diff options
111 files changed, 8324 insertions, 2220 deletions
diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h index feec86768f9c..f82ec22eeb11 100644 --- a/arch/arm/include/asm/kprobes.h +++ b/arch/arm/include/asm/kprobes.h | |||
@@ -24,7 +24,6 @@ | |||
24 | #define MAX_INSN_SIZE 2 | 24 | #define MAX_INSN_SIZE 2 |
25 | #define MAX_STACK_SIZE 64 /* 32 would probably be OK */ | 25 | #define MAX_STACK_SIZE 64 /* 32 would probably be OK */ |
26 | 26 | ||
27 | #define regs_return_value(regs) ((regs)->ARM_r0) | ||
28 | #define flush_insn_slot(p) do { } while (0) | 27 | #define flush_insn_slot(p) do { } while (0) |
29 | #define kretprobe_blacklist_size 0 | 28 | #define kretprobe_blacklist_size 0 |
30 | 29 | ||
diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index 96187ff58c24..451808ba1211 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h | |||
@@ -189,6 +189,11 @@ static inline int valid_user_regs(struct pt_regs *regs) | |||
189 | return 0; | 189 | return 0; |
190 | } | 190 | } |
191 | 191 | ||
192 | static inline long regs_return_value(struct pt_regs *regs) | ||
193 | { | ||
194 | return regs->ARM_r0; | ||
195 | } | ||
196 | |||
192 | #define instruction_pointer(regs) (regs)->ARM_pc | 197 | #define instruction_pointer(regs) (regs)->ARM_pc |
193 | 198 | ||
194 | #ifdef CONFIG_SMP | 199 | #ifdef CONFIG_SMP |
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 0f30c3a78fc1..d4c24d412a8d 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h | |||
@@ -129,6 +129,7 @@ extern void vfp_flush_hwstate(struct thread_info *); | |||
129 | /* | 129 | /* |
130 | * thread information flags: | 130 | * thread information flags: |
131 | * TIF_SYSCALL_TRACE - syscall trace active | 131 | * TIF_SYSCALL_TRACE - syscall trace active |
132 | * TIF_SYSCAL_AUDIT - syscall auditing active | ||
132 | * TIF_SIGPENDING - signal pending | 133 | * TIF_SIGPENDING - signal pending |
133 | * TIF_NEED_RESCHED - rescheduling necessary | 134 | * TIF_NEED_RESCHED - rescheduling necessary |
134 | * TIF_NOTIFY_RESUME - callback before returning to user | 135 | * TIF_NOTIFY_RESUME - callback before returning to user |
@@ -139,6 +140,7 @@ extern void vfp_flush_hwstate(struct thread_info *); | |||
139 | #define TIF_NEED_RESCHED 1 | 140 | #define TIF_NEED_RESCHED 1 |
140 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ | 141 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ |
141 | #define TIF_SYSCALL_TRACE 8 | 142 | #define TIF_SYSCALL_TRACE 8 |
143 | #define TIF_SYSCALL_AUDIT 9 | ||
142 | #define TIF_POLLING_NRFLAG 16 | 144 | #define TIF_POLLING_NRFLAG 16 |
143 | #define TIF_USING_IWMMXT 17 | 145 | #define TIF_USING_IWMMXT 17 |
144 | #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ | 146 | #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ |
@@ -149,11 +151,15 @@ extern void vfp_flush_hwstate(struct thread_info *); | |||
149 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) | 151 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) |
150 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | 152 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) |
151 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | 153 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) |
154 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | ||
152 | #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) | 155 | #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) |
153 | #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) | 156 | #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) |
154 | #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) | 157 | #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) |
155 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) | 158 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) |
156 | 159 | ||
160 | /* Checks for any syscall work in entry-common.S */ | ||
161 | #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT) | ||
162 | |||
157 | /* | 163 | /* |
158 | * Change these and you break ASM code in entry-common.S | 164 | * Change these and you break ASM code in entry-common.S |
159 | */ | 165 | */ |
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index b2a27b6b0046..520889cf1b5b 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S | |||
@@ -87,7 +87,7 @@ ENTRY(ret_from_fork) | |||
87 | get_thread_info tsk | 87 | get_thread_info tsk |
88 | ldr r1, [tsk, #TI_FLAGS] @ check for syscall tracing | 88 | ldr r1, [tsk, #TI_FLAGS] @ check for syscall tracing |
89 | mov why, #1 | 89 | mov why, #1 |
90 | tst r1, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? | 90 | tst r1, #_TIF_SYSCALL_WORK @ are we tracing syscalls? |
91 | beq ret_slow_syscall | 91 | beq ret_slow_syscall |
92 | mov r1, sp | 92 | mov r1, sp |
93 | mov r0, #1 @ trace exit [IP = 1] | 93 | mov r0, #1 @ trace exit [IP = 1] |
@@ -443,7 +443,7 @@ ENTRY(vector_swi) | |||
443 | 1: | 443 | 1: |
444 | #endif | 444 | #endif |
445 | 445 | ||
446 | tst r10, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? | 446 | tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls? |
447 | bne __sys_trace | 447 | bne __sys_trace |
448 | 448 | ||
449 | cmp scno, #NR_syscalls @ check upper syscall limit | 449 | cmp scno, #NR_syscalls @ check upper syscall limit |
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 483727ad6892..e1d5e1929fbd 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c | |||
@@ -906,11 +906,6 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) | |||
906 | { | 906 | { |
907 | unsigned long ip; | 907 | unsigned long ip; |
908 | 908 | ||
909 | if (!test_thread_flag(TIF_SYSCALL_TRACE)) | ||
910 | return scno; | ||
911 | if (!(current->ptrace & PT_PTRACED)) | ||
912 | return scno; | ||
913 | |||
914 | /* | 909 | /* |
915 | * Save IP. IP is used to denote syscall entry/exit: | 910 | * Save IP. IP is used to denote syscall entry/exit: |
916 | * IP = 0 -> entry, = 1 -> exit | 911 | * IP = 0 -> entry, = 1 -> exit |
@@ -918,6 +913,17 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) | |||
918 | ip = regs->ARM_ip; | 913 | ip = regs->ARM_ip; |
919 | regs->ARM_ip = why; | 914 | regs->ARM_ip = why; |
920 | 915 | ||
916 | if (!ip) | ||
917 | audit_syscall_exit(regs); | ||
918 | else | ||
919 | audit_syscall_entry(AUDIT_ARCH_ARMEB, scno, regs->ARM_r0, | ||
920 | regs->ARM_r1, regs->ARM_r2, regs->ARM_r3); | ||
921 | |||
922 | if (!test_thread_flag(TIF_SYSCALL_TRACE)) | ||
923 | return scno; | ||
924 | if (!(current->ptrace & PT_PTRACED)) | ||
925 | return scno; | ||
926 | |||
921 | current_thread_info()->syscall = scno; | 927 | current_thread_info()->syscall = scno; |
922 | 928 | ||
923 | /* the 0x80 provides a way for the tracing parent to distinguish | 929 | /* the 0x80 provides a way for the tracing parent to distinguish |
diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index f5cb27614e35..68c98f5b3ca6 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h | |||
@@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) | |||
246 | return regs->ar_bspstore; | 246 | return regs->ar_bspstore; |
247 | } | 247 | } |
248 | 248 | ||
249 | #define regs_return_value(regs) ((regs)->r8) | 249 | static inline int is_syscall_success(struct pt_regs *regs) |
250 | { | ||
251 | return regs->r10 != -1; | ||
252 | } | ||
253 | |||
254 | static inline long regs_return_value(struct pt_regs *regs) | ||
255 | { | ||
256 | if (is_syscall_success(regs)) | ||
257 | return regs->r8; | ||
258 | else | ||
259 | return -regs->r8; | ||
260 | } | ||
250 | 261 | ||
251 | /* Conserve space in histogram by encoding slot bits in address | 262 | /* Conserve space in histogram by encoding slot bits in address |
252 | * bits 2 and 3 rather than bits 0 and 1. | 263 | * bits 2 and 3 rather than bits 0 and 1. |
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 8848f43d819e..dad91661ddf9 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c | |||
@@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, | |||
1246 | if (test_thread_flag(TIF_RESTORE_RSE)) | 1246 | if (test_thread_flag(TIF_RESTORE_RSE)) |
1247 | ia64_sync_krbs(); | 1247 | ia64_sync_krbs(); |
1248 | 1248 | ||
1249 | if (unlikely(current->audit_context)) { | ||
1250 | long syscall; | ||
1251 | int arch; | ||
1252 | 1249 | ||
1253 | syscall = regs.r15; | 1250 | audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3); |
1254 | arch = AUDIT_ARCH_IA64; | ||
1255 | |||
1256 | audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3); | ||
1257 | } | ||
1258 | 1251 | ||
1259 | return 0; | 1252 | return 0; |
1260 | } | 1253 | } |
@@ -1268,14 +1261,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, | |||
1268 | { | 1261 | { |
1269 | int step; | 1262 | int step; |
1270 | 1263 | ||
1271 | if (unlikely(current->audit_context)) { | 1264 | audit_syscall_exit(®s); |
1272 | int success = AUDITSC_RESULT(regs.r10); | ||
1273 | long result = regs.r8; | ||
1274 | |||
1275 | if (success != AUDITSC_SUCCESS) | ||
1276 | result = -result; | ||
1277 | audit_syscall_exit(success, result); | ||
1278 | } | ||
1279 | 1265 | ||
1280 | step = test_thread_flag(TIF_SINGLESTEP); | 1266 | step = test_thread_flag(TIF_SINGLESTEP); |
1281 | if (step || test_thread_flag(TIF_SYSCALL_TRACE)) | 1267 | if (step || test_thread_flag(TIF_SYSCALL_TRACE)) |
diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h index 816bee64b196..94e92c805859 100644 --- a/arch/microblaze/include/asm/ptrace.h +++ b/arch/microblaze/include/asm/ptrace.h | |||
@@ -61,6 +61,11 @@ struct pt_regs { | |||
61 | #define instruction_pointer(regs) ((regs)->pc) | 61 | #define instruction_pointer(regs) ((regs)->pc) |
62 | #define profile_pc(regs) instruction_pointer(regs) | 62 | #define profile_pc(regs) instruction_pointer(regs) |
63 | 63 | ||
64 | static inline long regs_return_value(struct pt_regs *regs) | ||
65 | { | ||
66 | return regs->r3; | ||
67 | } | ||
68 | |||
64 | #else /* __KERNEL__ */ | 69 | #else /* __KERNEL__ */ |
65 | 70 | ||
66 | /* pt_regs offsets used by gdbserver etc in ptrace syscalls */ | 71 | /* pt_regs offsets used by gdbserver etc in ptrace syscalls */ |
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index 043cb58f9c44..6eb2aa927d89 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c | |||
@@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) | |||
147 | */ | 147 | */ |
148 | ret = -1L; | 148 | ret = -1L; |
149 | 149 | ||
150 | if (unlikely(current->audit_context)) | 150 | audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6, |
151 | audit_syscall_entry(EM_MICROBLAZE, regs->r12, | 151 | regs->r7, regs->r8); |
152 | regs->r5, regs->r6, | ||
153 | regs->r7, regs->r8); | ||
154 | 152 | ||
155 | return ret ?: regs->r12; | 153 | return ret ?: regs->r12; |
156 | } | 154 | } |
@@ -159,8 +157,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) | |||
159 | { | 157 | { |
160 | int step; | 158 | int step; |
161 | 159 | ||
162 | if (unlikely(current->audit_context)) | 160 | audit_syscall_exit(regs); |
163 | audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3); | ||
164 | 161 | ||
165 | step = test_thread_flag(TIF_SINGLESTEP); | 162 | step = test_thread_flag(TIF_SINGLESTEP); |
166 | if (step || test_thread_flag(TIF_SYSCALL_TRACE)) | 163 | if (step || test_thread_flag(TIF_SYSCALL_TRACE)) |
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index 604cd9dd1333..d4fc1a971779 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/cache.h> | 26 | #include <linux/cache.h> |
27 | #include <linux/of_platform.h> | 27 | #include <linux/of_platform.h> |
28 | #include <linux/dma-mapping.h> | 28 | #include <linux/dma-mapping.h> |
29 | #include <linux/cpu.h> | ||
29 | #include <asm/cacheflush.h> | 30 | #include <asm/cacheflush.h> |
30 | #include <asm/entry.h> | 31 | #include <asm/entry.h> |
31 | #include <asm/cpuinfo.h> | 32 | #include <asm/cpuinfo.h> |
@@ -226,5 +227,23 @@ static int __init setup_bus_notifier(void) | |||
226 | 227 | ||
227 | return 0; | 228 | return 0; |
228 | } | 229 | } |
229 | |||
230 | arch_initcall(setup_bus_notifier); | 230 | arch_initcall(setup_bus_notifier); |
231 | |||
232 | static DEFINE_PER_CPU(struct cpu, cpu_devices); | ||
233 | |||
234 | static int __init topology_init(void) | ||
235 | { | ||
236 | int i, ret; | ||
237 | |||
238 | for_each_present_cpu(i) { | ||
239 | struct cpu *c = &per_cpu(cpu_devices, i); | ||
240 | |||
241 | ret = register_cpu(c, i); | ||
242 | if (ret) | ||
243 | printk(KERN_WARNING "topology_init: register_cpu %d " | ||
244 | "failed (%d)\n", i, ret); | ||
245 | } | ||
246 | |||
247 | return 0; | ||
248 | } | ||
249 | subsys_initcall(topology_init); | ||
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 7b99c670e478..4b7f5252d2fd 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h | |||
@@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child, | |||
137 | */ | 137 | */ |
138 | #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER) | 138 | #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER) |
139 | 139 | ||
140 | #define regs_return_value(_regs) ((_regs)->regs[2]) | 140 | static inline int is_syscall_success(struct pt_regs *regs) |
141 | { | ||
142 | return !regs->regs[7]; | ||
143 | } | ||
144 | |||
145 | static inline long regs_return_value(struct pt_regs *regs) | ||
146 | { | ||
147 | if (is_syscall_success(regs)) | ||
148 | return regs->regs[2]; | ||
149 | else | ||
150 | return -regs->regs[2]; | ||
151 | } | ||
152 | |||
141 | #define instruction_pointer(regs) ((regs)->cp0_epc) | 153 | #define instruction_pointer(regs) ((regs)->cp0_epc) |
142 | #define profile_pc(regs) instruction_pointer(regs) | 154 | #define profile_pc(regs) instruction_pointer(regs) |
143 | 155 | ||
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 4e6ea1ffad46..7786b608d932 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c | |||
@@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) | |||
560 | } | 560 | } |
561 | 561 | ||
562 | out: | 562 | out: |
563 | if (unlikely(current->audit_context)) | 563 | audit_syscall_entry(audit_arch(), regs->regs[2], |
564 | audit_syscall_entry(audit_arch(), regs->regs[2], | 564 | regs->regs[4], regs->regs[5], |
565 | regs->regs[4], regs->regs[5], | 565 | regs->regs[6], regs->regs[7]); |
566 | regs->regs[6], regs->regs[7]); | ||
567 | } | 566 | } |
568 | 567 | ||
569 | /* | 568 | /* |
@@ -572,9 +571,7 @@ out: | |||
572 | */ | 571 | */ |
573 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) | 572 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) |
574 | { | 573 | { |
575 | if (unlikely(current->audit_context)) | 574 | audit_syscall_exit(regs); |
576 | audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), | ||
577 | -regs->regs[2]); | ||
578 | 575 | ||
579 | if (!(current->ptrace & PT_PTRACED)) | 576 | if (!(current->ptrace & PT_PTRACED)) |
580 | return; | 577 | return; |
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 48223f9b8728..78a205162fd7 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h | |||
@@ -86,7 +86,18 @@ struct pt_regs { | |||
86 | #define instruction_pointer(regs) ((regs)->nip) | 86 | #define instruction_pointer(regs) ((regs)->nip) |
87 | #define user_stack_pointer(regs) ((regs)->gpr[1]) | 87 | #define user_stack_pointer(regs) ((regs)->gpr[1]) |
88 | #define kernel_stack_pointer(regs) ((regs)->gpr[1]) | 88 | #define kernel_stack_pointer(regs) ((regs)->gpr[1]) |
89 | #define regs_return_value(regs) ((regs)->gpr[3]) | 89 | static inline int is_syscall_success(struct pt_regs *regs) |
90 | { | ||
91 | return !(regs->ccr & 0x10000000); | ||
92 | } | ||
93 | |||
94 | static inline long regs_return_value(struct pt_regs *regs) | ||
95 | { | ||
96 | if (is_syscall_success(regs)) | ||
97 | return regs->gpr[3]; | ||
98 | else | ||
99 | return -regs->gpr[3]; | ||
100 | } | ||
90 | 101 | ||
91 | #ifdef CONFIG_SMP | 102 | #ifdef CONFIG_SMP |
92 | extern unsigned long profile_pc(struct pt_regs *regs); | 103 | extern unsigned long profile_pc(struct pt_regs *regs); |
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 5de73dbd15c7..5b43325402bc 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c | |||
@@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs) | |||
1724 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1724 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1725 | trace_sys_enter(regs, regs->gpr[0]); | 1725 | trace_sys_enter(regs, regs->gpr[0]); |
1726 | 1726 | ||
1727 | if (unlikely(current->audit_context)) { | ||
1728 | #ifdef CONFIG_PPC64 | 1727 | #ifdef CONFIG_PPC64 |
1729 | if (!is_32bit_task()) | 1728 | if (!is_32bit_task()) |
1730 | audit_syscall_entry(AUDIT_ARCH_PPC64, | 1729 | audit_syscall_entry(AUDIT_ARCH_PPC64, |
1731 | regs->gpr[0], | 1730 | regs->gpr[0], |
1732 | regs->gpr[3], regs->gpr[4], | 1731 | regs->gpr[3], regs->gpr[4], |
1733 | regs->gpr[5], regs->gpr[6]); | 1732 | regs->gpr[5], regs->gpr[6]); |
1734 | else | 1733 | else |
1735 | #endif | 1734 | #endif |
1736 | audit_syscall_entry(AUDIT_ARCH_PPC, | 1735 | audit_syscall_entry(AUDIT_ARCH_PPC, |
1737 | regs->gpr[0], | 1736 | regs->gpr[0], |
1738 | regs->gpr[3] & 0xffffffff, | 1737 | regs->gpr[3] & 0xffffffff, |
1739 | regs->gpr[4] & 0xffffffff, | 1738 | regs->gpr[4] & 0xffffffff, |
1740 | regs->gpr[5] & 0xffffffff, | 1739 | regs->gpr[5] & 0xffffffff, |
1741 | regs->gpr[6] & 0xffffffff); | 1740 | regs->gpr[6] & 0xffffffff); |
1742 | } | ||
1743 | 1741 | ||
1744 | return ret ?: regs->gpr[0]; | 1742 | return ret ?: regs->gpr[0]; |
1745 | } | 1743 | } |
@@ -1748,9 +1746,7 @@ void do_syscall_trace_leave(struct pt_regs *regs) | |||
1748 | { | 1746 | { |
1749 | int step; | 1747 | int step; |
1750 | 1748 | ||
1751 | if (unlikely(current->audit_context)) | 1749 | audit_syscall_exit(regs); |
1752 | audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, | ||
1753 | regs->result); | ||
1754 | 1750 | ||
1755 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1751 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1756 | trace_sys_exit(regs, regs->result); | 1752 | trace_sys_exit(regs, regs->result); |
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 56da355678f4..aeb77f017985 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h | |||
@@ -541,9 +541,13 @@ struct user_regs_struct | |||
541 | #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) | 541 | #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) |
542 | #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) | 542 | #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) |
543 | #define user_stack_pointer(regs)((regs)->gprs[15]) | 543 | #define user_stack_pointer(regs)((regs)->gprs[15]) |
544 | #define regs_return_value(regs)((regs)->gprs[2]) | ||
545 | #define profile_pc(regs) instruction_pointer(regs) | 544 | #define profile_pc(regs) instruction_pointer(regs) |
546 | 545 | ||
546 | static inline long regs_return_value(struct pt_regs *regs) | ||
547 | { | ||
548 | return regs->gprs[2]; | ||
549 | } | ||
550 | |||
547 | int regs_query_register_offset(const char *name); | 551 | int regs_query_register_offset(const char *name); |
548 | const char *regs_query_register_name(unsigned int offset); | 552 | const char *regs_query_register_name(unsigned int offset); |
549 | unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); | 553 | unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); |
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 573bc29551ef..9d82ed4bcb27 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c | |||
@@ -740,20 +740,17 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) | |||
740 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 740 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
741 | trace_sys_enter(regs, regs->gprs[2]); | 741 | trace_sys_enter(regs, regs->gprs[2]); |
742 | 742 | ||
743 | if (unlikely(current->audit_context)) | 743 | audit_syscall_entry(is_compat_task() ? |
744 | audit_syscall_entry(is_compat_task() ? | 744 | AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, |
745 | AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, | 745 | regs->gprs[2], regs->orig_gpr2, |
746 | regs->gprs[2], regs->orig_gpr2, | 746 | regs->gprs[3], regs->gprs[4], |
747 | regs->gprs[3], regs->gprs[4], | 747 | regs->gprs[5]); |
748 | regs->gprs[5]); | ||
749 | return ret ?: regs->gprs[2]; | 748 | return ret ?: regs->gprs[2]; |
750 | } | 749 | } |
751 | 750 | ||
752 | asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) | 751 | asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) |
753 | { | 752 | { |
754 | if (unlikely(current->audit_context)) | 753 | audit_syscall_exit(regs); |
755 | audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), | ||
756 | regs->gprs[2]); | ||
757 | 754 | ||
758 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 755 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
759 | trace_sys_exit(regs, regs->gprs[2]); | 756 | trace_sys_exit(regs, regs->gprs[2]); |
diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h index 6c2239cca1a2..2d3e906aa722 100644 --- a/arch/sh/include/asm/ptrace_32.h +++ b/arch/sh/include/asm/ptrace_32.h | |||
@@ -76,7 +76,10 @@ struct pt_dspregs { | |||
76 | #ifdef __KERNEL__ | 76 | #ifdef __KERNEL__ |
77 | 77 | ||
78 | #define MAX_REG_OFFSET offsetof(struct pt_regs, tra) | 78 | #define MAX_REG_OFFSET offsetof(struct pt_regs, tra) |
79 | #define regs_return_value(_regs) ((_regs)->regs[0]) | 79 | static inline long regs_return_value(struct pt_regs *regs) |
80 | { | ||
81 | return regs->regs[0]; | ||
82 | } | ||
80 | 83 | ||
81 | #endif /* __KERNEL__ */ | 84 | #endif /* __KERNEL__ */ |
82 | 85 | ||
diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h index bf9be7764d69..eb3fcceaf64b 100644 --- a/arch/sh/include/asm/ptrace_64.h +++ b/arch/sh/include/asm/ptrace_64.h | |||
@@ -13,7 +13,10 @@ struct pt_regs { | |||
13 | #ifdef __KERNEL__ | 13 | #ifdef __KERNEL__ |
14 | 14 | ||
15 | #define MAX_REG_OFFSET offsetof(struct pt_regs, tregs[7]) | 15 | #define MAX_REG_OFFSET offsetof(struct pt_regs, tregs[7]) |
16 | #define regs_return_value(_regs) ((_regs)->regs[3]) | 16 | static inline long regs_return_value(struct pt_regs *regs) |
17 | { | ||
18 | return regs->regs[3]; | ||
19 | } | ||
17 | 20 | ||
18 | #endif /* __KERNEL__ */ | 21 | #endif /* __KERNEL__ */ |
19 | 22 | ||
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 92b3c276339a..a3e651563763 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c | |||
@@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) | |||
518 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 518 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
519 | trace_sys_enter(regs, regs->regs[0]); | 519 | trace_sys_enter(regs, regs->regs[0]); |
520 | 520 | ||
521 | if (unlikely(current->audit_context)) | 521 | audit_syscall_entry(audit_arch(), regs->regs[3], |
522 | audit_syscall_entry(audit_arch(), regs->regs[3], | 522 | regs->regs[4], regs->regs[5], |
523 | regs->regs[4], regs->regs[5], | 523 | regs->regs[6], regs->regs[7]); |
524 | regs->regs[6], regs->regs[7]); | ||
525 | 524 | ||
526 | return ret ?: regs->regs[0]; | 525 | return ret ?: regs->regs[0]; |
527 | } | 526 | } |
@@ -530,9 +529,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) | |||
530 | { | 529 | { |
531 | int step; | 530 | int step; |
532 | 531 | ||
533 | if (unlikely(current->audit_context)) | 532 | audit_syscall_exit(regs); |
534 | audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]), | ||
535 | regs->regs[0]); | ||
536 | 533 | ||
537 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 534 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
538 | trace_sys_exit(regs, regs->regs[0]); | 535 | trace_sys_exit(regs, regs->regs[0]); |
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index c8f97649f354..3d0080b5c976 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c | |||
@@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs) | |||
536 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 536 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
537 | trace_sys_enter(regs, regs->regs[9]); | 537 | trace_sys_enter(regs, regs->regs[9]); |
538 | 538 | ||
539 | if (unlikely(current->audit_context)) | 539 | audit_syscall_entry(audit_arch(), regs->regs[1], |
540 | audit_syscall_entry(audit_arch(), regs->regs[1], | 540 | regs->regs[2], regs->regs[3], |
541 | regs->regs[2], regs->regs[3], | 541 | regs->regs[4], regs->regs[5]); |
542 | regs->regs[4], regs->regs[5]); | ||
543 | 542 | ||
544 | return ret ?: regs->regs[9]; | 543 | return ret ?: regs->regs[9]; |
545 | } | 544 | } |
@@ -548,9 +547,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) | |||
548 | { | 547 | { |
549 | int step; | 548 | int step; |
550 | 549 | ||
551 | if (unlikely(current->audit_context)) | 550 | audit_syscall_exit(regs); |
552 | audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]), | ||
553 | regs->regs[9]); | ||
554 | 551 | ||
555 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 552 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
556 | trace_sys_exit(regs, regs->regs[9]); | 553 | trace_sys_exit(regs, regs->regs[9]); |
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index a0e1bcf843a1..c00c3b5c2806 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h | |||
@@ -207,7 +207,15 @@ do { current_thread_info()->syscall_noerror = 1; \ | |||
207 | #define instruction_pointer(regs) ((regs)->tpc) | 207 | #define instruction_pointer(regs) ((regs)->tpc) |
208 | #define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) | 208 | #define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) |
209 | #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) | 209 | #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) |
210 | #define regs_return_value(regs) ((regs)->u_regs[UREG_I0]) | 210 | static inline int is_syscall_success(struct pt_regs *regs) |
211 | { | ||
212 | return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY)); | ||
213 | } | ||
214 | |||
215 | static inline long regs_return_value(struct pt_regs *regs) | ||
216 | { | ||
217 | return regs->u_regs[UREG_I0]; | ||
218 | } | ||
211 | #ifdef CONFIG_SMP | 219 | #ifdef CONFIG_SMP |
212 | extern unsigned long profile_pc(struct pt_regs *); | 220 | extern unsigned long profile_pc(struct pt_regs *); |
213 | #else | 221 | #else |
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 96ee50a80661..9388844cd88c 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c | |||
@@ -1071,32 +1071,22 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) | |||
1071 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1071 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1072 | trace_sys_enter(regs, regs->u_regs[UREG_G1]); | 1072 | trace_sys_enter(regs, regs->u_regs[UREG_G1]); |
1073 | 1073 | ||
1074 | if (unlikely(current->audit_context) && !ret) | 1074 | audit_syscall_entry((test_thread_flag(TIF_32BIT) ? |
1075 | audit_syscall_entry((test_thread_flag(TIF_32BIT) ? | 1075 | AUDIT_ARCH_SPARC : |
1076 | AUDIT_ARCH_SPARC : | 1076 | AUDIT_ARCH_SPARC64), |
1077 | AUDIT_ARCH_SPARC64), | 1077 | regs->u_regs[UREG_G1], |
1078 | regs->u_regs[UREG_G1], | 1078 | regs->u_regs[UREG_I0], |
1079 | regs->u_regs[UREG_I0], | 1079 | regs->u_regs[UREG_I1], |
1080 | regs->u_regs[UREG_I1], | 1080 | regs->u_regs[UREG_I2], |
1081 | regs->u_regs[UREG_I2], | 1081 | regs->u_regs[UREG_I3]); |
1082 | regs->u_regs[UREG_I3]); | ||
1083 | 1082 | ||
1084 | return ret; | 1083 | return ret; |
1085 | } | 1084 | } |
1086 | 1085 | ||
1087 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) | 1086 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) |
1088 | { | 1087 | { |
1089 | #ifdef CONFIG_AUDITSYSCALL | 1088 | audit_syscall_exit(regs); |
1090 | if (unlikely(current->audit_context)) { | ||
1091 | unsigned long tstate = regs->tstate; | ||
1092 | int result = AUDITSC_SUCCESS; | ||
1093 | 1089 | ||
1094 | if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY))) | ||
1095 | result = AUDITSC_FAILURE; | ||
1096 | |||
1097 | audit_syscall_exit(result, regs->u_regs[UREG_I0]); | ||
1098 | } | ||
1099 | #endif | ||
1100 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1090 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1101 | trace_sys_exit(regs, regs->u_regs[UREG_G1]); | 1091 | trace_sys_exit(regs, regs->u_regs[UREG_G1]); |
1102 | 1092 | ||
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index c9da32b0c707..06b190390505 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c | |||
@@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit) | |||
167 | int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit; | 167 | int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit; |
168 | int tracesysgood; | 168 | int tracesysgood; |
169 | 169 | ||
170 | if (unlikely(current->audit_context)) { | 170 | if (!entryexit) |
171 | if (!entryexit) | 171 | audit_syscall_entry(HOST_AUDIT_ARCH, |
172 | audit_syscall_entry(HOST_AUDIT_ARCH, | 172 | UPT_SYSCALL_NR(regs), |
173 | UPT_SYSCALL_NR(regs), | 173 | UPT_SYSCALL_ARG1(regs), |
174 | UPT_SYSCALL_ARG1(regs), | 174 | UPT_SYSCALL_ARG2(regs), |
175 | UPT_SYSCALL_ARG2(regs), | 175 | UPT_SYSCALL_ARG3(regs), |
176 | UPT_SYSCALL_ARG3(regs), | 176 | UPT_SYSCALL_ARG4(regs)); |
177 | UPT_SYSCALL_ARG4(regs)); | 177 | else |
178 | else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)), | 178 | audit_syscall_exit(regs); |
179 | UPT_SYSCALL_RET(regs)); | ||
180 | } | ||
181 | 179 | ||
182 | /* Fake a debug trap */ | 180 | /* Fake a debug trap */ |
183 | if (is_singlestep) | 181 | if (is_singlestep) |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 1106261856c8..e3e734005e19 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <asm/segment.h> | 14 | #include <asm/segment.h> |
15 | #include <asm/irqflags.h> | 15 | #include <asm/irqflags.h> |
16 | #include <linux/linkage.h> | 16 | #include <linux/linkage.h> |
17 | #include <linux/err.h> | ||
17 | 18 | ||
18 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | 19 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
19 | #include <linux/elf-em.h> | 20 | #include <linux/elf-em.h> |
@@ -189,7 +190,7 @@ sysexit_from_sys_call: | |||
189 | movl %ebx,%edx /* 3rd arg: 1st syscall arg */ | 190 | movl %ebx,%edx /* 3rd arg: 1st syscall arg */ |
190 | movl %eax,%esi /* 2nd arg: syscall number */ | 191 | movl %eax,%esi /* 2nd arg: syscall number */ |
191 | movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ | 192 | movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ |
192 | call audit_syscall_entry | 193 | call __audit_syscall_entry |
193 | movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ | 194 | movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ |
194 | cmpq $(IA32_NR_syscalls-1),%rax | 195 | cmpq $(IA32_NR_syscalls-1),%rax |
195 | ja ia32_badsys | 196 | ja ia32_badsys |
@@ -206,12 +207,13 @@ sysexit_from_sys_call: | |||
206 | TRACE_IRQS_ON | 207 | TRACE_IRQS_ON |
207 | sti | 208 | sti |
208 | movl %eax,%esi /* second arg, syscall return value */ | 209 | movl %eax,%esi /* second arg, syscall return value */ |
209 | cmpl $0,%eax /* is it < 0? */ | 210 | cmpl $-MAX_ERRNO,%eax /* is it an error ? */ |
210 | setl %al /* 1 if so, 0 if not */ | 211 | jbe 1f |
212 | movslq %eax, %rsi /* if error sign extend to 64 bits */ | ||
213 | 1: setbe %al /* 1 if error, 0 if not */ | ||
211 | movzbl %al,%edi /* zero-extend that into %edi */ | 214 | movzbl %al,%edi /* zero-extend that into %edi */ |
212 | inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ | 215 | call __audit_syscall_exit |
213 | call audit_syscall_exit | 216 | movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ |
214 | movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ | ||
215 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | 217 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi |
216 | cli | 218 | cli |
217 | TRACE_IRQS_OFF | 219 | TRACE_IRQS_OFF |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4af9fd2450a5..79d97e68f042 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -42,6 +42,7 @@ | |||
42 | */ | 42 | */ |
43 | 43 | ||
44 | #include <linux/linkage.h> | 44 | #include <linux/linkage.h> |
45 | #include <linux/err.h> | ||
45 | #include <asm/thread_info.h> | 46 | #include <asm/thread_info.h> |
46 | #include <asm/irqflags.h> | 47 | #include <asm/irqflags.h> |
47 | #include <asm/errno.h> | 48 | #include <asm/errno.h> |
@@ -453,7 +454,7 @@ sysenter_audit: | |||
453 | movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ | 454 | movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ |
454 | movl %eax,%edx /* 2nd arg: syscall number */ | 455 | movl %eax,%edx /* 2nd arg: syscall number */ |
455 | movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ | 456 | movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ |
456 | call audit_syscall_entry | 457 | call __audit_syscall_entry |
457 | pushl_cfi %ebx | 458 | pushl_cfi %ebx |
458 | movl PT_EAX(%esp),%eax /* reload syscall number */ | 459 | movl PT_EAX(%esp),%eax /* reload syscall number */ |
459 | jmp sysenter_do_call | 460 | jmp sysenter_do_call |
@@ -464,11 +465,10 @@ sysexit_audit: | |||
464 | TRACE_IRQS_ON | 465 | TRACE_IRQS_ON |
465 | ENABLE_INTERRUPTS(CLBR_ANY) | 466 | ENABLE_INTERRUPTS(CLBR_ANY) |
466 | movl %eax,%edx /* second arg, syscall return value */ | 467 | movl %eax,%edx /* second arg, syscall return value */ |
467 | cmpl $0,%eax /* is it < 0? */ | 468 | cmpl $-MAX_ERRNO,%eax /* is it an error ? */ |
468 | setl %al /* 1 if so, 0 if not */ | 469 | setbe %al /* 1 if so, 0 if not */ |
469 | movzbl %al,%eax /* zero-extend that */ | 470 | movzbl %al,%eax /* zero-extend that */ |
470 | inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ | 471 | call __audit_syscall_exit |
471 | call audit_syscall_exit | ||
472 | DISABLE_INTERRUPTS(CLBR_ANY) | 472 | DISABLE_INTERRUPTS(CLBR_ANY) |
473 | TRACE_IRQS_OFF | 473 | TRACE_IRQS_OFF |
474 | movl TI_flags(%ebp), %ecx | 474 | movl TI_flags(%ebp), %ecx |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 940ba711fc28..3fe8239fd8fb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <asm/paravirt.h> | 55 | #include <asm/paravirt.h> |
56 | #include <asm/ftrace.h> | 56 | #include <asm/ftrace.h> |
57 | #include <asm/percpu.h> | 57 | #include <asm/percpu.h> |
58 | #include <linux/err.h> | ||
58 | 59 | ||
59 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | 60 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
60 | #include <linux/elf-em.h> | 61 | #include <linux/elf-em.h> |
@@ -548,7 +549,7 @@ badsys: | |||
548 | #ifdef CONFIG_AUDITSYSCALL | 549 | #ifdef CONFIG_AUDITSYSCALL |
549 | /* | 550 | /* |
550 | * Fast path for syscall audit without full syscall trace. | 551 | * Fast path for syscall audit without full syscall trace. |
551 | * We just call audit_syscall_entry() directly, and then | 552 | * We just call __audit_syscall_entry() directly, and then |
552 | * jump back to the normal fast path. | 553 | * jump back to the normal fast path. |
553 | */ | 554 | */ |
554 | auditsys: | 555 | auditsys: |
@@ -558,22 +559,21 @@ auditsys: | |||
558 | movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ | 559 | movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ |
559 | movq %rax,%rsi /* 2nd arg: syscall number */ | 560 | movq %rax,%rsi /* 2nd arg: syscall number */ |
560 | movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ | 561 | movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ |
561 | call audit_syscall_entry | 562 | call __audit_syscall_entry |
562 | LOAD_ARGS 0 /* reload call-clobbered registers */ | 563 | LOAD_ARGS 0 /* reload call-clobbered registers */ |
563 | jmp system_call_fastpath | 564 | jmp system_call_fastpath |
564 | 565 | ||
565 | /* | 566 | /* |
566 | * Return fast path for syscall audit. Call audit_syscall_exit() | 567 | * Return fast path for syscall audit. Call __audit_syscall_exit() |
567 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT | 568 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT |
568 | * masked off. | 569 | * masked off. |
569 | */ | 570 | */ |
570 | sysret_audit: | 571 | sysret_audit: |
571 | movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ | 572 | movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ |
572 | cmpq $0,%rsi /* is it < 0? */ | 573 | cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ |
573 | setl %al /* 1 if so, 0 if not */ | 574 | setbe %al /* 1 if so, 0 if not */ |
574 | movzbl %al,%edi /* zero-extend that into %edi */ | 575 | movzbl %al,%edi /* zero-extend that into %edi */ |
575 | inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ | 576 | call __audit_syscall_exit |
576 | call audit_syscall_exit | ||
577 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | 577 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi |
578 | jmp sysret_check | 578 | jmp sysret_check |
579 | #endif /* CONFIG_AUDITSYSCALL */ | 579 | #endif /* CONFIG_AUDITSYSCALL */ |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7b5bb6..50267386b766 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) | |||
1392 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1392 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1393 | trace_sys_enter(regs, regs->orig_ax); | 1393 | trace_sys_enter(regs, regs->orig_ax); |
1394 | 1394 | ||
1395 | if (unlikely(current->audit_context)) { | 1395 | if (IS_IA32) |
1396 | if (IS_IA32) | 1396 | audit_syscall_entry(AUDIT_ARCH_I386, |
1397 | audit_syscall_entry(AUDIT_ARCH_I386, | 1397 | regs->orig_ax, |
1398 | regs->orig_ax, | 1398 | regs->bx, regs->cx, |
1399 | regs->bx, regs->cx, | 1399 | regs->dx, regs->si); |
1400 | regs->dx, regs->si); | ||
1401 | #ifdef CONFIG_X86_64 | 1400 | #ifdef CONFIG_X86_64 |
1402 | else | 1401 | else |
1403 | audit_syscall_entry(AUDIT_ARCH_X86_64, | 1402 | audit_syscall_entry(AUDIT_ARCH_X86_64, |
1404 | regs->orig_ax, | 1403 | regs->orig_ax, |
1405 | regs->di, regs->si, | 1404 | regs->di, regs->si, |
1406 | regs->dx, regs->r10); | 1405 | regs->dx, regs->r10); |
1407 | #endif | 1406 | #endif |
1408 | } | ||
1409 | 1407 | ||
1410 | return ret ?: regs->orig_ax; | 1408 | return ret ?: regs->orig_ax; |
1411 | } | 1409 | } |
@@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs) | |||
1414 | { | 1412 | { |
1415 | bool step; | 1413 | bool step; |
1416 | 1414 | ||
1417 | if (unlikely(current->audit_context)) | 1415 | audit_syscall_exit(regs); |
1418 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); | ||
1419 | 1416 | ||
1420 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1417 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1421 | trace_sys_exit(regs, regs->ax); | 1418 | trace_sys_exit(regs, regs->ax); |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0a..af17e1c966dc 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
335 | if (info->flags & VM86_SCREEN_BITMAP) | 335 | if (info->flags & VM86_SCREEN_BITMAP) |
336 | mark_screen_rdonly(tsk->mm); | 336 | mark_screen_rdonly(tsk->mm); |
337 | 337 | ||
338 | /*call audit_syscall_exit since we do not exit via the normal paths */ | 338 | /*call __audit_syscall_exit since we do not exit via the normal paths */ |
339 | if (unlikely(current->audit_context)) | 339 | if (unlikely(current->audit_context)) |
340 | audit_syscall_exit(AUDITSC_RESULT(0), 0); | 340 | __audit_syscall_exit(1, 0); |
341 | 341 | ||
342 | __asm__ __volatile__( | 342 | __asm__ __volatile__( |
343 | "movl %0,%%esp\n\t" | 343 | "movl %0,%%esp\n\t" |
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 711b1621747f..5ef9344a8b24 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h | |||
@@ -3,3 +3,8 @@ | |||
3 | #else | 3 | #else |
4 | #include "ptrace_64.h" | 4 | #include "ptrace_64.h" |
5 | #endif | 5 | #endif |
6 | |||
7 | static inline long regs_return_value(struct uml_pt_regs *regs) | ||
8 | { | ||
9 | return UPT_SYSCALL_RET(regs); | ||
10 | } | ||
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index a0d042aa2967..2dff698ab02e 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c | |||
@@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs) | |||
334 | do_syscall_trace(); | 334 | do_syscall_trace(); |
335 | 335 | ||
336 | #if 0 | 336 | #if 0 |
337 | if (unlikely(current->audit_context)) | 337 | audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); |
338 | audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); | ||
339 | #endif | 338 | #endif |
340 | } | 339 | } |
341 | 340 | ||
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 163263ddd381..ee55019066a1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -3117,18 +3117,17 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | |||
3117 | */ | 3117 | */ |
3118 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 3118 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
3119 | { | 3119 | { |
3120 | struct cfq_queue *old_cfqq = cfqd->active_queue; | ||
3121 | |||
3122 | cfq_log_cfqq(cfqd, cfqq, "preempt"); | 3120 | cfq_log_cfqq(cfqd, cfqq, "preempt"); |
3123 | cfq_slice_expired(cfqd, 1); | ||
3124 | 3121 | ||
3125 | /* | 3122 | /* |
3126 | * workload type is changed, don't save slice, otherwise preempt | 3123 | * workload type is changed, don't save slice, otherwise preempt |
3127 | * doesn't happen | 3124 | * doesn't happen |
3128 | */ | 3125 | */ |
3129 | if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) | 3126 | if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq)) |
3130 | cfqq->cfqg->saved_workload_slice = 0; | 3127 | cfqq->cfqg->saved_workload_slice = 0; |
3131 | 3128 | ||
3129 | cfq_slice_expired(cfqd, 1); | ||
3130 | |||
3132 | /* | 3131 | /* |
3133 | * Put the new queue at the front of the of the current list, | 3132 | * Put the new queue at the front of the of the current list, |
3134 | * so we know that it will be selected next. | 3133 | * so we know that it will be selected next. |
diff --git a/drivers/usb/host/ehci-xilinx-of.c b/drivers/usb/host/ehci-xilinx-of.c index 32793ce3d9e9..9c2cc4633894 100644 --- a/drivers/usb/host/ehci-xilinx-of.c +++ b/drivers/usb/host/ehci-xilinx-of.c | |||
@@ -183,7 +183,7 @@ static int __devinit ehci_hcd_xilinx_of_probe(struct platform_device *op) | |||
183 | } | 183 | } |
184 | 184 | ||
185 | irq = irq_of_parse_and_map(dn, 0); | 185 | irq = irq_of_parse_and_map(dn, 0); |
186 | if (irq == NO_IRQ) { | 186 | if (!irq) { |
187 | printk(KERN_ERR "%s: irq_of_parse_and_map failed\n", __FILE__); | 187 | printk(KERN_ERR "%s: irq_of_parse_and_map failed\n", __FILE__); |
188 | rv = -EBUSY; | 188 | rv = -EBUSY; |
189 | goto err_irq; | 189 | goto err_irq; |
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 3832e303c33a..596e6a7b17d6 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c | |||
@@ -221,7 +221,7 @@ static int register_balloon(struct device *dev) | |||
221 | { | 221 | { |
222 | int i, error; | 222 | int i, error; |
223 | 223 | ||
224 | error = bus_register(&balloon_subsys); | 224 | error = subsys_system_register(&balloon_subsys, NULL); |
225 | if (error) | 225 | if (error) |
226 | return error; | 226 | return error; |
227 | 227 | ||
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ecb9fd3be143..d33f01c08b60 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig | |||
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL | |||
31 | Linux website <http://acl.bestbits.at/>. | 31 | Linux website <http://acl.bestbits.at/>. |
32 | 32 | ||
33 | If you don't know what Access Control Lists are, say N | 33 | If you don't know what Access Control Lists are, say N |
34 | |||
35 | config BTRFS_FS_CHECK_INTEGRITY | ||
36 | bool "Btrfs with integrity check tool compiled in (DANGEROUS)" | ||
37 | depends on BTRFS_FS | ||
38 | help | ||
39 | Adds code that examines all block write requests (including | ||
40 | writes of the super block). The goal is to verify that the | ||
41 | state of the filesystem on disk is always consistent, i.e., | ||
42 | after a power-loss or kernel panic event the filesystem is | ||
43 | in a consistent state. | ||
44 | |||
45 | If the integrity check tool is included and activated in | ||
46 | the mount options, plenty of kernel memory is used, and | ||
47 | plenty of additional CPU cycles are spent. Enabling this | ||
48 | functionality is not intended for normal use. | ||
49 | |||
50 | In most cases, unless you are a btrfs developer who needs | ||
51 | to verify the integrity of (super)-block write requests | ||
52 | during the run of a regression test, say N | ||
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index c0ddfd29c5e5..0c4fa2befae7 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
11 | reada.o backref.o | 11 | reada.o backref.o ulist.o |
12 | 12 | ||
13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o | ||
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 22c64fff1bd5..b9a843226de8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c | |||
@@ -19,18 +19,789 @@ | |||
19 | #include "ctree.h" | 19 | #include "ctree.h" |
20 | #include "disk-io.h" | 20 | #include "disk-io.h" |
21 | #include "backref.h" | 21 | #include "backref.h" |
22 | #include "ulist.h" | ||
23 | #include "transaction.h" | ||
24 | #include "delayed-ref.h" | ||
22 | 25 | ||
23 | struct __data_ref { | 26 | /* |
27 | * this structure records all encountered refs on the way up to the root | ||
28 | */ | ||
29 | struct __prelim_ref { | ||
24 | struct list_head list; | 30 | struct list_head list; |
25 | u64 inum; | 31 | u64 root_id; |
26 | u64 root; | 32 | struct btrfs_key key; |
27 | u64 extent_data_item_offset; | 33 | int level; |
34 | int count; | ||
35 | u64 parent; | ||
36 | u64 wanted_disk_byte; | ||
28 | }; | 37 | }; |
29 | 38 | ||
30 | struct __shared_ref { | 39 | static int __add_prelim_ref(struct list_head *head, u64 root_id, |
31 | struct list_head list; | 40 | struct btrfs_key *key, int level, u64 parent, |
41 | u64 wanted_disk_byte, int count) | ||
42 | { | ||
43 | struct __prelim_ref *ref; | ||
44 | |||
45 | /* in case we're adding delayed refs, we're holding the refs spinlock */ | ||
46 | ref = kmalloc(sizeof(*ref), GFP_ATOMIC); | ||
47 | if (!ref) | ||
48 | return -ENOMEM; | ||
49 | |||
50 | ref->root_id = root_id; | ||
51 | if (key) | ||
52 | ref->key = *key; | ||
53 | else | ||
54 | memset(&ref->key, 0, sizeof(ref->key)); | ||
55 | |||
56 | ref->level = level; | ||
57 | ref->count = count; | ||
58 | ref->parent = parent; | ||
59 | ref->wanted_disk_byte = wanted_disk_byte; | ||
60 | list_add_tail(&ref->list, head); | ||
61 | |||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, | ||
66 | struct ulist *parents, | ||
67 | struct extent_buffer *eb, int level, | ||
68 | u64 wanted_objectid, u64 wanted_disk_byte) | ||
69 | { | ||
70 | int ret; | ||
71 | int slot; | ||
72 | struct btrfs_file_extent_item *fi; | ||
73 | struct btrfs_key key; | ||
32 | u64 disk_byte; | 74 | u64 disk_byte; |
33 | }; | 75 | |
76 | add_parent: | ||
77 | ret = ulist_add(parents, eb->start, 0, GFP_NOFS); | ||
78 | if (ret < 0) | ||
79 | return ret; | ||
80 | |||
81 | if (level != 0) | ||
82 | return 0; | ||
83 | |||
84 | /* | ||
85 | * if the current leaf is full with EXTENT_DATA items, we must | ||
86 | * check the next one if that holds a reference as well. | ||
87 | * ref->count cannot be used to skip this check. | ||
88 | * repeat this until we don't find any additional EXTENT_DATA items. | ||
89 | */ | ||
90 | while (1) { | ||
91 | ret = btrfs_next_leaf(root, path); | ||
92 | if (ret < 0) | ||
93 | return ret; | ||
94 | if (ret) | ||
95 | return 0; | ||
96 | |||
97 | eb = path->nodes[0]; | ||
98 | for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { | ||
99 | btrfs_item_key_to_cpu(eb, &key, slot); | ||
100 | if (key.objectid != wanted_objectid || | ||
101 | key.type != BTRFS_EXTENT_DATA_KEY) | ||
102 | return 0; | ||
103 | fi = btrfs_item_ptr(eb, slot, | ||
104 | struct btrfs_file_extent_item); | ||
105 | disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); | ||
106 | if (disk_byte == wanted_disk_byte) | ||
107 | goto add_parent; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * resolve an indirect backref in the form (root_id, key, level) | ||
116 | * to a logical address | ||
117 | */ | ||
118 | static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, | ||
119 | struct __prelim_ref *ref, | ||
120 | struct ulist *parents) | ||
121 | { | ||
122 | struct btrfs_path *path; | ||
123 | struct btrfs_root *root; | ||
124 | struct btrfs_key root_key; | ||
125 | struct btrfs_key key = {0}; | ||
126 | struct extent_buffer *eb; | ||
127 | int ret = 0; | ||
128 | int root_level; | ||
129 | int level = ref->level; | ||
130 | |||
131 | path = btrfs_alloc_path(); | ||
132 | if (!path) | ||
133 | return -ENOMEM; | ||
134 | |||
135 | root_key.objectid = ref->root_id; | ||
136 | root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
137 | root_key.offset = (u64)-1; | ||
138 | root = btrfs_read_fs_root_no_name(fs_info, &root_key); | ||
139 | if (IS_ERR(root)) { | ||
140 | ret = PTR_ERR(root); | ||
141 | goto out; | ||
142 | } | ||
143 | |||
144 | rcu_read_lock(); | ||
145 | root_level = btrfs_header_level(root->node); | ||
146 | rcu_read_unlock(); | ||
147 | |||
148 | if (root_level + 1 == level) | ||
149 | goto out; | ||
150 | |||
151 | path->lowest_level = level; | ||
152 | ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); | ||
153 | pr_debug("search slot in root %llu (level %d, ref count %d) returned " | ||
154 | "%d for key (%llu %u %llu)\n", | ||
155 | (unsigned long long)ref->root_id, level, ref->count, ret, | ||
156 | (unsigned long long)ref->key.objectid, ref->key.type, | ||
157 | (unsigned long long)ref->key.offset); | ||
158 | if (ret < 0) | ||
159 | goto out; | ||
160 | |||
161 | eb = path->nodes[level]; | ||
162 | if (!eb) { | ||
163 | WARN_ON(1); | ||
164 | ret = 1; | ||
165 | goto out; | ||
166 | } | ||
167 | |||
168 | if (level == 0) { | ||
169 | if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { | ||
170 | ret = btrfs_next_leaf(root, path); | ||
171 | if (ret) | ||
172 | goto out; | ||
173 | eb = path->nodes[0]; | ||
174 | } | ||
175 | |||
176 | btrfs_item_key_to_cpu(eb, &key, path->slots[0]); | ||
177 | } | ||
178 | |||
179 | /* the last two parameters will only be used for level == 0 */ | ||
180 | ret = add_all_parents(root, path, parents, eb, level, key.objectid, | ||
181 | ref->wanted_disk_byte); | ||
182 | out: | ||
183 | btrfs_free_path(path); | ||
184 | return ret; | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * resolve all indirect backrefs from the list | ||
189 | */ | ||
190 | static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, | ||
191 | struct list_head *head) | ||
192 | { | ||
193 | int err; | ||
194 | int ret = 0; | ||
195 | struct __prelim_ref *ref; | ||
196 | struct __prelim_ref *ref_safe; | ||
197 | struct __prelim_ref *new_ref; | ||
198 | struct ulist *parents; | ||
199 | struct ulist_node *node; | ||
200 | |||
201 | parents = ulist_alloc(GFP_NOFS); | ||
202 | if (!parents) | ||
203 | return -ENOMEM; | ||
204 | |||
205 | /* | ||
206 | * _safe allows us to insert directly after the current item without | ||
207 | * iterating over the newly inserted items. | ||
208 | * we're also allowed to re-assign ref during iteration. | ||
209 | */ | ||
210 | list_for_each_entry_safe(ref, ref_safe, head, list) { | ||
211 | if (ref->parent) /* already direct */ | ||
212 | continue; | ||
213 | if (ref->count == 0) | ||
214 | continue; | ||
215 | err = __resolve_indirect_ref(fs_info, ref, parents); | ||
216 | if (err) { | ||
217 | if (ret == 0) | ||
218 | ret = err; | ||
219 | continue; | ||
220 | } | ||
221 | |||
222 | /* we put the first parent into the ref at hand */ | ||
223 | node = ulist_next(parents, NULL); | ||
224 | ref->parent = node ? node->val : 0; | ||
225 | |||
226 | /* additional parents require new refs being added here */ | ||
227 | while ((node = ulist_next(parents, node))) { | ||
228 | new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); | ||
229 | if (!new_ref) { | ||
230 | ret = -ENOMEM; | ||
231 | break; | ||
232 | } | ||
233 | memcpy(new_ref, ref, sizeof(*ref)); | ||
234 | new_ref->parent = node->val; | ||
235 | list_add(&new_ref->list, &ref->list); | ||
236 | } | ||
237 | ulist_reinit(parents); | ||
238 | } | ||
239 | |||
240 | ulist_free(parents); | ||
241 | return ret; | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * merge two lists of backrefs and adjust counts accordingly | ||
246 | * | ||
247 | * mode = 1: merge identical keys, if key is set | ||
248 | * mode = 2: merge identical parents | ||
249 | */ | ||
250 | static int __merge_refs(struct list_head *head, int mode) | ||
251 | { | ||
252 | struct list_head *pos1; | ||
253 | |||
254 | list_for_each(pos1, head) { | ||
255 | struct list_head *n2; | ||
256 | struct list_head *pos2; | ||
257 | struct __prelim_ref *ref1; | ||
258 | |||
259 | ref1 = list_entry(pos1, struct __prelim_ref, list); | ||
260 | |||
261 | if (mode == 1 && ref1->key.type == 0) | ||
262 | continue; | ||
263 | for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; | ||
264 | pos2 = n2, n2 = pos2->next) { | ||
265 | struct __prelim_ref *ref2; | ||
266 | |||
267 | ref2 = list_entry(pos2, struct __prelim_ref, list); | ||
268 | |||
269 | if (mode == 1) { | ||
270 | if (memcmp(&ref1->key, &ref2->key, | ||
271 | sizeof(ref1->key)) || | ||
272 | ref1->level != ref2->level || | ||
273 | ref1->root_id != ref2->root_id) | ||
274 | continue; | ||
275 | ref1->count += ref2->count; | ||
276 | } else { | ||
277 | if (ref1->parent != ref2->parent) | ||
278 | continue; | ||
279 | ref1->count += ref2->count; | ||
280 | } | ||
281 | list_del(&ref2->list); | ||
282 | kfree(ref2); | ||
283 | } | ||
284 | |||
285 | } | ||
286 | return 0; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * add all currently queued delayed refs from this head whose seq nr is | ||
291 | * smaller or equal that seq to the list | ||
292 | */ | ||
293 | static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, | ||
294 | struct btrfs_key *info_key, | ||
295 | struct list_head *prefs) | ||
296 | { | ||
297 | struct btrfs_delayed_extent_op *extent_op = head->extent_op; | ||
298 | struct rb_node *n = &head->node.rb_node; | ||
299 | int sgn; | ||
300 | int ret; | ||
301 | |||
302 | if (extent_op && extent_op->update_key) | ||
303 | btrfs_disk_key_to_cpu(info_key, &extent_op->key); | ||
304 | |||
305 | while ((n = rb_prev(n))) { | ||
306 | struct btrfs_delayed_ref_node *node; | ||
307 | node = rb_entry(n, struct btrfs_delayed_ref_node, | ||
308 | rb_node); | ||
309 | if (node->bytenr != head->node.bytenr) | ||
310 | break; | ||
311 | WARN_ON(node->is_head); | ||
312 | |||
313 | if (node->seq > seq) | ||
314 | continue; | ||
315 | |||
316 | switch (node->action) { | ||
317 | case BTRFS_ADD_DELAYED_EXTENT: | ||
318 | case BTRFS_UPDATE_DELAYED_HEAD: | ||
319 | WARN_ON(1); | ||
320 | continue; | ||
321 | case BTRFS_ADD_DELAYED_REF: | ||
322 | sgn = 1; | ||
323 | break; | ||
324 | case BTRFS_DROP_DELAYED_REF: | ||
325 | sgn = -1; | ||
326 | break; | ||
327 | default: | ||
328 | BUG_ON(1); | ||
329 | } | ||
330 | switch (node->type) { | ||
331 | case BTRFS_TREE_BLOCK_REF_KEY: { | ||
332 | struct btrfs_delayed_tree_ref *ref; | ||
333 | |||
334 | ref = btrfs_delayed_node_to_tree_ref(node); | ||
335 | ret = __add_prelim_ref(prefs, ref->root, info_key, | ||
336 | ref->level + 1, 0, node->bytenr, | ||
337 | node->ref_mod * sgn); | ||
338 | break; | ||
339 | } | ||
340 | case BTRFS_SHARED_BLOCK_REF_KEY: { | ||
341 | struct btrfs_delayed_tree_ref *ref; | ||
342 | |||
343 | ref = btrfs_delayed_node_to_tree_ref(node); | ||
344 | ret = __add_prelim_ref(prefs, ref->root, info_key, | ||
345 | ref->level + 1, ref->parent, | ||
346 | node->bytenr, | ||
347 | node->ref_mod * sgn); | ||
348 | break; | ||
349 | } | ||
350 | case BTRFS_EXTENT_DATA_REF_KEY: { | ||
351 | struct btrfs_delayed_data_ref *ref; | ||
352 | struct btrfs_key key; | ||
353 | |||
354 | ref = btrfs_delayed_node_to_data_ref(node); | ||
355 | |||
356 | key.objectid = ref->objectid; | ||
357 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
358 | key.offset = ref->offset; | ||
359 | ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, | ||
360 | node->bytenr, | ||
361 | node->ref_mod * sgn); | ||
362 | break; | ||
363 | } | ||
364 | case BTRFS_SHARED_DATA_REF_KEY: { | ||
365 | struct btrfs_delayed_data_ref *ref; | ||
366 | struct btrfs_key key; | ||
367 | |||
368 | ref = btrfs_delayed_node_to_data_ref(node); | ||
369 | |||
370 | key.objectid = ref->objectid; | ||
371 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
372 | key.offset = ref->offset; | ||
373 | ret = __add_prelim_ref(prefs, ref->root, &key, 0, | ||
374 | ref->parent, node->bytenr, | ||
375 | node->ref_mod * sgn); | ||
376 | break; | ||
377 | } | ||
378 | default: | ||
379 | WARN_ON(1); | ||
380 | } | ||
381 | BUG_ON(ret); | ||
382 | } | ||
383 | |||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * add all inline backrefs for bytenr to the list | ||
389 | */ | ||
390 | static int __add_inline_refs(struct btrfs_fs_info *fs_info, | ||
391 | struct btrfs_path *path, u64 bytenr, | ||
392 | struct btrfs_key *info_key, int *info_level, | ||
393 | struct list_head *prefs) | ||
394 | { | ||
395 | int ret; | ||
396 | int slot; | ||
397 | struct extent_buffer *leaf; | ||
398 | struct btrfs_key key; | ||
399 | unsigned long ptr; | ||
400 | unsigned long end; | ||
401 | struct btrfs_extent_item *ei; | ||
402 | u64 flags; | ||
403 | u64 item_size; | ||
404 | |||
405 | /* | ||
406 | * enumerate all inline refs | ||
407 | */ | ||
408 | leaf = path->nodes[0]; | ||
409 | slot = path->slots[0] - 1; | ||
410 | |||
411 | item_size = btrfs_item_size_nr(leaf, slot); | ||
412 | BUG_ON(item_size < sizeof(*ei)); | ||
413 | |||
414 | ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); | ||
415 | flags = btrfs_extent_flags(leaf, ei); | ||
416 | |||
417 | ptr = (unsigned long)(ei + 1); | ||
418 | end = (unsigned long)ei + item_size; | ||
419 | |||
420 | if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
421 | struct btrfs_tree_block_info *info; | ||
422 | struct btrfs_disk_key disk_key; | ||
423 | |||
424 | info = (struct btrfs_tree_block_info *)ptr; | ||
425 | *info_level = btrfs_tree_block_level(leaf, info); | ||
426 | btrfs_tree_block_key(leaf, info, &disk_key); | ||
427 | btrfs_disk_key_to_cpu(info_key, &disk_key); | ||
428 | ptr += sizeof(struct btrfs_tree_block_info); | ||
429 | BUG_ON(ptr > end); | ||
430 | } else { | ||
431 | BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); | ||
432 | } | ||
433 | |||
434 | while (ptr < end) { | ||
435 | struct btrfs_extent_inline_ref *iref; | ||
436 | u64 offset; | ||
437 | int type; | ||
438 | |||
439 | iref = (struct btrfs_extent_inline_ref *)ptr; | ||
440 | type = btrfs_extent_inline_ref_type(leaf, iref); | ||
441 | offset = btrfs_extent_inline_ref_offset(leaf, iref); | ||
442 | |||
443 | switch (type) { | ||
444 | case BTRFS_SHARED_BLOCK_REF_KEY: | ||
445 | ret = __add_prelim_ref(prefs, 0, info_key, | ||
446 | *info_level + 1, offset, | ||
447 | bytenr, 1); | ||
448 | break; | ||
449 | case BTRFS_SHARED_DATA_REF_KEY: { | ||
450 | struct btrfs_shared_data_ref *sdref; | ||
451 | int count; | ||
452 | |||
453 | sdref = (struct btrfs_shared_data_ref *)(iref + 1); | ||
454 | count = btrfs_shared_data_ref_count(leaf, sdref); | ||
455 | ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, | ||
456 | bytenr, count); | ||
457 | break; | ||
458 | } | ||
459 | case BTRFS_TREE_BLOCK_REF_KEY: | ||
460 | ret = __add_prelim_ref(prefs, offset, info_key, | ||
461 | *info_level + 1, 0, bytenr, 1); | ||
462 | break; | ||
463 | case BTRFS_EXTENT_DATA_REF_KEY: { | ||
464 | struct btrfs_extent_data_ref *dref; | ||
465 | int count; | ||
466 | u64 root; | ||
467 | |||
468 | dref = (struct btrfs_extent_data_ref *)(&iref->offset); | ||
469 | count = btrfs_extent_data_ref_count(leaf, dref); | ||
470 | key.objectid = btrfs_extent_data_ref_objectid(leaf, | ||
471 | dref); | ||
472 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
473 | key.offset = btrfs_extent_data_ref_offset(leaf, dref); | ||
474 | root = btrfs_extent_data_ref_root(leaf, dref); | ||
475 | ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, | ||
476 | count); | ||
477 | break; | ||
478 | } | ||
479 | default: | ||
480 | WARN_ON(1); | ||
481 | } | ||
482 | BUG_ON(ret); | ||
483 | ptr += btrfs_extent_inline_ref_size(type); | ||
484 | } | ||
485 | |||
486 | return 0; | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * add all non-inline backrefs for bytenr to the list | ||
491 | */ | ||
492 | static int __add_keyed_refs(struct btrfs_fs_info *fs_info, | ||
493 | struct btrfs_path *path, u64 bytenr, | ||
494 | struct btrfs_key *info_key, int info_level, | ||
495 | struct list_head *prefs) | ||
496 | { | ||
497 | struct btrfs_root *extent_root = fs_info->extent_root; | ||
498 | int ret; | ||
499 | int slot; | ||
500 | struct extent_buffer *leaf; | ||
501 | struct btrfs_key key; | ||
502 | |||
503 | while (1) { | ||
504 | ret = btrfs_next_item(extent_root, path); | ||
505 | if (ret < 0) | ||
506 | break; | ||
507 | if (ret) { | ||
508 | ret = 0; | ||
509 | break; | ||
510 | } | ||
511 | |||
512 | slot = path->slots[0]; | ||
513 | leaf = path->nodes[0]; | ||
514 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
515 | |||
516 | if (key.objectid != bytenr) | ||
517 | break; | ||
518 | if (key.type < BTRFS_TREE_BLOCK_REF_KEY) | ||
519 | continue; | ||
520 | if (key.type > BTRFS_SHARED_DATA_REF_KEY) | ||
521 | break; | ||
522 | |||
523 | switch (key.type) { | ||
524 | case BTRFS_SHARED_BLOCK_REF_KEY: | ||
525 | ret = __add_prelim_ref(prefs, 0, info_key, | ||
526 | info_level + 1, key.offset, | ||
527 | bytenr, 1); | ||
528 | break; | ||
529 | case BTRFS_SHARED_DATA_REF_KEY: { | ||
530 | struct btrfs_shared_data_ref *sdref; | ||
531 | int count; | ||
532 | |||
533 | sdref = btrfs_item_ptr(leaf, slot, | ||
534 | struct btrfs_shared_data_ref); | ||
535 | count = btrfs_shared_data_ref_count(leaf, sdref); | ||
536 | ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, | ||
537 | bytenr, count); | ||
538 | break; | ||
539 | } | ||
540 | case BTRFS_TREE_BLOCK_REF_KEY: | ||
541 | ret = __add_prelim_ref(prefs, key.offset, info_key, | ||
542 | info_level + 1, 0, bytenr, 1); | ||
543 | break; | ||
544 | case BTRFS_EXTENT_DATA_REF_KEY: { | ||
545 | struct btrfs_extent_data_ref *dref; | ||
546 | int count; | ||
547 | u64 root; | ||
548 | |||
549 | dref = btrfs_item_ptr(leaf, slot, | ||
550 | struct btrfs_extent_data_ref); | ||
551 | count = btrfs_extent_data_ref_count(leaf, dref); | ||
552 | key.objectid = btrfs_extent_data_ref_objectid(leaf, | ||
553 | dref); | ||
554 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
555 | key.offset = btrfs_extent_data_ref_offset(leaf, dref); | ||
556 | root = btrfs_extent_data_ref_root(leaf, dref); | ||
557 | ret = __add_prelim_ref(prefs, root, &key, 0, 0, | ||
558 | bytenr, count); | ||
559 | break; | ||
560 | } | ||
561 | default: | ||
562 | WARN_ON(1); | ||
563 | } | ||
564 | BUG_ON(ret); | ||
565 | } | ||
566 | |||
567 | return ret; | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * this adds all existing backrefs (inline backrefs, backrefs and delayed | ||
572 | * refs) for the given bytenr to the refs list, merges duplicates and resolves | ||
573 | * indirect refs to their parent bytenr. | ||
574 | * When roots are found, they're added to the roots list | ||
575 | * | ||
576 | * FIXME some caching might speed things up | ||
577 | */ | ||
578 | static int find_parent_nodes(struct btrfs_trans_handle *trans, | ||
579 | struct btrfs_fs_info *fs_info, u64 bytenr, | ||
580 | u64 seq, struct ulist *refs, struct ulist *roots) | ||
581 | { | ||
582 | struct btrfs_key key; | ||
583 | struct btrfs_path *path; | ||
584 | struct btrfs_key info_key = { 0 }; | ||
585 | struct btrfs_delayed_ref_root *delayed_refs = NULL; | ||
586 | struct btrfs_delayed_ref_head *head = NULL; | ||
587 | int info_level = 0; | ||
588 | int ret; | ||
589 | struct list_head prefs_delayed; | ||
590 | struct list_head prefs; | ||
591 | struct __prelim_ref *ref; | ||
592 | |||
593 | INIT_LIST_HEAD(&prefs); | ||
594 | INIT_LIST_HEAD(&prefs_delayed); | ||
595 | |||
596 | key.objectid = bytenr; | ||
597 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
598 | key.offset = (u64)-1; | ||
599 | |||
600 | path = btrfs_alloc_path(); | ||
601 | if (!path) | ||
602 | return -ENOMEM; | ||
603 | |||
604 | /* | ||
605 | * grab both a lock on the path and a lock on the delayed ref head. | ||
606 | * We need both to get a consistent picture of how the refs look | ||
607 | * at a specified point in time | ||
608 | */ | ||
609 | again: | ||
610 | ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); | ||
611 | if (ret < 0) | ||
612 | goto out; | ||
613 | BUG_ON(ret == 0); | ||
614 | |||
615 | /* | ||
616 | * look if there are updates for this ref queued and lock the head | ||
617 | */ | ||
618 | delayed_refs = &trans->transaction->delayed_refs; | ||
619 | spin_lock(&delayed_refs->lock); | ||
620 | head = btrfs_find_delayed_ref_head(trans, bytenr); | ||
621 | if (head) { | ||
622 | if (!mutex_trylock(&head->mutex)) { | ||
623 | atomic_inc(&head->node.refs); | ||
624 | spin_unlock(&delayed_refs->lock); | ||
625 | |||
626 | btrfs_release_path(path); | ||
627 | |||
628 | /* | ||
629 | * Mutex was contended, block until it's | ||
630 | * released and try again | ||
631 | */ | ||
632 | mutex_lock(&head->mutex); | ||
633 | mutex_unlock(&head->mutex); | ||
634 | btrfs_put_delayed_ref(&head->node); | ||
635 | goto again; | ||
636 | } | ||
637 | ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); | ||
638 | if (ret) | ||
639 | goto out; | ||
640 | } | ||
641 | spin_unlock(&delayed_refs->lock); | ||
642 | |||
643 | if (path->slots[0]) { | ||
644 | struct extent_buffer *leaf; | ||
645 | int slot; | ||
646 | |||
647 | leaf = path->nodes[0]; | ||
648 | slot = path->slots[0] - 1; | ||
649 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
650 | if (key.objectid == bytenr && | ||
651 | key.type == BTRFS_EXTENT_ITEM_KEY) { | ||
652 | ret = __add_inline_refs(fs_info, path, bytenr, | ||
653 | &info_key, &info_level, &prefs); | ||
654 | if (ret) | ||
655 | goto out; | ||
656 | ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, | ||
657 | info_level, &prefs); | ||
658 | if (ret) | ||
659 | goto out; | ||
660 | } | ||
661 | } | ||
662 | btrfs_release_path(path); | ||
663 | |||
664 | /* | ||
665 | * when adding the delayed refs above, the info_key might not have | ||
666 | * been known yet. Go over the list and replace the missing keys | ||
667 | */ | ||
668 | list_for_each_entry(ref, &prefs_delayed, list) { | ||
669 | if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) | ||
670 | memcpy(&ref->key, &info_key, sizeof(ref->key)); | ||
671 | } | ||
672 | list_splice_init(&prefs_delayed, &prefs); | ||
673 | |||
674 | ret = __merge_refs(&prefs, 1); | ||
675 | if (ret) | ||
676 | goto out; | ||
677 | |||
678 | ret = __resolve_indirect_refs(fs_info, &prefs); | ||
679 | if (ret) | ||
680 | goto out; | ||
681 | |||
682 | ret = __merge_refs(&prefs, 2); | ||
683 | if (ret) | ||
684 | goto out; | ||
685 | |||
686 | while (!list_empty(&prefs)) { | ||
687 | ref = list_first_entry(&prefs, struct __prelim_ref, list); | ||
688 | list_del(&ref->list); | ||
689 | if (ref->count < 0) | ||
690 | WARN_ON(1); | ||
691 | if (ref->count && ref->root_id && ref->parent == 0) { | ||
692 | /* no parent == root of tree */ | ||
693 | ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); | ||
694 | BUG_ON(ret < 0); | ||
695 | } | ||
696 | if (ref->count && ref->parent) { | ||
697 | ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); | ||
698 | BUG_ON(ret < 0); | ||
699 | } | ||
700 | kfree(ref); | ||
701 | } | ||
702 | |||
703 | out: | ||
704 | if (head) | ||
705 | mutex_unlock(&head->mutex); | ||
706 | btrfs_free_path(path); | ||
707 | while (!list_empty(&prefs)) { | ||
708 | ref = list_first_entry(&prefs, struct __prelim_ref, list); | ||
709 | list_del(&ref->list); | ||
710 | kfree(ref); | ||
711 | } | ||
712 | while (!list_empty(&prefs_delayed)) { | ||
713 | ref = list_first_entry(&prefs_delayed, struct __prelim_ref, | ||
714 | list); | ||
715 | list_del(&ref->list); | ||
716 | kfree(ref); | ||
717 | } | ||
718 | |||
719 | return ret; | ||
720 | } | ||
721 | |||
722 | /* | ||
723 | * Finds all leafs with a reference to the specified combination of bytenr and | ||
724 | * offset. key_list_head will point to a list of corresponding keys (caller must | ||
725 | * free each list element). The leafs will be stored in the leafs ulist, which | ||
726 | * must be freed with ulist_free. | ||
727 | * | ||
728 | * returns 0 on success, <0 on error | ||
729 | */ | ||
730 | static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, | ||
731 | struct btrfs_fs_info *fs_info, u64 bytenr, | ||
732 | u64 num_bytes, u64 seq, struct ulist **leafs) | ||
733 | { | ||
734 | struct ulist *tmp; | ||
735 | int ret; | ||
736 | |||
737 | tmp = ulist_alloc(GFP_NOFS); | ||
738 | if (!tmp) | ||
739 | return -ENOMEM; | ||
740 | *leafs = ulist_alloc(GFP_NOFS); | ||
741 | if (!*leafs) { | ||
742 | ulist_free(tmp); | ||
743 | return -ENOMEM; | ||
744 | } | ||
745 | |||
746 | ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); | ||
747 | ulist_free(tmp); | ||
748 | |||
749 | if (ret < 0 && ret != -ENOENT) { | ||
750 | ulist_free(*leafs); | ||
751 | return ret; | ||
752 | } | ||
753 | |||
754 | return 0; | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * walk all backrefs for a given extent to find all roots that reference this | ||
759 | * extent. Walking a backref means finding all extents that reference this | ||
760 | * extent and in turn walk the backrefs of those, too. Naturally this is a | ||
761 | * recursive process, but here it is implemented in an iterative fashion: We | ||
762 | * find all referencing extents for the extent in question and put them on a | ||
763 | * list. In turn, we find all referencing extents for those, further appending | ||
764 | * to the list. The way we iterate the list allows adding more elements after | ||
765 | * the current while iterating. The process stops when we reach the end of the | ||
766 | * list. Found roots are added to the roots list. | ||
767 | * | ||
768 | * returns 0 on success, < 0 on error. | ||
769 | */ | ||
770 | int btrfs_find_all_roots(struct btrfs_trans_handle *trans, | ||
771 | struct btrfs_fs_info *fs_info, u64 bytenr, | ||
772 | u64 num_bytes, u64 seq, struct ulist **roots) | ||
773 | { | ||
774 | struct ulist *tmp; | ||
775 | struct ulist_node *node = NULL; | ||
776 | int ret; | ||
777 | |||
778 | tmp = ulist_alloc(GFP_NOFS); | ||
779 | if (!tmp) | ||
780 | return -ENOMEM; | ||
781 | *roots = ulist_alloc(GFP_NOFS); | ||
782 | if (!*roots) { | ||
783 | ulist_free(tmp); | ||
784 | return -ENOMEM; | ||
785 | } | ||
786 | |||
787 | while (1) { | ||
788 | ret = find_parent_nodes(trans, fs_info, bytenr, seq, | ||
789 | tmp, *roots); | ||
790 | if (ret < 0 && ret != -ENOENT) { | ||
791 | ulist_free(tmp); | ||
792 | ulist_free(*roots); | ||
793 | return ret; | ||
794 | } | ||
795 | node = ulist_next(tmp, node); | ||
796 | if (!node) | ||
797 | break; | ||
798 | bytenr = node->val; | ||
799 | } | ||
800 | |||
801 | ulist_free(tmp); | ||
802 | return 0; | ||
803 | } | ||
804 | |||
34 | 805 | ||
35 | static int __inode_info(u64 inum, u64 ioff, u8 key_type, | 806 | static int __inode_info(u64 inum, u64 ioff, u8 key_type, |
36 | struct btrfs_root *fs_root, struct btrfs_path *path, | 807 | struct btrfs_root *fs_root, struct btrfs_path *path, |
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, | |||
181 | btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); | 952 | btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); |
182 | if (found_key->type != BTRFS_EXTENT_ITEM_KEY || | 953 | if (found_key->type != BTRFS_EXTENT_ITEM_KEY || |
183 | found_key->objectid > logical || | 954 | found_key->objectid > logical || |
184 | found_key->objectid + found_key->offset <= logical) | 955 | found_key->objectid + found_key->offset <= logical) { |
956 | pr_debug("logical %llu is not within any extent\n", | ||
957 | (unsigned long long)logical); | ||
185 | return -ENOENT; | 958 | return -ENOENT; |
959 | } | ||
186 | 960 | ||
187 | eb = path->nodes[0]; | 961 | eb = path->nodes[0]; |
188 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | 962 | item_size = btrfs_item_size_nr(eb, path->slots[0]); |
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, | |||
191 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | 965 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); |
192 | flags = btrfs_extent_flags(eb, ei); | 966 | flags = btrfs_extent_flags(eb, ei); |
193 | 967 | ||
968 | pr_debug("logical %llu is at position %llu within the extent (%llu " | ||
969 | "EXTENT_ITEM %llu) flags %#llx size %u\n", | ||
970 | (unsigned long long)logical, | ||
971 | (unsigned long long)(logical - found_key->objectid), | ||
972 | (unsigned long long)found_key->objectid, | ||
973 | (unsigned long long)found_key->offset, | ||
974 | (unsigned long long)flags, item_size); | ||
194 | if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) | 975 | if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) |
195 | return BTRFS_EXTENT_FLAG_TREE_BLOCK; | 976 | return BTRFS_EXTENT_FLAG_TREE_BLOCK; |
196 | if (flags & BTRFS_EXTENT_FLAG_DATA) | 977 | if (flags & BTRFS_EXTENT_FLAG_DATA) |
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, | |||
287 | return 0; | 1068 | return 0; |
288 | } | 1069 | } |
289 | 1070 | ||
290 | static int __data_list_add(struct list_head *head, u64 inum, | 1071 | static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, |
291 | u64 extent_data_item_offset, u64 root) | 1072 | struct btrfs_path *path, u64 logical, |
292 | { | 1073 | u64 orig_extent_item_objectid, |
293 | struct __data_ref *ref; | 1074 | u64 extent_item_pos, u64 root, |
294 | 1075 | iterate_extent_inodes_t *iterate, void *ctx) | |
295 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | ||
296 | if (!ref) | ||
297 | return -ENOMEM; | ||
298 | |||
299 | ref->inum = inum; | ||
300 | ref->extent_data_item_offset = extent_data_item_offset; | ||
301 | ref->root = root; | ||
302 | list_add_tail(&ref->list, head); | ||
303 | |||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, | ||
308 | struct btrfs_extent_data_ref *dref) | ||
309 | { | ||
310 | return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), | ||
311 | btrfs_extent_data_ref_offset(eb, dref), | ||
312 | btrfs_extent_data_ref_root(eb, dref)); | ||
313 | } | ||
314 | |||
315 | static int __shared_list_add(struct list_head *head, u64 disk_byte) | ||
316 | { | ||
317 | struct __shared_ref *ref; | ||
318 | |||
319 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | ||
320 | if (!ref) | ||
321 | return -ENOMEM; | ||
322 | |||
323 | ref->disk_byte = disk_byte; | ||
324 | list_add_tail(&ref->list, head); | ||
325 | |||
326 | return 0; | ||
327 | } | ||
328 | |||
329 | static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, | ||
330 | u64 logical, u64 inum, | ||
331 | u64 extent_data_item_offset, | ||
332 | u64 extent_offset, | ||
333 | struct btrfs_path *path, | ||
334 | struct list_head *data_refs, | ||
335 | iterate_extent_inodes_t *iterate, | ||
336 | void *ctx) | ||
337 | { | ||
338 | u64 ref_root; | ||
339 | u32 item_size; | ||
340 | struct btrfs_key key; | ||
341 | struct extent_buffer *eb; | ||
342 | struct btrfs_extent_item *ei; | ||
343 | struct btrfs_extent_inline_ref *eiref; | ||
344 | struct __data_ref *ref; | ||
345 | int ret; | ||
346 | int type; | ||
347 | int last; | ||
348 | unsigned long ptr = 0; | ||
349 | |||
350 | WARN_ON(!list_empty(data_refs)); | ||
351 | ret = extent_from_logical(fs_info, logical, path, &key); | ||
352 | if (ret & BTRFS_EXTENT_FLAG_DATA) | ||
353 | ret = -EIO; | ||
354 | if (ret < 0) | ||
355 | goto out; | ||
356 | |||
357 | eb = path->nodes[0]; | ||
358 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | ||
359 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | ||
360 | |||
361 | ret = 0; | ||
362 | ref_root = 0; | ||
363 | /* | ||
364 | * as done in iterate_extent_inodes, we first build a list of refs to | ||
365 | * iterate, then free the path and then iterate them to avoid deadlocks. | ||
366 | */ | ||
367 | do { | ||
368 | last = __get_extent_inline_ref(&ptr, eb, ei, item_size, | ||
369 | &eiref, &type); | ||
370 | if (last < 0) { | ||
371 | ret = last; | ||
372 | goto out; | ||
373 | } | ||
374 | if (type == BTRFS_TREE_BLOCK_REF_KEY || | ||
375 | type == BTRFS_SHARED_BLOCK_REF_KEY) { | ||
376 | ref_root = btrfs_extent_inline_ref_offset(eb, eiref); | ||
377 | ret = __data_list_add(data_refs, inum, | ||
378 | extent_data_item_offset, | ||
379 | ref_root); | ||
380 | } | ||
381 | } while (!ret && !last); | ||
382 | |||
383 | btrfs_release_path(path); | ||
384 | |||
385 | if (ref_root == 0) { | ||
386 | printk(KERN_ERR "btrfs: failed to find tree block ref " | ||
387 | "for shared data backref %llu\n", logical); | ||
388 | WARN_ON(1); | ||
389 | ret = -EIO; | ||
390 | } | ||
391 | |||
392 | out: | ||
393 | while (!list_empty(data_refs)) { | ||
394 | ref = list_first_entry(data_refs, struct __data_ref, list); | ||
395 | list_del(&ref->list); | ||
396 | if (!ret) | ||
397 | ret = iterate(ref->inum, extent_offset + | ||
398 | ref->extent_data_item_offset, | ||
399 | ref->root, ctx); | ||
400 | kfree(ref); | ||
401 | } | ||
402 | |||
403 | return ret; | ||
404 | } | ||
405 | |||
406 | static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, | ||
407 | u64 logical, u64 orig_extent_item_objectid, | ||
408 | u64 extent_offset, struct btrfs_path *path, | ||
409 | struct list_head *data_refs, | ||
410 | iterate_extent_inodes_t *iterate, | ||
411 | void *ctx) | ||
412 | { | 1076 | { |
413 | u64 disk_byte; | 1077 | u64 disk_byte; |
414 | struct btrfs_key key; | 1078 | struct btrfs_key key; |
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, | |||
416 | struct extent_buffer *eb; | 1080 | struct extent_buffer *eb; |
417 | int slot; | 1081 | int slot; |
418 | int nritems; | 1082 | int nritems; |
419 | int ret; | 1083 | int ret = 0; |
420 | int found = 0; | 1084 | int extent_type; |
1085 | u64 data_offset; | ||
1086 | u64 data_len; | ||
421 | 1087 | ||
422 | eb = read_tree_block(fs_info->tree_root, logical, | 1088 | eb = read_tree_block(fs_info->tree_root, logical, |
423 | fs_info->tree_root->leafsize, 0); | 1089 | fs_info->tree_root->leafsize, 0); |
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, | |||
435 | if (key.type != BTRFS_EXTENT_DATA_KEY) | 1101 | if (key.type != BTRFS_EXTENT_DATA_KEY) |
436 | continue; | 1102 | continue; |
437 | fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); | 1103 | fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); |
438 | if (!fi) { | 1104 | extent_type = btrfs_file_extent_type(eb, fi); |
439 | free_extent_buffer(eb); | 1105 | if (extent_type == BTRFS_FILE_EXTENT_INLINE) |
440 | return -EIO; | 1106 | continue; |
441 | } | 1107 | /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ |
442 | disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); | 1108 | disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); |
443 | if (disk_byte != orig_extent_item_objectid) { | 1109 | if (disk_byte != orig_extent_item_objectid) |
444 | if (found) | 1110 | continue; |
445 | break; | ||
446 | else | ||
447 | continue; | ||
448 | } | ||
449 | ++found; | ||
450 | ret = __iter_shared_inline_ref_inodes(fs_info, logical, | ||
451 | key.objectid, | ||
452 | key.offset, | ||
453 | extent_offset, path, | ||
454 | data_refs, | ||
455 | iterate, ctx); | ||
456 | if (ret) | ||
457 | break; | ||
458 | } | ||
459 | 1111 | ||
460 | if (!found) { | 1112 | data_offset = btrfs_file_extent_offset(eb, fi); |
461 | printk(KERN_ERR "btrfs: failed to follow shared data backref " | 1113 | data_len = btrfs_file_extent_num_bytes(eb, fi); |
462 | "to parent %llu\n", logical); | 1114 | |
463 | WARN_ON(1); | 1115 | if (extent_item_pos < data_offset || |
464 | ret = -EIO; | 1116 | extent_item_pos >= data_offset + data_len) |
1117 | continue; | ||
1118 | |||
1119 | pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " | ||
1120 | "root %llu\n", orig_extent_item_objectid, | ||
1121 | key.objectid, key.offset, root); | ||
1122 | ret = iterate(key.objectid, | ||
1123 | key.offset + (extent_item_pos - data_offset), | ||
1124 | root, ctx); | ||
1125 | if (ret) { | ||
1126 | pr_debug("stopping iteration because ret=%d\n", ret); | ||
1127 | break; | ||
1128 | } | ||
465 | } | 1129 | } |
466 | 1130 | ||
467 | free_extent_buffer(eb); | 1131 | free_extent_buffer(eb); |
1132 | |||
468 | return ret; | 1133 | return ret; |
469 | } | 1134 | } |
470 | 1135 | ||
471 | /* | 1136 | /* |
472 | * calls iterate() for every inode that references the extent identified by | 1137 | * calls iterate() for every inode that references the extent identified by |
473 | * the given parameters. will use the path given as a parameter and return it | 1138 | * the given parameters. |
474 | * released. | ||
475 | * when the iterator function returns a non-zero value, iteration stops. | 1139 | * when the iterator function returns a non-zero value, iteration stops. |
1140 | * path is guaranteed to be in released state when iterate() is called. | ||
476 | */ | 1141 | */ |
477 | int iterate_extent_inodes(struct btrfs_fs_info *fs_info, | 1142 | int iterate_extent_inodes(struct btrfs_fs_info *fs_info, |
478 | struct btrfs_path *path, | 1143 | struct btrfs_path *path, |
479 | u64 extent_item_objectid, | 1144 | u64 extent_item_objectid, u64 extent_item_pos, |
480 | u64 extent_offset, | ||
481 | iterate_extent_inodes_t *iterate, void *ctx) | 1145 | iterate_extent_inodes_t *iterate, void *ctx) |
482 | { | 1146 | { |
483 | unsigned long ptr = 0; | ||
484 | int last; | ||
485 | int ret; | 1147 | int ret; |
486 | int type; | ||
487 | u64 logical; | ||
488 | u32 item_size; | ||
489 | struct btrfs_extent_inline_ref *eiref; | ||
490 | struct btrfs_extent_data_ref *dref; | ||
491 | struct extent_buffer *eb; | ||
492 | struct btrfs_extent_item *ei; | ||
493 | struct btrfs_key key; | ||
494 | struct list_head data_refs = LIST_HEAD_INIT(data_refs); | 1148 | struct list_head data_refs = LIST_HEAD_INIT(data_refs); |
495 | struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); | 1149 | struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); |
496 | struct __data_ref *ref_d; | 1150 | struct btrfs_trans_handle *trans; |
497 | struct __shared_ref *ref_s; | 1151 | struct ulist *refs; |
498 | 1152 | struct ulist *roots; | |
499 | eb = path->nodes[0]; | 1153 | struct ulist_node *ref_node = NULL; |
500 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | 1154 | struct ulist_node *root_node = NULL; |
501 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | 1155 | struct seq_list seq_elem; |
502 | 1156 | struct btrfs_delayed_ref_root *delayed_refs; | |
503 | /* first we iterate the inline refs, ... */ | 1157 | |
504 | do { | 1158 | trans = btrfs_join_transaction(fs_info->extent_root); |
505 | last = __get_extent_inline_ref(&ptr, eb, ei, item_size, | 1159 | if (IS_ERR(trans)) |
506 | &eiref, &type); | 1160 | return PTR_ERR(trans); |
507 | if (last == -ENOENT) { | 1161 | |
508 | ret = 0; | 1162 | pr_debug("resolving all inodes for extent %llu\n", |
509 | break; | 1163 | extent_item_objectid); |
510 | } | 1164 | |
511 | if (last < 0) { | 1165 | delayed_refs = &trans->transaction->delayed_refs; |
512 | ret = last; | 1166 | spin_lock(&delayed_refs->lock); |
513 | break; | 1167 | btrfs_get_delayed_seq(delayed_refs, &seq_elem); |
514 | } | 1168 | spin_unlock(&delayed_refs->lock); |
1169 | |||
1170 | ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, | ||
1171 | extent_item_pos, seq_elem.seq, | ||
1172 | &refs); | ||
515 | 1173 | ||
516 | if (type == BTRFS_EXTENT_DATA_REF_KEY) { | 1174 | if (ret) |
517 | dref = (struct btrfs_extent_data_ref *)(&eiref->offset); | 1175 | goto out; |
518 | ret = __data_list_add_eb(&data_refs, eb, dref); | ||
519 | } else if (type == BTRFS_SHARED_DATA_REF_KEY) { | ||
520 | logical = btrfs_extent_inline_ref_offset(eb, eiref); | ||
521 | ret = __shared_list_add(&shared_refs, logical); | ||
522 | } | ||
523 | } while (!ret && !last); | ||
524 | 1176 | ||
525 | /* ... then we proceed to in-tree references and ... */ | 1177 | while (!ret && (ref_node = ulist_next(refs, ref_node))) { |
526 | while (!ret) { | 1178 | ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, |
527 | ++path->slots[0]; | 1179 | seq_elem.seq, &roots); |
528 | if (path->slots[0] > btrfs_header_nritems(eb)) { | 1180 | if (ret) |
529 | ret = btrfs_next_leaf(fs_info->extent_root, path); | ||
530 | if (ret) { | ||
531 | if (ret == 1) | ||
532 | ret = 0; /* we're done */ | ||
533 | break; | ||
534 | } | ||
535 | eb = path->nodes[0]; | ||
536 | } | ||
537 | btrfs_item_key_to_cpu(eb, &key, path->slots[0]); | ||
538 | if (key.objectid != extent_item_objectid) | ||
539 | break; | 1181 | break; |
540 | if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { | 1182 | while (!ret && (root_node = ulist_next(roots, root_node))) { |
541 | dref = btrfs_item_ptr(eb, path->slots[0], | 1183 | pr_debug("root %llu references leaf %llu\n", |
542 | struct btrfs_extent_data_ref); | 1184 | root_node->val, ref_node->val); |
543 | ret = __data_list_add_eb(&data_refs, eb, dref); | 1185 | ret = iterate_leaf_refs(fs_info, path, ref_node->val, |
544 | } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { | 1186 | extent_item_objectid, |
545 | ret = __shared_list_add(&shared_refs, key.offset); | 1187 | extent_item_pos, root_node->val, |
1188 | iterate, ctx); | ||
546 | } | 1189 | } |
547 | } | 1190 | } |
548 | 1191 | ||
549 | btrfs_release_path(path); | 1192 | ulist_free(refs); |
550 | 1193 | ulist_free(roots); | |
551 | /* | 1194 | out: |
552 | * ... only at the very end we can process the refs we found. this is | 1195 | btrfs_put_delayed_seq(delayed_refs, &seq_elem); |
553 | * because the iterator function we call is allowed to make tree lookups | 1196 | btrfs_end_transaction(trans, fs_info->extent_root); |
554 | * and we have to avoid deadlocks. additionally, we need more tree | ||
555 | * lookups ourselves for shared data refs. | ||
556 | */ | ||
557 | while (!list_empty(&data_refs)) { | ||
558 | ref_d = list_first_entry(&data_refs, struct __data_ref, list); | ||
559 | list_del(&ref_d->list); | ||
560 | if (!ret) | ||
561 | ret = iterate(ref_d->inum, extent_offset + | ||
562 | ref_d->extent_data_item_offset, | ||
563 | ref_d->root, ctx); | ||
564 | kfree(ref_d); | ||
565 | } | ||
566 | |||
567 | while (!list_empty(&shared_refs)) { | ||
568 | ref_s = list_first_entry(&shared_refs, struct __shared_ref, | ||
569 | list); | ||
570 | list_del(&ref_s->list); | ||
571 | if (!ret) | ||
572 | ret = __iter_shared_inline_ref(fs_info, | ||
573 | ref_s->disk_byte, | ||
574 | extent_item_objectid, | ||
575 | extent_offset, path, | ||
576 | &data_refs, | ||
577 | iterate, ctx); | ||
578 | kfree(ref_s); | ||
579 | } | ||
580 | |||
581 | return ret; | 1197 | return ret; |
582 | } | 1198 | } |
583 | 1199 | ||
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, | |||
586 | iterate_extent_inodes_t *iterate, void *ctx) | 1202 | iterate_extent_inodes_t *iterate, void *ctx) |
587 | { | 1203 | { |
588 | int ret; | 1204 | int ret; |
589 | u64 offset; | 1205 | u64 extent_item_pos; |
590 | struct btrfs_key found_key; | 1206 | struct btrfs_key found_key; |
591 | 1207 | ||
592 | ret = extent_from_logical(fs_info, logical, path, | 1208 | ret = extent_from_logical(fs_info, logical, path, |
593 | &found_key); | 1209 | &found_key); |
1210 | btrfs_release_path(path); | ||
594 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) | 1211 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) |
595 | ret = -EINVAL; | 1212 | ret = -EINVAL; |
596 | if (ret < 0) | 1213 | if (ret < 0) |
597 | return ret; | 1214 | return ret; |
598 | 1215 | ||
599 | offset = logical - found_key.objectid; | 1216 | extent_item_pos = logical - found_key.objectid; |
600 | ret = iterate_extent_inodes(fs_info, path, found_key.objectid, | 1217 | ret = iterate_extent_inodes(fs_info, path, found_key.objectid, |
601 | offset, iterate, ctx); | 1218 | extent_item_pos, iterate, ctx); |
602 | 1219 | ||
603 | return ret; | 1220 | return ret; |
604 | } | 1221 | } |
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, | |||
643 | for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { | 1260 | for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { |
644 | name_len = btrfs_inode_ref_name_len(eb, iref); | 1261 | name_len = btrfs_inode_ref_name_len(eb, iref); |
645 | /* path must be released before calling iterate()! */ | 1262 | /* path must be released before calling iterate()! */ |
1263 | pr_debug("following ref at offset %u for inode %llu in " | ||
1264 | "tree %llu\n", cur, | ||
1265 | (unsigned long long)found_key.objectid, | ||
1266 | (unsigned long long)fs_root->objectid); | ||
646 | ret = iterate(parent, iref, eb, ctx); | 1267 | ret = iterate(parent, iref, eb, ctx); |
647 | if (ret) { | 1268 | if (ret) { |
648 | free_extent_buffer(eb); | 1269 | free_extent_buffer(eb); |
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, | |||
683 | return PTR_ERR(fspath); | 1304 | return PTR_ERR(fspath); |
684 | 1305 | ||
685 | if (fspath > fspath_min) { | 1306 | if (fspath > fspath_min) { |
1307 | pr_debug("path resolved: %s\n", fspath); | ||
686 | ipath->fspath->val[i] = (u64)(unsigned long)fspath; | 1308 | ipath->fspath->val[i] = (u64)(unsigned long)fspath; |
687 | ++ipath->fspath->elem_cnt; | 1309 | ++ipath->fspath->elem_cnt; |
688 | ipath->fspath->bytes_left = fspath - fspath_min; | 1310 | ipath->fspath->bytes_left = fspath - fspath_min; |
689 | } else { | 1311 | } else { |
1312 | pr_debug("missed path, not enough space. missing bytes: %lu, " | ||
1313 | "constructed so far: %s\n", | ||
1314 | (unsigned long)(fspath_min - fspath), fspath_min); | ||
690 | ++ipath->fspath->elem_missed; | 1315 | ++ipath->fspath->elem_missed; |
691 | ipath->fspath->bytes_missing += fspath_min - fspath; | 1316 | ipath->fspath->bytes_missing += fspath_min - fspath; |
692 | ipath->fspath->bytes_left = 0; | 1317 | ipath->fspath->bytes_left = 0; |
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 92618837cb8f..d00dfa9ca934 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #define __BTRFS_BACKREF__ | 20 | #define __BTRFS_BACKREF__ |
21 | 21 | ||
22 | #include "ioctl.h" | 22 | #include "ioctl.h" |
23 | #include "ulist.h" | ||
23 | 24 | ||
24 | struct inode_fs_paths { | 25 | struct inode_fs_paths { |
25 | struct btrfs_path *btrfs_path; | 26 | struct btrfs_path *btrfs_path; |
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, | |||
54 | 55 | ||
55 | int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); | 56 | int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); |
56 | 57 | ||
58 | int btrfs_find_all_roots(struct btrfs_trans_handle *trans, | ||
59 | struct btrfs_fs_info *fs_info, u64 bytenr, | ||
60 | u64 num_bytes, u64 seq, struct ulist **roots); | ||
61 | |||
57 | struct btrfs_data_container *init_data_container(u32 total_bytes); | 62 | struct btrfs_data_container *init_data_container(u32 total_bytes); |
58 | struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, | 63 | struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, |
59 | struct btrfs_path *path); | 64 | struct btrfs_path *path); |
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 634608d2a6d0..9b9b15fd5204 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -51,6 +51,9 @@ struct btrfs_inode { | |||
51 | /* held while logging the inode in tree-log.c */ | 51 | /* held while logging the inode in tree-log.c */ |
52 | struct mutex log_mutex; | 52 | struct mutex log_mutex; |
53 | 53 | ||
54 | /* held while doing delalloc reservations */ | ||
55 | struct mutex delalloc_mutex; | ||
56 | |||
54 | /* used to order data wrt metadata */ | 57 | /* used to order data wrt metadata */ |
55 | struct btrfs_ordered_inode_tree ordered_tree; | 58 | struct btrfs_ordered_inode_tree ordered_tree; |
56 | 59 | ||
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c new file mode 100644 index 000000000000..ad0b3ba735b7 --- /dev/null +++ b/fs/btrfs/check-integrity.c | |||
@@ -0,0 +1,3068 @@ | |||
1 | /* | ||
2 | * Copyright (C) STRATO AG 2011. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | /* | ||
20 | * This module can be used to catch cases when the btrfs kernel | ||
21 | * code executes write requests to the disk that bring the file | ||
22 | * system in an inconsistent state. In such a state, a power-loss | ||
23 | * or kernel panic event would cause that the data on disk is | ||
24 | * lost or at least damaged. | ||
25 | * | ||
26 | * Code is added that examines all block write requests during | ||
27 | * runtime (including writes of the super block). Three rules | ||
28 | * are verified and an error is printed on violation of the | ||
29 | * rules: | ||
30 | * 1. It is not allowed to write a disk block which is | ||
31 | * currently referenced by the super block (either directly | ||
32 | * or indirectly). | ||
33 | * 2. When a super block is written, it is verified that all | ||
34 | * referenced (directly or indirectly) blocks fulfill the | ||
35 | * following requirements: | ||
36 | * 2a. All referenced blocks have either been present when | ||
37 | * the file system was mounted, (i.e., they have been | ||
38 | * referenced by the super block) or they have been | ||
39 | * written since then and the write completion callback | ||
40 | * was called and a FLUSH request to the device where | ||
41 | * these blocks are located was received and completed. | ||
42 | * 2b. All referenced blocks need to have a generation | ||
43 | * number which is equal to the parent's number. | ||
44 | * | ||
45 | * One issue that was found using this module was that the log | ||
46 | * tree on disk became temporarily corrupted because disk blocks | ||
47 | * that had been in use for the log tree had been freed and | ||
48 | * reused too early, while being referenced by the written super | ||
49 | * block. | ||
50 | * | ||
51 | * The search term in the kernel log that can be used to filter | ||
52 | * on the existence of detected integrity issues is | ||
53 | * "btrfs: attempt". | ||
54 | * | ||
55 | * The integrity check is enabled via mount options. These | ||
56 | * mount options are only supported if the integrity check | ||
57 | * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. | ||
58 | * | ||
59 | * Example #1, apply integrity checks to all metadata: | ||
60 | * mount /dev/sdb1 /mnt -o check_int | ||
61 | * | ||
62 | * Example #2, apply integrity checks to all metadata and | ||
63 | * to data extents: | ||
64 | * mount /dev/sdb1 /mnt -o check_int_data | ||
65 | * | ||
66 | * Example #3, apply integrity checks to all metadata and dump | ||
67 | * the tree that the super block references to kernel messages | ||
68 | * each time after a super block was written: | ||
69 | * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 | ||
70 | * | ||
71 | * If the integrity check tool is included and activated in | ||
72 | * the mount options, plenty of kernel memory is used, and | ||
73 | * plenty of additional CPU cycles are spent. Enabling this | ||
74 | * functionality is not intended for normal use. In most | ||
75 | * cases, unless you are a btrfs developer who needs to verify | ||
76 | * the integrity of (super)-block write requests, do not | ||
77 | * enable the config option BTRFS_FS_CHECK_INTEGRITY to | ||
78 | * include and compile the integrity check tool. | ||
79 | */ | ||
80 | |||
81 | #include <linux/sched.h> | ||
82 | #include <linux/slab.h> | ||
83 | #include <linux/buffer_head.h> | ||
84 | #include <linux/mutex.h> | ||
85 | #include <linux/crc32c.h> | ||
86 | #include <linux/genhd.h> | ||
87 | #include <linux/blkdev.h> | ||
88 | #include "ctree.h" | ||
89 | #include "disk-io.h" | ||
90 | #include "transaction.h" | ||
91 | #include "extent_io.h" | ||
92 | #include "disk-io.h" | ||
93 | #include "volumes.h" | ||
94 | #include "print-tree.h" | ||
95 | #include "locking.h" | ||
96 | #include "check-integrity.h" | ||
97 | |||
98 | #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 | ||
99 | #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 | ||
100 | #define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 | ||
101 | #define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 | ||
102 | #define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 | ||
103 | #define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 | ||
104 | #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 | ||
105 | #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, | ||
106 | * excluding " [...]" */ | ||
107 | #define BTRFSIC_BLOCK_SIZE PAGE_SIZE | ||
108 | |||
109 | #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) | ||
110 | |||
111 | /* | ||
112 | * The definition of the bitmask fields for the print_mask. | ||
113 | * They are specified with the mount option check_integrity_print_mask. | ||
114 | */ | ||
115 | #define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 | ||
116 | #define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 | ||
117 | #define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 | ||
118 | #define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 | ||
119 | #define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 | ||
120 | #define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 | ||
121 | #define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 | ||
122 | #define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 | ||
123 | #define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 | ||
124 | #define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 | ||
125 | #define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 | ||
126 | #define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 | ||
127 | #define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 | ||
128 | |||
129 | struct btrfsic_dev_state; | ||
130 | struct btrfsic_state; | ||
131 | |||
132 | struct btrfsic_block { | ||
133 | u32 magic_num; /* only used for debug purposes */ | ||
134 | unsigned int is_metadata:1; /* if it is meta-data, not data-data */ | ||
135 | unsigned int is_superblock:1; /* if it is one of the superblocks */ | ||
136 | unsigned int is_iodone:1; /* if is done by lower subsystem */ | ||
137 | unsigned int iodone_w_error:1; /* error was indicated to endio */ | ||
138 | unsigned int never_written:1; /* block was added because it was | ||
139 | * referenced, not because it was | ||
140 | * written */ | ||
141 | unsigned int mirror_num:2; /* large enough to hold | ||
142 | * BTRFS_SUPER_MIRROR_MAX */ | ||
143 | struct btrfsic_dev_state *dev_state; | ||
144 | u64 dev_bytenr; /* key, physical byte num on disk */ | ||
145 | u64 logical_bytenr; /* logical byte num on disk */ | ||
146 | u64 generation; | ||
147 | struct btrfs_disk_key disk_key; /* extra info to print in case of | ||
148 | * issues, will not always be correct */ | ||
149 | struct list_head collision_resolving_node; /* list node */ | ||
150 | struct list_head all_blocks_node; /* list node */ | ||
151 | |||
152 | /* the following two lists contain block_link items */ | ||
153 | struct list_head ref_to_list; /* list */ | ||
154 | struct list_head ref_from_list; /* list */ | ||
155 | struct btrfsic_block *next_in_same_bio; | ||
156 | void *orig_bio_bh_private; | ||
157 | union { | ||
158 | bio_end_io_t *bio; | ||
159 | bh_end_io_t *bh; | ||
160 | } orig_bio_bh_end_io; | ||
161 | int submit_bio_bh_rw; | ||
162 | u64 flush_gen; /* only valid if !never_written */ | ||
163 | }; | ||
164 | |||
165 | /* | ||
166 | * Elements of this type are allocated dynamically and required because | ||
167 | * each block object can refer to and can be ref from multiple blocks. | ||
168 | * The key to lookup them in the hashtable is the dev_bytenr of | ||
169 | * the block ref to plus the one from the block refered from. | ||
170 | * The fact that they are searchable via a hashtable and that a | ||
171 | * ref_cnt is maintained is not required for the btrfs integrity | ||
172 | * check algorithm itself, it is only used to make the output more | ||
173 | * beautiful in case that an error is detected (an error is defined | ||
174 | * as a write operation to a block while that block is still referenced). | ||
175 | */ | ||
176 | struct btrfsic_block_link { | ||
177 | u32 magic_num; /* only used for debug purposes */ | ||
178 | u32 ref_cnt; | ||
179 | struct list_head node_ref_to; /* list node */ | ||
180 | struct list_head node_ref_from; /* list node */ | ||
181 | struct list_head collision_resolving_node; /* list node */ | ||
182 | struct btrfsic_block *block_ref_to; | ||
183 | struct btrfsic_block *block_ref_from; | ||
184 | u64 parent_generation; | ||
185 | }; | ||
186 | |||
187 | struct btrfsic_dev_state { | ||
188 | u32 magic_num; /* only used for debug purposes */ | ||
189 | struct block_device *bdev; | ||
190 | struct btrfsic_state *state; | ||
191 | struct list_head collision_resolving_node; /* list node */ | ||
192 | struct btrfsic_block dummy_block_for_bio_bh_flush; | ||
193 | u64 last_flush_gen; | ||
194 | char name[BDEVNAME_SIZE]; | ||
195 | }; | ||
196 | |||
197 | struct btrfsic_block_hashtable { | ||
198 | struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; | ||
199 | }; | ||
200 | |||
201 | struct btrfsic_block_link_hashtable { | ||
202 | struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; | ||
203 | }; | ||
204 | |||
205 | struct btrfsic_dev_state_hashtable { | ||
206 | struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; | ||
207 | }; | ||
208 | |||
209 | struct btrfsic_block_data_ctx { | ||
210 | u64 start; /* virtual bytenr */ | ||
211 | u64 dev_bytenr; /* physical bytenr on device */ | ||
212 | u32 len; | ||
213 | struct btrfsic_dev_state *dev; | ||
214 | char *data; | ||
215 | struct buffer_head *bh; /* do not use if set to NULL */ | ||
216 | }; | ||
217 | |||
218 | /* This structure is used to implement recursion without occupying | ||
219 | * any stack space, refer to btrfsic_process_metablock() */ | ||
220 | struct btrfsic_stack_frame { | ||
221 | u32 magic; | ||
222 | u32 nr; | ||
223 | int error; | ||
224 | int i; | ||
225 | int limit_nesting; | ||
226 | int num_copies; | ||
227 | int mirror_num; | ||
228 | struct btrfsic_block *block; | ||
229 | struct btrfsic_block_data_ctx *block_ctx; | ||
230 | struct btrfsic_block *next_block; | ||
231 | struct btrfsic_block_data_ctx next_block_ctx; | ||
232 | struct btrfs_header *hdr; | ||
233 | struct btrfsic_stack_frame *prev; | ||
234 | }; | ||
235 | |||
236 | /* Some state per mounted filesystem */ | ||
237 | struct btrfsic_state { | ||
238 | u32 print_mask; | ||
239 | int include_extent_data; | ||
240 | int csum_size; | ||
241 | struct list_head all_blocks_list; | ||
242 | struct btrfsic_block_hashtable block_hashtable; | ||
243 | struct btrfsic_block_link_hashtable block_link_hashtable; | ||
244 | struct btrfs_root *root; | ||
245 | u64 max_superblock_generation; | ||
246 | struct btrfsic_block *latest_superblock; | ||
247 | }; | ||
248 | |||
249 | static void btrfsic_block_init(struct btrfsic_block *b); | ||
250 | static struct btrfsic_block *btrfsic_block_alloc(void); | ||
251 | static void btrfsic_block_free(struct btrfsic_block *b); | ||
252 | static void btrfsic_block_link_init(struct btrfsic_block_link *n); | ||
253 | static struct btrfsic_block_link *btrfsic_block_link_alloc(void); | ||
254 | static void btrfsic_block_link_free(struct btrfsic_block_link *n); | ||
255 | static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); | ||
256 | static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); | ||
257 | static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); | ||
258 | static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); | ||
259 | static void btrfsic_block_hashtable_add(struct btrfsic_block *b, | ||
260 | struct btrfsic_block_hashtable *h); | ||
261 | static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); | ||
262 | static struct btrfsic_block *btrfsic_block_hashtable_lookup( | ||
263 | struct block_device *bdev, | ||
264 | u64 dev_bytenr, | ||
265 | struct btrfsic_block_hashtable *h); | ||
266 | static void btrfsic_block_link_hashtable_init( | ||
267 | struct btrfsic_block_link_hashtable *h); | ||
268 | static void btrfsic_block_link_hashtable_add( | ||
269 | struct btrfsic_block_link *l, | ||
270 | struct btrfsic_block_link_hashtable *h); | ||
271 | static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); | ||
272 | static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( | ||
273 | struct block_device *bdev_ref_to, | ||
274 | u64 dev_bytenr_ref_to, | ||
275 | struct block_device *bdev_ref_from, | ||
276 | u64 dev_bytenr_ref_from, | ||
277 | struct btrfsic_block_link_hashtable *h); | ||
278 | static void btrfsic_dev_state_hashtable_init( | ||
279 | struct btrfsic_dev_state_hashtable *h); | ||
280 | static void btrfsic_dev_state_hashtable_add( | ||
281 | struct btrfsic_dev_state *ds, | ||
282 | struct btrfsic_dev_state_hashtable *h); | ||
283 | static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); | ||
284 | static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( | ||
285 | struct block_device *bdev, | ||
286 | struct btrfsic_dev_state_hashtable *h); | ||
287 | static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); | ||
288 | static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); | ||
289 | static int btrfsic_process_superblock(struct btrfsic_state *state, | ||
290 | struct btrfs_fs_devices *fs_devices); | ||
291 | static int btrfsic_process_metablock(struct btrfsic_state *state, | ||
292 | struct btrfsic_block *block, | ||
293 | struct btrfsic_block_data_ctx *block_ctx, | ||
294 | struct btrfs_header *hdr, | ||
295 | int limit_nesting, int force_iodone_flag); | ||
296 | static int btrfsic_create_link_to_next_block( | ||
297 | struct btrfsic_state *state, | ||
298 | struct btrfsic_block *block, | ||
299 | struct btrfsic_block_data_ctx | ||
300 | *block_ctx, u64 next_bytenr, | ||
301 | int limit_nesting, | ||
302 | struct btrfsic_block_data_ctx *next_block_ctx, | ||
303 | struct btrfsic_block **next_blockp, | ||
304 | int force_iodone_flag, | ||
305 | int *num_copiesp, int *mirror_nump, | ||
306 | struct btrfs_disk_key *disk_key, | ||
307 | u64 parent_generation); | ||
308 | static int btrfsic_handle_extent_data(struct btrfsic_state *state, | ||
309 | struct btrfsic_block *block, | ||
310 | struct btrfsic_block_data_ctx *block_ctx, | ||
311 | u32 item_offset, int force_iodone_flag); | ||
312 | static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, | ||
313 | struct btrfsic_block_data_ctx *block_ctx_out, | ||
314 | int mirror_num); | ||
315 | static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, | ||
316 | u32 len, struct block_device *bdev, | ||
317 | struct btrfsic_block_data_ctx *block_ctx_out); | ||
318 | static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); | ||
319 | static int btrfsic_read_block(struct btrfsic_state *state, | ||
320 | struct btrfsic_block_data_ctx *block_ctx); | ||
321 | static void btrfsic_dump_database(struct btrfsic_state *state); | ||
322 | static int btrfsic_test_for_metadata(struct btrfsic_state *state, | ||
323 | const u8 *data, unsigned int size); | ||
324 | static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, | ||
325 | u64 dev_bytenr, u8 *mapped_data, | ||
326 | unsigned int len, struct bio *bio, | ||
327 | int *bio_is_patched, | ||
328 | struct buffer_head *bh, | ||
329 | int submit_bio_bh_rw); | ||
330 | static int btrfsic_process_written_superblock( | ||
331 | struct btrfsic_state *state, | ||
332 | struct btrfsic_block *const block, | ||
333 | struct btrfs_super_block *const super_hdr); | ||
334 | static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); | ||
335 | static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); | ||
336 | static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, | ||
337 | const struct btrfsic_block *block, | ||
338 | int recursion_level); | ||
339 | static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, | ||
340 | struct btrfsic_block *const block, | ||
341 | int recursion_level); | ||
342 | static void btrfsic_print_add_link(const struct btrfsic_state *state, | ||
343 | const struct btrfsic_block_link *l); | ||
344 | static void btrfsic_print_rem_link(const struct btrfsic_state *state, | ||
345 | const struct btrfsic_block_link *l); | ||
346 | static char btrfsic_get_block_type(const struct btrfsic_state *state, | ||
347 | const struct btrfsic_block *block); | ||
348 | static void btrfsic_dump_tree(const struct btrfsic_state *state); | ||
349 | static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, | ||
350 | const struct btrfsic_block *block, | ||
351 | int indent_level); | ||
352 | static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( | ||
353 | struct btrfsic_state *state, | ||
354 | struct btrfsic_block_data_ctx *next_block_ctx, | ||
355 | struct btrfsic_block *next_block, | ||
356 | struct btrfsic_block *from_block, | ||
357 | u64 parent_generation); | ||
358 | static struct btrfsic_block *btrfsic_block_lookup_or_add( | ||
359 | struct btrfsic_state *state, | ||
360 | struct btrfsic_block_data_ctx *block_ctx, | ||
361 | const char *additional_string, | ||
362 | int is_metadata, | ||
363 | int is_iodone, | ||
364 | int never_written, | ||
365 | int mirror_num, | ||
366 | int *was_created); | ||
367 | static int btrfsic_process_superblock_dev_mirror( | ||
368 | struct btrfsic_state *state, | ||
369 | struct btrfsic_dev_state *dev_state, | ||
370 | struct btrfs_device *device, | ||
371 | int superblock_mirror_num, | ||
372 | struct btrfsic_dev_state **selected_dev_state, | ||
373 | struct btrfs_super_block *selected_super); | ||
374 | static struct btrfsic_dev_state *btrfsic_dev_state_lookup( | ||
375 | struct block_device *bdev); | ||
376 | static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, | ||
377 | u64 bytenr, | ||
378 | struct btrfsic_dev_state *dev_state, | ||
379 | u64 dev_bytenr, char *data); | ||
380 | |||
381 | static struct mutex btrfsic_mutex; | ||
382 | static int btrfsic_is_initialized; | ||
383 | static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; | ||
384 | |||
385 | |||
386 | static void btrfsic_block_init(struct btrfsic_block *b) | ||
387 | { | ||
388 | b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; | ||
389 | b->dev_state = NULL; | ||
390 | b->dev_bytenr = 0; | ||
391 | b->logical_bytenr = 0; | ||
392 | b->generation = BTRFSIC_GENERATION_UNKNOWN; | ||
393 | b->disk_key.objectid = 0; | ||
394 | b->disk_key.type = 0; | ||
395 | b->disk_key.offset = 0; | ||
396 | b->is_metadata = 0; | ||
397 | b->is_superblock = 0; | ||
398 | b->is_iodone = 0; | ||
399 | b->iodone_w_error = 0; | ||
400 | b->never_written = 0; | ||
401 | b->mirror_num = 0; | ||
402 | b->next_in_same_bio = NULL; | ||
403 | b->orig_bio_bh_private = NULL; | ||
404 | b->orig_bio_bh_end_io.bio = NULL; | ||
405 | INIT_LIST_HEAD(&b->collision_resolving_node); | ||
406 | INIT_LIST_HEAD(&b->all_blocks_node); | ||
407 | INIT_LIST_HEAD(&b->ref_to_list); | ||
408 | INIT_LIST_HEAD(&b->ref_from_list); | ||
409 | b->submit_bio_bh_rw = 0; | ||
410 | b->flush_gen = 0; | ||
411 | } | ||
412 | |||
413 | static struct btrfsic_block *btrfsic_block_alloc(void) | ||
414 | { | ||
415 | struct btrfsic_block *b; | ||
416 | |||
417 | b = kzalloc(sizeof(*b), GFP_NOFS); | ||
418 | if (NULL != b) | ||
419 | btrfsic_block_init(b); | ||
420 | |||
421 | return b; | ||
422 | } | ||
423 | |||
424 | static void btrfsic_block_free(struct btrfsic_block *b) | ||
425 | { | ||
426 | BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); | ||
427 | kfree(b); | ||
428 | } | ||
429 | |||
430 | static void btrfsic_block_link_init(struct btrfsic_block_link *l) | ||
431 | { | ||
432 | l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; | ||
433 | l->ref_cnt = 1; | ||
434 | INIT_LIST_HEAD(&l->node_ref_to); | ||
435 | INIT_LIST_HEAD(&l->node_ref_from); | ||
436 | INIT_LIST_HEAD(&l->collision_resolving_node); | ||
437 | l->block_ref_to = NULL; | ||
438 | l->block_ref_from = NULL; | ||
439 | } | ||
440 | |||
441 | static struct btrfsic_block_link *btrfsic_block_link_alloc(void) | ||
442 | { | ||
443 | struct btrfsic_block_link *l; | ||
444 | |||
445 | l = kzalloc(sizeof(*l), GFP_NOFS); | ||
446 | if (NULL != l) | ||
447 | btrfsic_block_link_init(l); | ||
448 | |||
449 | return l; | ||
450 | } | ||
451 | |||
452 | static void btrfsic_block_link_free(struct btrfsic_block_link *l) | ||
453 | { | ||
454 | BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); | ||
455 | kfree(l); | ||
456 | } | ||
457 | |||
458 | static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) | ||
459 | { | ||
460 | ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; | ||
461 | ds->bdev = NULL; | ||
462 | ds->state = NULL; | ||
463 | ds->name[0] = '\0'; | ||
464 | INIT_LIST_HEAD(&ds->collision_resolving_node); | ||
465 | ds->last_flush_gen = 0; | ||
466 | btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); | ||
467 | ds->dummy_block_for_bio_bh_flush.is_iodone = 1; | ||
468 | ds->dummy_block_for_bio_bh_flush.dev_state = ds; | ||
469 | } | ||
470 | |||
471 | static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) | ||
472 | { | ||
473 | struct btrfsic_dev_state *ds; | ||
474 | |||
475 | ds = kzalloc(sizeof(*ds), GFP_NOFS); | ||
476 | if (NULL != ds) | ||
477 | btrfsic_dev_state_init(ds); | ||
478 | |||
479 | return ds; | ||
480 | } | ||
481 | |||
482 | static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) | ||
483 | { | ||
484 | BUG_ON(!(NULL == ds || | ||
485 | BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); | ||
486 | kfree(ds); | ||
487 | } | ||
488 | |||
489 | static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) | ||
490 | { | ||
491 | int i; | ||
492 | |||
493 | for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) | ||
494 | INIT_LIST_HEAD(h->table + i); | ||
495 | } | ||
496 | |||
497 | static void btrfsic_block_hashtable_add(struct btrfsic_block *b, | ||
498 | struct btrfsic_block_hashtable *h) | ||
499 | { | ||
500 | const unsigned int hashval = | ||
501 | (((unsigned int)(b->dev_bytenr >> 16)) ^ | ||
502 | ((unsigned int)((uintptr_t)b->dev_state->bdev))) & | ||
503 | (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); | ||
504 | |||
505 | list_add(&b->collision_resolving_node, h->table + hashval); | ||
506 | } | ||
507 | |||
508 | static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) | ||
509 | { | ||
510 | list_del(&b->collision_resolving_node); | ||
511 | } | ||
512 | |||
513 | static struct btrfsic_block *btrfsic_block_hashtable_lookup( | ||
514 | struct block_device *bdev, | ||
515 | u64 dev_bytenr, | ||
516 | struct btrfsic_block_hashtable *h) | ||
517 | { | ||
518 | const unsigned int hashval = | ||
519 | (((unsigned int)(dev_bytenr >> 16)) ^ | ||
520 | ((unsigned int)((uintptr_t)bdev))) & | ||
521 | (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); | ||
522 | struct list_head *elem; | ||
523 | |||
524 | list_for_each(elem, h->table + hashval) { | ||
525 | struct btrfsic_block *const b = | ||
526 | list_entry(elem, struct btrfsic_block, | ||
527 | collision_resolving_node); | ||
528 | |||
529 | if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) | ||
530 | return b; | ||
531 | } | ||
532 | |||
533 | return NULL; | ||
534 | } | ||
535 | |||
536 | static void btrfsic_block_link_hashtable_init( | ||
537 | struct btrfsic_block_link_hashtable *h) | ||
538 | { | ||
539 | int i; | ||
540 | |||
541 | for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) | ||
542 | INIT_LIST_HEAD(h->table + i); | ||
543 | } | ||
544 | |||
545 | static void btrfsic_block_link_hashtable_add( | ||
546 | struct btrfsic_block_link *l, | ||
547 | struct btrfsic_block_link_hashtable *h) | ||
548 | { | ||
549 | const unsigned int hashval = | ||
550 | (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ | ||
551 | ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ | ||
552 | ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ | ||
553 | ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) | ||
554 | & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); | ||
555 | |||
556 | BUG_ON(NULL == l->block_ref_to); | ||
557 | BUG_ON(NULL == l->block_ref_from); | ||
558 | list_add(&l->collision_resolving_node, h->table + hashval); | ||
559 | } | ||
560 | |||
561 | static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) | ||
562 | { | ||
563 | list_del(&l->collision_resolving_node); | ||
564 | } | ||
565 | |||
566 | static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( | ||
567 | struct block_device *bdev_ref_to, | ||
568 | u64 dev_bytenr_ref_to, | ||
569 | struct block_device *bdev_ref_from, | ||
570 | u64 dev_bytenr_ref_from, | ||
571 | struct btrfsic_block_link_hashtable *h) | ||
572 | { | ||
573 | const unsigned int hashval = | ||
574 | (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ | ||
575 | ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ | ||
576 | ((unsigned int)((uintptr_t)bdev_ref_to)) ^ | ||
577 | ((unsigned int)((uintptr_t)bdev_ref_from))) & | ||
578 | (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); | ||
579 | struct list_head *elem; | ||
580 | |||
581 | list_for_each(elem, h->table + hashval) { | ||
582 | struct btrfsic_block_link *const l = | ||
583 | list_entry(elem, struct btrfsic_block_link, | ||
584 | collision_resolving_node); | ||
585 | |||
586 | BUG_ON(NULL == l->block_ref_to); | ||
587 | BUG_ON(NULL == l->block_ref_from); | ||
588 | if (l->block_ref_to->dev_state->bdev == bdev_ref_to && | ||
589 | l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && | ||
590 | l->block_ref_from->dev_state->bdev == bdev_ref_from && | ||
591 | l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) | ||
592 | return l; | ||
593 | } | ||
594 | |||
595 | return NULL; | ||
596 | } | ||
597 | |||
598 | static void btrfsic_dev_state_hashtable_init( | ||
599 | struct btrfsic_dev_state_hashtable *h) | ||
600 | { | ||
601 | int i; | ||
602 | |||
603 | for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) | ||
604 | INIT_LIST_HEAD(h->table + i); | ||
605 | } | ||
606 | |||
607 | static void btrfsic_dev_state_hashtable_add( | ||
608 | struct btrfsic_dev_state *ds, | ||
609 | struct btrfsic_dev_state_hashtable *h) | ||
610 | { | ||
611 | const unsigned int hashval = | ||
612 | (((unsigned int)((uintptr_t)ds->bdev)) & | ||
613 | (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); | ||
614 | |||
615 | list_add(&ds->collision_resolving_node, h->table + hashval); | ||
616 | } | ||
617 | |||
618 | static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) | ||
619 | { | ||
620 | list_del(&ds->collision_resolving_node); | ||
621 | } | ||
622 | |||
623 | static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( | ||
624 | struct block_device *bdev, | ||
625 | struct btrfsic_dev_state_hashtable *h) | ||
626 | { | ||
627 | const unsigned int hashval = | ||
628 | (((unsigned int)((uintptr_t)bdev)) & | ||
629 | (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); | ||
630 | struct list_head *elem; | ||
631 | |||
632 | list_for_each(elem, h->table + hashval) { | ||
633 | struct btrfsic_dev_state *const ds = | ||
634 | list_entry(elem, struct btrfsic_dev_state, | ||
635 | collision_resolving_node); | ||
636 | |||
637 | if (ds->bdev == bdev) | ||
638 | return ds; | ||
639 | } | ||
640 | |||
641 | return NULL; | ||
642 | } | ||
643 | |||
644 | static int btrfsic_process_superblock(struct btrfsic_state *state, | ||
645 | struct btrfs_fs_devices *fs_devices) | ||
646 | { | ||
647 | int ret; | ||
648 | struct btrfs_super_block *selected_super; | ||
649 | struct list_head *dev_head = &fs_devices->devices; | ||
650 | struct btrfs_device *device; | ||
651 | struct btrfsic_dev_state *selected_dev_state = NULL; | ||
652 | int pass; | ||
653 | |||
654 | BUG_ON(NULL == state); | ||
655 | selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); | ||
656 | if (NULL == selected_super) { | ||
657 | printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); | ||
658 | return -1; | ||
659 | } | ||
660 | |||
661 | list_for_each_entry(device, dev_head, dev_list) { | ||
662 | int i; | ||
663 | struct btrfsic_dev_state *dev_state; | ||
664 | |||
665 | if (!device->bdev || !device->name) | ||
666 | continue; | ||
667 | |||
668 | dev_state = btrfsic_dev_state_lookup(device->bdev); | ||
669 | BUG_ON(NULL == dev_state); | ||
670 | for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { | ||
671 | ret = btrfsic_process_superblock_dev_mirror( | ||
672 | state, dev_state, device, i, | ||
673 | &selected_dev_state, selected_super); | ||
674 | if (0 != ret && 0 == i) { | ||
675 | kfree(selected_super); | ||
676 | return ret; | ||
677 | } | ||
678 | } | ||
679 | } | ||
680 | |||
681 | if (NULL == state->latest_superblock) { | ||
682 | printk(KERN_INFO "btrfsic: no superblock found!\n"); | ||
683 | kfree(selected_super); | ||
684 | return -1; | ||
685 | } | ||
686 | |||
687 | state->csum_size = btrfs_super_csum_size(selected_super); | ||
688 | |||
689 | for (pass = 0; pass < 3; pass++) { | ||
690 | int num_copies; | ||
691 | int mirror_num; | ||
692 | u64 next_bytenr; | ||
693 | |||
694 | switch (pass) { | ||
695 | case 0: | ||
696 | next_bytenr = btrfs_super_root(selected_super); | ||
697 | if (state->print_mask & | ||
698 | BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) | ||
699 | printk(KERN_INFO "root@%llu\n", | ||
700 | (unsigned long long)next_bytenr); | ||
701 | break; | ||
702 | case 1: | ||
703 | next_bytenr = btrfs_super_chunk_root(selected_super); | ||
704 | if (state->print_mask & | ||
705 | BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) | ||
706 | printk(KERN_INFO "chunk@%llu\n", | ||
707 | (unsigned long long)next_bytenr); | ||
708 | break; | ||
709 | case 2: | ||
710 | next_bytenr = btrfs_super_log_root(selected_super); | ||
711 | if (0 == next_bytenr) | ||
712 | continue; | ||
713 | if (state->print_mask & | ||
714 | BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) | ||
715 | printk(KERN_INFO "log@%llu\n", | ||
716 | (unsigned long long)next_bytenr); | ||
717 | break; | ||
718 | } | ||
719 | |||
720 | num_copies = | ||
721 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | ||
722 | next_bytenr, PAGE_SIZE); | ||
723 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | ||
724 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | ||
725 | (unsigned long long)next_bytenr, num_copies); | ||
726 | |||
727 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { | ||
728 | struct btrfsic_block *next_block; | ||
729 | struct btrfsic_block_data_ctx tmp_next_block_ctx; | ||
730 | struct btrfsic_block_link *l; | ||
731 | struct btrfs_header *hdr; | ||
732 | |||
733 | ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, | ||
734 | &tmp_next_block_ctx, | ||
735 | mirror_num); | ||
736 | if (ret) { | ||
737 | printk(KERN_INFO "btrfsic:" | ||
738 | " btrfsic_map_block(root @%llu," | ||
739 | " mirror %d) failed!\n", | ||
740 | (unsigned long long)next_bytenr, | ||
741 | mirror_num); | ||
742 | kfree(selected_super); | ||
743 | return -1; | ||
744 | } | ||
745 | |||
746 | next_block = btrfsic_block_hashtable_lookup( | ||
747 | tmp_next_block_ctx.dev->bdev, | ||
748 | tmp_next_block_ctx.dev_bytenr, | ||
749 | &state->block_hashtable); | ||
750 | BUG_ON(NULL == next_block); | ||
751 | |||
752 | l = btrfsic_block_link_hashtable_lookup( | ||
753 | tmp_next_block_ctx.dev->bdev, | ||
754 | tmp_next_block_ctx.dev_bytenr, | ||
755 | state->latest_superblock->dev_state-> | ||
756 | bdev, | ||
757 | state->latest_superblock->dev_bytenr, | ||
758 | &state->block_link_hashtable); | ||
759 | BUG_ON(NULL == l); | ||
760 | |||
761 | ret = btrfsic_read_block(state, &tmp_next_block_ctx); | ||
762 | if (ret < (int)BTRFSIC_BLOCK_SIZE) { | ||
763 | printk(KERN_INFO | ||
764 | "btrfsic: read @logical %llu failed!\n", | ||
765 | (unsigned long long) | ||
766 | tmp_next_block_ctx.start); | ||
767 | btrfsic_release_block_ctx(&tmp_next_block_ctx); | ||
768 | kfree(selected_super); | ||
769 | return -1; | ||
770 | } | ||
771 | |||
772 | hdr = (struct btrfs_header *)tmp_next_block_ctx.data; | ||
773 | ret = btrfsic_process_metablock(state, | ||
774 | next_block, | ||
775 | &tmp_next_block_ctx, | ||
776 | hdr, | ||
777 | BTRFS_MAX_LEVEL + 3, 1); | ||
778 | btrfsic_release_block_ctx(&tmp_next_block_ctx); | ||
779 | } | ||
780 | } | ||
781 | |||
782 | kfree(selected_super); | ||
783 | return ret; | ||
784 | } | ||
785 | |||
786 | static int btrfsic_process_superblock_dev_mirror( | ||
787 | struct btrfsic_state *state, | ||
788 | struct btrfsic_dev_state *dev_state, | ||
789 | struct btrfs_device *device, | ||
790 | int superblock_mirror_num, | ||
791 | struct btrfsic_dev_state **selected_dev_state, | ||
792 | struct btrfs_super_block *selected_super) | ||
793 | { | ||
794 | struct btrfs_super_block *super_tmp; | ||
795 | u64 dev_bytenr; | ||
796 | struct buffer_head *bh; | ||
797 | struct btrfsic_block *superblock_tmp; | ||
798 | int pass; | ||
799 | struct block_device *const superblock_bdev = device->bdev; | ||
800 | |||
801 | /* super block bytenr is always the unmapped device bytenr */ | ||
802 | dev_bytenr = btrfs_sb_offset(superblock_mirror_num); | ||
803 | bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); | ||
804 | if (NULL == bh) | ||
805 | return -1; | ||
806 | super_tmp = (struct btrfs_super_block *) | ||
807 | (bh->b_data + (dev_bytenr & 4095)); | ||
808 | |||
809 | if (btrfs_super_bytenr(super_tmp) != dev_bytenr || | ||
810 | strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, | ||
811 | sizeof(super_tmp->magic)) || | ||
812 | memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { | ||
813 | brelse(bh); | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | superblock_tmp = | ||
818 | btrfsic_block_hashtable_lookup(superblock_bdev, | ||
819 | dev_bytenr, | ||
820 | &state->block_hashtable); | ||
821 | if (NULL == superblock_tmp) { | ||
822 | superblock_tmp = btrfsic_block_alloc(); | ||
823 | if (NULL == superblock_tmp) { | ||
824 | printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); | ||
825 | brelse(bh); | ||
826 | return -1; | ||
827 | } | ||
828 | /* for superblock, only the dev_bytenr makes sense */ | ||
829 | superblock_tmp->dev_bytenr = dev_bytenr; | ||
830 | superblock_tmp->dev_state = dev_state; | ||
831 | superblock_tmp->logical_bytenr = dev_bytenr; | ||
832 | superblock_tmp->generation = btrfs_super_generation(super_tmp); | ||
833 | superblock_tmp->is_metadata = 1; | ||
834 | superblock_tmp->is_superblock = 1; | ||
835 | superblock_tmp->is_iodone = 1; | ||
836 | superblock_tmp->never_written = 0; | ||
837 | superblock_tmp->mirror_num = 1 + superblock_mirror_num; | ||
838 | if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) | ||
839 | printk(KERN_INFO "New initial S-block (bdev %p, %s)" | ||
840 | " @%llu (%s/%llu/%d)\n", | ||
841 | superblock_bdev, device->name, | ||
842 | (unsigned long long)dev_bytenr, | ||
843 | dev_state->name, | ||
844 | (unsigned long long)dev_bytenr, | ||
845 | superblock_mirror_num); | ||
846 | list_add(&superblock_tmp->all_blocks_node, | ||
847 | &state->all_blocks_list); | ||
848 | btrfsic_block_hashtable_add(superblock_tmp, | ||
849 | &state->block_hashtable); | ||
850 | } | ||
851 | |||
852 | /* select the one with the highest generation field */ | ||
853 | if (btrfs_super_generation(super_tmp) > | ||
854 | state->max_superblock_generation || | ||
855 | 0 == state->max_superblock_generation) { | ||
856 | memcpy(selected_super, super_tmp, sizeof(*selected_super)); | ||
857 | *selected_dev_state = dev_state; | ||
858 | state->max_superblock_generation = | ||
859 | btrfs_super_generation(super_tmp); | ||
860 | state->latest_superblock = superblock_tmp; | ||
861 | } | ||
862 | |||
863 | for (pass = 0; pass < 3; pass++) { | ||
864 | u64 next_bytenr; | ||
865 | int num_copies; | ||
866 | int mirror_num; | ||
867 | const char *additional_string = NULL; | ||
868 | struct btrfs_disk_key tmp_disk_key; | ||
869 | |||
870 | tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; | ||
871 | tmp_disk_key.offset = 0; | ||
872 | switch (pass) { | ||
873 | case 0: | ||
874 | tmp_disk_key.objectid = | ||
875 | cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); | ||
876 | additional_string = "initial root "; | ||
877 | next_bytenr = btrfs_super_root(super_tmp); | ||
878 | break; | ||
879 | case 1: | ||
880 | tmp_disk_key.objectid = | ||
881 | cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); | ||
882 | additional_string = "initial chunk "; | ||
883 | next_bytenr = btrfs_super_chunk_root(super_tmp); | ||
884 | break; | ||
885 | case 2: | ||
886 | tmp_disk_key.objectid = | ||
887 | cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); | ||
888 | additional_string = "initial log "; | ||
889 | next_bytenr = btrfs_super_log_root(super_tmp); | ||
890 | if (0 == next_bytenr) | ||
891 | continue; | ||
892 | break; | ||
893 | } | ||
894 | |||
895 | num_copies = | ||
896 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | ||
897 | next_bytenr, PAGE_SIZE); | ||
898 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | ||
899 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | ||
900 | (unsigned long long)next_bytenr, num_copies); | ||
901 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { | ||
902 | struct btrfsic_block *next_block; | ||
903 | struct btrfsic_block_data_ctx tmp_next_block_ctx; | ||
904 | struct btrfsic_block_link *l; | ||
905 | |||
906 | if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, | ||
907 | &tmp_next_block_ctx, | ||
908 | mirror_num)) { | ||
909 | printk(KERN_INFO "btrfsic: btrfsic_map_block(" | ||
910 | "bytenr @%llu, mirror %d) failed!\n", | ||
911 | (unsigned long long)next_bytenr, | ||
912 | mirror_num); | ||
913 | brelse(bh); | ||
914 | return -1; | ||
915 | } | ||
916 | |||
917 | next_block = btrfsic_block_lookup_or_add( | ||
918 | state, &tmp_next_block_ctx, | ||
919 | additional_string, 1, 1, 0, | ||
920 | mirror_num, NULL); | ||
921 | if (NULL == next_block) { | ||
922 | btrfsic_release_block_ctx(&tmp_next_block_ctx); | ||
923 | brelse(bh); | ||
924 | return -1; | ||
925 | } | ||
926 | |||
927 | next_block->disk_key = tmp_disk_key; | ||
928 | next_block->generation = BTRFSIC_GENERATION_UNKNOWN; | ||
929 | l = btrfsic_block_link_lookup_or_add( | ||
930 | state, &tmp_next_block_ctx, | ||
931 | next_block, superblock_tmp, | ||
932 | BTRFSIC_GENERATION_UNKNOWN); | ||
933 | btrfsic_release_block_ctx(&tmp_next_block_ctx); | ||
934 | if (NULL == l) { | ||
935 | brelse(bh); | ||
936 | return -1; | ||
937 | } | ||
938 | } | ||
939 | } | ||
940 | if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) | ||
941 | btrfsic_dump_tree_sub(state, superblock_tmp, 0); | ||
942 | |||
943 | brelse(bh); | ||
944 | return 0; | ||
945 | } | ||
946 | |||
947 | static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) | ||
948 | { | ||
949 | struct btrfsic_stack_frame *sf; | ||
950 | |||
951 | sf = kzalloc(sizeof(*sf), GFP_NOFS); | ||
952 | if (NULL == sf) | ||
953 | printk(KERN_INFO "btrfsic: alloc memory failed!\n"); | ||
954 | else | ||
955 | sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; | ||
956 | return sf; | ||
957 | } | ||
958 | |||
959 | static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) | ||
960 | { | ||
961 | BUG_ON(!(NULL == sf || | ||
962 | BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); | ||
963 | kfree(sf); | ||
964 | } | ||
965 | |||
966 | static int btrfsic_process_metablock( | ||
967 | struct btrfsic_state *state, | ||
968 | struct btrfsic_block *const first_block, | ||
969 | struct btrfsic_block_data_ctx *const first_block_ctx, | ||
970 | struct btrfs_header *const first_hdr, | ||
971 | int first_limit_nesting, int force_iodone_flag) | ||
972 | { | ||
973 | struct btrfsic_stack_frame initial_stack_frame = { 0 }; | ||
974 | struct btrfsic_stack_frame *sf; | ||
975 | struct btrfsic_stack_frame *next_stack; | ||
976 | |||
977 | sf = &initial_stack_frame; | ||
978 | sf->error = 0; | ||
979 | sf->i = -1; | ||
980 | sf->limit_nesting = first_limit_nesting; | ||
981 | sf->block = first_block; | ||
982 | sf->block_ctx = first_block_ctx; | ||
983 | sf->next_block = NULL; | ||
984 | sf->hdr = first_hdr; | ||
985 | sf->prev = NULL; | ||
986 | |||
987 | continue_with_new_stack_frame: | ||
988 | sf->block->generation = le64_to_cpu(sf->hdr->generation); | ||
989 | if (0 == sf->hdr->level) { | ||
990 | struct btrfs_leaf *const leafhdr = | ||
991 | (struct btrfs_leaf *)sf->hdr; | ||
992 | |||
993 | if (-1 == sf->i) { | ||
994 | sf->nr = le32_to_cpu(leafhdr->header.nritems); | ||
995 | |||
996 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
997 | printk(KERN_INFO | ||
998 | "leaf %llu items %d generation %llu" | ||
999 | " owner %llu\n", | ||
1000 | (unsigned long long) | ||
1001 | sf->block_ctx->start, | ||
1002 | sf->nr, | ||
1003 | (unsigned long long) | ||
1004 | le64_to_cpu(leafhdr->header.generation), | ||
1005 | (unsigned long long) | ||
1006 | le64_to_cpu(leafhdr->header.owner)); | ||
1007 | } | ||
1008 | |||
1009 | continue_with_current_leaf_stack_frame: | ||
1010 | if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { | ||
1011 | sf->i++; | ||
1012 | sf->num_copies = 0; | ||
1013 | } | ||
1014 | |||
1015 | if (sf->i < sf->nr) { | ||
1016 | struct btrfs_item *disk_item = leafhdr->items + sf->i; | ||
1017 | struct btrfs_disk_key *disk_key = &disk_item->key; | ||
1018 | u8 type; | ||
1019 | const u32 item_offset = le32_to_cpu(disk_item->offset); | ||
1020 | |||
1021 | type = disk_key->type; | ||
1022 | |||
1023 | if (BTRFS_ROOT_ITEM_KEY == type) { | ||
1024 | const struct btrfs_root_item *const root_item = | ||
1025 | (struct btrfs_root_item *) | ||
1026 | (sf->block_ctx->data + | ||
1027 | offsetof(struct btrfs_leaf, items) + | ||
1028 | item_offset); | ||
1029 | const u64 next_bytenr = | ||
1030 | le64_to_cpu(root_item->bytenr); | ||
1031 | |||
1032 | sf->error = | ||
1033 | btrfsic_create_link_to_next_block( | ||
1034 | state, | ||
1035 | sf->block, | ||
1036 | sf->block_ctx, | ||
1037 | next_bytenr, | ||
1038 | sf->limit_nesting, | ||
1039 | &sf->next_block_ctx, | ||
1040 | &sf->next_block, | ||
1041 | force_iodone_flag, | ||
1042 | &sf->num_copies, | ||
1043 | &sf->mirror_num, | ||
1044 | disk_key, | ||
1045 | le64_to_cpu(root_item-> | ||
1046 | generation)); | ||
1047 | if (sf->error) | ||
1048 | goto one_stack_frame_backwards; | ||
1049 | |||
1050 | if (NULL != sf->next_block) { | ||
1051 | struct btrfs_header *const next_hdr = | ||
1052 | (struct btrfs_header *) | ||
1053 | sf->next_block_ctx.data; | ||
1054 | |||
1055 | next_stack = | ||
1056 | btrfsic_stack_frame_alloc(); | ||
1057 | if (NULL == next_stack) { | ||
1058 | btrfsic_release_block_ctx( | ||
1059 | &sf-> | ||
1060 | next_block_ctx); | ||
1061 | goto one_stack_frame_backwards; | ||
1062 | } | ||
1063 | |||
1064 | next_stack->i = -1; | ||
1065 | next_stack->block = sf->next_block; | ||
1066 | next_stack->block_ctx = | ||
1067 | &sf->next_block_ctx; | ||
1068 | next_stack->next_block = NULL; | ||
1069 | next_stack->hdr = next_hdr; | ||
1070 | next_stack->limit_nesting = | ||
1071 | sf->limit_nesting - 1; | ||
1072 | next_stack->prev = sf; | ||
1073 | sf = next_stack; | ||
1074 | goto continue_with_new_stack_frame; | ||
1075 | } | ||
1076 | } else if (BTRFS_EXTENT_DATA_KEY == type && | ||
1077 | state->include_extent_data) { | ||
1078 | sf->error = btrfsic_handle_extent_data( | ||
1079 | state, | ||
1080 | sf->block, | ||
1081 | sf->block_ctx, | ||
1082 | item_offset, | ||
1083 | force_iodone_flag); | ||
1084 | if (sf->error) | ||
1085 | goto one_stack_frame_backwards; | ||
1086 | } | ||
1087 | |||
1088 | goto continue_with_current_leaf_stack_frame; | ||
1089 | } | ||
1090 | } else { | ||
1091 | struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; | ||
1092 | |||
1093 | if (-1 == sf->i) { | ||
1094 | sf->nr = le32_to_cpu(nodehdr->header.nritems); | ||
1095 | |||
1096 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1097 | printk(KERN_INFO "node %llu level %d items %d" | ||
1098 | " generation %llu owner %llu\n", | ||
1099 | (unsigned long long) | ||
1100 | sf->block_ctx->start, | ||
1101 | nodehdr->header.level, sf->nr, | ||
1102 | (unsigned long long) | ||
1103 | le64_to_cpu(nodehdr->header.generation), | ||
1104 | (unsigned long long) | ||
1105 | le64_to_cpu(nodehdr->header.owner)); | ||
1106 | } | ||
1107 | |||
1108 | continue_with_current_node_stack_frame: | ||
1109 | if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { | ||
1110 | sf->i++; | ||
1111 | sf->num_copies = 0; | ||
1112 | } | ||
1113 | |||
1114 | if (sf->i < sf->nr) { | ||
1115 | struct btrfs_key_ptr *disk_key_ptr = | ||
1116 | nodehdr->ptrs + sf->i; | ||
1117 | const u64 next_bytenr = | ||
1118 | le64_to_cpu(disk_key_ptr->blockptr); | ||
1119 | |||
1120 | sf->error = btrfsic_create_link_to_next_block( | ||
1121 | state, | ||
1122 | sf->block, | ||
1123 | sf->block_ctx, | ||
1124 | next_bytenr, | ||
1125 | sf->limit_nesting, | ||
1126 | &sf->next_block_ctx, | ||
1127 | &sf->next_block, | ||
1128 | force_iodone_flag, | ||
1129 | &sf->num_copies, | ||
1130 | &sf->mirror_num, | ||
1131 | &disk_key_ptr->key, | ||
1132 | le64_to_cpu(disk_key_ptr->generation)); | ||
1133 | if (sf->error) | ||
1134 | goto one_stack_frame_backwards; | ||
1135 | |||
1136 | if (NULL != sf->next_block) { | ||
1137 | struct btrfs_header *const next_hdr = | ||
1138 | (struct btrfs_header *) | ||
1139 | sf->next_block_ctx.data; | ||
1140 | |||
1141 | next_stack = btrfsic_stack_frame_alloc(); | ||
1142 | if (NULL == next_stack) | ||
1143 | goto one_stack_frame_backwards; | ||
1144 | |||
1145 | next_stack->i = -1; | ||
1146 | next_stack->block = sf->next_block; | ||
1147 | next_stack->block_ctx = &sf->next_block_ctx; | ||
1148 | next_stack->next_block = NULL; | ||
1149 | next_stack->hdr = next_hdr; | ||
1150 | next_stack->limit_nesting = | ||
1151 | sf->limit_nesting - 1; | ||
1152 | next_stack->prev = sf; | ||
1153 | sf = next_stack; | ||
1154 | goto continue_with_new_stack_frame; | ||
1155 | } | ||
1156 | |||
1157 | goto continue_with_current_node_stack_frame; | ||
1158 | } | ||
1159 | } | ||
1160 | |||
1161 | one_stack_frame_backwards: | ||
1162 | if (NULL != sf->prev) { | ||
1163 | struct btrfsic_stack_frame *const prev = sf->prev; | ||
1164 | |||
1165 | /* the one for the initial block is freed in the caller */ | ||
1166 | btrfsic_release_block_ctx(sf->block_ctx); | ||
1167 | |||
1168 | if (sf->error) { | ||
1169 | prev->error = sf->error; | ||
1170 | btrfsic_stack_frame_free(sf); | ||
1171 | sf = prev; | ||
1172 | goto one_stack_frame_backwards; | ||
1173 | } | ||
1174 | |||
1175 | btrfsic_stack_frame_free(sf); | ||
1176 | sf = prev; | ||
1177 | goto continue_with_new_stack_frame; | ||
1178 | } else { | ||
1179 | BUG_ON(&initial_stack_frame != sf); | ||
1180 | } | ||
1181 | |||
1182 | return sf->error; | ||
1183 | } | ||
1184 | |||
1185 | static int btrfsic_create_link_to_next_block( | ||
1186 | struct btrfsic_state *state, | ||
1187 | struct btrfsic_block *block, | ||
1188 | struct btrfsic_block_data_ctx *block_ctx, | ||
1189 | u64 next_bytenr, | ||
1190 | int limit_nesting, | ||
1191 | struct btrfsic_block_data_ctx *next_block_ctx, | ||
1192 | struct btrfsic_block **next_blockp, | ||
1193 | int force_iodone_flag, | ||
1194 | int *num_copiesp, int *mirror_nump, | ||
1195 | struct btrfs_disk_key *disk_key, | ||
1196 | u64 parent_generation) | ||
1197 | { | ||
1198 | struct btrfsic_block *next_block = NULL; | ||
1199 | int ret; | ||
1200 | struct btrfsic_block_link *l; | ||
1201 | int did_alloc_block_link; | ||
1202 | int block_was_created; | ||
1203 | |||
1204 | *next_blockp = NULL; | ||
1205 | if (0 == *num_copiesp) { | ||
1206 | *num_copiesp = | ||
1207 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | ||
1208 | next_bytenr, PAGE_SIZE); | ||
1209 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | ||
1210 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | ||
1211 | (unsigned long long)next_bytenr, *num_copiesp); | ||
1212 | *mirror_nump = 1; | ||
1213 | } | ||
1214 | |||
1215 | if (*mirror_nump > *num_copiesp) | ||
1216 | return 0; | ||
1217 | |||
1218 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1219 | printk(KERN_INFO | ||
1220 | "btrfsic_create_link_to_next_block(mirror_num=%d)\n", | ||
1221 | *mirror_nump); | ||
1222 | ret = btrfsic_map_block(state, next_bytenr, | ||
1223 | BTRFSIC_BLOCK_SIZE, | ||
1224 | next_block_ctx, *mirror_nump); | ||
1225 | if (ret) { | ||
1226 | printk(KERN_INFO | ||
1227 | "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", | ||
1228 | (unsigned long long)next_bytenr, *mirror_nump); | ||
1229 | btrfsic_release_block_ctx(next_block_ctx); | ||
1230 | *next_blockp = NULL; | ||
1231 | return -1; | ||
1232 | } | ||
1233 | |||
1234 | next_block = btrfsic_block_lookup_or_add(state, | ||
1235 | next_block_ctx, "referenced ", | ||
1236 | 1, force_iodone_flag, | ||
1237 | !force_iodone_flag, | ||
1238 | *mirror_nump, | ||
1239 | &block_was_created); | ||
1240 | if (NULL == next_block) { | ||
1241 | btrfsic_release_block_ctx(next_block_ctx); | ||
1242 | *next_blockp = NULL; | ||
1243 | return -1; | ||
1244 | } | ||
1245 | if (block_was_created) { | ||
1246 | l = NULL; | ||
1247 | next_block->generation = BTRFSIC_GENERATION_UNKNOWN; | ||
1248 | } else { | ||
1249 | if (next_block->logical_bytenr != next_bytenr && | ||
1250 | !(!next_block->is_metadata && | ||
1251 | 0 == next_block->logical_bytenr)) { | ||
1252 | printk(KERN_INFO | ||
1253 | "Referenced block @%llu (%s/%llu/%d)" | ||
1254 | " found in hash table, %c," | ||
1255 | " bytenr mismatch (!= stored %llu).\n", | ||
1256 | (unsigned long long)next_bytenr, | ||
1257 | next_block_ctx->dev->name, | ||
1258 | (unsigned long long)next_block_ctx->dev_bytenr, | ||
1259 | *mirror_nump, | ||
1260 | btrfsic_get_block_type(state, next_block), | ||
1261 | (unsigned long long)next_block->logical_bytenr); | ||
1262 | } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1263 | printk(KERN_INFO | ||
1264 | "Referenced block @%llu (%s/%llu/%d)" | ||
1265 | " found in hash table, %c.\n", | ||
1266 | (unsigned long long)next_bytenr, | ||
1267 | next_block_ctx->dev->name, | ||
1268 | (unsigned long long)next_block_ctx->dev_bytenr, | ||
1269 | *mirror_nump, | ||
1270 | btrfsic_get_block_type(state, next_block)); | ||
1271 | next_block->logical_bytenr = next_bytenr; | ||
1272 | |||
1273 | next_block->mirror_num = *mirror_nump; | ||
1274 | l = btrfsic_block_link_hashtable_lookup( | ||
1275 | next_block_ctx->dev->bdev, | ||
1276 | next_block_ctx->dev_bytenr, | ||
1277 | block_ctx->dev->bdev, | ||
1278 | block_ctx->dev_bytenr, | ||
1279 | &state->block_link_hashtable); | ||
1280 | } | ||
1281 | |||
1282 | next_block->disk_key = *disk_key; | ||
1283 | if (NULL == l) { | ||
1284 | l = btrfsic_block_link_alloc(); | ||
1285 | if (NULL == l) { | ||
1286 | printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); | ||
1287 | btrfsic_release_block_ctx(next_block_ctx); | ||
1288 | *next_blockp = NULL; | ||
1289 | return -1; | ||
1290 | } | ||
1291 | |||
1292 | did_alloc_block_link = 1; | ||
1293 | l->block_ref_to = next_block; | ||
1294 | l->block_ref_from = block; | ||
1295 | l->ref_cnt = 1; | ||
1296 | l->parent_generation = parent_generation; | ||
1297 | |||
1298 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1299 | btrfsic_print_add_link(state, l); | ||
1300 | |||
1301 | list_add(&l->node_ref_to, &block->ref_to_list); | ||
1302 | list_add(&l->node_ref_from, &next_block->ref_from_list); | ||
1303 | |||
1304 | btrfsic_block_link_hashtable_add(l, | ||
1305 | &state->block_link_hashtable); | ||
1306 | } else { | ||
1307 | did_alloc_block_link = 0; | ||
1308 | if (0 == limit_nesting) { | ||
1309 | l->ref_cnt++; | ||
1310 | l->parent_generation = parent_generation; | ||
1311 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1312 | btrfsic_print_add_link(state, l); | ||
1313 | } | ||
1314 | } | ||
1315 | |||
1316 | if (limit_nesting > 0 && did_alloc_block_link) { | ||
1317 | ret = btrfsic_read_block(state, next_block_ctx); | ||
1318 | if (ret < (int)BTRFSIC_BLOCK_SIZE) { | ||
1319 | printk(KERN_INFO | ||
1320 | "btrfsic: read block @logical %llu failed!\n", | ||
1321 | (unsigned long long)next_bytenr); | ||
1322 | btrfsic_release_block_ctx(next_block_ctx); | ||
1323 | *next_blockp = NULL; | ||
1324 | return -1; | ||
1325 | } | ||
1326 | |||
1327 | *next_blockp = next_block; | ||
1328 | } else { | ||
1329 | *next_blockp = NULL; | ||
1330 | } | ||
1331 | (*mirror_nump)++; | ||
1332 | |||
1333 | return 0; | ||
1334 | } | ||
1335 | |||
1336 | static int btrfsic_handle_extent_data( | ||
1337 | struct btrfsic_state *state, | ||
1338 | struct btrfsic_block *block, | ||
1339 | struct btrfsic_block_data_ctx *block_ctx, | ||
1340 | u32 item_offset, int force_iodone_flag) | ||
1341 | { | ||
1342 | int ret; | ||
1343 | struct btrfs_file_extent_item *file_extent_item = | ||
1344 | (struct btrfs_file_extent_item *)(block_ctx->data + | ||
1345 | offsetof(struct btrfs_leaf, | ||
1346 | items) + item_offset); | ||
1347 | u64 next_bytenr = | ||
1348 | le64_to_cpu(file_extent_item->disk_bytenr) + | ||
1349 | le64_to_cpu(file_extent_item->offset); | ||
1350 | u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); | ||
1351 | u64 generation = le64_to_cpu(file_extent_item->generation); | ||
1352 | struct btrfsic_block_link *l; | ||
1353 | |||
1354 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) | ||
1355 | printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," | ||
1356 | " offset = %llu, num_bytes = %llu\n", | ||
1357 | file_extent_item->type, | ||
1358 | (unsigned long long) | ||
1359 | le64_to_cpu(file_extent_item->disk_bytenr), | ||
1360 | (unsigned long long) | ||
1361 | le64_to_cpu(file_extent_item->offset), | ||
1362 | (unsigned long long) | ||
1363 | le64_to_cpu(file_extent_item->num_bytes)); | ||
1364 | if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || | ||
1365 | ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) | ||
1366 | return 0; | ||
1367 | while (num_bytes > 0) { | ||
1368 | u32 chunk_len; | ||
1369 | int num_copies; | ||
1370 | int mirror_num; | ||
1371 | |||
1372 | if (num_bytes > BTRFSIC_BLOCK_SIZE) | ||
1373 | chunk_len = BTRFSIC_BLOCK_SIZE; | ||
1374 | else | ||
1375 | chunk_len = num_bytes; | ||
1376 | |||
1377 | num_copies = | ||
1378 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | ||
1379 | next_bytenr, PAGE_SIZE); | ||
1380 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | ||
1381 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | ||
1382 | (unsigned long long)next_bytenr, num_copies); | ||
1383 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { | ||
1384 | struct btrfsic_block_data_ctx next_block_ctx; | ||
1385 | struct btrfsic_block *next_block; | ||
1386 | int block_was_created; | ||
1387 | |||
1388 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1389 | printk(KERN_INFO "btrfsic_handle_extent_data(" | ||
1390 | "mirror_num=%d)\n", mirror_num); | ||
1391 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) | ||
1392 | printk(KERN_INFO | ||
1393 | "\tdisk_bytenr = %llu, num_bytes %u\n", | ||
1394 | (unsigned long long)next_bytenr, | ||
1395 | chunk_len); | ||
1396 | ret = btrfsic_map_block(state, next_bytenr, | ||
1397 | chunk_len, &next_block_ctx, | ||
1398 | mirror_num); | ||
1399 | if (ret) { | ||
1400 | printk(KERN_INFO | ||
1401 | "btrfsic: btrfsic_map_block(@%llu," | ||
1402 | " mirror=%d) failed!\n", | ||
1403 | (unsigned long long)next_bytenr, | ||
1404 | mirror_num); | ||
1405 | return -1; | ||
1406 | } | ||
1407 | |||
1408 | next_block = btrfsic_block_lookup_or_add( | ||
1409 | state, | ||
1410 | &next_block_ctx, | ||
1411 | "referenced ", | ||
1412 | 0, | ||
1413 | force_iodone_flag, | ||
1414 | !force_iodone_flag, | ||
1415 | mirror_num, | ||
1416 | &block_was_created); | ||
1417 | if (NULL == next_block) { | ||
1418 | printk(KERN_INFO | ||
1419 | "btrfsic: error, kmalloc failed!\n"); | ||
1420 | btrfsic_release_block_ctx(&next_block_ctx); | ||
1421 | return -1; | ||
1422 | } | ||
1423 | if (!block_was_created) { | ||
1424 | if (next_block->logical_bytenr != next_bytenr && | ||
1425 | !(!next_block->is_metadata && | ||
1426 | 0 == next_block->logical_bytenr)) { | ||
1427 | printk(KERN_INFO | ||
1428 | "Referenced block" | ||
1429 | " @%llu (%s/%llu/%d)" | ||
1430 | " found in hash table, D," | ||
1431 | " bytenr mismatch" | ||
1432 | " (!= stored %llu).\n", | ||
1433 | (unsigned long long)next_bytenr, | ||
1434 | next_block_ctx.dev->name, | ||
1435 | (unsigned long long) | ||
1436 | next_block_ctx.dev_bytenr, | ||
1437 | mirror_num, | ||
1438 | (unsigned long long) | ||
1439 | next_block->logical_bytenr); | ||
1440 | } | ||
1441 | next_block->logical_bytenr = next_bytenr; | ||
1442 | next_block->mirror_num = mirror_num; | ||
1443 | } | ||
1444 | |||
1445 | l = btrfsic_block_link_lookup_or_add(state, | ||
1446 | &next_block_ctx, | ||
1447 | next_block, block, | ||
1448 | generation); | ||
1449 | btrfsic_release_block_ctx(&next_block_ctx); | ||
1450 | if (NULL == l) | ||
1451 | return -1; | ||
1452 | } | ||
1453 | |||
1454 | next_bytenr += chunk_len; | ||
1455 | num_bytes -= chunk_len; | ||
1456 | } | ||
1457 | |||
1458 | return 0; | ||
1459 | } | ||
1460 | |||
1461 | static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, | ||
1462 | struct btrfsic_block_data_ctx *block_ctx_out, | ||
1463 | int mirror_num) | ||
1464 | { | ||
1465 | int ret; | ||
1466 | u64 length; | ||
1467 | struct btrfs_bio *multi = NULL; | ||
1468 | struct btrfs_device *device; | ||
1469 | |||
1470 | length = len; | ||
1471 | ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, | ||
1472 | bytenr, &length, &multi, mirror_num); | ||
1473 | |||
1474 | device = multi->stripes[0].dev; | ||
1475 | block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); | ||
1476 | block_ctx_out->dev_bytenr = multi->stripes[0].physical; | ||
1477 | block_ctx_out->start = bytenr; | ||
1478 | block_ctx_out->len = len; | ||
1479 | block_ctx_out->data = NULL; | ||
1480 | block_ctx_out->bh = NULL; | ||
1481 | |||
1482 | if (0 == ret) | ||
1483 | kfree(multi); | ||
1484 | if (NULL == block_ctx_out->dev) { | ||
1485 | ret = -ENXIO; | ||
1486 | printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); | ||
1487 | } | ||
1488 | |||
1489 | return ret; | ||
1490 | } | ||
1491 | |||
1492 | static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, | ||
1493 | u32 len, struct block_device *bdev, | ||
1494 | struct btrfsic_block_data_ctx *block_ctx_out) | ||
1495 | { | ||
1496 | block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); | ||
1497 | block_ctx_out->dev_bytenr = bytenr; | ||
1498 | block_ctx_out->start = bytenr; | ||
1499 | block_ctx_out->len = len; | ||
1500 | block_ctx_out->data = NULL; | ||
1501 | block_ctx_out->bh = NULL; | ||
1502 | if (NULL != block_ctx_out->dev) { | ||
1503 | return 0; | ||
1504 | } else { | ||
1505 | printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); | ||
1506 | return -ENXIO; | ||
1507 | } | ||
1508 | } | ||
1509 | |||
1510 | static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) | ||
1511 | { | ||
1512 | if (NULL != block_ctx->bh) { | ||
1513 | brelse(block_ctx->bh); | ||
1514 | block_ctx->bh = NULL; | ||
1515 | } | ||
1516 | } | ||
1517 | |||
1518 | static int btrfsic_read_block(struct btrfsic_state *state, | ||
1519 | struct btrfsic_block_data_ctx *block_ctx) | ||
1520 | { | ||
1521 | block_ctx->bh = NULL; | ||
1522 | if (block_ctx->dev_bytenr & 4095) { | ||
1523 | printk(KERN_INFO | ||
1524 | "btrfsic: read_block() with unaligned bytenr %llu\n", | ||
1525 | (unsigned long long)block_ctx->dev_bytenr); | ||
1526 | return -1; | ||
1527 | } | ||
1528 | if (block_ctx->len > 4096) { | ||
1529 | printk(KERN_INFO | ||
1530 | "btrfsic: read_block() with too huge size %d\n", | ||
1531 | block_ctx->len); | ||
1532 | return -1; | ||
1533 | } | ||
1534 | |||
1535 | block_ctx->bh = __bread(block_ctx->dev->bdev, | ||
1536 | block_ctx->dev_bytenr >> 12, 4096); | ||
1537 | if (NULL == block_ctx->bh) | ||
1538 | return -1; | ||
1539 | block_ctx->data = block_ctx->bh->b_data; | ||
1540 | |||
1541 | return block_ctx->len; | ||
1542 | } | ||
1543 | |||
1544 | static void btrfsic_dump_database(struct btrfsic_state *state) | ||
1545 | { | ||
1546 | struct list_head *elem_all; | ||
1547 | |||
1548 | BUG_ON(NULL == state); | ||
1549 | |||
1550 | printk(KERN_INFO "all_blocks_list:\n"); | ||
1551 | list_for_each(elem_all, &state->all_blocks_list) { | ||
1552 | const struct btrfsic_block *const b_all = | ||
1553 | list_entry(elem_all, struct btrfsic_block, | ||
1554 | all_blocks_node); | ||
1555 | struct list_head *elem_ref_to; | ||
1556 | struct list_head *elem_ref_from; | ||
1557 | |||
1558 | printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", | ||
1559 | btrfsic_get_block_type(state, b_all), | ||
1560 | (unsigned long long)b_all->logical_bytenr, | ||
1561 | b_all->dev_state->name, | ||
1562 | (unsigned long long)b_all->dev_bytenr, | ||
1563 | b_all->mirror_num); | ||
1564 | |||
1565 | list_for_each(elem_ref_to, &b_all->ref_to_list) { | ||
1566 | const struct btrfsic_block_link *const l = | ||
1567 | list_entry(elem_ref_to, | ||
1568 | struct btrfsic_block_link, | ||
1569 | node_ref_to); | ||
1570 | |||
1571 | printk(KERN_INFO " %c @%llu (%s/%llu/%d)" | ||
1572 | " refers %u* to" | ||
1573 | " %c @%llu (%s/%llu/%d)\n", | ||
1574 | btrfsic_get_block_type(state, b_all), | ||
1575 | (unsigned long long)b_all->logical_bytenr, | ||
1576 | b_all->dev_state->name, | ||
1577 | (unsigned long long)b_all->dev_bytenr, | ||
1578 | b_all->mirror_num, | ||
1579 | l->ref_cnt, | ||
1580 | btrfsic_get_block_type(state, l->block_ref_to), | ||
1581 | (unsigned long long) | ||
1582 | l->block_ref_to->logical_bytenr, | ||
1583 | l->block_ref_to->dev_state->name, | ||
1584 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
1585 | l->block_ref_to->mirror_num); | ||
1586 | } | ||
1587 | |||
1588 | list_for_each(elem_ref_from, &b_all->ref_from_list) { | ||
1589 | const struct btrfsic_block_link *const l = | ||
1590 | list_entry(elem_ref_from, | ||
1591 | struct btrfsic_block_link, | ||
1592 | node_ref_from); | ||
1593 | |||
1594 | printk(KERN_INFO " %c @%llu (%s/%llu/%d)" | ||
1595 | " is ref %u* from" | ||
1596 | " %c @%llu (%s/%llu/%d)\n", | ||
1597 | btrfsic_get_block_type(state, b_all), | ||
1598 | (unsigned long long)b_all->logical_bytenr, | ||
1599 | b_all->dev_state->name, | ||
1600 | (unsigned long long)b_all->dev_bytenr, | ||
1601 | b_all->mirror_num, | ||
1602 | l->ref_cnt, | ||
1603 | btrfsic_get_block_type(state, l->block_ref_from), | ||
1604 | (unsigned long long) | ||
1605 | l->block_ref_from->logical_bytenr, | ||
1606 | l->block_ref_from->dev_state->name, | ||
1607 | (unsigned long long) | ||
1608 | l->block_ref_from->dev_bytenr, | ||
1609 | l->block_ref_from->mirror_num); | ||
1610 | } | ||
1611 | |||
1612 | printk(KERN_INFO "\n"); | ||
1613 | } | ||
1614 | } | ||
1615 | |||
1616 | /* | ||
1617 | * Test whether the disk block contains a tree block (leaf or node) | ||
1618 | * (note that this test fails for the super block) | ||
1619 | */ | ||
1620 | static int btrfsic_test_for_metadata(struct btrfsic_state *state, | ||
1621 | const u8 *data, unsigned int size) | ||
1622 | { | ||
1623 | struct btrfs_header *h; | ||
1624 | u8 csum[BTRFS_CSUM_SIZE]; | ||
1625 | u32 crc = ~(u32)0; | ||
1626 | int fail = 0; | ||
1627 | int crc_fail = 0; | ||
1628 | |||
1629 | h = (struct btrfs_header *)data; | ||
1630 | |||
1631 | if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) | ||
1632 | fail++; | ||
1633 | |||
1634 | crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); | ||
1635 | btrfs_csum_final(crc, csum); | ||
1636 | if (memcmp(csum, h->csum, state->csum_size)) | ||
1637 | crc_fail++; | ||
1638 | |||
1639 | return fail || crc_fail; | ||
1640 | } | ||
1641 | |||
1642 | static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, | ||
1643 | u64 dev_bytenr, | ||
1644 | u8 *mapped_data, unsigned int len, | ||
1645 | struct bio *bio, | ||
1646 | int *bio_is_patched, | ||
1647 | struct buffer_head *bh, | ||
1648 | int submit_bio_bh_rw) | ||
1649 | { | ||
1650 | int is_metadata; | ||
1651 | struct btrfsic_block *block; | ||
1652 | struct btrfsic_block_data_ctx block_ctx; | ||
1653 | int ret; | ||
1654 | struct btrfsic_state *state = dev_state->state; | ||
1655 | struct block_device *bdev = dev_state->bdev; | ||
1656 | |||
1657 | WARN_ON(len > PAGE_SIZE); | ||
1658 | is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); | ||
1659 | if (NULL != bio_is_patched) | ||
1660 | *bio_is_patched = 0; | ||
1661 | |||
1662 | block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, | ||
1663 | &state->block_hashtable); | ||
1664 | if (NULL != block) { | ||
1665 | u64 bytenr; | ||
1666 | struct list_head *elem_ref_to; | ||
1667 | struct list_head *tmp_ref_to; | ||
1668 | |||
1669 | if (block->is_superblock) { | ||
1670 | bytenr = le64_to_cpu(((struct btrfs_super_block *) | ||
1671 | mapped_data)->bytenr); | ||
1672 | is_metadata = 1; | ||
1673 | if (state->print_mask & | ||
1674 | BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { | ||
1675 | printk(KERN_INFO | ||
1676 | "[before new superblock is written]:\n"); | ||
1677 | btrfsic_dump_tree_sub(state, block, 0); | ||
1678 | } | ||
1679 | } | ||
1680 | if (is_metadata) { | ||
1681 | if (!block->is_superblock) { | ||
1682 | bytenr = le64_to_cpu(((struct btrfs_header *) | ||
1683 | mapped_data)->bytenr); | ||
1684 | btrfsic_cmp_log_and_dev_bytenr(state, bytenr, | ||
1685 | dev_state, | ||
1686 | dev_bytenr, | ||
1687 | mapped_data); | ||
1688 | } | ||
1689 | if (block->logical_bytenr != bytenr) { | ||
1690 | printk(KERN_INFO | ||
1691 | "Written block @%llu (%s/%llu/%d)" | ||
1692 | " found in hash table, %c," | ||
1693 | " bytenr mismatch" | ||
1694 | " (!= stored %llu).\n", | ||
1695 | (unsigned long long)bytenr, | ||
1696 | dev_state->name, | ||
1697 | (unsigned long long)dev_bytenr, | ||
1698 | block->mirror_num, | ||
1699 | btrfsic_get_block_type(state, block), | ||
1700 | (unsigned long long) | ||
1701 | block->logical_bytenr); | ||
1702 | block->logical_bytenr = bytenr; | ||
1703 | } else if (state->print_mask & | ||
1704 | BTRFSIC_PRINT_MASK_VERBOSE) | ||
1705 | printk(KERN_INFO | ||
1706 | "Written block @%llu (%s/%llu/%d)" | ||
1707 | " found in hash table, %c.\n", | ||
1708 | (unsigned long long)bytenr, | ||
1709 | dev_state->name, | ||
1710 | (unsigned long long)dev_bytenr, | ||
1711 | block->mirror_num, | ||
1712 | btrfsic_get_block_type(state, block)); | ||
1713 | } else { | ||
1714 | bytenr = block->logical_bytenr; | ||
1715 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1716 | printk(KERN_INFO | ||
1717 | "Written block @%llu (%s/%llu/%d)" | ||
1718 | " found in hash table, %c.\n", | ||
1719 | (unsigned long long)bytenr, | ||
1720 | dev_state->name, | ||
1721 | (unsigned long long)dev_bytenr, | ||
1722 | block->mirror_num, | ||
1723 | btrfsic_get_block_type(state, block)); | ||
1724 | } | ||
1725 | |||
1726 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1727 | printk(KERN_INFO | ||
1728 | "ref_to_list: %cE, ref_from_list: %cE\n", | ||
1729 | list_empty(&block->ref_to_list) ? ' ' : '!', | ||
1730 | list_empty(&block->ref_from_list) ? ' ' : '!'); | ||
1731 | if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { | ||
1732 | printk(KERN_INFO "btrfs: attempt to overwrite %c-block" | ||
1733 | " @%llu (%s/%llu/%d), old(gen=%llu," | ||
1734 | " objectid=%llu, type=%d, offset=%llu)," | ||
1735 | " new(gen=%llu)," | ||
1736 | " which is referenced by most recent superblock" | ||
1737 | " (superblockgen=%llu)!\n", | ||
1738 | btrfsic_get_block_type(state, block), | ||
1739 | (unsigned long long)bytenr, | ||
1740 | dev_state->name, | ||
1741 | (unsigned long long)dev_bytenr, | ||
1742 | block->mirror_num, | ||
1743 | (unsigned long long)block->generation, | ||
1744 | (unsigned long long) | ||
1745 | le64_to_cpu(block->disk_key.objectid), | ||
1746 | block->disk_key.type, | ||
1747 | (unsigned long long) | ||
1748 | le64_to_cpu(block->disk_key.offset), | ||
1749 | (unsigned long long) | ||
1750 | le64_to_cpu(((struct btrfs_header *) | ||
1751 | mapped_data)->generation), | ||
1752 | (unsigned long long) | ||
1753 | state->max_superblock_generation); | ||
1754 | btrfsic_dump_tree(state); | ||
1755 | } | ||
1756 | |||
1757 | if (!block->is_iodone && !block->never_written) { | ||
1758 | printk(KERN_INFO "btrfs: attempt to overwrite %c-block" | ||
1759 | " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," | ||
1760 | " which is not yet iodone!\n", | ||
1761 | btrfsic_get_block_type(state, block), | ||
1762 | (unsigned long long)bytenr, | ||
1763 | dev_state->name, | ||
1764 | (unsigned long long)dev_bytenr, | ||
1765 | block->mirror_num, | ||
1766 | (unsigned long long)block->generation, | ||
1767 | (unsigned long long) | ||
1768 | le64_to_cpu(((struct btrfs_header *) | ||
1769 | mapped_data)->generation)); | ||
1770 | /* it would not be safe to go on */ | ||
1771 | btrfsic_dump_tree(state); | ||
1772 | return; | ||
1773 | } | ||
1774 | |||
1775 | /* | ||
1776 | * Clear all references of this block. Do not free | ||
1777 | * the block itself even if is not referenced anymore | ||
1778 | * because it still carries valueable information | ||
1779 | * like whether it was ever written and IO completed. | ||
1780 | */ | ||
1781 | list_for_each_safe(elem_ref_to, tmp_ref_to, | ||
1782 | &block->ref_to_list) { | ||
1783 | struct btrfsic_block_link *const l = | ||
1784 | list_entry(elem_ref_to, | ||
1785 | struct btrfsic_block_link, | ||
1786 | node_ref_to); | ||
1787 | |||
1788 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1789 | btrfsic_print_rem_link(state, l); | ||
1790 | l->ref_cnt--; | ||
1791 | if (0 == l->ref_cnt) { | ||
1792 | list_del(&l->node_ref_to); | ||
1793 | list_del(&l->node_ref_from); | ||
1794 | btrfsic_block_link_hashtable_remove(l); | ||
1795 | btrfsic_block_link_free(l); | ||
1796 | } | ||
1797 | } | ||
1798 | |||
1799 | if (block->is_superblock) | ||
1800 | ret = btrfsic_map_superblock(state, bytenr, len, | ||
1801 | bdev, &block_ctx); | ||
1802 | else | ||
1803 | ret = btrfsic_map_block(state, bytenr, len, | ||
1804 | &block_ctx, 0); | ||
1805 | if (ret) { | ||
1806 | printk(KERN_INFO | ||
1807 | "btrfsic: btrfsic_map_block(root @%llu)" | ||
1808 | " failed!\n", (unsigned long long)bytenr); | ||
1809 | return; | ||
1810 | } | ||
1811 | block_ctx.data = mapped_data; | ||
1812 | /* the following is required in case of writes to mirrors, | ||
1813 | * use the same that was used for the lookup */ | ||
1814 | block_ctx.dev = dev_state; | ||
1815 | block_ctx.dev_bytenr = dev_bytenr; | ||
1816 | |||
1817 | if (is_metadata || state->include_extent_data) { | ||
1818 | block->never_written = 0; | ||
1819 | block->iodone_w_error = 0; | ||
1820 | if (NULL != bio) { | ||
1821 | block->is_iodone = 0; | ||
1822 | BUG_ON(NULL == bio_is_patched); | ||
1823 | if (!*bio_is_patched) { | ||
1824 | block->orig_bio_bh_private = | ||
1825 | bio->bi_private; | ||
1826 | block->orig_bio_bh_end_io.bio = | ||
1827 | bio->bi_end_io; | ||
1828 | block->next_in_same_bio = NULL; | ||
1829 | bio->bi_private = block; | ||
1830 | bio->bi_end_io = btrfsic_bio_end_io; | ||
1831 | *bio_is_patched = 1; | ||
1832 | } else { | ||
1833 | struct btrfsic_block *chained_block = | ||
1834 | (struct btrfsic_block *) | ||
1835 | bio->bi_private; | ||
1836 | |||
1837 | BUG_ON(NULL == chained_block); | ||
1838 | block->orig_bio_bh_private = | ||
1839 | chained_block->orig_bio_bh_private; | ||
1840 | block->orig_bio_bh_end_io.bio = | ||
1841 | chained_block->orig_bio_bh_end_io. | ||
1842 | bio; | ||
1843 | block->next_in_same_bio = chained_block; | ||
1844 | bio->bi_private = block; | ||
1845 | } | ||
1846 | } else if (NULL != bh) { | ||
1847 | block->is_iodone = 0; | ||
1848 | block->orig_bio_bh_private = bh->b_private; | ||
1849 | block->orig_bio_bh_end_io.bh = bh->b_end_io; | ||
1850 | block->next_in_same_bio = NULL; | ||
1851 | bh->b_private = block; | ||
1852 | bh->b_end_io = btrfsic_bh_end_io; | ||
1853 | } else { | ||
1854 | block->is_iodone = 1; | ||
1855 | block->orig_bio_bh_private = NULL; | ||
1856 | block->orig_bio_bh_end_io.bio = NULL; | ||
1857 | block->next_in_same_bio = NULL; | ||
1858 | } | ||
1859 | } | ||
1860 | |||
1861 | block->flush_gen = dev_state->last_flush_gen + 1; | ||
1862 | block->submit_bio_bh_rw = submit_bio_bh_rw; | ||
1863 | if (is_metadata) { | ||
1864 | block->logical_bytenr = bytenr; | ||
1865 | block->is_metadata = 1; | ||
1866 | if (block->is_superblock) { | ||
1867 | ret = btrfsic_process_written_superblock( | ||
1868 | state, | ||
1869 | block, | ||
1870 | (struct btrfs_super_block *) | ||
1871 | mapped_data); | ||
1872 | if (state->print_mask & | ||
1873 | BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { | ||
1874 | printk(KERN_INFO | ||
1875 | "[after new superblock is written]:\n"); | ||
1876 | btrfsic_dump_tree_sub(state, block, 0); | ||
1877 | } | ||
1878 | } else { | ||
1879 | block->mirror_num = 0; /* unknown */ | ||
1880 | ret = btrfsic_process_metablock( | ||
1881 | state, | ||
1882 | block, | ||
1883 | &block_ctx, | ||
1884 | (struct btrfs_header *) | ||
1885 | block_ctx.data, | ||
1886 | 0, 0); | ||
1887 | } | ||
1888 | if (ret) | ||
1889 | printk(KERN_INFO | ||
1890 | "btrfsic: btrfsic_process_metablock" | ||
1891 | "(root @%llu) failed!\n", | ||
1892 | (unsigned long long)dev_bytenr); | ||
1893 | } else { | ||
1894 | block->is_metadata = 0; | ||
1895 | block->mirror_num = 0; /* unknown */ | ||
1896 | block->generation = BTRFSIC_GENERATION_UNKNOWN; | ||
1897 | if (!state->include_extent_data | ||
1898 | && list_empty(&block->ref_from_list)) { | ||
1899 | /* | ||
1900 | * disk block is overwritten with extent | ||
1901 | * data (not meta data) and we are configured | ||
1902 | * to not include extent data: take the | ||
1903 | * chance and free the block's memory | ||
1904 | */ | ||
1905 | btrfsic_block_hashtable_remove(block); | ||
1906 | list_del(&block->all_blocks_node); | ||
1907 | btrfsic_block_free(block); | ||
1908 | } | ||
1909 | } | ||
1910 | btrfsic_release_block_ctx(&block_ctx); | ||
1911 | } else { | ||
1912 | /* block has not been found in hash table */ | ||
1913 | u64 bytenr; | ||
1914 | |||
1915 | if (!is_metadata) { | ||
1916 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1917 | printk(KERN_INFO "Written block (%s/%llu/?)" | ||
1918 | " !found in hash table, D.\n", | ||
1919 | dev_state->name, | ||
1920 | (unsigned long long)dev_bytenr); | ||
1921 | if (!state->include_extent_data) | ||
1922 | return; /* ignore that written D block */ | ||
1923 | |||
1924 | /* this is getting ugly for the | ||
1925 | * include_extent_data case... */ | ||
1926 | bytenr = 0; /* unknown */ | ||
1927 | block_ctx.start = bytenr; | ||
1928 | block_ctx.len = len; | ||
1929 | block_ctx.bh = NULL; | ||
1930 | } else { | ||
1931 | bytenr = le64_to_cpu(((struct btrfs_header *) | ||
1932 | mapped_data)->bytenr); | ||
1933 | btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, | ||
1934 | dev_bytenr, | ||
1935 | mapped_data); | ||
1936 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
1937 | printk(KERN_INFO | ||
1938 | "Written block @%llu (%s/%llu/?)" | ||
1939 | " !found in hash table, M.\n", | ||
1940 | (unsigned long long)bytenr, | ||
1941 | dev_state->name, | ||
1942 | (unsigned long long)dev_bytenr); | ||
1943 | |||
1944 | ret = btrfsic_map_block(state, bytenr, len, &block_ctx, | ||
1945 | 0); | ||
1946 | if (ret) { | ||
1947 | printk(KERN_INFO | ||
1948 | "btrfsic: btrfsic_map_block(root @%llu)" | ||
1949 | " failed!\n", | ||
1950 | (unsigned long long)dev_bytenr); | ||
1951 | return; | ||
1952 | } | ||
1953 | } | ||
1954 | block_ctx.data = mapped_data; | ||
1955 | /* the following is required in case of writes to mirrors, | ||
1956 | * use the same that was used for the lookup */ | ||
1957 | block_ctx.dev = dev_state; | ||
1958 | block_ctx.dev_bytenr = dev_bytenr; | ||
1959 | |||
1960 | block = btrfsic_block_alloc(); | ||
1961 | if (NULL == block) { | ||
1962 | printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); | ||
1963 | btrfsic_release_block_ctx(&block_ctx); | ||
1964 | return; | ||
1965 | } | ||
1966 | block->dev_state = dev_state; | ||
1967 | block->dev_bytenr = dev_bytenr; | ||
1968 | block->logical_bytenr = bytenr; | ||
1969 | block->is_metadata = is_metadata; | ||
1970 | block->never_written = 0; | ||
1971 | block->iodone_w_error = 0; | ||
1972 | block->mirror_num = 0; /* unknown */ | ||
1973 | block->flush_gen = dev_state->last_flush_gen + 1; | ||
1974 | block->submit_bio_bh_rw = submit_bio_bh_rw; | ||
1975 | if (NULL != bio) { | ||
1976 | block->is_iodone = 0; | ||
1977 | BUG_ON(NULL == bio_is_patched); | ||
1978 | if (!*bio_is_patched) { | ||
1979 | block->orig_bio_bh_private = bio->bi_private; | ||
1980 | block->orig_bio_bh_end_io.bio = bio->bi_end_io; | ||
1981 | block->next_in_same_bio = NULL; | ||
1982 | bio->bi_private = block; | ||
1983 | bio->bi_end_io = btrfsic_bio_end_io; | ||
1984 | *bio_is_patched = 1; | ||
1985 | } else { | ||
1986 | struct btrfsic_block *chained_block = | ||
1987 | (struct btrfsic_block *) | ||
1988 | bio->bi_private; | ||
1989 | |||
1990 | BUG_ON(NULL == chained_block); | ||
1991 | block->orig_bio_bh_private = | ||
1992 | chained_block->orig_bio_bh_private; | ||
1993 | block->orig_bio_bh_end_io.bio = | ||
1994 | chained_block->orig_bio_bh_end_io.bio; | ||
1995 | block->next_in_same_bio = chained_block; | ||
1996 | bio->bi_private = block; | ||
1997 | } | ||
1998 | } else if (NULL != bh) { | ||
1999 | block->is_iodone = 0; | ||
2000 | block->orig_bio_bh_private = bh->b_private; | ||
2001 | block->orig_bio_bh_end_io.bh = bh->b_end_io; | ||
2002 | block->next_in_same_bio = NULL; | ||
2003 | bh->b_private = block; | ||
2004 | bh->b_end_io = btrfsic_bh_end_io; | ||
2005 | } else { | ||
2006 | block->is_iodone = 1; | ||
2007 | block->orig_bio_bh_private = NULL; | ||
2008 | block->orig_bio_bh_end_io.bio = NULL; | ||
2009 | block->next_in_same_bio = NULL; | ||
2010 | } | ||
2011 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2012 | printk(KERN_INFO | ||
2013 | "New written %c-block @%llu (%s/%llu/%d)\n", | ||
2014 | is_metadata ? 'M' : 'D', | ||
2015 | (unsigned long long)block->logical_bytenr, | ||
2016 | block->dev_state->name, | ||
2017 | (unsigned long long)block->dev_bytenr, | ||
2018 | block->mirror_num); | ||
2019 | list_add(&block->all_blocks_node, &state->all_blocks_list); | ||
2020 | btrfsic_block_hashtable_add(block, &state->block_hashtable); | ||
2021 | |||
2022 | if (is_metadata) { | ||
2023 | ret = btrfsic_process_metablock(state, block, | ||
2024 | &block_ctx, | ||
2025 | (struct btrfs_header *) | ||
2026 | block_ctx.data, 0, 0); | ||
2027 | if (ret) | ||
2028 | printk(KERN_INFO | ||
2029 | "btrfsic: process_metablock(root @%llu)" | ||
2030 | " failed!\n", | ||
2031 | (unsigned long long)dev_bytenr); | ||
2032 | } | ||
2033 | btrfsic_release_block_ctx(&block_ctx); | ||
2034 | } | ||
2035 | } | ||
2036 | |||
2037 | static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) | ||
2038 | { | ||
2039 | struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; | ||
2040 | int iodone_w_error; | ||
2041 | |||
2042 | /* mutex is not held! This is not save if IO is not yet completed | ||
2043 | * on umount */ | ||
2044 | iodone_w_error = 0; | ||
2045 | if (bio_error_status) | ||
2046 | iodone_w_error = 1; | ||
2047 | |||
2048 | BUG_ON(NULL == block); | ||
2049 | bp->bi_private = block->orig_bio_bh_private; | ||
2050 | bp->bi_end_io = block->orig_bio_bh_end_io.bio; | ||
2051 | |||
2052 | do { | ||
2053 | struct btrfsic_block *next_block; | ||
2054 | struct btrfsic_dev_state *const dev_state = block->dev_state; | ||
2055 | |||
2056 | if ((dev_state->state->print_mask & | ||
2057 | BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) | ||
2058 | printk(KERN_INFO | ||
2059 | "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", | ||
2060 | bio_error_status, | ||
2061 | btrfsic_get_block_type(dev_state->state, block), | ||
2062 | (unsigned long long)block->logical_bytenr, | ||
2063 | dev_state->name, | ||
2064 | (unsigned long long)block->dev_bytenr, | ||
2065 | block->mirror_num); | ||
2066 | next_block = block->next_in_same_bio; | ||
2067 | block->iodone_w_error = iodone_w_error; | ||
2068 | if (block->submit_bio_bh_rw & REQ_FLUSH) { | ||
2069 | dev_state->last_flush_gen++; | ||
2070 | if ((dev_state->state->print_mask & | ||
2071 | BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) | ||
2072 | printk(KERN_INFO | ||
2073 | "bio_end_io() new %s flush_gen=%llu\n", | ||
2074 | dev_state->name, | ||
2075 | (unsigned long long) | ||
2076 | dev_state->last_flush_gen); | ||
2077 | } | ||
2078 | if (block->submit_bio_bh_rw & REQ_FUA) | ||
2079 | block->flush_gen = 0; /* FUA completed means block is | ||
2080 | * on disk */ | ||
2081 | block->is_iodone = 1; /* for FLUSH, this releases the block */ | ||
2082 | block = next_block; | ||
2083 | } while (NULL != block); | ||
2084 | |||
2085 | bp->bi_end_io(bp, bio_error_status); | ||
2086 | } | ||
2087 | |||
2088 | static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) | ||
2089 | { | ||
2090 | struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; | ||
2091 | int iodone_w_error = !uptodate; | ||
2092 | struct btrfsic_dev_state *dev_state; | ||
2093 | |||
2094 | BUG_ON(NULL == block); | ||
2095 | dev_state = block->dev_state; | ||
2096 | if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) | ||
2097 | printk(KERN_INFO | ||
2098 | "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", | ||
2099 | iodone_w_error, | ||
2100 | btrfsic_get_block_type(dev_state->state, block), | ||
2101 | (unsigned long long)block->logical_bytenr, | ||
2102 | block->dev_state->name, | ||
2103 | (unsigned long long)block->dev_bytenr, | ||
2104 | block->mirror_num); | ||
2105 | |||
2106 | block->iodone_w_error = iodone_w_error; | ||
2107 | if (block->submit_bio_bh_rw & REQ_FLUSH) { | ||
2108 | dev_state->last_flush_gen++; | ||
2109 | if ((dev_state->state->print_mask & | ||
2110 | BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) | ||
2111 | printk(KERN_INFO | ||
2112 | "bh_end_io() new %s flush_gen=%llu\n", | ||
2113 | dev_state->name, | ||
2114 | (unsigned long long)dev_state->last_flush_gen); | ||
2115 | } | ||
2116 | if (block->submit_bio_bh_rw & REQ_FUA) | ||
2117 | block->flush_gen = 0; /* FUA completed means block is on disk */ | ||
2118 | |||
2119 | bh->b_private = block->orig_bio_bh_private; | ||
2120 | bh->b_end_io = block->orig_bio_bh_end_io.bh; | ||
2121 | block->is_iodone = 1; /* for FLUSH, this releases the block */ | ||
2122 | bh->b_end_io(bh, uptodate); | ||
2123 | } | ||
2124 | |||
2125 | static int btrfsic_process_written_superblock( | ||
2126 | struct btrfsic_state *state, | ||
2127 | struct btrfsic_block *const superblock, | ||
2128 | struct btrfs_super_block *const super_hdr) | ||
2129 | { | ||
2130 | int pass; | ||
2131 | |||
2132 | superblock->generation = btrfs_super_generation(super_hdr); | ||
2133 | if (!(superblock->generation > state->max_superblock_generation || | ||
2134 | 0 == state->max_superblock_generation)) { | ||
2135 | if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) | ||
2136 | printk(KERN_INFO | ||
2137 | "btrfsic: superblock @%llu (%s/%llu/%d)" | ||
2138 | " with old gen %llu <= %llu\n", | ||
2139 | (unsigned long long)superblock->logical_bytenr, | ||
2140 | superblock->dev_state->name, | ||
2141 | (unsigned long long)superblock->dev_bytenr, | ||
2142 | superblock->mirror_num, | ||
2143 | (unsigned long long) | ||
2144 | btrfs_super_generation(super_hdr), | ||
2145 | (unsigned long long) | ||
2146 | state->max_superblock_generation); | ||
2147 | } else { | ||
2148 | if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) | ||
2149 | printk(KERN_INFO | ||
2150 | "btrfsic: got new superblock @%llu (%s/%llu/%d)" | ||
2151 | " with new gen %llu > %llu\n", | ||
2152 | (unsigned long long)superblock->logical_bytenr, | ||
2153 | superblock->dev_state->name, | ||
2154 | (unsigned long long)superblock->dev_bytenr, | ||
2155 | superblock->mirror_num, | ||
2156 | (unsigned long long) | ||
2157 | btrfs_super_generation(super_hdr), | ||
2158 | (unsigned long long) | ||
2159 | state->max_superblock_generation); | ||
2160 | |||
2161 | state->max_superblock_generation = | ||
2162 | btrfs_super_generation(super_hdr); | ||
2163 | state->latest_superblock = superblock; | ||
2164 | } | ||
2165 | |||
2166 | for (pass = 0; pass < 3; pass++) { | ||
2167 | int ret; | ||
2168 | u64 next_bytenr; | ||
2169 | struct btrfsic_block *next_block; | ||
2170 | struct btrfsic_block_data_ctx tmp_next_block_ctx; | ||
2171 | struct btrfsic_block_link *l; | ||
2172 | int num_copies; | ||
2173 | int mirror_num; | ||
2174 | const char *additional_string = NULL; | ||
2175 | struct btrfs_disk_key tmp_disk_key; | ||
2176 | |||
2177 | tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; | ||
2178 | tmp_disk_key.offset = 0; | ||
2179 | |||
2180 | switch (pass) { | ||
2181 | case 0: | ||
2182 | tmp_disk_key.objectid = | ||
2183 | cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); | ||
2184 | additional_string = "root "; | ||
2185 | next_bytenr = btrfs_super_root(super_hdr); | ||
2186 | if (state->print_mask & | ||
2187 | BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) | ||
2188 | printk(KERN_INFO "root@%llu\n", | ||
2189 | (unsigned long long)next_bytenr); | ||
2190 | break; | ||
2191 | case 1: | ||
2192 | tmp_disk_key.objectid = | ||
2193 | cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); | ||
2194 | additional_string = "chunk "; | ||
2195 | next_bytenr = btrfs_super_chunk_root(super_hdr); | ||
2196 | if (state->print_mask & | ||
2197 | BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) | ||
2198 | printk(KERN_INFO "chunk@%llu\n", | ||
2199 | (unsigned long long)next_bytenr); | ||
2200 | break; | ||
2201 | case 2: | ||
2202 | tmp_disk_key.objectid = | ||
2203 | cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); | ||
2204 | additional_string = "log "; | ||
2205 | next_bytenr = btrfs_super_log_root(super_hdr); | ||
2206 | if (0 == next_bytenr) | ||
2207 | continue; | ||
2208 | if (state->print_mask & | ||
2209 | BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) | ||
2210 | printk(KERN_INFO "log@%llu\n", | ||
2211 | (unsigned long long)next_bytenr); | ||
2212 | break; | ||
2213 | } | ||
2214 | |||
2215 | num_copies = | ||
2216 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | ||
2217 | next_bytenr, PAGE_SIZE); | ||
2218 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | ||
2219 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | ||
2220 | (unsigned long long)next_bytenr, num_copies); | ||
2221 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { | ||
2222 | int was_created; | ||
2223 | |||
2224 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2225 | printk(KERN_INFO | ||
2226 | "btrfsic_process_written_superblock(" | ||
2227 | "mirror_num=%d)\n", mirror_num); | ||
2228 | ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, | ||
2229 | &tmp_next_block_ctx, | ||
2230 | mirror_num); | ||
2231 | if (ret) { | ||
2232 | printk(KERN_INFO | ||
2233 | "btrfsic: btrfsic_map_block(@%llu," | ||
2234 | " mirror=%d) failed!\n", | ||
2235 | (unsigned long long)next_bytenr, | ||
2236 | mirror_num); | ||
2237 | return -1; | ||
2238 | } | ||
2239 | |||
2240 | next_block = btrfsic_block_lookup_or_add( | ||
2241 | state, | ||
2242 | &tmp_next_block_ctx, | ||
2243 | additional_string, | ||
2244 | 1, 0, 1, | ||
2245 | mirror_num, | ||
2246 | &was_created); | ||
2247 | if (NULL == next_block) { | ||
2248 | printk(KERN_INFO | ||
2249 | "btrfsic: error, kmalloc failed!\n"); | ||
2250 | btrfsic_release_block_ctx(&tmp_next_block_ctx); | ||
2251 | return -1; | ||
2252 | } | ||
2253 | |||
2254 | next_block->disk_key = tmp_disk_key; | ||
2255 | if (was_created) | ||
2256 | next_block->generation = | ||
2257 | BTRFSIC_GENERATION_UNKNOWN; | ||
2258 | l = btrfsic_block_link_lookup_or_add( | ||
2259 | state, | ||
2260 | &tmp_next_block_ctx, | ||
2261 | next_block, | ||
2262 | superblock, | ||
2263 | BTRFSIC_GENERATION_UNKNOWN); | ||
2264 | btrfsic_release_block_ctx(&tmp_next_block_ctx); | ||
2265 | if (NULL == l) | ||
2266 | return -1; | ||
2267 | } | ||
2268 | } | ||
2269 | |||
2270 | if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { | ||
2271 | WARN_ON(1); | ||
2272 | btrfsic_dump_tree(state); | ||
2273 | } | ||
2274 | |||
2275 | return 0; | ||
2276 | } | ||
2277 | |||
2278 | static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, | ||
2279 | struct btrfsic_block *const block, | ||
2280 | int recursion_level) | ||
2281 | { | ||
2282 | struct list_head *elem_ref_to; | ||
2283 | int ret = 0; | ||
2284 | |||
2285 | if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { | ||
2286 | /* | ||
2287 | * Note that this situation can happen and does not | ||
2288 | * indicate an error in regular cases. It happens | ||
2289 | * when disk blocks are freed and later reused. | ||
2290 | * The check-integrity module is not aware of any | ||
2291 | * block free operations, it just recognizes block | ||
2292 | * write operations. Therefore it keeps the linkage | ||
2293 | * information for a block until a block is | ||
2294 | * rewritten. This can temporarily cause incorrect | ||
2295 | * and even circular linkage informations. This | ||
2296 | * causes no harm unless such blocks are referenced | ||
2297 | * by the most recent super block. | ||
2298 | */ | ||
2299 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2300 | printk(KERN_INFO | ||
2301 | "btrfsic: abort cyclic linkage (case 1).\n"); | ||
2302 | |||
2303 | return ret; | ||
2304 | } | ||
2305 | |||
2306 | /* | ||
2307 | * This algorithm is recursive because the amount of used stack | ||
2308 | * space is very small and the max recursion depth is limited. | ||
2309 | */ | ||
2310 | list_for_each(elem_ref_to, &block->ref_to_list) { | ||
2311 | const struct btrfsic_block_link *const l = | ||
2312 | list_entry(elem_ref_to, struct btrfsic_block_link, | ||
2313 | node_ref_to); | ||
2314 | |||
2315 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2316 | printk(KERN_INFO | ||
2317 | "rl=%d, %c @%llu (%s/%llu/%d)" | ||
2318 | " %u* refers to %c @%llu (%s/%llu/%d)\n", | ||
2319 | recursion_level, | ||
2320 | btrfsic_get_block_type(state, block), | ||
2321 | (unsigned long long)block->logical_bytenr, | ||
2322 | block->dev_state->name, | ||
2323 | (unsigned long long)block->dev_bytenr, | ||
2324 | block->mirror_num, | ||
2325 | l->ref_cnt, | ||
2326 | btrfsic_get_block_type(state, l->block_ref_to), | ||
2327 | (unsigned long long) | ||
2328 | l->block_ref_to->logical_bytenr, | ||
2329 | l->block_ref_to->dev_state->name, | ||
2330 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
2331 | l->block_ref_to->mirror_num); | ||
2332 | if (l->block_ref_to->never_written) { | ||
2333 | printk(KERN_INFO "btrfs: attempt to write superblock" | ||
2334 | " which references block %c @%llu (%s/%llu/%d)" | ||
2335 | " which is never written!\n", | ||
2336 | btrfsic_get_block_type(state, l->block_ref_to), | ||
2337 | (unsigned long long) | ||
2338 | l->block_ref_to->logical_bytenr, | ||
2339 | l->block_ref_to->dev_state->name, | ||
2340 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
2341 | l->block_ref_to->mirror_num); | ||
2342 | ret = -1; | ||
2343 | } else if (!l->block_ref_to->is_iodone) { | ||
2344 | printk(KERN_INFO "btrfs: attempt to write superblock" | ||
2345 | " which references block %c @%llu (%s/%llu/%d)" | ||
2346 | " which is not yet iodone!\n", | ||
2347 | btrfsic_get_block_type(state, l->block_ref_to), | ||
2348 | (unsigned long long) | ||
2349 | l->block_ref_to->logical_bytenr, | ||
2350 | l->block_ref_to->dev_state->name, | ||
2351 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
2352 | l->block_ref_to->mirror_num); | ||
2353 | ret = -1; | ||
2354 | } else if (l->parent_generation != | ||
2355 | l->block_ref_to->generation && | ||
2356 | BTRFSIC_GENERATION_UNKNOWN != | ||
2357 | l->parent_generation && | ||
2358 | BTRFSIC_GENERATION_UNKNOWN != | ||
2359 | l->block_ref_to->generation) { | ||
2360 | printk(KERN_INFO "btrfs: attempt to write superblock" | ||
2361 | " which references block %c @%llu (%s/%llu/%d)" | ||
2362 | " with generation %llu !=" | ||
2363 | " parent generation %llu!\n", | ||
2364 | btrfsic_get_block_type(state, l->block_ref_to), | ||
2365 | (unsigned long long) | ||
2366 | l->block_ref_to->logical_bytenr, | ||
2367 | l->block_ref_to->dev_state->name, | ||
2368 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
2369 | l->block_ref_to->mirror_num, | ||
2370 | (unsigned long long)l->block_ref_to->generation, | ||
2371 | (unsigned long long)l->parent_generation); | ||
2372 | ret = -1; | ||
2373 | } else if (l->block_ref_to->flush_gen > | ||
2374 | l->block_ref_to->dev_state->last_flush_gen) { | ||
2375 | printk(KERN_INFO "btrfs: attempt to write superblock" | ||
2376 | " which references block %c @%llu (%s/%llu/%d)" | ||
2377 | " which is not flushed out of disk's write cache" | ||
2378 | " (block flush_gen=%llu," | ||
2379 | " dev->flush_gen=%llu)!\n", | ||
2380 | btrfsic_get_block_type(state, l->block_ref_to), | ||
2381 | (unsigned long long) | ||
2382 | l->block_ref_to->logical_bytenr, | ||
2383 | l->block_ref_to->dev_state->name, | ||
2384 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
2385 | l->block_ref_to->mirror_num, | ||
2386 | (unsigned long long)block->flush_gen, | ||
2387 | (unsigned long long) | ||
2388 | l->block_ref_to->dev_state->last_flush_gen); | ||
2389 | ret = -1; | ||
2390 | } else if (-1 == btrfsic_check_all_ref_blocks(state, | ||
2391 | l->block_ref_to, | ||
2392 | recursion_level + | ||
2393 | 1)) { | ||
2394 | ret = -1; | ||
2395 | } | ||
2396 | } | ||
2397 | |||
2398 | return ret; | ||
2399 | } | ||
2400 | |||
2401 | static int btrfsic_is_block_ref_by_superblock( | ||
2402 | const struct btrfsic_state *state, | ||
2403 | const struct btrfsic_block *block, | ||
2404 | int recursion_level) | ||
2405 | { | ||
2406 | struct list_head *elem_ref_from; | ||
2407 | |||
2408 | if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { | ||
2409 | /* refer to comment at "abort cyclic linkage (case 1)" */ | ||
2410 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2411 | printk(KERN_INFO | ||
2412 | "btrfsic: abort cyclic linkage (case 2).\n"); | ||
2413 | |||
2414 | return 0; | ||
2415 | } | ||
2416 | |||
2417 | /* | ||
2418 | * This algorithm is recursive because the amount of used stack space | ||
2419 | * is very small and the max recursion depth is limited. | ||
2420 | */ | ||
2421 | list_for_each(elem_ref_from, &block->ref_from_list) { | ||
2422 | const struct btrfsic_block_link *const l = | ||
2423 | list_entry(elem_ref_from, struct btrfsic_block_link, | ||
2424 | node_ref_from); | ||
2425 | |||
2426 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2427 | printk(KERN_INFO | ||
2428 | "rl=%d, %c @%llu (%s/%llu/%d)" | ||
2429 | " is ref %u* from %c @%llu (%s/%llu/%d)\n", | ||
2430 | recursion_level, | ||
2431 | btrfsic_get_block_type(state, block), | ||
2432 | (unsigned long long)block->logical_bytenr, | ||
2433 | block->dev_state->name, | ||
2434 | (unsigned long long)block->dev_bytenr, | ||
2435 | block->mirror_num, | ||
2436 | l->ref_cnt, | ||
2437 | btrfsic_get_block_type(state, l->block_ref_from), | ||
2438 | (unsigned long long) | ||
2439 | l->block_ref_from->logical_bytenr, | ||
2440 | l->block_ref_from->dev_state->name, | ||
2441 | (unsigned long long) | ||
2442 | l->block_ref_from->dev_bytenr, | ||
2443 | l->block_ref_from->mirror_num); | ||
2444 | if (l->block_ref_from->is_superblock && | ||
2445 | state->latest_superblock->dev_bytenr == | ||
2446 | l->block_ref_from->dev_bytenr && | ||
2447 | state->latest_superblock->dev_state->bdev == | ||
2448 | l->block_ref_from->dev_state->bdev) | ||
2449 | return 1; | ||
2450 | else if (btrfsic_is_block_ref_by_superblock(state, | ||
2451 | l->block_ref_from, | ||
2452 | recursion_level + | ||
2453 | 1)) | ||
2454 | return 1; | ||
2455 | } | ||
2456 | |||
2457 | return 0; | ||
2458 | } | ||
2459 | |||
2460 | static void btrfsic_print_add_link(const struct btrfsic_state *state, | ||
2461 | const struct btrfsic_block_link *l) | ||
2462 | { | ||
2463 | printk(KERN_INFO | ||
2464 | "Add %u* link from %c @%llu (%s/%llu/%d)" | ||
2465 | " to %c @%llu (%s/%llu/%d).\n", | ||
2466 | l->ref_cnt, | ||
2467 | btrfsic_get_block_type(state, l->block_ref_from), | ||
2468 | (unsigned long long)l->block_ref_from->logical_bytenr, | ||
2469 | l->block_ref_from->dev_state->name, | ||
2470 | (unsigned long long)l->block_ref_from->dev_bytenr, | ||
2471 | l->block_ref_from->mirror_num, | ||
2472 | btrfsic_get_block_type(state, l->block_ref_to), | ||
2473 | (unsigned long long)l->block_ref_to->logical_bytenr, | ||
2474 | l->block_ref_to->dev_state->name, | ||
2475 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
2476 | l->block_ref_to->mirror_num); | ||
2477 | } | ||
2478 | |||
2479 | static void btrfsic_print_rem_link(const struct btrfsic_state *state, | ||
2480 | const struct btrfsic_block_link *l) | ||
2481 | { | ||
2482 | printk(KERN_INFO | ||
2483 | "Rem %u* link from %c @%llu (%s/%llu/%d)" | ||
2484 | " to %c @%llu (%s/%llu/%d).\n", | ||
2485 | l->ref_cnt, | ||
2486 | btrfsic_get_block_type(state, l->block_ref_from), | ||
2487 | (unsigned long long)l->block_ref_from->logical_bytenr, | ||
2488 | l->block_ref_from->dev_state->name, | ||
2489 | (unsigned long long)l->block_ref_from->dev_bytenr, | ||
2490 | l->block_ref_from->mirror_num, | ||
2491 | btrfsic_get_block_type(state, l->block_ref_to), | ||
2492 | (unsigned long long)l->block_ref_to->logical_bytenr, | ||
2493 | l->block_ref_to->dev_state->name, | ||
2494 | (unsigned long long)l->block_ref_to->dev_bytenr, | ||
2495 | l->block_ref_to->mirror_num); | ||
2496 | } | ||
2497 | |||
2498 | static char btrfsic_get_block_type(const struct btrfsic_state *state, | ||
2499 | const struct btrfsic_block *block) | ||
2500 | { | ||
2501 | if (block->is_superblock && | ||
2502 | state->latest_superblock->dev_bytenr == block->dev_bytenr && | ||
2503 | state->latest_superblock->dev_state->bdev == block->dev_state->bdev) | ||
2504 | return 'S'; | ||
2505 | else if (block->is_superblock) | ||
2506 | return 's'; | ||
2507 | else if (block->is_metadata) | ||
2508 | return 'M'; | ||
2509 | else | ||
2510 | return 'D'; | ||
2511 | } | ||
2512 | |||
2513 | static void btrfsic_dump_tree(const struct btrfsic_state *state) | ||
2514 | { | ||
2515 | btrfsic_dump_tree_sub(state, state->latest_superblock, 0); | ||
2516 | } | ||
2517 | |||
2518 | static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, | ||
2519 | const struct btrfsic_block *block, | ||
2520 | int indent_level) | ||
2521 | { | ||
2522 | struct list_head *elem_ref_to; | ||
2523 | int indent_add; | ||
2524 | static char buf[80]; | ||
2525 | int cursor_position; | ||
2526 | |||
2527 | /* | ||
2528 | * Should better fill an on-stack buffer with a complete line and | ||
2529 | * dump it at once when it is time to print a newline character. | ||
2530 | */ | ||
2531 | |||
2532 | /* | ||
2533 | * This algorithm is recursive because the amount of used stack space | ||
2534 | * is very small and the max recursion depth is limited. | ||
2535 | */ | ||
2536 | indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", | ||
2537 | btrfsic_get_block_type(state, block), | ||
2538 | (unsigned long long)block->logical_bytenr, | ||
2539 | block->dev_state->name, | ||
2540 | (unsigned long long)block->dev_bytenr, | ||
2541 | block->mirror_num); | ||
2542 | if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { | ||
2543 | printk("[...]\n"); | ||
2544 | return; | ||
2545 | } | ||
2546 | printk(buf); | ||
2547 | indent_level += indent_add; | ||
2548 | if (list_empty(&block->ref_to_list)) { | ||
2549 | printk("\n"); | ||
2550 | return; | ||
2551 | } | ||
2552 | if (block->mirror_num > 1 && | ||
2553 | !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { | ||
2554 | printk(" [...]\n"); | ||
2555 | return; | ||
2556 | } | ||
2557 | |||
2558 | cursor_position = indent_level; | ||
2559 | list_for_each(elem_ref_to, &block->ref_to_list) { | ||
2560 | const struct btrfsic_block_link *const l = | ||
2561 | list_entry(elem_ref_to, struct btrfsic_block_link, | ||
2562 | node_ref_to); | ||
2563 | |||
2564 | while (cursor_position < indent_level) { | ||
2565 | printk(" "); | ||
2566 | cursor_position++; | ||
2567 | } | ||
2568 | if (l->ref_cnt > 1) | ||
2569 | indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); | ||
2570 | else | ||
2571 | indent_add = sprintf(buf, " --> "); | ||
2572 | if (indent_level + indent_add > | ||
2573 | BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { | ||
2574 | printk("[...]\n"); | ||
2575 | cursor_position = 0; | ||
2576 | continue; | ||
2577 | } | ||
2578 | |||
2579 | printk(buf); | ||
2580 | |||
2581 | btrfsic_dump_tree_sub(state, l->block_ref_to, | ||
2582 | indent_level + indent_add); | ||
2583 | cursor_position = 0; | ||
2584 | } | ||
2585 | } | ||
2586 | |||
2587 | static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( | ||
2588 | struct btrfsic_state *state, | ||
2589 | struct btrfsic_block_data_ctx *next_block_ctx, | ||
2590 | struct btrfsic_block *next_block, | ||
2591 | struct btrfsic_block *from_block, | ||
2592 | u64 parent_generation) | ||
2593 | { | ||
2594 | struct btrfsic_block_link *l; | ||
2595 | |||
2596 | l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, | ||
2597 | next_block_ctx->dev_bytenr, | ||
2598 | from_block->dev_state->bdev, | ||
2599 | from_block->dev_bytenr, | ||
2600 | &state->block_link_hashtable); | ||
2601 | if (NULL == l) { | ||
2602 | l = btrfsic_block_link_alloc(); | ||
2603 | if (NULL == l) { | ||
2604 | printk(KERN_INFO | ||
2605 | "btrfsic: error, kmalloc" " failed!\n"); | ||
2606 | return NULL; | ||
2607 | } | ||
2608 | |||
2609 | l->block_ref_to = next_block; | ||
2610 | l->block_ref_from = from_block; | ||
2611 | l->ref_cnt = 1; | ||
2612 | l->parent_generation = parent_generation; | ||
2613 | |||
2614 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2615 | btrfsic_print_add_link(state, l); | ||
2616 | |||
2617 | list_add(&l->node_ref_to, &from_block->ref_to_list); | ||
2618 | list_add(&l->node_ref_from, &next_block->ref_from_list); | ||
2619 | |||
2620 | btrfsic_block_link_hashtable_add(l, | ||
2621 | &state->block_link_hashtable); | ||
2622 | } else { | ||
2623 | l->ref_cnt++; | ||
2624 | l->parent_generation = parent_generation; | ||
2625 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2626 | btrfsic_print_add_link(state, l); | ||
2627 | } | ||
2628 | |||
2629 | return l; | ||
2630 | } | ||
2631 | |||
2632 | static struct btrfsic_block *btrfsic_block_lookup_or_add( | ||
2633 | struct btrfsic_state *state, | ||
2634 | struct btrfsic_block_data_ctx *block_ctx, | ||
2635 | const char *additional_string, | ||
2636 | int is_metadata, | ||
2637 | int is_iodone, | ||
2638 | int never_written, | ||
2639 | int mirror_num, | ||
2640 | int *was_created) | ||
2641 | { | ||
2642 | struct btrfsic_block *block; | ||
2643 | |||
2644 | block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, | ||
2645 | block_ctx->dev_bytenr, | ||
2646 | &state->block_hashtable); | ||
2647 | if (NULL == block) { | ||
2648 | struct btrfsic_dev_state *dev_state; | ||
2649 | |||
2650 | block = btrfsic_block_alloc(); | ||
2651 | if (NULL == block) { | ||
2652 | printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); | ||
2653 | return NULL; | ||
2654 | } | ||
2655 | dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); | ||
2656 | if (NULL == dev_state) { | ||
2657 | printk(KERN_INFO | ||
2658 | "btrfsic: error, lookup dev_state failed!\n"); | ||
2659 | btrfsic_block_free(block); | ||
2660 | return NULL; | ||
2661 | } | ||
2662 | block->dev_state = dev_state; | ||
2663 | block->dev_bytenr = block_ctx->dev_bytenr; | ||
2664 | block->logical_bytenr = block_ctx->start; | ||
2665 | block->is_metadata = is_metadata; | ||
2666 | block->is_iodone = is_iodone; | ||
2667 | block->never_written = never_written; | ||
2668 | block->mirror_num = mirror_num; | ||
2669 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
2670 | printk(KERN_INFO | ||
2671 | "New %s%c-block @%llu (%s/%llu/%d)\n", | ||
2672 | additional_string, | ||
2673 | btrfsic_get_block_type(state, block), | ||
2674 | (unsigned long long)block->logical_bytenr, | ||
2675 | dev_state->name, | ||
2676 | (unsigned long long)block->dev_bytenr, | ||
2677 | mirror_num); | ||
2678 | list_add(&block->all_blocks_node, &state->all_blocks_list); | ||
2679 | btrfsic_block_hashtable_add(block, &state->block_hashtable); | ||
2680 | if (NULL != was_created) | ||
2681 | *was_created = 1; | ||
2682 | } else { | ||
2683 | if (NULL != was_created) | ||
2684 | *was_created = 0; | ||
2685 | } | ||
2686 | |||
2687 | return block; | ||
2688 | } | ||
2689 | |||
2690 | static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, | ||
2691 | u64 bytenr, | ||
2692 | struct btrfsic_dev_state *dev_state, | ||
2693 | u64 dev_bytenr, char *data) | ||
2694 | { | ||
2695 | int num_copies; | ||
2696 | int mirror_num; | ||
2697 | int ret; | ||
2698 | struct btrfsic_block_data_ctx block_ctx; | ||
2699 | int match = 0; | ||
2700 | |||
2701 | num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, | ||
2702 | bytenr, PAGE_SIZE); | ||
2703 | |||
2704 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { | ||
2705 | ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, | ||
2706 | &block_ctx, mirror_num); | ||
2707 | if (ret) { | ||
2708 | printk(KERN_INFO "btrfsic:" | ||
2709 | " btrfsic_map_block(logical @%llu," | ||
2710 | " mirror %d) failed!\n", | ||
2711 | (unsigned long long)bytenr, mirror_num); | ||
2712 | continue; | ||
2713 | } | ||
2714 | |||
2715 | if (dev_state->bdev == block_ctx.dev->bdev && | ||
2716 | dev_bytenr == block_ctx.dev_bytenr) { | ||
2717 | match++; | ||
2718 | btrfsic_release_block_ctx(&block_ctx); | ||
2719 | break; | ||
2720 | } | ||
2721 | btrfsic_release_block_ctx(&block_ctx); | ||
2722 | } | ||
2723 | |||
2724 | if (!match) { | ||
2725 | printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," | ||
2726 | " buffer->log_bytenr=%llu, submit_bio(bdev=%s," | ||
2727 | " phys_bytenr=%llu)!\n", | ||
2728 | (unsigned long long)bytenr, dev_state->name, | ||
2729 | (unsigned long long)dev_bytenr); | ||
2730 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { | ||
2731 | ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, | ||
2732 | &block_ctx, mirror_num); | ||
2733 | if (ret) | ||
2734 | continue; | ||
2735 | |||
2736 | printk(KERN_INFO "Read logical bytenr @%llu maps to" | ||
2737 | " (%s/%llu/%d)\n", | ||
2738 | (unsigned long long)bytenr, | ||
2739 | block_ctx.dev->name, | ||
2740 | (unsigned long long)block_ctx.dev_bytenr, | ||
2741 | mirror_num); | ||
2742 | } | ||
2743 | WARN_ON(1); | ||
2744 | } | ||
2745 | } | ||
2746 | |||
2747 | static struct btrfsic_dev_state *btrfsic_dev_state_lookup( | ||
2748 | struct block_device *bdev) | ||
2749 | { | ||
2750 | struct btrfsic_dev_state *ds; | ||
2751 | |||
2752 | ds = btrfsic_dev_state_hashtable_lookup(bdev, | ||
2753 | &btrfsic_dev_state_hashtable); | ||
2754 | return ds; | ||
2755 | } | ||
2756 | |||
2757 | int btrfsic_submit_bh(int rw, struct buffer_head *bh) | ||
2758 | { | ||
2759 | struct btrfsic_dev_state *dev_state; | ||
2760 | |||
2761 | if (!btrfsic_is_initialized) | ||
2762 | return submit_bh(rw, bh); | ||
2763 | |||
2764 | mutex_lock(&btrfsic_mutex); | ||
2765 | /* since btrfsic_submit_bh() might also be called before | ||
2766 | * btrfsic_mount(), this might return NULL */ | ||
2767 | dev_state = btrfsic_dev_state_lookup(bh->b_bdev); | ||
2768 | |||
2769 | /* Only called to write the superblock (incl. FLUSH/FUA) */ | ||
2770 | if (NULL != dev_state && | ||
2771 | (rw & WRITE) && bh->b_size > 0) { | ||
2772 | u64 dev_bytenr; | ||
2773 | |||
2774 | dev_bytenr = 4096 * bh->b_blocknr; | ||
2775 | if (dev_state->state->print_mask & | ||
2776 | BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) | ||
2777 | printk(KERN_INFO | ||
2778 | "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," | ||
2779 | " size=%lu, data=%p, bdev=%p)\n", | ||
2780 | rw, bh->b_blocknr, | ||
2781 | (unsigned long long)dev_bytenr, bh->b_size, | ||
2782 | bh->b_data, bh->b_bdev); | ||
2783 | btrfsic_process_written_block(dev_state, dev_bytenr, | ||
2784 | bh->b_data, bh->b_size, NULL, | ||
2785 | NULL, bh, rw); | ||
2786 | } else if (NULL != dev_state && (rw & REQ_FLUSH)) { | ||
2787 | if (dev_state->state->print_mask & | ||
2788 | BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) | ||
2789 | printk(KERN_INFO | ||
2790 | "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", | ||
2791 | rw, bh->b_bdev); | ||
2792 | if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { | ||
2793 | if ((dev_state->state->print_mask & | ||
2794 | (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | | ||
2795 | BTRFSIC_PRINT_MASK_VERBOSE))) | ||
2796 | printk(KERN_INFO | ||
2797 | "btrfsic_submit_bh(%s) with FLUSH" | ||
2798 | " but dummy block already in use" | ||
2799 | " (ignored)!\n", | ||
2800 | dev_state->name); | ||
2801 | } else { | ||
2802 | struct btrfsic_block *const block = | ||
2803 | &dev_state->dummy_block_for_bio_bh_flush; | ||
2804 | |||
2805 | block->is_iodone = 0; | ||
2806 | block->never_written = 0; | ||
2807 | block->iodone_w_error = 0; | ||
2808 | block->flush_gen = dev_state->last_flush_gen + 1; | ||
2809 | block->submit_bio_bh_rw = rw; | ||
2810 | block->orig_bio_bh_private = bh->b_private; | ||
2811 | block->orig_bio_bh_end_io.bh = bh->b_end_io; | ||
2812 | block->next_in_same_bio = NULL; | ||
2813 | bh->b_private = block; | ||
2814 | bh->b_end_io = btrfsic_bh_end_io; | ||
2815 | } | ||
2816 | } | ||
2817 | mutex_unlock(&btrfsic_mutex); | ||
2818 | return submit_bh(rw, bh); | ||
2819 | } | ||
2820 | |||
2821 | void btrfsic_submit_bio(int rw, struct bio *bio) | ||
2822 | { | ||
2823 | struct btrfsic_dev_state *dev_state; | ||
2824 | |||
2825 | if (!btrfsic_is_initialized) { | ||
2826 | submit_bio(rw, bio); | ||
2827 | return; | ||
2828 | } | ||
2829 | |||
2830 | mutex_lock(&btrfsic_mutex); | ||
2831 | /* since btrfsic_submit_bio() is also called before | ||
2832 | * btrfsic_mount(), this might return NULL */ | ||
2833 | dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); | ||
2834 | if (NULL != dev_state && | ||
2835 | (rw & WRITE) && NULL != bio->bi_io_vec) { | ||
2836 | unsigned int i; | ||
2837 | u64 dev_bytenr; | ||
2838 | int bio_is_patched; | ||
2839 | |||
2840 | dev_bytenr = 512 * bio->bi_sector; | ||
2841 | bio_is_patched = 0; | ||
2842 | if (dev_state->state->print_mask & | ||
2843 | BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) | ||
2844 | printk(KERN_INFO | ||
2845 | "submit_bio(rw=0x%x, bi_vcnt=%u," | ||
2846 | " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", | ||
2847 | rw, bio->bi_vcnt, bio->bi_sector, | ||
2848 | (unsigned long long)dev_bytenr, | ||
2849 | bio->bi_bdev); | ||
2850 | |||
2851 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
2852 | u8 *mapped_data; | ||
2853 | |||
2854 | mapped_data = kmap(bio->bi_io_vec[i].bv_page); | ||
2855 | if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | | ||
2856 | BTRFSIC_PRINT_MASK_VERBOSE) == | ||
2857 | (dev_state->state->print_mask & | ||
2858 | (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | | ||
2859 | BTRFSIC_PRINT_MASK_VERBOSE))) | ||
2860 | printk(KERN_INFO | ||
2861 | "#%u: page=%p, mapped=%p, len=%u," | ||
2862 | " offset=%u\n", | ||
2863 | i, bio->bi_io_vec[i].bv_page, | ||
2864 | mapped_data, | ||
2865 | bio->bi_io_vec[i].bv_len, | ||
2866 | bio->bi_io_vec[i].bv_offset); | ||
2867 | btrfsic_process_written_block(dev_state, dev_bytenr, | ||
2868 | mapped_data, | ||
2869 | bio->bi_io_vec[i].bv_len, | ||
2870 | bio, &bio_is_patched, | ||
2871 | NULL, rw); | ||
2872 | kunmap(bio->bi_io_vec[i].bv_page); | ||
2873 | dev_bytenr += bio->bi_io_vec[i].bv_len; | ||
2874 | } | ||
2875 | } else if (NULL != dev_state && (rw & REQ_FLUSH)) { | ||
2876 | if (dev_state->state->print_mask & | ||
2877 | BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) | ||
2878 | printk(KERN_INFO | ||
2879 | "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", | ||
2880 | rw, bio->bi_bdev); | ||
2881 | if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { | ||
2882 | if ((dev_state->state->print_mask & | ||
2883 | (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | | ||
2884 | BTRFSIC_PRINT_MASK_VERBOSE))) | ||
2885 | printk(KERN_INFO | ||
2886 | "btrfsic_submit_bio(%s) with FLUSH" | ||
2887 | " but dummy block already in use" | ||
2888 | " (ignored)!\n", | ||
2889 | dev_state->name); | ||
2890 | } else { | ||
2891 | struct btrfsic_block *const block = | ||
2892 | &dev_state->dummy_block_for_bio_bh_flush; | ||
2893 | |||
2894 | block->is_iodone = 0; | ||
2895 | block->never_written = 0; | ||
2896 | block->iodone_w_error = 0; | ||
2897 | block->flush_gen = dev_state->last_flush_gen + 1; | ||
2898 | block->submit_bio_bh_rw = rw; | ||
2899 | block->orig_bio_bh_private = bio->bi_private; | ||
2900 | block->orig_bio_bh_end_io.bio = bio->bi_end_io; | ||
2901 | block->next_in_same_bio = NULL; | ||
2902 | bio->bi_private = block; | ||
2903 | bio->bi_end_io = btrfsic_bio_end_io; | ||
2904 | } | ||
2905 | } | ||
2906 | mutex_unlock(&btrfsic_mutex); | ||
2907 | |||
2908 | submit_bio(rw, bio); | ||
2909 | } | ||
2910 | |||
2911 | int btrfsic_mount(struct btrfs_root *root, | ||
2912 | struct btrfs_fs_devices *fs_devices, | ||
2913 | int including_extent_data, u32 print_mask) | ||
2914 | { | ||
2915 | int ret; | ||
2916 | struct btrfsic_state *state; | ||
2917 | struct list_head *dev_head = &fs_devices->devices; | ||
2918 | struct btrfs_device *device; | ||
2919 | |||
2920 | state = kzalloc(sizeof(*state), GFP_NOFS); | ||
2921 | if (NULL == state) { | ||
2922 | printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); | ||
2923 | return -1; | ||
2924 | } | ||
2925 | |||
2926 | if (!btrfsic_is_initialized) { | ||
2927 | mutex_init(&btrfsic_mutex); | ||
2928 | btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); | ||
2929 | btrfsic_is_initialized = 1; | ||
2930 | } | ||
2931 | mutex_lock(&btrfsic_mutex); | ||
2932 | state->root = root; | ||
2933 | state->print_mask = print_mask; | ||
2934 | state->include_extent_data = including_extent_data; | ||
2935 | state->csum_size = 0; | ||
2936 | INIT_LIST_HEAD(&state->all_blocks_list); | ||
2937 | btrfsic_block_hashtable_init(&state->block_hashtable); | ||
2938 | btrfsic_block_link_hashtable_init(&state->block_link_hashtable); | ||
2939 | state->max_superblock_generation = 0; | ||
2940 | state->latest_superblock = NULL; | ||
2941 | |||
2942 | list_for_each_entry(device, dev_head, dev_list) { | ||
2943 | struct btrfsic_dev_state *ds; | ||
2944 | char *p; | ||
2945 | |||
2946 | if (!device->bdev || !device->name) | ||
2947 | continue; | ||
2948 | |||
2949 | ds = btrfsic_dev_state_alloc(); | ||
2950 | if (NULL == ds) { | ||
2951 | printk(KERN_INFO | ||
2952 | "btrfs check-integrity: kmalloc() failed!\n"); | ||
2953 | mutex_unlock(&btrfsic_mutex); | ||
2954 | return -1; | ||
2955 | } | ||
2956 | ds->bdev = device->bdev; | ||
2957 | ds->state = state; | ||
2958 | bdevname(ds->bdev, ds->name); | ||
2959 | ds->name[BDEVNAME_SIZE - 1] = '\0'; | ||
2960 | for (p = ds->name; *p != '\0'; p++); | ||
2961 | while (p > ds->name && *p != '/') | ||
2962 | p--; | ||
2963 | if (*p == '/') | ||
2964 | p++; | ||
2965 | strlcpy(ds->name, p, sizeof(ds->name)); | ||
2966 | btrfsic_dev_state_hashtable_add(ds, | ||
2967 | &btrfsic_dev_state_hashtable); | ||
2968 | } | ||
2969 | |||
2970 | ret = btrfsic_process_superblock(state, fs_devices); | ||
2971 | if (0 != ret) { | ||
2972 | mutex_unlock(&btrfsic_mutex); | ||
2973 | btrfsic_unmount(root, fs_devices); | ||
2974 | return ret; | ||
2975 | } | ||
2976 | |||
2977 | if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) | ||
2978 | btrfsic_dump_database(state); | ||
2979 | if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) | ||
2980 | btrfsic_dump_tree(state); | ||
2981 | |||
2982 | mutex_unlock(&btrfsic_mutex); | ||
2983 | return 0; | ||
2984 | } | ||
2985 | |||
2986 | void btrfsic_unmount(struct btrfs_root *root, | ||
2987 | struct btrfs_fs_devices *fs_devices) | ||
2988 | { | ||
2989 | struct list_head *elem_all; | ||
2990 | struct list_head *tmp_all; | ||
2991 | struct btrfsic_state *state; | ||
2992 | struct list_head *dev_head = &fs_devices->devices; | ||
2993 | struct btrfs_device *device; | ||
2994 | |||
2995 | if (!btrfsic_is_initialized) | ||
2996 | return; | ||
2997 | |||
2998 | mutex_lock(&btrfsic_mutex); | ||
2999 | |||
3000 | state = NULL; | ||
3001 | list_for_each_entry(device, dev_head, dev_list) { | ||
3002 | struct btrfsic_dev_state *ds; | ||
3003 | |||
3004 | if (!device->bdev || !device->name) | ||
3005 | continue; | ||
3006 | |||
3007 | ds = btrfsic_dev_state_hashtable_lookup( | ||
3008 | device->bdev, | ||
3009 | &btrfsic_dev_state_hashtable); | ||
3010 | if (NULL != ds) { | ||
3011 | state = ds->state; | ||
3012 | btrfsic_dev_state_hashtable_remove(ds); | ||
3013 | btrfsic_dev_state_free(ds); | ||
3014 | } | ||
3015 | } | ||
3016 | |||
3017 | if (NULL == state) { | ||
3018 | printk(KERN_INFO | ||
3019 | "btrfsic: error, cannot find state information" | ||
3020 | " on umount!\n"); | ||
3021 | mutex_unlock(&btrfsic_mutex); | ||
3022 | return; | ||
3023 | } | ||
3024 | |||
3025 | /* | ||
3026 | * Don't care about keeping the lists' state up to date, | ||
3027 | * just free all memory that was allocated dynamically. | ||
3028 | * Free the blocks and the block_links. | ||
3029 | */ | ||
3030 | list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { | ||
3031 | struct btrfsic_block *const b_all = | ||
3032 | list_entry(elem_all, struct btrfsic_block, | ||
3033 | all_blocks_node); | ||
3034 | struct list_head *elem_ref_to; | ||
3035 | struct list_head *tmp_ref_to; | ||
3036 | |||
3037 | list_for_each_safe(elem_ref_to, tmp_ref_to, | ||
3038 | &b_all->ref_to_list) { | ||
3039 | struct btrfsic_block_link *const l = | ||
3040 | list_entry(elem_ref_to, | ||
3041 | struct btrfsic_block_link, | ||
3042 | node_ref_to); | ||
3043 | |||
3044 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | ||
3045 | btrfsic_print_rem_link(state, l); | ||
3046 | |||
3047 | l->ref_cnt--; | ||
3048 | if (0 == l->ref_cnt) | ||
3049 | btrfsic_block_link_free(l); | ||
3050 | } | ||
3051 | |||
3052 | if (b_all->is_iodone) | ||
3053 | btrfsic_block_free(b_all); | ||
3054 | else | ||
3055 | printk(KERN_INFO "btrfs: attempt to free %c-block" | ||
3056 | " @%llu (%s/%llu/%d) on umount which is" | ||
3057 | " not yet iodone!\n", | ||
3058 | btrfsic_get_block_type(state, b_all), | ||
3059 | (unsigned long long)b_all->logical_bytenr, | ||
3060 | b_all->dev_state->name, | ||
3061 | (unsigned long long)b_all->dev_bytenr, | ||
3062 | b_all->mirror_num); | ||
3063 | } | ||
3064 | |||
3065 | mutex_unlock(&btrfsic_mutex); | ||
3066 | |||
3067 | kfree(state); | ||
3068 | } | ||
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h new file mode 100644 index 000000000000..8b59175cc502 --- /dev/null +++ b/fs/btrfs/check-integrity.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* | ||
2 | * Copyright (C) STRATO AG 2011. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #if !defined(__BTRFS_CHECK_INTEGRITY__) | ||
20 | #define __BTRFS_CHECK_INTEGRITY__ | ||
21 | |||
22 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | ||
23 | int btrfsic_submit_bh(int rw, struct buffer_head *bh); | ||
24 | void btrfsic_submit_bio(int rw, struct bio *bio); | ||
25 | #else | ||
26 | #define btrfsic_submit_bh submit_bh | ||
27 | #define btrfsic_submit_bio submit_bio | ||
28 | #endif | ||
29 | |||
30 | int btrfsic_mount(struct btrfs_root *root, | ||
31 | struct btrfs_fs_devices *fs_devices, | ||
32 | int including_extent_data, u32 print_mask); | ||
33 | void btrfsic_unmount(struct btrfs_root *root, | ||
34 | struct btrfs_fs_devices *fs_devices); | ||
35 | |||
36 | #endif | ||
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dede441bdeee..0639a555e16e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, | |||
240 | 240 | ||
241 | cow = btrfs_alloc_free_block(trans, root, buf->len, 0, | 241 | cow = btrfs_alloc_free_block(trans, root, buf->len, 0, |
242 | new_root_objectid, &disk_key, level, | 242 | new_root_objectid, &disk_key, level, |
243 | buf->start, 0); | 243 | buf->start, 0, 1); |
244 | if (IS_ERR(cow)) | 244 | if (IS_ERR(cow)) |
245 | return PTR_ERR(cow); | 245 | return PTR_ERR(cow); |
246 | 246 | ||
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, | |||
261 | 261 | ||
262 | WARN_ON(btrfs_header_generation(buf) > trans->transid); | 262 | WARN_ON(btrfs_header_generation(buf) > trans->transid); |
263 | if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) | 263 | if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) |
264 | ret = btrfs_inc_ref(trans, root, cow, 1); | 264 | ret = btrfs_inc_ref(trans, root, cow, 1, 1); |
265 | else | 265 | else |
266 | ret = btrfs_inc_ref(trans, root, cow, 0); | 266 | ret = btrfs_inc_ref(trans, root, cow, 0, 1); |
267 | 267 | ||
268 | if (ret) | 268 | if (ret) |
269 | return ret; | 269 | return ret; |
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, | |||
350 | if ((owner == root->root_key.objectid || | 350 | if ((owner == root->root_key.objectid || |
351 | root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && | 351 | root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && |
352 | !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { | 352 | !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { |
353 | ret = btrfs_inc_ref(trans, root, buf, 1); | 353 | ret = btrfs_inc_ref(trans, root, buf, 1, 1); |
354 | BUG_ON(ret); | 354 | BUG_ON(ret); |
355 | 355 | ||
356 | if (root->root_key.objectid == | 356 | if (root->root_key.objectid == |
357 | BTRFS_TREE_RELOC_OBJECTID) { | 357 | BTRFS_TREE_RELOC_OBJECTID) { |
358 | ret = btrfs_dec_ref(trans, root, buf, 0); | 358 | ret = btrfs_dec_ref(trans, root, buf, 0, 1); |
359 | BUG_ON(ret); | 359 | BUG_ON(ret); |
360 | ret = btrfs_inc_ref(trans, root, cow, 1); | 360 | ret = btrfs_inc_ref(trans, root, cow, 1, 1); |
361 | BUG_ON(ret); | 361 | BUG_ON(ret); |
362 | } | 362 | } |
363 | new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; | 363 | new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; |
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, | |||
365 | 365 | ||
366 | if (root->root_key.objectid == | 366 | if (root->root_key.objectid == |
367 | BTRFS_TREE_RELOC_OBJECTID) | 367 | BTRFS_TREE_RELOC_OBJECTID) |
368 | ret = btrfs_inc_ref(trans, root, cow, 1); | 368 | ret = btrfs_inc_ref(trans, root, cow, 1, 1); |
369 | else | 369 | else |
370 | ret = btrfs_inc_ref(trans, root, cow, 0); | 370 | ret = btrfs_inc_ref(trans, root, cow, 0, 1); |
371 | BUG_ON(ret); | 371 | BUG_ON(ret); |
372 | } | 372 | } |
373 | if (new_flags != 0) { | 373 | if (new_flags != 0) { |
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, | |||
381 | if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { | 381 | if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { |
382 | if (root->root_key.objectid == | 382 | if (root->root_key.objectid == |
383 | BTRFS_TREE_RELOC_OBJECTID) | 383 | BTRFS_TREE_RELOC_OBJECTID) |
384 | ret = btrfs_inc_ref(trans, root, cow, 1); | 384 | ret = btrfs_inc_ref(trans, root, cow, 1, 1); |
385 | else | 385 | else |
386 | ret = btrfs_inc_ref(trans, root, cow, 0); | 386 | ret = btrfs_inc_ref(trans, root, cow, 0, 1); |
387 | BUG_ON(ret); | 387 | BUG_ON(ret); |
388 | ret = btrfs_dec_ref(trans, root, buf, 1); | 388 | ret = btrfs_dec_ref(trans, root, buf, 1, 1); |
389 | BUG_ON(ret); | 389 | BUG_ON(ret); |
390 | } | 390 | } |
391 | clean_tree_block(trans, root, buf); | 391 | clean_tree_block(trans, root, buf); |
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, | |||
446 | 446 | ||
447 | cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, | 447 | cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, |
448 | root->root_key.objectid, &disk_key, | 448 | root->root_key.objectid, &disk_key, |
449 | level, search_start, empty_size); | 449 | level, search_start, empty_size, 1); |
450 | if (IS_ERR(cow)) | 450 | if (IS_ERR(cow)) |
451 | return PTR_ERR(cow); | 451 | return PTR_ERR(cow); |
452 | 452 | ||
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, | |||
484 | rcu_assign_pointer(root->node, cow); | 484 | rcu_assign_pointer(root->node, cow); |
485 | 485 | ||
486 | btrfs_free_tree_block(trans, root, buf, parent_start, | 486 | btrfs_free_tree_block(trans, root, buf, parent_start, |
487 | last_ref); | 487 | last_ref, 1); |
488 | free_extent_buffer(buf); | 488 | free_extent_buffer(buf); |
489 | add_root_to_dirty_list(root); | 489 | add_root_to_dirty_list(root); |
490 | } else { | 490 | } else { |
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, | |||
500 | trans->transid); | 500 | trans->transid); |
501 | btrfs_mark_buffer_dirty(parent); | 501 | btrfs_mark_buffer_dirty(parent); |
502 | btrfs_free_tree_block(trans, root, buf, parent_start, | 502 | btrfs_free_tree_block(trans, root, buf, parent_start, |
503 | last_ref); | 503 | last_ref, 1); |
504 | } | 504 | } |
505 | if (unlock_orig) | 505 | if (unlock_orig) |
506 | btrfs_tree_unlock(buf); | 506 | btrfs_tree_unlock(buf); |
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
957 | free_extent_buffer(mid); | 957 | free_extent_buffer(mid); |
958 | 958 | ||
959 | root_sub_used(root, mid->len); | 959 | root_sub_used(root, mid->len); |
960 | btrfs_free_tree_block(trans, root, mid, 0, 1); | 960 | btrfs_free_tree_block(trans, root, mid, 0, 1, 0); |
961 | /* once for the root ptr */ | 961 | /* once for the root ptr */ |
962 | free_extent_buffer(mid); | 962 | free_extent_buffer(mid); |
963 | return 0; | 963 | return 0; |
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
1015 | if (wret) | 1015 | if (wret) |
1016 | ret = wret; | 1016 | ret = wret; |
1017 | root_sub_used(root, right->len); | 1017 | root_sub_used(root, right->len); |
1018 | btrfs_free_tree_block(trans, root, right, 0, 1); | 1018 | btrfs_free_tree_block(trans, root, right, 0, 1, 0); |
1019 | free_extent_buffer(right); | 1019 | free_extent_buffer(right); |
1020 | right = NULL; | 1020 | right = NULL; |
1021 | } else { | 1021 | } else { |
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
1055 | if (wret) | 1055 | if (wret) |
1056 | ret = wret; | 1056 | ret = wret; |
1057 | root_sub_used(root, mid->len); | 1057 | root_sub_used(root, mid->len); |
1058 | btrfs_free_tree_block(trans, root, mid, 0, 1); | 1058 | btrfs_free_tree_block(trans, root, mid, 0, 1, 0); |
1059 | free_extent_buffer(mid); | 1059 | free_extent_buffer(mid); |
1060 | mid = NULL; | 1060 | mid = NULL; |
1061 | } else { | 1061 | } else { |
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, | |||
2089 | 2089 | ||
2090 | c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, | 2090 | c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, |
2091 | root->root_key.objectid, &lower_key, | 2091 | root->root_key.objectid, &lower_key, |
2092 | level, root->node->start, 0); | 2092 | level, root->node->start, 0, 0); |
2093 | if (IS_ERR(c)) | 2093 | if (IS_ERR(c)) |
2094 | return PTR_ERR(c); | 2094 | return PTR_ERR(c); |
2095 | 2095 | ||
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, | |||
2216 | 2216 | ||
2217 | split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, | 2217 | split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, |
2218 | root->root_key.objectid, | 2218 | root->root_key.objectid, |
2219 | &disk_key, level, c->start, 0); | 2219 | &disk_key, level, c->start, 0, 0); |
2220 | if (IS_ERR(split)) | 2220 | if (IS_ERR(split)) |
2221 | return PTR_ERR(split); | 2221 | return PTR_ERR(split); |
2222 | 2222 | ||
@@ -2970,7 +2970,7 @@ again: | |||
2970 | 2970 | ||
2971 | right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, | 2971 | right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, |
2972 | root->root_key.objectid, | 2972 | root->root_key.objectid, |
2973 | &disk_key, 0, l->start, 0); | 2973 | &disk_key, 0, l->start, 0, 0); |
2974 | if (IS_ERR(right)) | 2974 | if (IS_ERR(right)) |
2975 | return PTR_ERR(right); | 2975 | return PTR_ERR(right); |
2976 | 2976 | ||
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, | |||
3781 | 3781 | ||
3782 | root_sub_used(root, leaf->len); | 3782 | root_sub_used(root, leaf->len); |
3783 | 3783 | ||
3784 | btrfs_free_tree_block(trans, root, leaf, 0, 1); | 3784 | btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); |
3785 | return 0; | 3785 | return 0; |
3786 | } | 3786 | } |
3787 | /* | 3787 | /* |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323d..27ebe61d3ccc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum; | |||
86 | /* holds checksums of all the data extents */ | 86 | /* holds checksums of all the data extents */ |
87 | #define BTRFS_CSUM_TREE_OBJECTID 7ULL | 87 | #define BTRFS_CSUM_TREE_OBJECTID 7ULL |
88 | 88 | ||
89 | /* for storing balance parameters in the root tree */ | ||
90 | #define BTRFS_BALANCE_OBJECTID -4ULL | ||
91 | |||
89 | /* orhpan objectid for tracking unlinked/truncated files */ | 92 | /* orhpan objectid for tracking unlinked/truncated files */ |
90 | #define BTRFS_ORPHAN_OBJECTID -5ULL | 93 | #define BTRFS_ORPHAN_OBJECTID -5ULL |
91 | 94 | ||
@@ -692,6 +695,54 @@ struct btrfs_root_ref { | |||
692 | __le16 name_len; | 695 | __le16 name_len; |
693 | } __attribute__ ((__packed__)); | 696 | } __attribute__ ((__packed__)); |
694 | 697 | ||
698 | struct btrfs_disk_balance_args { | ||
699 | /* | ||
700 | * profiles to operate on, single is denoted by | ||
701 | * BTRFS_AVAIL_ALLOC_BIT_SINGLE | ||
702 | */ | ||
703 | __le64 profiles; | ||
704 | |||
705 | /* usage filter */ | ||
706 | __le64 usage; | ||
707 | |||
708 | /* devid filter */ | ||
709 | __le64 devid; | ||
710 | |||
711 | /* devid subset filter [pstart..pend) */ | ||
712 | __le64 pstart; | ||
713 | __le64 pend; | ||
714 | |||
715 | /* btrfs virtual address space subset filter [vstart..vend) */ | ||
716 | __le64 vstart; | ||
717 | __le64 vend; | ||
718 | |||
719 | /* | ||
720 | * profile to convert to, single is denoted by | ||
721 | * BTRFS_AVAIL_ALLOC_BIT_SINGLE | ||
722 | */ | ||
723 | __le64 target; | ||
724 | |||
725 | /* BTRFS_BALANCE_ARGS_* */ | ||
726 | __le64 flags; | ||
727 | |||
728 | __le64 unused[8]; | ||
729 | } __attribute__ ((__packed__)); | ||
730 | |||
731 | /* | ||
732 | * store balance parameters to disk so that balance can be properly | ||
733 | * resumed after crash or unmount | ||
734 | */ | ||
735 | struct btrfs_balance_item { | ||
736 | /* BTRFS_BALANCE_* */ | ||
737 | __le64 flags; | ||
738 | |||
739 | struct btrfs_disk_balance_args data; | ||
740 | struct btrfs_disk_balance_args meta; | ||
741 | struct btrfs_disk_balance_args sys; | ||
742 | |||
743 | __le64 unused[4]; | ||
744 | } __attribute__ ((__packed__)); | ||
745 | |||
695 | #define BTRFS_FILE_EXTENT_INLINE 0 | 746 | #define BTRFS_FILE_EXTENT_INLINE 0 |
696 | #define BTRFS_FILE_EXTENT_REG 1 | 747 | #define BTRFS_FILE_EXTENT_REG 1 |
697 | #define BTRFS_FILE_EXTENT_PREALLOC 2 | 748 | #define BTRFS_FILE_EXTENT_PREALLOC 2 |
@@ -751,14 +802,32 @@ struct btrfs_csum_item { | |||
751 | } __attribute__ ((__packed__)); | 802 | } __attribute__ ((__packed__)); |
752 | 803 | ||
753 | /* different types of block groups (and chunks) */ | 804 | /* different types of block groups (and chunks) */ |
754 | #define BTRFS_BLOCK_GROUP_DATA (1 << 0) | 805 | #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) |
755 | #define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) | 806 | #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) |
756 | #define BTRFS_BLOCK_GROUP_METADATA (1 << 2) | 807 | #define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) |
757 | #define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) | 808 | #define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) |
758 | #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) | 809 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) |
759 | #define BTRFS_BLOCK_GROUP_DUP (1 << 5) | 810 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) |
760 | #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) | 811 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) |
761 | #define BTRFS_NR_RAID_TYPES 5 | 812 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE |
813 | #define BTRFS_NR_RAID_TYPES 5 | ||
814 | |||
815 | #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ | ||
816 | BTRFS_BLOCK_GROUP_SYSTEM | \ | ||
817 | BTRFS_BLOCK_GROUP_METADATA) | ||
818 | |||
819 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ | ||
820 | BTRFS_BLOCK_GROUP_RAID1 | \ | ||
821 | BTRFS_BLOCK_GROUP_DUP | \ | ||
822 | BTRFS_BLOCK_GROUP_RAID10) | ||
823 | /* | ||
824 | * We need a bit for restriper to be able to tell when chunks of type | ||
825 | * SINGLE are available. This "extended" profile format is used in | ||
826 | * fs_info->avail_*_alloc_bits (in-memory) and balance item fields | ||
827 | * (on-disk). The corresponding on-disk bit in chunk.type is reserved | ||
828 | * to avoid remappings between two formats in future. | ||
829 | */ | ||
830 | #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) | ||
762 | 831 | ||
763 | struct btrfs_block_group_item { | 832 | struct btrfs_block_group_item { |
764 | __le64 used; | 833 | __le64 used; |
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache { | |||
916 | struct reloc_control; | 985 | struct reloc_control; |
917 | struct btrfs_device; | 986 | struct btrfs_device; |
918 | struct btrfs_fs_devices; | 987 | struct btrfs_fs_devices; |
988 | struct btrfs_balance_control; | ||
919 | struct btrfs_delayed_root; | 989 | struct btrfs_delayed_root; |
920 | struct btrfs_fs_info { | 990 | struct btrfs_fs_info { |
921 | u8 fsid[BTRFS_FSID_SIZE]; | 991 | u8 fsid[BTRFS_FSID_SIZE]; |
@@ -971,7 +1041,7 @@ struct btrfs_fs_info { | |||
971 | * is required instead of the faster short fsync log commits | 1041 | * is required instead of the faster short fsync log commits |
972 | */ | 1042 | */ |
973 | u64 last_trans_log_full_commit; | 1043 | u64 last_trans_log_full_commit; |
974 | unsigned long mount_opt:20; | 1044 | unsigned long mount_opt:21; |
975 | unsigned long compress_type:4; | 1045 | unsigned long compress_type:4; |
976 | u64 max_inline; | 1046 | u64 max_inline; |
977 | u64 alloc_start; | 1047 | u64 alloc_start; |
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info { | |||
1132 | spinlock_t ref_cache_lock; | 1202 | spinlock_t ref_cache_lock; |
1133 | u64 total_ref_cache_size; | 1203 | u64 total_ref_cache_size; |
1134 | 1204 | ||
1205 | /* | ||
1206 | * these three are in extended format (availability of single | ||
1207 | * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other | ||
1208 | * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) | ||
1209 | */ | ||
1135 | u64 avail_data_alloc_bits; | 1210 | u64 avail_data_alloc_bits; |
1136 | u64 avail_metadata_alloc_bits; | 1211 | u64 avail_metadata_alloc_bits; |
1137 | u64 avail_system_alloc_bits; | 1212 | u64 avail_system_alloc_bits; |
1138 | u64 data_alloc_profile; | 1213 | |
1139 | u64 metadata_alloc_profile; | 1214 | /* restriper state */ |
1140 | u64 system_alloc_profile; | 1215 | spinlock_t balance_lock; |
1216 | struct mutex balance_mutex; | ||
1217 | atomic_t balance_running; | ||
1218 | atomic_t balance_pause_req; | ||
1219 | atomic_t balance_cancel_req; | ||
1220 | struct btrfs_balance_control *balance_ctl; | ||
1221 | wait_queue_head_t balance_wait_q; | ||
1141 | 1222 | ||
1142 | unsigned data_chunk_allocations; | 1223 | unsigned data_chunk_allocations; |
1143 | unsigned metadata_ratio; | 1224 | unsigned metadata_ratio; |
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info { | |||
1155 | int scrub_workers_refcnt; | 1236 | int scrub_workers_refcnt; |
1156 | struct btrfs_workers scrub_workers; | 1237 | struct btrfs_workers scrub_workers; |
1157 | 1238 | ||
1239 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | ||
1240 | u32 check_integrity_print_mask; | ||
1241 | #endif | ||
1242 | |||
1158 | /* filesystem state */ | 1243 | /* filesystem state */ |
1159 | u64 fs_state; | 1244 | u64 fs_state; |
1160 | 1245 | ||
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args { | |||
1383 | #define BTRFS_DEV_ITEM_KEY 216 | 1468 | #define BTRFS_DEV_ITEM_KEY 216 |
1384 | #define BTRFS_CHUNK_ITEM_KEY 228 | 1469 | #define BTRFS_CHUNK_ITEM_KEY 228 |
1385 | 1470 | ||
1471 | #define BTRFS_BALANCE_ITEM_KEY 248 | ||
1472 | |||
1386 | /* | 1473 | /* |
1387 | * string items are for debugging. They just store a short string of | 1474 | * string items are for debugging. They just store a short string of |
1388 | * data in the FS | 1475 | * data in the FS |
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args { | |||
1413 | #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) | 1500 | #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) |
1414 | #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) | 1501 | #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) |
1415 | #define BTRFS_MOUNT_RECOVERY (1 << 18) | 1502 | #define BTRFS_MOUNT_RECOVERY (1 << 18) |
1503 | #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) | ||
1504 | #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) | ||
1505 | #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) | ||
1416 | 1506 | ||
1417 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) | 1507 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) |
1418 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) | 1508 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) |
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, | |||
2077 | BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, | 2167 | BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, |
2078 | num_devices, 64); | 2168 | num_devices, 64); |
2079 | 2169 | ||
2080 | /* struct btrfs_super_block */ | 2170 | /* struct btrfs_balance_item */ |
2171 | BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); | ||
2081 | 2172 | ||
2173 | static inline void btrfs_balance_data(struct extent_buffer *eb, | ||
2174 | struct btrfs_balance_item *bi, | ||
2175 | struct btrfs_disk_balance_args *ba) | ||
2176 | { | ||
2177 | read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); | ||
2178 | } | ||
2179 | |||
2180 | static inline void btrfs_set_balance_data(struct extent_buffer *eb, | ||
2181 | struct btrfs_balance_item *bi, | ||
2182 | struct btrfs_disk_balance_args *ba) | ||
2183 | { | ||
2184 | write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); | ||
2185 | } | ||
2186 | |||
2187 | static inline void btrfs_balance_meta(struct extent_buffer *eb, | ||
2188 | struct btrfs_balance_item *bi, | ||
2189 | struct btrfs_disk_balance_args *ba) | ||
2190 | { | ||
2191 | read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); | ||
2192 | } | ||
2193 | |||
2194 | static inline void btrfs_set_balance_meta(struct extent_buffer *eb, | ||
2195 | struct btrfs_balance_item *bi, | ||
2196 | struct btrfs_disk_balance_args *ba) | ||
2197 | { | ||
2198 | write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); | ||
2199 | } | ||
2200 | |||
2201 | static inline void btrfs_balance_sys(struct extent_buffer *eb, | ||
2202 | struct btrfs_balance_item *bi, | ||
2203 | struct btrfs_disk_balance_args *ba) | ||
2204 | { | ||
2205 | read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); | ||
2206 | } | ||
2207 | |||
2208 | static inline void btrfs_set_balance_sys(struct extent_buffer *eb, | ||
2209 | struct btrfs_balance_item *bi, | ||
2210 | struct btrfs_disk_balance_args *ba) | ||
2211 | { | ||
2212 | write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); | ||
2213 | } | ||
2214 | |||
2215 | static inline void | ||
2216 | btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, | ||
2217 | struct btrfs_disk_balance_args *disk) | ||
2218 | { | ||
2219 | memset(cpu, 0, sizeof(*cpu)); | ||
2220 | |||
2221 | cpu->profiles = le64_to_cpu(disk->profiles); | ||
2222 | cpu->usage = le64_to_cpu(disk->usage); | ||
2223 | cpu->devid = le64_to_cpu(disk->devid); | ||
2224 | cpu->pstart = le64_to_cpu(disk->pstart); | ||
2225 | cpu->pend = le64_to_cpu(disk->pend); | ||
2226 | cpu->vstart = le64_to_cpu(disk->vstart); | ||
2227 | cpu->vend = le64_to_cpu(disk->vend); | ||
2228 | cpu->target = le64_to_cpu(disk->target); | ||
2229 | cpu->flags = le64_to_cpu(disk->flags); | ||
2230 | } | ||
2231 | |||
2232 | static inline void | ||
2233 | btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, | ||
2234 | struct btrfs_balance_args *cpu) | ||
2235 | { | ||
2236 | memset(disk, 0, sizeof(*disk)); | ||
2237 | |||
2238 | disk->profiles = cpu_to_le64(cpu->profiles); | ||
2239 | disk->usage = cpu_to_le64(cpu->usage); | ||
2240 | disk->devid = cpu_to_le64(cpu->devid); | ||
2241 | disk->pstart = cpu_to_le64(cpu->pstart); | ||
2242 | disk->pend = cpu_to_le64(cpu->pend); | ||
2243 | disk->vstart = cpu_to_le64(cpu->vstart); | ||
2244 | disk->vend = cpu_to_le64(cpu->vend); | ||
2245 | disk->target = cpu_to_le64(cpu->target); | ||
2246 | disk->flags = cpu_to_le64(cpu->flags); | ||
2247 | } | ||
2248 | |||
2249 | /* struct btrfs_super_block */ | ||
2082 | BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); | 2250 | BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); |
2083 | BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); | 2251 | BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); |
2084 | BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, | 2252 | BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, |
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, | |||
2196 | return btrfs_item_size(eb, e) - offset; | 2364 | return btrfs_item_size(eb, e) - offset; |
2197 | } | 2365 | } |
2198 | 2366 | ||
2199 | static inline struct btrfs_root *btrfs_sb(struct super_block *sb) | 2367 | static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) |
2200 | { | 2368 | { |
2201 | return sb->s_fs_info; | 2369 | return sb->s_fs_info; |
2202 | } | 2370 | } |
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | |||
2277 | struct btrfs_root *root, u32 blocksize, | 2445 | struct btrfs_root *root, u32 blocksize, |
2278 | u64 parent, u64 root_objectid, | 2446 | u64 parent, u64 root_objectid, |
2279 | struct btrfs_disk_key *key, int level, | 2447 | struct btrfs_disk_key *key, int level, |
2280 | u64 hint, u64 empty_size); | 2448 | u64 hint, u64 empty_size, int for_cow); |
2281 | void btrfs_free_tree_block(struct btrfs_trans_handle *trans, | 2449 | void btrfs_free_tree_block(struct btrfs_trans_handle *trans, |
2282 | struct btrfs_root *root, | 2450 | struct btrfs_root *root, |
2283 | struct extent_buffer *buf, | 2451 | struct extent_buffer *buf, |
2284 | u64 parent, int last_ref); | 2452 | u64 parent, int last_ref, int for_cow); |
2285 | struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, | 2453 | struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, |
2286 | struct btrfs_root *root, | 2454 | struct btrfs_root *root, |
2287 | u64 bytenr, u32 blocksize, | 2455 | u64 bytenr, u32 blocksize, |
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, | |||
2301 | u64 search_end, struct btrfs_key *ins, | 2469 | u64 search_end, struct btrfs_key *ins, |
2302 | u64 data); | 2470 | u64 data); |
2303 | int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 2471 | int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
2304 | struct extent_buffer *buf, int full_backref); | 2472 | struct extent_buffer *buf, int full_backref, int for_cow); |
2305 | int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 2473 | int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
2306 | struct extent_buffer *buf, int full_backref); | 2474 | struct extent_buffer *buf, int full_backref, int for_cow); |
2307 | int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, | 2475 | int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, |
2308 | struct btrfs_root *root, | 2476 | struct btrfs_root *root, |
2309 | u64 bytenr, u64 num_bytes, u64 flags, | 2477 | u64 bytenr, u64 num_bytes, u64 flags, |
2310 | int is_data); | 2478 | int is_data); |
2311 | int btrfs_free_extent(struct btrfs_trans_handle *trans, | 2479 | int btrfs_free_extent(struct btrfs_trans_handle *trans, |
2312 | struct btrfs_root *root, | 2480 | struct btrfs_root *root, |
2313 | u64 bytenr, u64 num_bytes, u64 parent, | 2481 | u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, |
2314 | u64 root_objectid, u64 owner, u64 offset); | 2482 | u64 owner, u64 offset, int for_cow); |
2315 | 2483 | ||
2316 | int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); | 2484 | int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); |
2317 | int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, | 2485 | int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, |
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, | |||
2323 | int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, | 2491 | int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, |
2324 | struct btrfs_root *root, | 2492 | struct btrfs_root *root, |
2325 | u64 bytenr, u64 num_bytes, u64 parent, | 2493 | u64 bytenr, u64 num_bytes, u64 parent, |
2326 | u64 root_objectid, u64 owner, u64 offset); | 2494 | u64 root_objectid, u64 owner, u64 offset, int for_cow); |
2327 | 2495 | ||
2328 | int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, | 2496 | int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, |
2329 | struct btrfs_root *root); | 2497 | struct btrfs_root *root); |
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, | |||
2482 | } | 2650 | } |
2483 | 2651 | ||
2484 | int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); | 2652 | int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); |
2653 | static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) | ||
2654 | { | ||
2655 | ++p->slots[0]; | ||
2656 | if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) | ||
2657 | return btrfs_next_leaf(root, p); | ||
2658 | return 0; | ||
2659 | } | ||
2485 | int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); | 2660 | int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); |
2486 | int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); | 2661 | int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); |
2487 | void btrfs_drop_snapshot(struct btrfs_root *root, | 2662 | void btrfs_drop_snapshot(struct btrfs_root *root, |
2488 | struct btrfs_block_rsv *block_rsv, int update_ref); | 2663 | struct btrfs_block_rsv *block_rsv, int update_ref, |
2664 | int for_reloc); | ||
2489 | int btrfs_drop_subtree(struct btrfs_trans_handle *trans, | 2665 | int btrfs_drop_subtree(struct btrfs_trans_handle *trans, |
2490 | struct btrfs_root *root, | 2666 | struct btrfs_root *root, |
2491 | struct extent_buffer *node, | 2667 | struct extent_buffer *node, |
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) | |||
2500 | } | 2676 | } |
2501 | static inline void free_fs_info(struct btrfs_fs_info *fs_info) | 2677 | static inline void free_fs_info(struct btrfs_fs_info *fs_info) |
2502 | { | 2678 | { |
2679 | kfree(fs_info->balance_ctl); | ||
2503 | kfree(fs_info->delayed_root); | 2680 | kfree(fs_info->delayed_root); |
2504 | kfree(fs_info->extent_root); | 2681 | kfree(fs_info->extent_root); |
2505 | kfree(fs_info->tree_root); | 2682 | kfree(fs_info->tree_root); |
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) | |||
2510 | kfree(fs_info->super_for_commit); | 2687 | kfree(fs_info->super_for_commit); |
2511 | kfree(fs_info); | 2688 | kfree(fs_info); |
2512 | } | 2689 | } |
2690 | /** | ||
2691 | * profile_is_valid - tests whether a given profile is valid and reduced | ||
2692 | * @flags: profile to validate | ||
2693 | * @extended: if true @flags is treated as an extended profile | ||
2694 | */ | ||
2695 | static inline int profile_is_valid(u64 flags, int extended) | ||
2696 | { | ||
2697 | u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; | ||
2698 | |||
2699 | flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; | ||
2700 | if (extended) | ||
2701 | mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; | ||
2702 | |||
2703 | if (flags & mask) | ||
2704 | return 0; | ||
2705 | /* true if zero or exactly one bit set */ | ||
2706 | return (flags & (~flags + 1)) == flags; | ||
2707 | } | ||
2513 | 2708 | ||
2514 | /* root-item.c */ | 2709 | /* root-item.c */ |
2515 | int btrfs_find_root_ref(struct btrfs_root *tree_root, | 2710 | int btrfs_find_root_ref(struct btrfs_root *tree_root, |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 9c1eccc2c503..fe4cd0f1cef1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, | |||
595 | 595 | ||
596 | num_bytes = btrfs_calc_trans_metadata_size(root, 1); | 596 | num_bytes = btrfs_calc_trans_metadata_size(root, 1); |
597 | ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); | 597 | ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); |
598 | if (!ret) | 598 | if (!ret) { |
599 | trace_btrfs_space_reservation(root->fs_info, "delayed_item", | ||
600 | item->key.objectid, | ||
601 | num_bytes, 1); | ||
599 | item->bytes_reserved = num_bytes; | 602 | item->bytes_reserved = num_bytes; |
603 | } | ||
600 | 604 | ||
601 | return ret; | 605 | return ret; |
602 | } | 606 | } |
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, | |||
610 | return; | 614 | return; |
611 | 615 | ||
612 | rsv = &root->fs_info->delayed_block_rsv; | 616 | rsv = &root->fs_info->delayed_block_rsv; |
617 | trace_btrfs_space_reservation(root->fs_info, "delayed_item", | ||
618 | item->key.objectid, item->bytes_reserved, | ||
619 | 0); | ||
613 | btrfs_block_rsv_release(root, rsv, | 620 | btrfs_block_rsv_release(root, rsv, |
614 | item->bytes_reserved); | 621 | item->bytes_reserved); |
615 | } | 622 | } |
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata( | |||
624 | struct btrfs_block_rsv *dst_rsv; | 631 | struct btrfs_block_rsv *dst_rsv; |
625 | u64 num_bytes; | 632 | u64 num_bytes; |
626 | int ret; | 633 | int ret; |
627 | int release = false; | 634 | bool release = false; |
628 | 635 | ||
629 | src_rsv = trans->block_rsv; | 636 | src_rsv = trans->block_rsv; |
630 | dst_rsv = &root->fs_info->delayed_block_rsv; | 637 | dst_rsv = &root->fs_info->delayed_block_rsv; |
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata( | |||
651 | */ | 658 | */ |
652 | if (ret == -EAGAIN) | 659 | if (ret == -EAGAIN) |
653 | ret = -ENOSPC; | 660 | ret = -ENOSPC; |
654 | if (!ret) | 661 | if (!ret) { |
655 | node->bytes_reserved = num_bytes; | 662 | node->bytes_reserved = num_bytes; |
663 | trace_btrfs_space_reservation(root->fs_info, | ||
664 | "delayed_inode", | ||
665 | btrfs_ino(inode), | ||
666 | num_bytes, 1); | ||
667 | } | ||
656 | return ret; | 668 | return ret; |
657 | } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { | 669 | } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { |
658 | spin_lock(&BTRFS_I(inode)->lock); | 670 | spin_lock(&BTRFS_I(inode)->lock); |
@@ -707,11 +719,17 @@ out: | |||
707 | * reservation here. I think it may be time for a documentation page on | 719 | * reservation here. I think it may be time for a documentation page on |
708 | * how block rsvs. work. | 720 | * how block rsvs. work. |
709 | */ | 721 | */ |
710 | if (!ret) | 722 | if (!ret) { |
723 | trace_btrfs_space_reservation(root->fs_info, "delayed_inode", | ||
724 | btrfs_ino(inode), num_bytes, 1); | ||
711 | node->bytes_reserved = num_bytes; | 725 | node->bytes_reserved = num_bytes; |
726 | } | ||
712 | 727 | ||
713 | if (release) | 728 | if (release) { |
729 | trace_btrfs_space_reservation(root->fs_info, "delalloc", | ||
730 | btrfs_ino(inode), num_bytes, 0); | ||
714 | btrfs_block_rsv_release(root, src_rsv, num_bytes); | 731 | btrfs_block_rsv_release(root, src_rsv, num_bytes); |
732 | } | ||
715 | 733 | ||
716 | return ret; | 734 | return ret; |
717 | } | 735 | } |
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, | |||
725 | return; | 743 | return; |
726 | 744 | ||
727 | rsv = &root->fs_info->delayed_block_rsv; | 745 | rsv = &root->fs_info->delayed_block_rsv; |
746 | trace_btrfs_space_reservation(root->fs_info, "delayed_inode", | ||
747 | node->inode_id, node->bytes_reserved, 0); | ||
728 | btrfs_block_rsv_release(root, rsv, | 748 | btrfs_block_rsv_release(root, rsv, |
729 | node->bytes_reserved); | 749 | node->bytes_reserved); |
730 | node->bytes_reserved = 0; | 750 | node->bytes_reserved = 0; |
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, | |||
1372 | goto release_node; | 1392 | goto release_node; |
1373 | } | 1393 | } |
1374 | 1394 | ||
1375 | ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); | ||
1376 | /* | ||
1377 | * we have reserved enough space when we start a new transaction, | ||
1378 | * so reserving metadata failure is impossible | ||
1379 | */ | ||
1380 | BUG_ON(ret); | ||
1381 | |||
1382 | delayed_item->key.objectid = btrfs_ino(dir); | 1395 | delayed_item->key.objectid = btrfs_ino(dir); |
1383 | btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); | 1396 | btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); |
1384 | delayed_item->key.offset = index; | 1397 | delayed_item->key.offset = index; |
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, | |||
1391 | dir_item->type = type; | 1404 | dir_item->type = type; |
1392 | memcpy((char *)(dir_item + 1), name, name_len); | 1405 | memcpy((char *)(dir_item + 1), name, name_len); |
1393 | 1406 | ||
1407 | ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); | ||
1408 | /* | ||
1409 | * we have reserved enough space when we start a new transaction, | ||
1410 | * so reserving metadata failure is impossible | ||
1411 | */ | ||
1412 | BUG_ON(ret); | ||
1413 | |||
1414 | |||
1394 | mutex_lock(&delayed_node->mutex); | 1415 | mutex_lock(&delayed_node->mutex); |
1395 | ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); | 1416 | ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); |
1396 | if (unlikely(ret)) { | 1417 | if (unlikely(ret)) { |
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 125cf76fcd08..66e4f29505a3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c | |||
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, | |||
101 | return -1; | 101 | return -1; |
102 | if (ref1->type > ref2->type) | 102 | if (ref1->type > ref2->type) |
103 | return 1; | 103 | return 1; |
104 | /* merging of sequenced refs is not allowed */ | ||
105 | if (ref1->seq < ref2->seq) | ||
106 | return -1; | ||
107 | if (ref1->seq > ref2->seq) | ||
108 | return 1; | ||
104 | if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || | 109 | if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || |
105 | ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { | 110 | ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { |
106 | return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), | 111 | return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), |
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, | |||
150 | 155 | ||
151 | /* | 156 | /* |
152 | * find an head entry based on bytenr. This returns the delayed ref | 157 | * find an head entry based on bytenr. This returns the delayed ref |
153 | * head if it was able to find one, or NULL if nothing was in that spot | 158 | * head if it was able to find one, or NULL if nothing was in that spot. |
159 | * If return_bigger is given, the next bigger entry is returned if no exact | ||
160 | * match is found. | ||
154 | */ | 161 | */ |
155 | static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, | 162 | static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, |
156 | u64 bytenr, | 163 | u64 bytenr, |
157 | struct btrfs_delayed_ref_node **last) | 164 | struct btrfs_delayed_ref_node **last, |
165 | int return_bigger) | ||
158 | { | 166 | { |
159 | struct rb_node *n = root->rb_node; | 167 | struct rb_node *n; |
160 | struct btrfs_delayed_ref_node *entry; | 168 | struct btrfs_delayed_ref_node *entry; |
161 | int cmp; | 169 | int cmp = 0; |
162 | 170 | ||
171 | again: | ||
172 | n = root->rb_node; | ||
173 | entry = NULL; | ||
163 | while (n) { | 174 | while (n) { |
164 | entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); | 175 | entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); |
165 | WARN_ON(!entry->in_tree); | 176 | WARN_ON(!entry->in_tree); |
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, | |||
182 | else | 193 | else |
183 | return entry; | 194 | return entry; |
184 | } | 195 | } |
196 | if (entry && return_bigger) { | ||
197 | if (cmp > 0) { | ||
198 | n = rb_next(&entry->rb_node); | ||
199 | if (!n) | ||
200 | n = rb_first(root); | ||
201 | entry = rb_entry(n, struct btrfs_delayed_ref_node, | ||
202 | rb_node); | ||
203 | bytenr = entry->bytenr; | ||
204 | return_bigger = 0; | ||
205 | goto again; | ||
206 | } | ||
207 | return entry; | ||
208 | } | ||
185 | return NULL; | 209 | return NULL; |
186 | } | 210 | } |
187 | 211 | ||
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, | |||
209 | return 0; | 233 | return 0; |
210 | } | 234 | } |
211 | 235 | ||
236 | int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, | ||
237 | u64 seq) | ||
238 | { | ||
239 | struct seq_list *elem; | ||
240 | |||
241 | assert_spin_locked(&delayed_refs->lock); | ||
242 | if (list_empty(&delayed_refs->seq_head)) | ||
243 | return 0; | ||
244 | |||
245 | elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); | ||
246 | if (seq >= elem->seq) { | ||
247 | pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", | ||
248 | seq, elem->seq, delayed_refs); | ||
249 | return 1; | ||
250 | } | ||
251 | return 0; | ||
252 | } | ||
253 | |||
212 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, | 254 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, |
213 | struct list_head *cluster, u64 start) | 255 | struct list_head *cluster, u64 start) |
214 | { | 256 | { |
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, | |||
223 | node = rb_first(&delayed_refs->root); | 265 | node = rb_first(&delayed_refs->root); |
224 | } else { | 266 | } else { |
225 | ref = NULL; | 267 | ref = NULL; |
226 | find_ref_head(&delayed_refs->root, start, &ref); | 268 | find_ref_head(&delayed_refs->root, start + 1, &ref, 1); |
227 | if (ref) { | 269 | if (ref) { |
228 | struct btrfs_delayed_ref_node *tmp; | ||
229 | |||
230 | node = rb_prev(&ref->rb_node); | ||
231 | while (node) { | ||
232 | tmp = rb_entry(node, | ||
233 | struct btrfs_delayed_ref_node, | ||
234 | rb_node); | ||
235 | if (tmp->bytenr < start) | ||
236 | break; | ||
237 | ref = tmp; | ||
238 | node = rb_prev(&ref->rb_node); | ||
239 | } | ||
240 | node = &ref->rb_node; | 270 | node = &ref->rb_node; |
241 | } else | 271 | } else |
242 | node = rb_first(&delayed_refs->root); | 272 | node = rb_first(&delayed_refs->root); |
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, | |||
390 | * this does all the dirty work in terms of maintaining the correct | 420 | * this does all the dirty work in terms of maintaining the correct |
391 | * overall modification count. | 421 | * overall modification count. |
392 | */ | 422 | */ |
393 | static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, | 423 | static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, |
424 | struct btrfs_trans_handle *trans, | ||
394 | struct btrfs_delayed_ref_node *ref, | 425 | struct btrfs_delayed_ref_node *ref, |
395 | u64 bytenr, u64 num_bytes, | 426 | u64 bytenr, u64 num_bytes, |
396 | int action, int is_data) | 427 | int action, int is_data) |
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, | |||
437 | ref->action = 0; | 468 | ref->action = 0; |
438 | ref->is_head = 1; | 469 | ref->is_head = 1; |
439 | ref->in_tree = 1; | 470 | ref->in_tree = 1; |
471 | ref->seq = 0; | ||
440 | 472 | ||
441 | head_ref = btrfs_delayed_node_to_head(ref); | 473 | head_ref = btrfs_delayed_node_to_head(ref); |
442 | head_ref->must_insert_reserved = must_insert_reserved; | 474 | head_ref->must_insert_reserved = must_insert_reserved; |
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, | |||
468 | /* | 500 | /* |
469 | * helper to insert a delayed tree ref into the rbtree. | 501 | * helper to insert a delayed tree ref into the rbtree. |
470 | */ | 502 | */ |
471 | static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, | 503 | static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, |
504 | struct btrfs_trans_handle *trans, | ||
472 | struct btrfs_delayed_ref_node *ref, | 505 | struct btrfs_delayed_ref_node *ref, |
473 | u64 bytenr, u64 num_bytes, u64 parent, | 506 | u64 bytenr, u64 num_bytes, u64 parent, |
474 | u64 ref_root, int level, int action) | 507 | u64 ref_root, int level, int action, |
508 | int for_cow) | ||
475 | { | 509 | { |
476 | struct btrfs_delayed_ref_node *existing; | 510 | struct btrfs_delayed_ref_node *existing; |
477 | struct btrfs_delayed_tree_ref *full_ref; | 511 | struct btrfs_delayed_tree_ref *full_ref; |
478 | struct btrfs_delayed_ref_root *delayed_refs; | 512 | struct btrfs_delayed_ref_root *delayed_refs; |
513 | u64 seq = 0; | ||
479 | 514 | ||
480 | if (action == BTRFS_ADD_DELAYED_EXTENT) | 515 | if (action == BTRFS_ADD_DELAYED_EXTENT) |
481 | action = BTRFS_ADD_DELAYED_REF; | 516 | action = BTRFS_ADD_DELAYED_REF; |
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, | |||
491 | ref->is_head = 0; | 526 | ref->is_head = 0; |
492 | ref->in_tree = 1; | 527 | ref->in_tree = 1; |
493 | 528 | ||
529 | if (need_ref_seq(for_cow, ref_root)) | ||
530 | seq = inc_delayed_seq(delayed_refs); | ||
531 | ref->seq = seq; | ||
532 | |||
494 | full_ref = btrfs_delayed_node_to_tree_ref(ref); | 533 | full_ref = btrfs_delayed_node_to_tree_ref(ref); |
495 | if (parent) { | 534 | full_ref->parent = parent; |
496 | full_ref->parent = parent; | 535 | full_ref->root = ref_root; |
536 | if (parent) | ||
497 | ref->type = BTRFS_SHARED_BLOCK_REF_KEY; | 537 | ref->type = BTRFS_SHARED_BLOCK_REF_KEY; |
498 | } else { | 538 | else |
499 | full_ref->root = ref_root; | ||
500 | ref->type = BTRFS_TREE_BLOCK_REF_KEY; | 539 | ref->type = BTRFS_TREE_BLOCK_REF_KEY; |
501 | } | ||
502 | full_ref->level = level; | 540 | full_ref->level = level; |
503 | 541 | ||
504 | trace_btrfs_delayed_tree_ref(ref, full_ref, action); | 542 | trace_btrfs_delayed_tree_ref(ref, full_ref, action); |
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, | |||
522 | /* | 560 | /* |
523 | * helper to insert a delayed data ref into the rbtree. | 561 | * helper to insert a delayed data ref into the rbtree. |
524 | */ | 562 | */ |
525 | static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, | 563 | static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, |
564 | struct btrfs_trans_handle *trans, | ||
526 | struct btrfs_delayed_ref_node *ref, | 565 | struct btrfs_delayed_ref_node *ref, |
527 | u64 bytenr, u64 num_bytes, u64 parent, | 566 | u64 bytenr, u64 num_bytes, u64 parent, |
528 | u64 ref_root, u64 owner, u64 offset, | 567 | u64 ref_root, u64 owner, u64 offset, |
529 | int action) | 568 | int action, int for_cow) |
530 | { | 569 | { |
531 | struct btrfs_delayed_ref_node *existing; | 570 | struct btrfs_delayed_ref_node *existing; |
532 | struct btrfs_delayed_data_ref *full_ref; | 571 | struct btrfs_delayed_data_ref *full_ref; |
533 | struct btrfs_delayed_ref_root *delayed_refs; | 572 | struct btrfs_delayed_ref_root *delayed_refs; |
573 | u64 seq = 0; | ||
534 | 574 | ||
535 | if (action == BTRFS_ADD_DELAYED_EXTENT) | 575 | if (action == BTRFS_ADD_DELAYED_EXTENT) |
536 | action = BTRFS_ADD_DELAYED_REF; | 576 | action = BTRFS_ADD_DELAYED_REF; |
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, | |||
546 | ref->is_head = 0; | 586 | ref->is_head = 0; |
547 | ref->in_tree = 1; | 587 | ref->in_tree = 1; |
548 | 588 | ||
589 | if (need_ref_seq(for_cow, ref_root)) | ||
590 | seq = inc_delayed_seq(delayed_refs); | ||
591 | ref->seq = seq; | ||
592 | |||
549 | full_ref = btrfs_delayed_node_to_data_ref(ref); | 593 | full_ref = btrfs_delayed_node_to_data_ref(ref); |
550 | if (parent) { | 594 | full_ref->parent = parent; |
551 | full_ref->parent = parent; | 595 | full_ref->root = ref_root; |
596 | if (parent) | ||
552 | ref->type = BTRFS_SHARED_DATA_REF_KEY; | 597 | ref->type = BTRFS_SHARED_DATA_REF_KEY; |
553 | } else { | 598 | else |
554 | full_ref->root = ref_root; | ||
555 | ref->type = BTRFS_EXTENT_DATA_REF_KEY; | 599 | ref->type = BTRFS_EXTENT_DATA_REF_KEY; |
556 | } | 600 | |
557 | full_ref->objectid = owner; | 601 | full_ref->objectid = owner; |
558 | full_ref->offset = offset; | 602 | full_ref->offset = offset; |
559 | 603 | ||
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, | |||
580 | * to make sure the delayed ref is eventually processed before this | 624 | * to make sure the delayed ref is eventually processed before this |
581 | * transaction commits. | 625 | * transaction commits. |
582 | */ | 626 | */ |
583 | int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, | 627 | int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, |
628 | struct btrfs_trans_handle *trans, | ||
584 | u64 bytenr, u64 num_bytes, u64 parent, | 629 | u64 bytenr, u64 num_bytes, u64 parent, |
585 | u64 ref_root, int level, int action, | 630 | u64 ref_root, int level, int action, |
586 | struct btrfs_delayed_extent_op *extent_op) | 631 | struct btrfs_delayed_extent_op *extent_op, |
632 | int for_cow) | ||
587 | { | 633 | { |
588 | struct btrfs_delayed_tree_ref *ref; | 634 | struct btrfs_delayed_tree_ref *ref; |
589 | struct btrfs_delayed_ref_head *head_ref; | 635 | struct btrfs_delayed_ref_head *head_ref; |
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, | |||
610 | * insert both the head node and the new ref without dropping | 656 | * insert both the head node and the new ref without dropping |
611 | * the spin lock | 657 | * the spin lock |
612 | */ | 658 | */ |
613 | ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, | 659 | ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, |
614 | action, 0); | 660 | num_bytes, action, 0); |
615 | BUG_ON(ret); | 661 | BUG_ON(ret); |
616 | 662 | ||
617 | ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, | 663 | ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, |
618 | parent, ref_root, level, action); | 664 | num_bytes, parent, ref_root, level, action, |
665 | for_cow); | ||
619 | BUG_ON(ret); | 666 | BUG_ON(ret); |
667 | if (!need_ref_seq(for_cow, ref_root) && | ||
668 | waitqueue_active(&delayed_refs->seq_wait)) | ||
669 | wake_up(&delayed_refs->seq_wait); | ||
620 | spin_unlock(&delayed_refs->lock); | 670 | spin_unlock(&delayed_refs->lock); |
621 | return 0; | 671 | return 0; |
622 | } | 672 | } |
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, | |||
624 | /* | 674 | /* |
625 | * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. | 675 | * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. |
626 | */ | 676 | */ |
627 | int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, | 677 | int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, |
678 | struct btrfs_trans_handle *trans, | ||
628 | u64 bytenr, u64 num_bytes, | 679 | u64 bytenr, u64 num_bytes, |
629 | u64 parent, u64 ref_root, | 680 | u64 parent, u64 ref_root, |
630 | u64 owner, u64 offset, int action, | 681 | u64 owner, u64 offset, int action, |
631 | struct btrfs_delayed_extent_op *extent_op) | 682 | struct btrfs_delayed_extent_op *extent_op, |
683 | int for_cow) | ||
632 | { | 684 | { |
633 | struct btrfs_delayed_data_ref *ref; | 685 | struct btrfs_delayed_data_ref *ref; |
634 | struct btrfs_delayed_ref_head *head_ref; | 686 | struct btrfs_delayed_ref_head *head_ref; |
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, | |||
655 | * insert both the head node and the new ref without dropping | 707 | * insert both the head node and the new ref without dropping |
656 | * the spin lock | 708 | * the spin lock |
657 | */ | 709 | */ |
658 | ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, | 710 | ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, |
659 | action, 1); | 711 | num_bytes, action, 1); |
660 | BUG_ON(ret); | 712 | BUG_ON(ret); |
661 | 713 | ||
662 | ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, | 714 | ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, |
663 | parent, ref_root, owner, offset, action); | 715 | num_bytes, parent, ref_root, owner, offset, |
716 | action, for_cow); | ||
664 | BUG_ON(ret); | 717 | BUG_ON(ret); |
718 | if (!need_ref_seq(for_cow, ref_root) && | ||
719 | waitqueue_active(&delayed_refs->seq_wait)) | ||
720 | wake_up(&delayed_refs->seq_wait); | ||
665 | spin_unlock(&delayed_refs->lock); | 721 | spin_unlock(&delayed_refs->lock); |
666 | return 0; | 722 | return 0; |
667 | } | 723 | } |
668 | 724 | ||
669 | int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, | 725 | int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, |
726 | struct btrfs_trans_handle *trans, | ||
670 | u64 bytenr, u64 num_bytes, | 727 | u64 bytenr, u64 num_bytes, |
671 | struct btrfs_delayed_extent_op *extent_op) | 728 | struct btrfs_delayed_extent_op *extent_op) |
672 | { | 729 | { |
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, | |||
683 | delayed_refs = &trans->transaction->delayed_refs; | 740 | delayed_refs = &trans->transaction->delayed_refs; |
684 | spin_lock(&delayed_refs->lock); | 741 | spin_lock(&delayed_refs->lock); |
685 | 742 | ||
686 | ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, | 743 | ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, |
687 | num_bytes, BTRFS_UPDATE_DELAYED_HEAD, | 744 | num_bytes, BTRFS_UPDATE_DELAYED_HEAD, |
688 | extent_op->is_data); | 745 | extent_op->is_data); |
689 | BUG_ON(ret); | 746 | BUG_ON(ret); |
690 | 747 | ||
748 | if (waitqueue_active(&delayed_refs->seq_wait)) | ||
749 | wake_up(&delayed_refs->seq_wait); | ||
691 | spin_unlock(&delayed_refs->lock); | 750 | spin_unlock(&delayed_refs->lock); |
692 | return 0; | 751 | return 0; |
693 | } | 752 | } |
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) | |||
704 | struct btrfs_delayed_ref_root *delayed_refs; | 763 | struct btrfs_delayed_ref_root *delayed_refs; |
705 | 764 | ||
706 | delayed_refs = &trans->transaction->delayed_refs; | 765 | delayed_refs = &trans->transaction->delayed_refs; |
707 | ref = find_ref_head(&delayed_refs->root, bytenr, NULL); | 766 | ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); |
708 | if (ref) | 767 | if (ref) |
709 | return btrfs_delayed_node_to_head(ref); | 768 | return btrfs_delayed_node_to_head(ref); |
710 | return NULL; | 769 | return NULL; |
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e287e3b0eab0..d8f244d94925 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h | |||
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node { | |||
33 | /* the size of the extent */ | 33 | /* the size of the extent */ |
34 | u64 num_bytes; | 34 | u64 num_bytes; |
35 | 35 | ||
36 | /* seq number to keep track of insertion order */ | ||
37 | u64 seq; | ||
38 | |||
36 | /* ref count on this data structure */ | 39 | /* ref count on this data structure */ |
37 | atomic_t refs; | 40 | atomic_t refs; |
38 | 41 | ||
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head { | |||
98 | 101 | ||
99 | struct btrfs_delayed_tree_ref { | 102 | struct btrfs_delayed_tree_ref { |
100 | struct btrfs_delayed_ref_node node; | 103 | struct btrfs_delayed_ref_node node; |
101 | union { | 104 | u64 root; |
102 | u64 root; | 105 | u64 parent; |
103 | u64 parent; | ||
104 | }; | ||
105 | int level; | 106 | int level; |
106 | }; | 107 | }; |
107 | 108 | ||
108 | struct btrfs_delayed_data_ref { | 109 | struct btrfs_delayed_data_ref { |
109 | struct btrfs_delayed_ref_node node; | 110 | struct btrfs_delayed_ref_node node; |
110 | union { | 111 | u64 root; |
111 | u64 root; | 112 | u64 parent; |
112 | u64 parent; | ||
113 | }; | ||
114 | u64 objectid; | 113 | u64 objectid; |
115 | u64 offset; | 114 | u64 offset; |
116 | }; | 115 | }; |
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root { | |||
140 | int flushing; | 139 | int flushing; |
141 | 140 | ||
142 | u64 run_delayed_start; | 141 | u64 run_delayed_start; |
142 | |||
143 | /* | ||
144 | * seq number of delayed refs. We need to know if a backref was being | ||
145 | * added before the currently processed ref or afterwards. | ||
146 | */ | ||
147 | u64 seq; | ||
148 | |||
149 | /* | ||
150 | * seq_list holds a list of all seq numbers that are currently being | ||
151 | * added to the list. While walking backrefs (btrfs_find_all_roots, | ||
152 | * qgroups), which might take some time, no newer ref must be processed, | ||
153 | * as it might influence the outcome of the walk. | ||
154 | */ | ||
155 | struct list_head seq_head; | ||
156 | |||
157 | /* | ||
158 | * when the only refs we have in the list must not be processed, we want | ||
159 | * to wait for more refs to show up or for the end of backref walking. | ||
160 | */ | ||
161 | wait_queue_head_t seq_wait; | ||
143 | }; | 162 | }; |
144 | 163 | ||
145 | static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) | 164 | static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) |
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) | |||
151 | } | 170 | } |
152 | } | 171 | } |
153 | 172 | ||
154 | int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, | 173 | int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, |
174 | struct btrfs_trans_handle *trans, | ||
155 | u64 bytenr, u64 num_bytes, u64 parent, | 175 | u64 bytenr, u64 num_bytes, u64 parent, |
156 | u64 ref_root, int level, int action, | 176 | u64 ref_root, int level, int action, |
157 | struct btrfs_delayed_extent_op *extent_op); | 177 | struct btrfs_delayed_extent_op *extent_op, |
158 | int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, | 178 | int for_cow); |
179 | int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, | ||
180 | struct btrfs_trans_handle *trans, | ||
159 | u64 bytenr, u64 num_bytes, | 181 | u64 bytenr, u64 num_bytes, |
160 | u64 parent, u64 ref_root, | 182 | u64 parent, u64 ref_root, |
161 | u64 owner, u64 offset, int action, | 183 | u64 owner, u64 offset, int action, |
162 | struct btrfs_delayed_extent_op *extent_op); | 184 | struct btrfs_delayed_extent_op *extent_op, |
163 | int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, | 185 | int for_cow); |
186 | int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, | ||
187 | struct btrfs_trans_handle *trans, | ||
164 | u64 bytenr, u64 num_bytes, | 188 | u64 bytenr, u64 num_bytes, |
165 | struct btrfs_delayed_extent_op *extent_op); | 189 | struct btrfs_delayed_extent_op *extent_op); |
166 | 190 | ||
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, | |||
170 | struct btrfs_delayed_ref_head *head); | 194 | struct btrfs_delayed_ref_head *head); |
171 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, | 195 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, |
172 | struct list_head *cluster, u64 search_start); | 196 | struct list_head *cluster, u64 search_start); |
197 | |||
198 | struct seq_list { | ||
199 | struct list_head list; | ||
200 | u64 seq; | ||
201 | }; | ||
202 | |||
203 | static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) | ||
204 | { | ||
205 | assert_spin_locked(&delayed_refs->lock); | ||
206 | ++delayed_refs->seq; | ||
207 | return delayed_refs->seq; | ||
208 | } | ||
209 | |||
210 | static inline void | ||
211 | btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, | ||
212 | struct seq_list *elem) | ||
213 | { | ||
214 | assert_spin_locked(&delayed_refs->lock); | ||
215 | elem->seq = delayed_refs->seq; | ||
216 | list_add_tail(&elem->list, &delayed_refs->seq_head); | ||
217 | } | ||
218 | |||
219 | static inline void | ||
220 | btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, | ||
221 | struct seq_list *elem) | ||
222 | { | ||
223 | spin_lock(&delayed_refs->lock); | ||
224 | list_del(&elem->list); | ||
225 | wake_up(&delayed_refs->seq_wait); | ||
226 | spin_unlock(&delayed_refs->lock); | ||
227 | } | ||
228 | |||
229 | int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, | ||
230 | u64 seq); | ||
231 | |||
232 | /* | ||
233 | * delayed refs with a ref_seq > 0 must be held back during backref walking. | ||
234 | * this only applies to items in one of the fs-trees. for_cow items never need | ||
235 | * to be held back, so they won't get a ref_seq number. | ||
236 | */ | ||
237 | static inline int need_ref_seq(int for_cow, u64 rootid) | ||
238 | { | ||
239 | if (for_cow) | ||
240 | return 0; | ||
241 | |||
242 | if (rootid == BTRFS_FS_TREE_OBJECTID) | ||
243 | return 1; | ||
244 | |||
245 | if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) | ||
246 | return 1; | ||
247 | |||
248 | return 0; | ||
249 | } | ||
250 | |||
173 | /* | 251 | /* |
174 | * a node might live in a head or a regular ref, this lets you | 252 | * a node might live in a head or a regular ref, this lets you |
175 | * test for the proper type to use. | 253 | * test for the proper type to use. |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d8525662ca7a..7aa9cd36bf1b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include "tree-log.h" | 43 | #include "tree-log.h" |
44 | #include "free-space-cache.h" | 44 | #include "free-space-cache.h" |
45 | #include "inode-map.h" | 45 | #include "inode-map.h" |
46 | #include "check-integrity.h" | ||
46 | 47 | ||
47 | static struct extent_io_ops btree_extent_io_ops; | 48 | static struct extent_io_ops btree_extent_io_ops; |
48 | static void end_workqueue_fn(struct btrfs_work *work); | 49 | static void end_workqueue_fn(struct btrfs_work *work); |
@@ -1143,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, | |||
1143 | root->orphan_item_inserted = 0; | 1144 | root->orphan_item_inserted = 0; |
1144 | root->orphan_cleanup_state = 0; | 1145 | root->orphan_cleanup_state = 0; |
1145 | 1146 | ||
1146 | root->fs_info = fs_info; | ||
1147 | root->objectid = objectid; | 1147 | root->objectid = objectid; |
1148 | root->last_trans = 0; | 1148 | root->last_trans = 0; |
1149 | root->highest_objectid = 0; | 1149 | root->highest_objectid = 0; |
@@ -1217,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root, | |||
1217 | return 0; | 1217 | return 0; |
1218 | } | 1218 | } |
1219 | 1219 | ||
1220 | static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) | ||
1221 | { | ||
1222 | struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); | ||
1223 | if (root) | ||
1224 | root->fs_info = fs_info; | ||
1225 | return root; | ||
1226 | } | ||
1227 | |||
1220 | static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, | 1228 | static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, |
1221 | struct btrfs_fs_info *fs_info) | 1229 | struct btrfs_fs_info *fs_info) |
1222 | { | 1230 | { |
@@ -1224,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, | |||
1224 | struct btrfs_root *tree_root = fs_info->tree_root; | 1232 | struct btrfs_root *tree_root = fs_info->tree_root; |
1225 | struct extent_buffer *leaf; | 1233 | struct extent_buffer *leaf; |
1226 | 1234 | ||
1227 | root = kzalloc(sizeof(*root), GFP_NOFS); | 1235 | root = btrfs_alloc_root(fs_info); |
1228 | if (!root) | 1236 | if (!root) |
1229 | return ERR_PTR(-ENOMEM); | 1237 | return ERR_PTR(-ENOMEM); |
1230 | 1238 | ||
@@ -1244,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, | |||
1244 | root->ref_cows = 0; | 1252 | root->ref_cows = 0; |
1245 | 1253 | ||
1246 | leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, | 1254 | leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, |
1247 | BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); | 1255 | BTRFS_TREE_LOG_OBJECTID, NULL, |
1256 | 0, 0, 0, 0); | ||
1248 | if (IS_ERR(leaf)) { | 1257 | if (IS_ERR(leaf)) { |
1249 | kfree(root); | 1258 | kfree(root); |
1250 | return ERR_CAST(leaf); | 1259 | return ERR_CAST(leaf); |
@@ -1318,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, | |||
1318 | u32 blocksize; | 1327 | u32 blocksize; |
1319 | int ret = 0; | 1328 | int ret = 0; |
1320 | 1329 | ||
1321 | root = kzalloc(sizeof(*root), GFP_NOFS); | 1330 | root = btrfs_alloc_root(fs_info); |
1322 | if (!root) | 1331 | if (!root) |
1323 | return ERR_PTR(-ENOMEM); | 1332 | return ERR_PTR(-ENOMEM); |
1324 | if (location->offset == (u64)-1) { | 1333 | if (location->offset == (u64)-1) { |
@@ -1874,9 +1883,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) | |||
1874 | } | 1883 | } |
1875 | 1884 | ||
1876 | 1885 | ||
1877 | struct btrfs_root *open_ctree(struct super_block *sb, | 1886 | int open_ctree(struct super_block *sb, |
1878 | struct btrfs_fs_devices *fs_devices, | 1887 | struct btrfs_fs_devices *fs_devices, |
1879 | char *options) | 1888 | char *options) |
1880 | { | 1889 | { |
1881 | u32 sectorsize; | 1890 | u32 sectorsize; |
1882 | u32 nodesize; | 1891 | u32 nodesize; |
@@ -1888,8 +1897,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1888 | struct btrfs_key location; | 1897 | struct btrfs_key location; |
1889 | struct buffer_head *bh; | 1898 | struct buffer_head *bh; |
1890 | struct btrfs_super_block *disk_super; | 1899 | struct btrfs_super_block *disk_super; |
1891 | struct btrfs_root *tree_root = btrfs_sb(sb); | 1900 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
1892 | struct btrfs_fs_info *fs_info = tree_root->fs_info; | 1901 | struct btrfs_root *tree_root; |
1893 | struct btrfs_root *extent_root; | 1902 | struct btrfs_root *extent_root; |
1894 | struct btrfs_root *csum_root; | 1903 | struct btrfs_root *csum_root; |
1895 | struct btrfs_root *chunk_root; | 1904 | struct btrfs_root *chunk_root; |
@@ -1900,16 +1909,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1900 | int num_backups_tried = 0; | 1909 | int num_backups_tried = 0; |
1901 | int backup_index = 0; | 1910 | int backup_index = 0; |
1902 | 1911 | ||
1903 | extent_root = fs_info->extent_root = | 1912 | tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); |
1904 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | 1913 | extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); |
1905 | csum_root = fs_info->csum_root = | 1914 | csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); |
1906 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | 1915 | chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); |
1907 | chunk_root = fs_info->chunk_root = | 1916 | dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); |
1908 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | ||
1909 | dev_root = fs_info->dev_root = | ||
1910 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | ||
1911 | 1917 | ||
1912 | if (!extent_root || !csum_root || !chunk_root || !dev_root) { | 1918 | if (!tree_root || !extent_root || !csum_root || |
1919 | !chunk_root || !dev_root) { | ||
1913 | err = -ENOMEM; | 1920 | err = -ENOMEM; |
1914 | goto fail; | 1921 | goto fail; |
1915 | } | 1922 | } |
@@ -1998,6 +2005,17 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1998 | init_waitqueue_head(&fs_info->scrub_pause_wait); | 2005 | init_waitqueue_head(&fs_info->scrub_pause_wait); |
1999 | init_rwsem(&fs_info->scrub_super_lock); | 2006 | init_rwsem(&fs_info->scrub_super_lock); |
2000 | fs_info->scrub_workers_refcnt = 0; | 2007 | fs_info->scrub_workers_refcnt = 0; |
2008 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | ||
2009 | fs_info->check_integrity_print_mask = 0; | ||
2010 | #endif | ||
2011 | |||
2012 | spin_lock_init(&fs_info->balance_lock); | ||
2013 | mutex_init(&fs_info->balance_mutex); | ||
2014 | atomic_set(&fs_info->balance_running, 0); | ||
2015 | atomic_set(&fs_info->balance_pause_req, 0); | ||
2016 | atomic_set(&fs_info->balance_cancel_req, 0); | ||
2017 | fs_info->balance_ctl = NULL; | ||
2018 | init_waitqueue_head(&fs_info->balance_wait_q); | ||
2001 | 2019 | ||
2002 | sb->s_blocksize = 4096; | 2020 | sb->s_blocksize = 4096; |
2003 | sb->s_blocksize_bits = blksize_bits(4096); | 2021 | sb->s_blocksize_bits = blksize_bits(4096); |
@@ -2267,9 +2285,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
2267 | (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), | 2285 | (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), |
2268 | BTRFS_UUID_SIZE); | 2286 | BTRFS_UUID_SIZE); |
2269 | 2287 | ||
2270 | mutex_lock(&fs_info->chunk_mutex); | ||
2271 | ret = btrfs_read_chunk_tree(chunk_root); | 2288 | ret = btrfs_read_chunk_tree(chunk_root); |
2272 | mutex_unlock(&fs_info->chunk_mutex); | ||
2273 | if (ret) { | 2289 | if (ret) { |
2274 | printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", | 2290 | printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", |
2275 | sb->s_id); | 2291 | sb->s_id); |
@@ -2318,9 +2334,6 @@ retry_root_backup: | |||
2318 | 2334 | ||
2319 | fs_info->generation = generation; | 2335 | fs_info->generation = generation; |
2320 | fs_info->last_trans_committed = generation; | 2336 | fs_info->last_trans_committed = generation; |
2321 | fs_info->data_alloc_profile = (u64)-1; | ||
2322 | fs_info->metadata_alloc_profile = (u64)-1; | ||
2323 | fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; | ||
2324 | 2337 | ||
2325 | ret = btrfs_init_space_info(fs_info); | 2338 | ret = btrfs_init_space_info(fs_info); |
2326 | if (ret) { | 2339 | if (ret) { |
@@ -2353,6 +2366,19 @@ retry_root_backup: | |||
2353 | btrfs_set_opt(fs_info->mount_opt, SSD); | 2366 | btrfs_set_opt(fs_info->mount_opt, SSD); |
2354 | } | 2367 | } |
2355 | 2368 | ||
2369 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | ||
2370 | if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { | ||
2371 | ret = btrfsic_mount(tree_root, fs_devices, | ||
2372 | btrfs_test_opt(tree_root, | ||
2373 | CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? | ||
2374 | 1 : 0, | ||
2375 | fs_info->check_integrity_print_mask); | ||
2376 | if (ret) | ||
2377 | printk(KERN_WARNING "btrfs: failed to initialize" | ||
2378 | " integrity check module %s\n", sb->s_id); | ||
2379 | } | ||
2380 | #endif | ||
2381 | |||
2356 | /* do not make disk changes in broken FS */ | 2382 | /* do not make disk changes in broken FS */ |
2357 | if (btrfs_super_log_root(disk_super) != 0 && | 2383 | if (btrfs_super_log_root(disk_super) != 0 && |
2358 | !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { | 2384 | !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { |
@@ -2368,7 +2394,7 @@ retry_root_backup: | |||
2368 | btrfs_level_size(tree_root, | 2394 | btrfs_level_size(tree_root, |
2369 | btrfs_super_log_root_level(disk_super)); | 2395 | btrfs_super_log_root_level(disk_super)); |
2370 | 2396 | ||
2371 | log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | 2397 | log_tree_root = btrfs_alloc_root(fs_info); |
2372 | if (!log_tree_root) { | 2398 | if (!log_tree_root) { |
2373 | err = -ENOMEM; | 2399 | err = -ENOMEM; |
2374 | goto fail_trans_kthread; | 2400 | goto fail_trans_kthread; |
@@ -2423,13 +2449,17 @@ retry_root_backup: | |||
2423 | if (!err) | 2449 | if (!err) |
2424 | err = btrfs_orphan_cleanup(fs_info->tree_root); | 2450 | err = btrfs_orphan_cleanup(fs_info->tree_root); |
2425 | up_read(&fs_info->cleanup_work_sem); | 2451 | up_read(&fs_info->cleanup_work_sem); |
2452 | |||
2453 | if (!err) | ||
2454 | err = btrfs_recover_balance(fs_info->tree_root); | ||
2455 | |||
2426 | if (err) { | 2456 | if (err) { |
2427 | close_ctree(tree_root); | 2457 | close_ctree(tree_root); |
2428 | return ERR_PTR(err); | 2458 | return err; |
2429 | } | 2459 | } |
2430 | } | 2460 | } |
2431 | 2461 | ||
2432 | return tree_root; | 2462 | return 0; |
2433 | 2463 | ||
2434 | fail_trans_kthread: | 2464 | fail_trans_kthread: |
2435 | kthread_stop(fs_info->transaction_kthread); | 2465 | kthread_stop(fs_info->transaction_kthread); |
@@ -2475,8 +2505,7 @@ fail_srcu: | |||
2475 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 2505 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
2476 | fail: | 2506 | fail: |
2477 | btrfs_close_devices(fs_info->fs_devices); | 2507 | btrfs_close_devices(fs_info->fs_devices); |
2478 | free_fs_info(fs_info); | 2508 | return err; |
2479 | return ERR_PTR(err); | ||
2480 | 2509 | ||
2481 | recovery_tree_root: | 2510 | recovery_tree_root: |
2482 | if (!btrfs_test_opt(tree_root, RECOVERY)) | 2511 | if (!btrfs_test_opt(tree_root, RECOVERY)) |
@@ -2631,7 +2660,7 @@ static int write_dev_supers(struct btrfs_device *device, | |||
2631 | * we fua the first super. The others we allow | 2660 | * we fua the first super. The others we allow |
2632 | * to go down lazy. | 2661 | * to go down lazy. |
2633 | */ | 2662 | */ |
2634 | ret = submit_bh(WRITE_FUA, bh); | 2663 | ret = btrfsic_submit_bh(WRITE_FUA, bh); |
2635 | if (ret) | 2664 | if (ret) |
2636 | errors++; | 2665 | errors++; |
2637 | } | 2666 | } |
@@ -2708,7 +2737,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) | |||
2708 | device->flush_bio = bio; | 2737 | device->flush_bio = bio; |
2709 | 2738 | ||
2710 | bio_get(bio); | 2739 | bio_get(bio); |
2711 | submit_bio(WRITE_FLUSH, bio); | 2740 | btrfsic_submit_bio(WRITE_FLUSH, bio); |
2712 | 2741 | ||
2713 | return 0; | 2742 | return 0; |
2714 | } | 2743 | } |
@@ -2972,6 +3001,9 @@ int close_ctree(struct btrfs_root *root) | |||
2972 | fs_info->closing = 1; | 3001 | fs_info->closing = 1; |
2973 | smp_mb(); | 3002 | smp_mb(); |
2974 | 3003 | ||
3004 | /* pause restriper - we want to resume on mount */ | ||
3005 | btrfs_pause_balance(root->fs_info); | ||
3006 | |||
2975 | btrfs_scrub_cancel(root); | 3007 | btrfs_scrub_cancel(root); |
2976 | 3008 | ||
2977 | /* wait for any defraggers to finish */ | 3009 | /* wait for any defraggers to finish */ |
@@ -2979,7 +3011,7 @@ int close_ctree(struct btrfs_root *root) | |||
2979 | (atomic_read(&fs_info->defrag_running) == 0)); | 3011 | (atomic_read(&fs_info->defrag_running) == 0)); |
2980 | 3012 | ||
2981 | /* clear out the rbtree of defraggable inodes */ | 3013 | /* clear out the rbtree of defraggable inodes */ |
2982 | btrfs_run_defrag_inodes(root->fs_info); | 3014 | btrfs_run_defrag_inodes(fs_info); |
2983 | 3015 | ||
2984 | /* | 3016 | /* |
2985 | * Here come 2 situations when btrfs is broken to flip readonly: | 3017 | * Here come 2 situations when btrfs is broken to flip readonly: |
@@ -3008,8 +3040,8 @@ int close_ctree(struct btrfs_root *root) | |||
3008 | 3040 | ||
3009 | btrfs_put_block_group_cache(fs_info); | 3041 | btrfs_put_block_group_cache(fs_info); |
3010 | 3042 | ||
3011 | kthread_stop(root->fs_info->transaction_kthread); | 3043 | kthread_stop(fs_info->transaction_kthread); |
3012 | kthread_stop(root->fs_info->cleaner_kthread); | 3044 | kthread_stop(fs_info->cleaner_kthread); |
3013 | 3045 | ||
3014 | fs_info->closing = 2; | 3046 | fs_info->closing = 2; |
3015 | smp_mb(); | 3047 | smp_mb(); |
@@ -3027,14 +3059,14 @@ int close_ctree(struct btrfs_root *root) | |||
3027 | free_extent_buffer(fs_info->extent_root->commit_root); | 3059 | free_extent_buffer(fs_info->extent_root->commit_root); |
3028 | free_extent_buffer(fs_info->tree_root->node); | 3060 | free_extent_buffer(fs_info->tree_root->node); |
3029 | free_extent_buffer(fs_info->tree_root->commit_root); | 3061 | free_extent_buffer(fs_info->tree_root->commit_root); |
3030 | free_extent_buffer(root->fs_info->chunk_root->node); | 3062 | free_extent_buffer(fs_info->chunk_root->node); |
3031 | free_extent_buffer(root->fs_info->chunk_root->commit_root); | 3063 | free_extent_buffer(fs_info->chunk_root->commit_root); |
3032 | free_extent_buffer(root->fs_info->dev_root->node); | 3064 | free_extent_buffer(fs_info->dev_root->node); |
3033 | free_extent_buffer(root->fs_info->dev_root->commit_root); | 3065 | free_extent_buffer(fs_info->dev_root->commit_root); |
3034 | free_extent_buffer(root->fs_info->csum_root->node); | 3066 | free_extent_buffer(fs_info->csum_root->node); |
3035 | free_extent_buffer(root->fs_info->csum_root->commit_root); | 3067 | free_extent_buffer(fs_info->csum_root->commit_root); |
3036 | 3068 | ||
3037 | btrfs_free_block_groups(root->fs_info); | 3069 | btrfs_free_block_groups(fs_info); |
3038 | 3070 | ||
3039 | del_fs_roots(fs_info); | 3071 | del_fs_roots(fs_info); |
3040 | 3072 | ||
@@ -3054,14 +3086,17 @@ int close_ctree(struct btrfs_root *root) | |||
3054 | btrfs_stop_workers(&fs_info->caching_workers); | 3086 | btrfs_stop_workers(&fs_info->caching_workers); |
3055 | btrfs_stop_workers(&fs_info->readahead_workers); | 3087 | btrfs_stop_workers(&fs_info->readahead_workers); |
3056 | 3088 | ||
3089 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | ||
3090 | if (btrfs_test_opt(root, CHECK_INTEGRITY)) | ||
3091 | btrfsic_unmount(root, fs_info->fs_devices); | ||
3092 | #endif | ||
3093 | |||
3057 | btrfs_close_devices(fs_info->fs_devices); | 3094 | btrfs_close_devices(fs_info->fs_devices); |
3058 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 3095 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
3059 | 3096 | ||
3060 | bdi_destroy(&fs_info->bdi); | 3097 | bdi_destroy(&fs_info->bdi); |
3061 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 3098 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
3062 | 3099 | ||
3063 | free_fs_info(fs_info); | ||
3064 | |||
3065 | return 0; | 3100 | return 0; |
3066 | } | 3101 | } |
3067 | 3102 | ||
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c99d0a8f13fa..e4bc4741319b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, | |||
46 | u64 bytenr, u32 blocksize); | 46 | u64 bytenr, u32 blocksize); |
47 | int clean_tree_block(struct btrfs_trans_handle *trans, | 47 | int clean_tree_block(struct btrfs_trans_handle *trans, |
48 | struct btrfs_root *root, struct extent_buffer *buf); | 48 | struct btrfs_root *root, struct extent_buffer *buf); |
49 | struct btrfs_root *open_ctree(struct super_block *sb, | 49 | int open_ctree(struct super_block *sb, |
50 | struct btrfs_fs_devices *fs_devices, | 50 | struct btrfs_fs_devices *fs_devices, |
51 | char *options); | 51 | char *options); |
52 | int close_ctree(struct btrfs_root *root); | 52 | int close_ctree(struct btrfs_root *root); |
53 | int write_ctree_super(struct btrfs_trans_handle *trans, | 53 | int write_ctree_super(struct btrfs_trans_handle *trans, |
54 | struct btrfs_root *root, int max_mirrors); | 54 | struct btrfs_root *root, int max_mirrors); |
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1b8dc33778f9..5f77166fd01c 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c | |||
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, | |||
67 | u64 root_objectid, u32 generation, | 67 | u64 root_objectid, u32 generation, |
68 | int check_generation) | 68 | int check_generation) |
69 | { | 69 | { |
70 | struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; | 70 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
71 | struct btrfs_root *root; | 71 | struct btrfs_root *root; |
72 | struct inode *inode; | 72 | struct inode *inode; |
73 | struct btrfs_key key; | 73 | struct btrfs_key key; |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5fbe576d2ba..700879ed64cf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, | |||
618 | struct list_head *head = &info->space_info; | 618 | struct list_head *head = &info->space_info; |
619 | struct btrfs_space_info *found; | 619 | struct btrfs_space_info *found; |
620 | 620 | ||
621 | flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | | 621 | flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; |
622 | BTRFS_BLOCK_GROUP_METADATA; | ||
623 | 622 | ||
624 | rcu_read_lock(); | 623 | rcu_read_lock(); |
625 | list_for_each_entry_rcu(found, head, list) { | 624 | list_for_each_entry_rcu(found, head, list) { |
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
1872 | int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, | 1871 | int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, |
1873 | struct btrfs_root *root, | 1872 | struct btrfs_root *root, |
1874 | u64 bytenr, u64 num_bytes, u64 parent, | 1873 | u64 bytenr, u64 num_bytes, u64 parent, |
1875 | u64 root_objectid, u64 owner, u64 offset) | 1874 | u64 root_objectid, u64 owner, u64 offset, int for_cow) |
1876 | { | 1875 | { |
1877 | int ret; | 1876 | int ret; |
1877 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
1878 | |||
1878 | BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && | 1879 | BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && |
1879 | root_objectid == BTRFS_TREE_LOG_OBJECTID); | 1880 | root_objectid == BTRFS_TREE_LOG_OBJECTID); |
1880 | 1881 | ||
1881 | if (owner < BTRFS_FIRST_FREE_OBJECTID) { | 1882 | if (owner < BTRFS_FIRST_FREE_OBJECTID) { |
1882 | ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, | 1883 | ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, |
1884 | num_bytes, | ||
1883 | parent, root_objectid, (int)owner, | 1885 | parent, root_objectid, (int)owner, |
1884 | BTRFS_ADD_DELAYED_REF, NULL); | 1886 | BTRFS_ADD_DELAYED_REF, NULL, for_cow); |
1885 | } else { | 1887 | } else { |
1886 | ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, | 1888 | ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, |
1889 | num_bytes, | ||
1887 | parent, root_objectid, owner, offset, | 1890 | parent, root_objectid, owner, offset, |
1888 | BTRFS_ADD_DELAYED_REF, NULL); | 1891 | BTRFS_ADD_DELAYED_REF, NULL, for_cow); |
1889 | } | 1892 | } |
1890 | return ret; | 1893 | return ret; |
1891 | } | 1894 | } |
@@ -2233,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2233 | } | 2236 | } |
2234 | 2237 | ||
2235 | /* | 2238 | /* |
2239 | * locked_ref is the head node, so we have to go one | ||
2240 | * node back for any delayed ref updates | ||
2241 | */ | ||
2242 | ref = select_delayed_ref(locked_ref); | ||
2243 | |||
2244 | if (ref && ref->seq && | ||
2245 | btrfs_check_delayed_seq(delayed_refs, ref->seq)) { | ||
2246 | /* | ||
2247 | * there are still refs with lower seq numbers in the | ||
2248 | * process of being added. Don't run this ref yet. | ||
2249 | */ | ||
2250 | list_del_init(&locked_ref->cluster); | ||
2251 | mutex_unlock(&locked_ref->mutex); | ||
2252 | locked_ref = NULL; | ||
2253 | delayed_refs->num_heads_ready++; | ||
2254 | spin_unlock(&delayed_refs->lock); | ||
2255 | cond_resched(); | ||
2256 | spin_lock(&delayed_refs->lock); | ||
2257 | continue; | ||
2258 | } | ||
2259 | |||
2260 | /* | ||
2236 | * record the must insert reserved flag before we | 2261 | * record the must insert reserved flag before we |
2237 | * drop the spin lock. | 2262 | * drop the spin lock. |
2238 | */ | 2263 | */ |
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2242 | extent_op = locked_ref->extent_op; | 2267 | extent_op = locked_ref->extent_op; |
2243 | locked_ref->extent_op = NULL; | 2268 | locked_ref->extent_op = NULL; |
2244 | 2269 | ||
2245 | /* | ||
2246 | * locked_ref is the head node, so we have to go one | ||
2247 | * node back for any delayed ref updates | ||
2248 | */ | ||
2249 | ref = select_delayed_ref(locked_ref); | ||
2250 | if (!ref) { | 2270 | if (!ref) { |
2251 | /* All delayed refs have been processed, Go ahead | 2271 | /* All delayed refs have been processed, Go ahead |
2252 | * and send the head node to run_one_delayed_ref, | 2272 | * and send the head node to run_one_delayed_ref, |
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2267 | BUG_ON(ret); | 2287 | BUG_ON(ret); |
2268 | kfree(extent_op); | 2288 | kfree(extent_op); |
2269 | 2289 | ||
2270 | cond_resched(); | 2290 | goto next; |
2271 | spin_lock(&delayed_refs->lock); | ||
2272 | continue; | ||
2273 | } | 2291 | } |
2274 | 2292 | ||
2275 | list_del_init(&locked_ref->cluster); | 2293 | list_del_init(&locked_ref->cluster); |
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2279 | ref->in_tree = 0; | 2297 | ref->in_tree = 0; |
2280 | rb_erase(&ref->rb_node, &delayed_refs->root); | 2298 | rb_erase(&ref->rb_node, &delayed_refs->root); |
2281 | delayed_refs->num_entries--; | 2299 | delayed_refs->num_entries--; |
2282 | 2300 | /* | |
2301 | * we modified num_entries, but as we're currently running | ||
2302 | * delayed refs, skip | ||
2303 | * wake_up(&delayed_refs->seq_wait); | ||
2304 | * here. | ||
2305 | */ | ||
2283 | spin_unlock(&delayed_refs->lock); | 2306 | spin_unlock(&delayed_refs->lock); |
2284 | 2307 | ||
2285 | ret = run_one_delayed_ref(trans, root, ref, extent_op, | 2308 | ret = run_one_delayed_ref(trans, root, ref, extent_op, |
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2289 | btrfs_put_delayed_ref(ref); | 2312 | btrfs_put_delayed_ref(ref); |
2290 | kfree(extent_op); | 2313 | kfree(extent_op); |
2291 | count++; | 2314 | count++; |
2292 | 2315 | next: | |
2316 | do_chunk_alloc(trans, root->fs_info->extent_root, | ||
2317 | 2 * 1024 * 1024, | ||
2318 | btrfs_get_alloc_profile(root, 0), | ||
2319 | CHUNK_ALLOC_NO_FORCE); | ||
2293 | cond_resched(); | 2320 | cond_resched(); |
2294 | spin_lock(&delayed_refs->lock); | 2321 | spin_lock(&delayed_refs->lock); |
2295 | } | 2322 | } |
2296 | return count; | 2323 | return count; |
2297 | } | 2324 | } |
2298 | 2325 | ||
2326 | |||
2327 | static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, | ||
2328 | unsigned long num_refs) | ||
2329 | { | ||
2330 | struct list_head *first_seq = delayed_refs->seq_head.next; | ||
2331 | |||
2332 | spin_unlock(&delayed_refs->lock); | ||
2333 | pr_debug("waiting for more refs (num %ld, first %p)\n", | ||
2334 | num_refs, first_seq); | ||
2335 | wait_event(delayed_refs->seq_wait, | ||
2336 | num_refs != delayed_refs->num_entries || | ||
2337 | delayed_refs->seq_head.next != first_seq); | ||
2338 | pr_debug("done waiting for more refs (num %ld, first %p)\n", | ||
2339 | delayed_refs->num_entries, delayed_refs->seq_head.next); | ||
2340 | spin_lock(&delayed_refs->lock); | ||
2341 | } | ||
2342 | |||
2299 | /* | 2343 | /* |
2300 | * this starts processing the delayed reference count updates and | 2344 | * this starts processing the delayed reference count updates and |
2301 | * extent insertions we have queued up so far. count can be | 2345 | * extent insertions we have queued up so far. count can be |
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | |||
2311 | struct btrfs_delayed_ref_node *ref; | 2355 | struct btrfs_delayed_ref_node *ref; |
2312 | struct list_head cluster; | 2356 | struct list_head cluster; |
2313 | int ret; | 2357 | int ret; |
2358 | u64 delayed_start; | ||
2314 | int run_all = count == (unsigned long)-1; | 2359 | int run_all = count == (unsigned long)-1; |
2315 | int run_most = 0; | 2360 | int run_most = 0; |
2361 | unsigned long num_refs = 0; | ||
2362 | int consider_waiting; | ||
2316 | 2363 | ||
2317 | if (root == root->fs_info->extent_root) | 2364 | if (root == root->fs_info->extent_root) |
2318 | root = root->fs_info->tree_root; | 2365 | root = root->fs_info->tree_root; |
2319 | 2366 | ||
2367 | do_chunk_alloc(trans, root->fs_info->extent_root, | ||
2368 | 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), | ||
2369 | CHUNK_ALLOC_NO_FORCE); | ||
2370 | |||
2320 | delayed_refs = &trans->transaction->delayed_refs; | 2371 | delayed_refs = &trans->transaction->delayed_refs; |
2321 | INIT_LIST_HEAD(&cluster); | 2372 | INIT_LIST_HEAD(&cluster); |
2322 | again: | 2373 | again: |
2374 | consider_waiting = 0; | ||
2323 | spin_lock(&delayed_refs->lock); | 2375 | spin_lock(&delayed_refs->lock); |
2324 | if (count == 0) { | 2376 | if (count == 0) { |
2325 | count = delayed_refs->num_entries * 2; | 2377 | count = delayed_refs->num_entries * 2; |
@@ -2336,11 +2388,35 @@ again: | |||
2336 | * of refs to process starting at the first one we are able to | 2388 | * of refs to process starting at the first one we are able to |
2337 | * lock | 2389 | * lock |
2338 | */ | 2390 | */ |
2391 | delayed_start = delayed_refs->run_delayed_start; | ||
2339 | ret = btrfs_find_ref_cluster(trans, &cluster, | 2392 | ret = btrfs_find_ref_cluster(trans, &cluster, |
2340 | delayed_refs->run_delayed_start); | 2393 | delayed_refs->run_delayed_start); |
2341 | if (ret) | 2394 | if (ret) |
2342 | break; | 2395 | break; |
2343 | 2396 | ||
2397 | if (delayed_start >= delayed_refs->run_delayed_start) { | ||
2398 | if (consider_waiting == 0) { | ||
2399 | /* | ||
2400 | * btrfs_find_ref_cluster looped. let's do one | ||
2401 | * more cycle. if we don't run any delayed ref | ||
2402 | * during that cycle (because we can't because | ||
2403 | * all of them are blocked) and if the number of | ||
2404 | * refs doesn't change, we avoid busy waiting. | ||
2405 | */ | ||
2406 | consider_waiting = 1; | ||
2407 | num_refs = delayed_refs->num_entries; | ||
2408 | } else { | ||
2409 | wait_for_more_refs(delayed_refs, num_refs); | ||
2410 | /* | ||
2411 | * after waiting, things have changed. we | ||
2412 | * dropped the lock and someone else might have | ||
2413 | * run some refs, built new clusters and so on. | ||
2414 | * therefore, we restart staleness detection. | ||
2415 | */ | ||
2416 | consider_waiting = 0; | ||
2417 | } | ||
2418 | } | ||
2419 | |||
2344 | ret = run_clustered_refs(trans, root, &cluster); | 2420 | ret = run_clustered_refs(trans, root, &cluster); |
2345 | BUG_ON(ret < 0); | 2421 | BUG_ON(ret < 0); |
2346 | 2422 | ||
@@ -2348,6 +2424,11 @@ again: | |||
2348 | 2424 | ||
2349 | if (count == 0) | 2425 | if (count == 0) |
2350 | break; | 2426 | break; |
2427 | |||
2428 | if (ret || delayed_refs->run_delayed_start == 0) { | ||
2429 | /* refs were run, let's reset staleness detection */ | ||
2430 | consider_waiting = 0; | ||
2431 | } | ||
2351 | } | 2432 | } |
2352 | 2433 | ||
2353 | if (run_all) { | 2434 | if (run_all) { |
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, | |||
2405 | extent_op->update_key = 0; | 2486 | extent_op->update_key = 0; |
2406 | extent_op->is_data = is_data ? 1 : 0; | 2487 | extent_op->is_data = is_data ? 1 : 0; |
2407 | 2488 | ||
2408 | ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); | 2489 | ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, |
2490 | num_bytes, extent_op); | ||
2409 | if (ret) | 2491 | if (ret) |
2410 | kfree(extent_op); | 2492 | kfree(extent_op); |
2411 | return ret; | 2493 | return ret; |
@@ -2590,7 +2672,7 @@ out: | |||
2590 | static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, | 2672 | static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, |
2591 | struct btrfs_root *root, | 2673 | struct btrfs_root *root, |
2592 | struct extent_buffer *buf, | 2674 | struct extent_buffer *buf, |
2593 | int full_backref, int inc) | 2675 | int full_backref, int inc, int for_cow) |
2594 | { | 2676 | { |
2595 | u64 bytenr; | 2677 | u64 bytenr; |
2596 | u64 num_bytes; | 2678 | u64 num_bytes; |
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, | |||
2603 | int level; | 2685 | int level; |
2604 | int ret = 0; | 2686 | int ret = 0; |
2605 | int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, | 2687 | int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, |
2606 | u64, u64, u64, u64, u64, u64); | 2688 | u64, u64, u64, u64, u64, u64, int); |
2607 | 2689 | ||
2608 | ref_root = btrfs_header_owner(buf); | 2690 | ref_root = btrfs_header_owner(buf); |
2609 | nritems = btrfs_header_nritems(buf); | 2691 | nritems = btrfs_header_nritems(buf); |
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, | |||
2640 | key.offset -= btrfs_file_extent_offset(buf, fi); | 2722 | key.offset -= btrfs_file_extent_offset(buf, fi); |
2641 | ret = process_func(trans, root, bytenr, num_bytes, | 2723 | ret = process_func(trans, root, bytenr, num_bytes, |
2642 | parent, ref_root, key.objectid, | 2724 | parent, ref_root, key.objectid, |
2643 | key.offset); | 2725 | key.offset, for_cow); |
2644 | if (ret) | 2726 | if (ret) |
2645 | goto fail; | 2727 | goto fail; |
2646 | } else { | 2728 | } else { |
2647 | bytenr = btrfs_node_blockptr(buf, i); | 2729 | bytenr = btrfs_node_blockptr(buf, i); |
2648 | num_bytes = btrfs_level_size(root, level - 1); | 2730 | num_bytes = btrfs_level_size(root, level - 1); |
2649 | ret = process_func(trans, root, bytenr, num_bytes, | 2731 | ret = process_func(trans, root, bytenr, num_bytes, |
2650 | parent, ref_root, level - 1, 0); | 2732 | parent, ref_root, level - 1, 0, |
2733 | for_cow); | ||
2651 | if (ret) | 2734 | if (ret) |
2652 | goto fail; | 2735 | goto fail; |
2653 | } | 2736 | } |
@@ -2659,15 +2742,15 @@ fail: | |||
2659 | } | 2742 | } |
2660 | 2743 | ||
2661 | int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 2744 | int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
2662 | struct extent_buffer *buf, int full_backref) | 2745 | struct extent_buffer *buf, int full_backref, int for_cow) |
2663 | { | 2746 | { |
2664 | return __btrfs_mod_ref(trans, root, buf, full_backref, 1); | 2747 | return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); |
2665 | } | 2748 | } |
2666 | 2749 | ||
2667 | int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 2750 | int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
2668 | struct extent_buffer *buf, int full_backref) | 2751 | struct extent_buffer *buf, int full_backref, int for_cow) |
2669 | { | 2752 | { |
2670 | return __btrfs_mod_ref(trans, root, buf, full_backref, 0); | 2753 | return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); |
2671 | } | 2754 | } |
2672 | 2755 | ||
2673 | static int write_one_cache_group(struct btrfs_trans_handle *trans, | 2756 | static int write_one_cache_group(struct btrfs_trans_handle *trans, |
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, | |||
2993 | INIT_LIST_HEAD(&found->block_groups[i]); | 3076 | INIT_LIST_HEAD(&found->block_groups[i]); |
2994 | init_rwsem(&found->groups_sem); | 3077 | init_rwsem(&found->groups_sem); |
2995 | spin_lock_init(&found->lock); | 3078 | spin_lock_init(&found->lock); |
2996 | found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | | 3079 | found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; |
2997 | BTRFS_BLOCK_GROUP_SYSTEM | | ||
2998 | BTRFS_BLOCK_GROUP_METADATA); | ||
2999 | found->total_bytes = total_bytes; | 3080 | found->total_bytes = total_bytes; |
3000 | found->disk_total = total_bytes * factor; | 3081 | found->disk_total = total_bytes * factor; |
3001 | found->bytes_used = bytes_used; | 3082 | found->bytes_used = bytes_used; |
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, | |||
3016 | 3097 | ||
3017 | static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | 3098 | static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) |
3018 | { | 3099 | { |
3019 | u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | | 3100 | u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; |
3020 | BTRFS_BLOCK_GROUP_RAID1 | | 3101 | |
3021 | BTRFS_BLOCK_GROUP_RAID10 | | 3102 | /* chunk -> extended profile */ |
3022 | BTRFS_BLOCK_GROUP_DUP); | 3103 | if (extra_flags == 0) |
3023 | if (extra_flags) { | 3104 | extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; |
3024 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 3105 | |
3025 | fs_info->avail_data_alloc_bits |= extra_flags; | 3106 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
3026 | if (flags & BTRFS_BLOCK_GROUP_METADATA) | 3107 | fs_info->avail_data_alloc_bits |= extra_flags; |
3027 | fs_info->avail_metadata_alloc_bits |= extra_flags; | 3108 | if (flags & BTRFS_BLOCK_GROUP_METADATA) |
3028 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 3109 | fs_info->avail_metadata_alloc_bits |= extra_flags; |
3029 | fs_info->avail_system_alloc_bits |= extra_flags; | 3110 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) |
3030 | } | 3111 | fs_info->avail_system_alloc_bits |= extra_flags; |
3031 | } | 3112 | } |
3032 | 3113 | ||
3114 | /* | ||
3115 | * @flags: available profiles in extended format (see ctree.h) | ||
3116 | * | ||
3117 | * Returns reduced profile in chunk format. If profile changing is in | ||
3118 | * progress (either running or paused) picks the target profile (if it's | ||
3119 | * already available), otherwise falls back to plain reducing. | ||
3120 | */ | ||
3033 | u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | 3121 | u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) |
3034 | { | 3122 | { |
3035 | /* | 3123 | /* |
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3040 | u64 num_devices = root->fs_info->fs_devices->rw_devices + | 3128 | u64 num_devices = root->fs_info->fs_devices->rw_devices + |
3041 | root->fs_info->fs_devices->missing_devices; | 3129 | root->fs_info->fs_devices->missing_devices; |
3042 | 3130 | ||
3131 | /* pick restriper's target profile if it's available */ | ||
3132 | spin_lock(&root->fs_info->balance_lock); | ||
3133 | if (root->fs_info->balance_ctl) { | ||
3134 | struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; | ||
3135 | u64 tgt = 0; | ||
3136 | |||
3137 | if ((flags & BTRFS_BLOCK_GROUP_DATA) && | ||
3138 | (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && | ||
3139 | (flags & bctl->data.target)) { | ||
3140 | tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; | ||
3141 | } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && | ||
3142 | (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && | ||
3143 | (flags & bctl->sys.target)) { | ||
3144 | tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; | ||
3145 | } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && | ||
3146 | (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && | ||
3147 | (flags & bctl->meta.target)) { | ||
3148 | tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; | ||
3149 | } | ||
3150 | |||
3151 | if (tgt) { | ||
3152 | spin_unlock(&root->fs_info->balance_lock); | ||
3153 | flags = tgt; | ||
3154 | goto out; | ||
3155 | } | ||
3156 | } | ||
3157 | spin_unlock(&root->fs_info->balance_lock); | ||
3158 | |||
3043 | if (num_devices == 1) | 3159 | if (num_devices == 1) |
3044 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); | 3160 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); |
3045 | if (num_devices < 4) | 3161 | if (num_devices < 4) |
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3059 | if ((flags & BTRFS_BLOCK_GROUP_RAID0) && | 3175 | if ((flags & BTRFS_BLOCK_GROUP_RAID0) && |
3060 | ((flags & BTRFS_BLOCK_GROUP_RAID1) | | 3176 | ((flags & BTRFS_BLOCK_GROUP_RAID1) | |
3061 | (flags & BTRFS_BLOCK_GROUP_RAID10) | | 3177 | (flags & BTRFS_BLOCK_GROUP_RAID10) | |
3062 | (flags & BTRFS_BLOCK_GROUP_DUP))) | 3178 | (flags & BTRFS_BLOCK_GROUP_DUP))) { |
3063 | flags &= ~BTRFS_BLOCK_GROUP_RAID0; | 3179 | flags &= ~BTRFS_BLOCK_GROUP_RAID0; |
3180 | } | ||
3181 | |||
3182 | out: | ||
3183 | /* extended -> chunk profile */ | ||
3184 | flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; | ||
3064 | return flags; | 3185 | return flags; |
3065 | } | 3186 | } |
3066 | 3187 | ||
3067 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | 3188 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) |
3068 | { | 3189 | { |
3069 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 3190 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
3070 | flags |= root->fs_info->avail_data_alloc_bits & | 3191 | flags |= root->fs_info->avail_data_alloc_bits; |
3071 | root->fs_info->data_alloc_profile; | ||
3072 | else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 3192 | else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) |
3073 | flags |= root->fs_info->avail_system_alloc_bits & | 3193 | flags |= root->fs_info->avail_system_alloc_bits; |
3074 | root->fs_info->system_alloc_profile; | ||
3075 | else if (flags & BTRFS_BLOCK_GROUP_METADATA) | 3194 | else if (flags & BTRFS_BLOCK_GROUP_METADATA) |
3076 | flags |= root->fs_info->avail_metadata_alloc_bits & | 3195 | flags |= root->fs_info->avail_metadata_alloc_bits; |
3077 | root->fs_info->metadata_alloc_profile; | 3196 | |
3078 | return btrfs_reduce_alloc_profile(root, flags); | 3197 | return btrfs_reduce_alloc_profile(root, flags); |
3079 | } | 3198 | } |
3080 | 3199 | ||
@@ -3191,6 +3310,8 @@ commit_trans: | |||
3191 | return -ENOSPC; | 3310 | return -ENOSPC; |
3192 | } | 3311 | } |
3193 | data_sinfo->bytes_may_use += bytes; | 3312 | data_sinfo->bytes_may_use += bytes; |
3313 | trace_btrfs_space_reservation(root->fs_info, "space_info", | ||
3314 | (u64)data_sinfo, bytes, 1); | ||
3194 | spin_unlock(&data_sinfo->lock); | 3315 | spin_unlock(&data_sinfo->lock); |
3195 | 3316 | ||
3196 | return 0; | 3317 | return 0; |
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) | |||
3210 | data_sinfo = BTRFS_I(inode)->space_info; | 3331 | data_sinfo = BTRFS_I(inode)->space_info; |
3211 | spin_lock(&data_sinfo->lock); | 3332 | spin_lock(&data_sinfo->lock); |
3212 | data_sinfo->bytes_may_use -= bytes; | 3333 | data_sinfo->bytes_may_use -= bytes; |
3334 | trace_btrfs_space_reservation(root->fs_info, "space_info", | ||
3335 | (u64)data_sinfo, bytes, 0); | ||
3213 | spin_unlock(&data_sinfo->lock); | 3336 | spin_unlock(&data_sinfo->lock); |
3214 | } | 3337 | } |
3215 | 3338 | ||
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root, | |||
3257 | if (num_bytes - num_allocated < thresh) | 3380 | if (num_bytes - num_allocated < thresh) |
3258 | return 1; | 3381 | return 1; |
3259 | } | 3382 | } |
3260 | |||
3261 | /* | ||
3262 | * we have two similar checks here, one based on percentage | ||
3263 | * and once based on a hard number of 256MB. The idea | ||
3264 | * is that if we have a good amount of free | ||
3265 | * room, don't allocate a chunk. A good mount is | ||
3266 | * less than 80% utilized of the chunks we have allocated, | ||
3267 | * or more than 256MB free | ||
3268 | */ | ||
3269 | if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) | ||
3270 | return 0; | ||
3271 | |||
3272 | if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) | ||
3273 | return 0; | ||
3274 | |||
3275 | thresh = btrfs_super_total_bytes(root->fs_info->super_copy); | 3383 | thresh = btrfs_super_total_bytes(root->fs_info->super_copy); |
3276 | 3384 | ||
3277 | /* 256MB or 5% of the FS */ | 3385 | /* 256MB or 2% of the FS */ |
3278 | thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); | 3386 | thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); |
3387 | /* system chunks need a much small threshold */ | ||
3388 | if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) | ||
3389 | thresh = 32 * 1024 * 1024; | ||
3279 | 3390 | ||
3280 | if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) | 3391 | if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) |
3281 | return 0; | 3392 | return 0; |
3282 | return 1; | 3393 | return 1; |
3283 | } | 3394 | } |
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, | |||
3291 | int wait_for_alloc = 0; | 3402 | int wait_for_alloc = 0; |
3292 | int ret = 0; | 3403 | int ret = 0; |
3293 | 3404 | ||
3294 | flags = btrfs_reduce_alloc_profile(extent_root, flags); | 3405 | BUG_ON(!profile_is_valid(flags, 0)); |
3295 | 3406 | ||
3296 | space_info = __find_space_info(extent_root->fs_info, flags); | 3407 | space_info = __find_space_info(extent_root->fs_info, flags); |
3297 | if (!space_info) { | 3408 | if (!space_info) { |
@@ -3582,6 +3693,10 @@ again: | |||
3582 | if (used <= space_info->total_bytes) { | 3693 | if (used <= space_info->total_bytes) { |
3583 | if (used + orig_bytes <= space_info->total_bytes) { | 3694 | if (used + orig_bytes <= space_info->total_bytes) { |
3584 | space_info->bytes_may_use += orig_bytes; | 3695 | space_info->bytes_may_use += orig_bytes; |
3696 | trace_btrfs_space_reservation(root->fs_info, | ||
3697 | "space_info", | ||
3698 | (u64)space_info, | ||
3699 | orig_bytes, 1); | ||
3585 | ret = 0; | 3700 | ret = 0; |
3586 | } else { | 3701 | } else { |
3587 | /* | 3702 | /* |
@@ -3649,6 +3764,10 @@ again: | |||
3649 | 3764 | ||
3650 | if (used + num_bytes < space_info->total_bytes + avail) { | 3765 | if (used + num_bytes < space_info->total_bytes + avail) { |
3651 | space_info->bytes_may_use += orig_bytes; | 3766 | space_info->bytes_may_use += orig_bytes; |
3767 | trace_btrfs_space_reservation(root->fs_info, | ||
3768 | "space_info", | ||
3769 | (u64)space_info, | ||
3770 | orig_bytes, 1); | ||
3652 | ret = 0; | 3771 | ret = 0; |
3653 | } else { | 3772 | } else { |
3654 | wait_ordered = true; | 3773 | wait_ordered = true; |
@@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, | |||
3755 | spin_unlock(&block_rsv->lock); | 3874 | spin_unlock(&block_rsv->lock); |
3756 | } | 3875 | } |
3757 | 3876 | ||
3758 | static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, | 3877 | static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, |
3878 | struct btrfs_block_rsv *block_rsv, | ||
3759 | struct btrfs_block_rsv *dest, u64 num_bytes) | 3879 | struct btrfs_block_rsv *dest, u64 num_bytes) |
3760 | { | 3880 | { |
3761 | struct btrfs_space_info *space_info = block_rsv->space_info; | 3881 | struct btrfs_space_info *space_info = block_rsv->space_info; |
@@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, | |||
3791 | if (num_bytes) { | 3911 | if (num_bytes) { |
3792 | spin_lock(&space_info->lock); | 3912 | spin_lock(&space_info->lock); |
3793 | space_info->bytes_may_use -= num_bytes; | 3913 | space_info->bytes_may_use -= num_bytes; |
3914 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
3915 | (u64)space_info, | ||
3916 | num_bytes, 0); | ||
3794 | space_info->reservation_progress++; | 3917 | space_info->reservation_progress++; |
3795 | spin_unlock(&space_info->lock); | 3918 | spin_unlock(&space_info->lock); |
3796 | } | 3919 | } |
@@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, | |||
3947 | if (global_rsv->full || global_rsv == block_rsv || | 4070 | if (global_rsv->full || global_rsv == block_rsv || |
3948 | block_rsv->space_info != global_rsv->space_info) | 4071 | block_rsv->space_info != global_rsv->space_info) |
3949 | global_rsv = NULL; | 4072 | global_rsv = NULL; |
3950 | block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); | 4073 | block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, |
4074 | num_bytes); | ||
3951 | } | 4075 | } |
3952 | 4076 | ||
3953 | /* | 4077 | /* |
@@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) | |||
4006 | num_bytes = sinfo->total_bytes - num_bytes; | 4130 | num_bytes = sinfo->total_bytes - num_bytes; |
4007 | block_rsv->reserved += num_bytes; | 4131 | block_rsv->reserved += num_bytes; |
4008 | sinfo->bytes_may_use += num_bytes; | 4132 | sinfo->bytes_may_use += num_bytes; |
4133 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
4134 | (u64)sinfo, num_bytes, 1); | ||
4009 | } | 4135 | } |
4010 | 4136 | ||
4011 | if (block_rsv->reserved >= block_rsv->size) { | 4137 | if (block_rsv->reserved >= block_rsv->size) { |
4012 | num_bytes = block_rsv->reserved - block_rsv->size; | 4138 | num_bytes = block_rsv->reserved - block_rsv->size; |
4013 | sinfo->bytes_may_use -= num_bytes; | 4139 | sinfo->bytes_may_use -= num_bytes; |
4140 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
4141 | (u64)sinfo, num_bytes, 0); | ||
4014 | sinfo->reservation_progress++; | 4142 | sinfo->reservation_progress++; |
4015 | block_rsv->reserved = block_rsv->size; | 4143 | block_rsv->reserved = block_rsv->size; |
4016 | block_rsv->full = 1; | 4144 | block_rsv->full = 1; |
@@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) | |||
4045 | 4173 | ||
4046 | static void release_global_block_rsv(struct btrfs_fs_info *fs_info) | 4174 | static void release_global_block_rsv(struct btrfs_fs_info *fs_info) |
4047 | { | 4175 | { |
4048 | block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); | 4176 | block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, |
4177 | (u64)-1); | ||
4049 | WARN_ON(fs_info->delalloc_block_rsv.size > 0); | 4178 | WARN_ON(fs_info->delalloc_block_rsv.size > 0); |
4050 | WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); | 4179 | WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); |
4051 | WARN_ON(fs_info->trans_block_rsv.size > 0); | 4180 | WARN_ON(fs_info->trans_block_rsv.size > 0); |
@@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, | |||
4062 | if (!trans->bytes_reserved) | 4191 | if (!trans->bytes_reserved) |
4063 | return; | 4192 | return; |
4064 | 4193 | ||
4194 | trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, | ||
4195 | trans->bytes_reserved, 0); | ||
4065 | btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); | 4196 | btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); |
4066 | trans->bytes_reserved = 0; | 4197 | trans->bytes_reserved = 0; |
4067 | } | 4198 | } |
@@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, | |||
4079 | * when we are truly done with the orphan item. | 4210 | * when we are truly done with the orphan item. |
4080 | */ | 4211 | */ |
4081 | u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); | 4212 | u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); |
4213 | trace_btrfs_space_reservation(root->fs_info, "orphan", | ||
4214 | btrfs_ino(inode), num_bytes, 1); | ||
4082 | return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); | 4215 | return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); |
4083 | } | 4216 | } |
4084 | 4217 | ||
@@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode) | |||
4086 | { | 4219 | { |
4087 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4220 | struct btrfs_root *root = BTRFS_I(inode)->root; |
4088 | u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); | 4221 | u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); |
4222 | trace_btrfs_space_reservation(root->fs_info, "orphan", | ||
4223 | btrfs_ino(inode), num_bytes, 0); | ||
4089 | btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); | 4224 | btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); |
4090 | } | 4225 | } |
4091 | 4226 | ||
@@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4213 | /* Need to be holding the i_mutex here if we aren't free space cache */ | 4348 | /* Need to be holding the i_mutex here if we aren't free space cache */ |
4214 | if (btrfs_is_free_space_inode(root, inode)) | 4349 | if (btrfs_is_free_space_inode(root, inode)) |
4215 | flush = 0; | 4350 | flush = 0; |
4216 | else | ||
4217 | WARN_ON(!mutex_is_locked(&inode->i_mutex)); | ||
4218 | 4351 | ||
4219 | if (flush && btrfs_transaction_in_commit(root->fs_info)) | 4352 | if (flush && btrfs_transaction_in_commit(root->fs_info)) |
4220 | schedule_timeout(1); | 4353 | schedule_timeout(1); |
4221 | 4354 | ||
4355 | mutex_lock(&BTRFS_I(inode)->delalloc_mutex); | ||
4222 | num_bytes = ALIGN(num_bytes, root->sectorsize); | 4356 | num_bytes = ALIGN(num_bytes, root->sectorsize); |
4223 | 4357 | ||
4224 | spin_lock(&BTRFS_I(inode)->lock); | 4358 | spin_lock(&BTRFS_I(inode)->lock); |
@@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4266 | if (dropped) | 4400 | if (dropped) |
4267 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | 4401 | to_free += btrfs_calc_trans_metadata_size(root, dropped); |
4268 | 4402 | ||
4269 | if (to_free) | 4403 | if (to_free) { |
4270 | btrfs_block_rsv_release(root, block_rsv, to_free); | 4404 | btrfs_block_rsv_release(root, block_rsv, to_free); |
4405 | trace_btrfs_space_reservation(root->fs_info, | ||
4406 | "delalloc", | ||
4407 | btrfs_ino(inode), | ||
4408 | to_free, 0); | ||
4409 | } | ||
4410 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
4271 | return ret; | 4411 | return ret; |
4272 | } | 4412 | } |
4273 | 4413 | ||
@@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4278 | } | 4418 | } |
4279 | BTRFS_I(inode)->reserved_extents += nr_extents; | 4419 | BTRFS_I(inode)->reserved_extents += nr_extents; |
4280 | spin_unlock(&BTRFS_I(inode)->lock); | 4420 | spin_unlock(&BTRFS_I(inode)->lock); |
4421 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
4281 | 4422 | ||
4423 | if (to_reserve) | ||
4424 | trace_btrfs_space_reservation(root->fs_info,"delalloc", | ||
4425 | btrfs_ino(inode), to_reserve, 1); | ||
4282 | block_rsv_add_bytes(block_rsv, to_reserve, 1); | 4426 | block_rsv_add_bytes(block_rsv, to_reserve, 1); |
4283 | 4427 | ||
4284 | return 0; | 4428 | return 0; |
@@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) | |||
4308 | if (dropped > 0) | 4452 | if (dropped > 0) |
4309 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | 4453 | to_free += btrfs_calc_trans_metadata_size(root, dropped); |
4310 | 4454 | ||
4455 | trace_btrfs_space_reservation(root->fs_info, "delalloc", | ||
4456 | btrfs_ino(inode), to_free, 0); | ||
4311 | btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, | 4457 | btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, |
4312 | to_free); | 4458 | to_free); |
4313 | } | 4459 | } |
@@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, | |||
4562 | cache->reserved += num_bytes; | 4708 | cache->reserved += num_bytes; |
4563 | space_info->bytes_reserved += num_bytes; | 4709 | space_info->bytes_reserved += num_bytes; |
4564 | if (reserve == RESERVE_ALLOC) { | 4710 | if (reserve == RESERVE_ALLOC) { |
4565 | BUG_ON(space_info->bytes_may_use < num_bytes); | 4711 | trace_btrfs_space_reservation(cache->fs_info, |
4712 | "space_info", | ||
4713 | (u64)space_info, | ||
4714 | num_bytes, 0); | ||
4566 | space_info->bytes_may_use -= num_bytes; | 4715 | space_info->bytes_may_use -= num_bytes; |
4567 | } | 4716 | } |
4568 | } | 4717 | } |
@@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, | |||
4928 | rb_erase(&head->node.rb_node, &delayed_refs->root); | 5077 | rb_erase(&head->node.rb_node, &delayed_refs->root); |
4929 | 5078 | ||
4930 | delayed_refs->num_entries--; | 5079 | delayed_refs->num_entries--; |
5080 | if (waitqueue_active(&delayed_refs->seq_wait)) | ||
5081 | wake_up(&delayed_refs->seq_wait); | ||
4931 | 5082 | ||
4932 | /* | 5083 | /* |
4933 | * we don't take a ref on the node because we're removing it from the | 5084 | * we don't take a ref on the node because we're removing it from the |
@@ -4955,16 +5106,17 @@ out: | |||
4955 | void btrfs_free_tree_block(struct btrfs_trans_handle *trans, | 5106 | void btrfs_free_tree_block(struct btrfs_trans_handle *trans, |
4956 | struct btrfs_root *root, | 5107 | struct btrfs_root *root, |
4957 | struct extent_buffer *buf, | 5108 | struct extent_buffer *buf, |
4958 | u64 parent, int last_ref) | 5109 | u64 parent, int last_ref, int for_cow) |
4959 | { | 5110 | { |
4960 | struct btrfs_block_group_cache *cache = NULL; | 5111 | struct btrfs_block_group_cache *cache = NULL; |
4961 | int ret; | 5112 | int ret; |
4962 | 5113 | ||
4963 | if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { | 5114 | if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { |
4964 | ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, | 5115 | ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, |
4965 | parent, root->root_key.objectid, | 5116 | buf->start, buf->len, |
4966 | btrfs_header_level(buf), | 5117 | parent, root->root_key.objectid, |
4967 | BTRFS_DROP_DELAYED_REF, NULL); | 5118 | btrfs_header_level(buf), |
5119 | BTRFS_DROP_DELAYED_REF, NULL, for_cow); | ||
4968 | BUG_ON(ret); | 5120 | BUG_ON(ret); |
4969 | } | 5121 | } |
4970 | 5122 | ||
@@ -4999,12 +5151,12 @@ out: | |||
4999 | btrfs_put_block_group(cache); | 5151 | btrfs_put_block_group(cache); |
5000 | } | 5152 | } |
5001 | 5153 | ||
5002 | int btrfs_free_extent(struct btrfs_trans_handle *trans, | 5154 | int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
5003 | struct btrfs_root *root, | 5155 | u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, |
5004 | u64 bytenr, u64 num_bytes, u64 parent, | 5156 | u64 owner, u64 offset, int for_cow) |
5005 | u64 root_objectid, u64 owner, u64 offset) | ||
5006 | { | 5157 | { |
5007 | int ret; | 5158 | int ret; |
5159 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
5008 | 5160 | ||
5009 | /* | 5161 | /* |
5010 | * tree log blocks never actually go into the extent allocation | 5162 | * tree log blocks never actually go into the extent allocation |
@@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, | |||
5016 | btrfs_pin_extent(root, bytenr, num_bytes, 1); | 5168 | btrfs_pin_extent(root, bytenr, num_bytes, 1); |
5017 | ret = 0; | 5169 | ret = 0; |
5018 | } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { | 5170 | } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { |
5019 | ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, | 5171 | ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, |
5172 | num_bytes, | ||
5020 | parent, root_objectid, (int)owner, | 5173 | parent, root_objectid, (int)owner, |
5021 | BTRFS_DROP_DELAYED_REF, NULL); | 5174 | BTRFS_DROP_DELAYED_REF, NULL, for_cow); |
5022 | BUG_ON(ret); | 5175 | BUG_ON(ret); |
5023 | } else { | 5176 | } else { |
5024 | ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, | 5177 | ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, |
5025 | parent, root_objectid, owner, | 5178 | num_bytes, |
5026 | offset, BTRFS_DROP_DELAYED_REF, NULL); | 5179 | parent, root_objectid, owner, |
5180 | offset, BTRFS_DROP_DELAYED_REF, | ||
5181 | NULL, for_cow); | ||
5027 | BUG_ON(ret); | 5182 | BUG_ON(ret); |
5028 | } | 5183 | } |
5029 | return ret; | 5184 | return ret; |
@@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, | |||
5146 | ins->objectid = 0; | 5301 | ins->objectid = 0; |
5147 | ins->offset = 0; | 5302 | ins->offset = 0; |
5148 | 5303 | ||
5304 | trace_find_free_extent(orig_root, num_bytes, empty_size, data); | ||
5305 | |||
5149 | space_info = __find_space_info(root->fs_info, data); | 5306 | space_info = __find_space_info(root->fs_info, data); |
5150 | if (!space_info) { | 5307 | if (!space_info) { |
5151 | printk(KERN_ERR "No space info for %llu\n", data); | 5308 | printk(KERN_ERR "No space info for %llu\n", data); |
@@ -5295,15 +5452,6 @@ alloc: | |||
5295 | if (unlikely(block_group->ro)) | 5452 | if (unlikely(block_group->ro)) |
5296 | goto loop; | 5453 | goto loop; |
5297 | 5454 | ||
5298 | spin_lock(&block_group->free_space_ctl->tree_lock); | ||
5299 | if (cached && | ||
5300 | block_group->free_space_ctl->free_space < | ||
5301 | num_bytes + empty_cluster + empty_size) { | ||
5302 | spin_unlock(&block_group->free_space_ctl->tree_lock); | ||
5303 | goto loop; | ||
5304 | } | ||
5305 | spin_unlock(&block_group->free_space_ctl->tree_lock); | ||
5306 | |||
5307 | /* | 5455 | /* |
5308 | * Ok we want to try and use the cluster allocator, so | 5456 | * Ok we want to try and use the cluster allocator, so |
5309 | * lets look there | 5457 | * lets look there |
@@ -5331,6 +5479,8 @@ alloc: | |||
5331 | if (offset) { | 5479 | if (offset) { |
5332 | /* we have a block, we're done */ | 5480 | /* we have a block, we're done */ |
5333 | spin_unlock(&last_ptr->refill_lock); | 5481 | spin_unlock(&last_ptr->refill_lock); |
5482 | trace_btrfs_reserve_extent_cluster(root, | ||
5483 | block_group, search_start, num_bytes); | ||
5334 | goto checks; | 5484 | goto checks; |
5335 | } | 5485 | } |
5336 | 5486 | ||
@@ -5349,8 +5499,15 @@ refill_cluster: | |||
5349 | * plenty of times and not have found | 5499 | * plenty of times and not have found |
5350 | * anything, so we are likely way too | 5500 | * anything, so we are likely way too |
5351 | * fragmented for the clustering stuff to find | 5501 | * fragmented for the clustering stuff to find |
5352 | * anything. */ | 5502 | * anything. |
5353 | if (loop >= LOOP_NO_EMPTY_SIZE) { | 5503 | * |
5504 | * However, if the cluster is taken from the | ||
5505 | * current block group, release the cluster | ||
5506 | * first, so that we stand a better chance of | ||
5507 | * succeeding in the unclustered | ||
5508 | * allocation. */ | ||
5509 | if (loop >= LOOP_NO_EMPTY_SIZE && | ||
5510 | last_ptr->block_group != block_group) { | ||
5354 | spin_unlock(&last_ptr->refill_lock); | 5511 | spin_unlock(&last_ptr->refill_lock); |
5355 | goto unclustered_alloc; | 5512 | goto unclustered_alloc; |
5356 | } | 5513 | } |
@@ -5361,6 +5518,11 @@ refill_cluster: | |||
5361 | */ | 5518 | */ |
5362 | btrfs_return_cluster_to_free_space(NULL, last_ptr); | 5519 | btrfs_return_cluster_to_free_space(NULL, last_ptr); |
5363 | 5520 | ||
5521 | if (loop >= LOOP_NO_EMPTY_SIZE) { | ||
5522 | spin_unlock(&last_ptr->refill_lock); | ||
5523 | goto unclustered_alloc; | ||
5524 | } | ||
5525 | |||
5364 | /* allocate a cluster in this block group */ | 5526 | /* allocate a cluster in this block group */ |
5365 | ret = btrfs_find_space_cluster(trans, root, | 5527 | ret = btrfs_find_space_cluster(trans, root, |
5366 | block_group, last_ptr, | 5528 | block_group, last_ptr, |
@@ -5377,6 +5539,9 @@ refill_cluster: | |||
5377 | if (offset) { | 5539 | if (offset) { |
5378 | /* we found one, proceed */ | 5540 | /* we found one, proceed */ |
5379 | spin_unlock(&last_ptr->refill_lock); | 5541 | spin_unlock(&last_ptr->refill_lock); |
5542 | trace_btrfs_reserve_extent_cluster(root, | ||
5543 | block_group, search_start, | ||
5544 | num_bytes); | ||
5380 | goto checks; | 5545 | goto checks; |
5381 | } | 5546 | } |
5382 | } else if (!cached && loop > LOOP_CACHING_NOWAIT | 5547 | } else if (!cached && loop > LOOP_CACHING_NOWAIT |
@@ -5401,6 +5566,15 @@ refill_cluster: | |||
5401 | } | 5566 | } |
5402 | 5567 | ||
5403 | unclustered_alloc: | 5568 | unclustered_alloc: |
5569 | spin_lock(&block_group->free_space_ctl->tree_lock); | ||
5570 | if (cached && | ||
5571 | block_group->free_space_ctl->free_space < | ||
5572 | num_bytes + empty_cluster + empty_size) { | ||
5573 | spin_unlock(&block_group->free_space_ctl->tree_lock); | ||
5574 | goto loop; | ||
5575 | } | ||
5576 | spin_unlock(&block_group->free_space_ctl->tree_lock); | ||
5577 | |||
5404 | offset = btrfs_find_space_for_alloc(block_group, search_start, | 5578 | offset = btrfs_find_space_for_alloc(block_group, search_start, |
5405 | num_bytes, empty_size); | 5579 | num_bytes, empty_size); |
5406 | /* | 5580 | /* |
@@ -5438,9 +5612,6 @@ checks: | |||
5438 | goto loop; | 5612 | goto loop; |
5439 | } | 5613 | } |
5440 | 5614 | ||
5441 | ins->objectid = search_start; | ||
5442 | ins->offset = num_bytes; | ||
5443 | |||
5444 | if (offset < search_start) | 5615 | if (offset < search_start) |
5445 | btrfs_add_free_space(used_block_group, offset, | 5616 | btrfs_add_free_space(used_block_group, offset, |
5446 | search_start - offset); | 5617 | search_start - offset); |
@@ -5457,6 +5628,8 @@ checks: | |||
5457 | ins->objectid = search_start; | 5628 | ins->objectid = search_start; |
5458 | ins->offset = num_bytes; | 5629 | ins->offset = num_bytes; |
5459 | 5630 | ||
5631 | trace_btrfs_reserve_extent(orig_root, block_group, | ||
5632 | search_start, num_bytes); | ||
5460 | if (offset < search_start) | 5633 | if (offset < search_start) |
5461 | btrfs_add_free_space(used_block_group, offset, | 5634 | btrfs_add_free_space(used_block_group, offset, |
5462 | search_start - offset); | 5635 | search_start - offset); |
@@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
5842 | 6015 | ||
5843 | BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); | 6016 | BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); |
5844 | 6017 | ||
5845 | ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, | 6018 | ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, |
5846 | 0, root_objectid, owner, offset, | 6019 | ins->offset, 0, |
5847 | BTRFS_ADD_DELAYED_EXTENT, NULL); | 6020 | root_objectid, owner, offset, |
6021 | BTRFS_ADD_DELAYED_EXTENT, NULL, 0); | ||
5848 | return ret; | 6022 | return ret; |
5849 | } | 6023 | } |
5850 | 6024 | ||
@@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
5997 | return ERR_PTR(-ENOSPC); | 6171 | return ERR_PTR(-ENOSPC); |
5998 | } | 6172 | } |
5999 | 6173 | ||
6000 | static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) | 6174 | static void unuse_block_rsv(struct btrfs_fs_info *fs_info, |
6175 | struct btrfs_block_rsv *block_rsv, u32 blocksize) | ||
6001 | { | 6176 | { |
6002 | block_rsv_add_bytes(block_rsv, blocksize, 0); | 6177 | block_rsv_add_bytes(block_rsv, blocksize, 0); |
6003 | block_rsv_release_bytes(block_rsv, NULL, 0); | 6178 | block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); |
6004 | } | 6179 | } |
6005 | 6180 | ||
6006 | /* | 6181 | /* |
@@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | |||
6014 | struct btrfs_root *root, u32 blocksize, | 6189 | struct btrfs_root *root, u32 blocksize, |
6015 | u64 parent, u64 root_objectid, | 6190 | u64 parent, u64 root_objectid, |
6016 | struct btrfs_disk_key *key, int level, | 6191 | struct btrfs_disk_key *key, int level, |
6017 | u64 hint, u64 empty_size) | 6192 | u64 hint, u64 empty_size, int for_cow) |
6018 | { | 6193 | { |
6019 | struct btrfs_key ins; | 6194 | struct btrfs_key ins; |
6020 | struct btrfs_block_rsv *block_rsv; | 6195 | struct btrfs_block_rsv *block_rsv; |
@@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | |||
6030 | ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, | 6205 | ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, |
6031 | empty_size, hint, (u64)-1, &ins, 0); | 6206 | empty_size, hint, (u64)-1, &ins, 0); |
6032 | if (ret) { | 6207 | if (ret) { |
6033 | unuse_block_rsv(block_rsv, blocksize); | 6208 | unuse_block_rsv(root->fs_info, block_rsv, blocksize); |
6034 | return ERR_PTR(ret); | 6209 | return ERR_PTR(ret); |
6035 | } | 6210 | } |
6036 | 6211 | ||
@@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | |||
6058 | extent_op->update_flags = 1; | 6233 | extent_op->update_flags = 1; |
6059 | extent_op->is_data = 0; | 6234 | extent_op->is_data = 0; |
6060 | 6235 | ||
6061 | ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, | 6236 | ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, |
6237 | ins.objectid, | ||
6062 | ins.offset, parent, root_objectid, | 6238 | ins.offset, parent, root_objectid, |
6063 | level, BTRFS_ADD_DELAYED_EXTENT, | 6239 | level, BTRFS_ADD_DELAYED_EXTENT, |
6064 | extent_op); | 6240 | extent_op, for_cow); |
6065 | BUG_ON(ret); | 6241 | BUG_ON(ret); |
6066 | } | 6242 | } |
6067 | return buf; | 6243 | return buf; |
@@ -6078,6 +6254,7 @@ struct walk_control { | |||
6078 | int keep_locks; | 6254 | int keep_locks; |
6079 | int reada_slot; | 6255 | int reada_slot; |
6080 | int reada_count; | 6256 | int reada_count; |
6257 | int for_reloc; | ||
6081 | }; | 6258 | }; |
6082 | 6259 | ||
6083 | #define DROP_REFERENCE 1 | 6260 | #define DROP_REFERENCE 1 |
@@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, | |||
6216 | /* wc->stage == UPDATE_BACKREF */ | 6393 | /* wc->stage == UPDATE_BACKREF */ |
6217 | if (!(wc->flags[level] & flag)) { | 6394 | if (!(wc->flags[level] & flag)) { |
6218 | BUG_ON(!path->locks[level]); | 6395 | BUG_ON(!path->locks[level]); |
6219 | ret = btrfs_inc_ref(trans, root, eb, 1); | 6396 | ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); |
6220 | BUG_ON(ret); | 6397 | BUG_ON(ret); |
6221 | ret = btrfs_dec_ref(trans, root, eb, 0); | 6398 | ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); |
6222 | BUG_ON(ret); | 6399 | BUG_ON(ret); |
6223 | ret = btrfs_set_disk_extent_flags(trans, root, eb->start, | 6400 | ret = btrfs_set_disk_extent_flags(trans, root, eb->start, |
6224 | eb->len, flag, 0); | 6401 | eb->len, flag, 0); |
@@ -6362,7 +6539,7 @@ skip: | |||
6362 | } | 6539 | } |
6363 | 6540 | ||
6364 | ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, | 6541 | ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, |
6365 | root->root_key.objectid, level - 1, 0); | 6542 | root->root_key.objectid, level - 1, 0, 0); |
6366 | BUG_ON(ret); | 6543 | BUG_ON(ret); |
6367 | } | 6544 | } |
6368 | btrfs_tree_unlock(next); | 6545 | btrfs_tree_unlock(next); |
@@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6436 | if (wc->refs[level] == 1) { | 6613 | if (wc->refs[level] == 1) { |
6437 | if (level == 0) { | 6614 | if (level == 0) { |
6438 | if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) | 6615 | if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) |
6439 | ret = btrfs_dec_ref(trans, root, eb, 1); | 6616 | ret = btrfs_dec_ref(trans, root, eb, 1, |
6617 | wc->for_reloc); | ||
6440 | else | 6618 | else |
6441 | ret = btrfs_dec_ref(trans, root, eb, 0); | 6619 | ret = btrfs_dec_ref(trans, root, eb, 0, |
6620 | wc->for_reloc); | ||
6442 | BUG_ON(ret); | 6621 | BUG_ON(ret); |
6443 | } | 6622 | } |
6444 | /* make block locked assertion in clean_tree_block happy */ | 6623 | /* make block locked assertion in clean_tree_block happy */ |
@@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, | |||
6465 | btrfs_header_owner(path->nodes[level + 1])); | 6644 | btrfs_header_owner(path->nodes[level + 1])); |
6466 | } | 6645 | } |
6467 | 6646 | ||
6468 | btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); | 6647 | btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); |
6469 | out: | 6648 | out: |
6470 | wc->refs[level] = 0; | 6649 | wc->refs[level] = 0; |
6471 | wc->flags[level] = 0; | 6650 | wc->flags[level] = 0; |
@@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, | |||
6549 | * blocks are properly updated. | 6728 | * blocks are properly updated. |
6550 | */ | 6729 | */ |
6551 | void btrfs_drop_snapshot(struct btrfs_root *root, | 6730 | void btrfs_drop_snapshot(struct btrfs_root *root, |
6552 | struct btrfs_block_rsv *block_rsv, int update_ref) | 6731 | struct btrfs_block_rsv *block_rsv, int update_ref, |
6732 | int for_reloc) | ||
6553 | { | 6733 | { |
6554 | struct btrfs_path *path; | 6734 | struct btrfs_path *path; |
6555 | struct btrfs_trans_handle *trans; | 6735 | struct btrfs_trans_handle *trans; |
@@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, | |||
6637 | wc->stage = DROP_REFERENCE; | 6817 | wc->stage = DROP_REFERENCE; |
6638 | wc->update_ref = update_ref; | 6818 | wc->update_ref = update_ref; |
6639 | wc->keep_locks = 0; | 6819 | wc->keep_locks = 0; |
6820 | wc->for_reloc = for_reloc; | ||
6640 | wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); | 6821 | wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); |
6641 | 6822 | ||
6642 | while (1) { | 6823 | while (1) { |
@@ -6721,6 +6902,7 @@ out: | |||
6721 | * drop subtree rooted at tree block 'node'. | 6902 | * drop subtree rooted at tree block 'node'. |
6722 | * | 6903 | * |
6723 | * NOTE: this function will unlock and release tree block 'node' | 6904 | * NOTE: this function will unlock and release tree block 'node' |
6905 | * only used by relocation code | ||
6724 | */ | 6906 | */ |
6725 | int btrfs_drop_subtree(struct btrfs_trans_handle *trans, | 6907 | int btrfs_drop_subtree(struct btrfs_trans_handle *trans, |
6726 | struct btrfs_root *root, | 6908 | struct btrfs_root *root, |
@@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, | |||
6765 | wc->stage = DROP_REFERENCE; | 6947 | wc->stage = DROP_REFERENCE; |
6766 | wc->update_ref = 0; | 6948 | wc->update_ref = 0; |
6767 | wc->keep_locks = 1; | 6949 | wc->keep_locks = 1; |
6950 | wc->for_reloc = 1; | ||
6768 | wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); | 6951 | wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); |
6769 | 6952 | ||
6770 | while (1) { | 6953 | while (1) { |
@@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) | |||
6792 | u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | | 6975 | u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | |
6793 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; | 6976 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; |
6794 | 6977 | ||
6978 | if (root->fs_info->balance_ctl) { | ||
6979 | struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; | ||
6980 | u64 tgt = 0; | ||
6981 | |||
6982 | /* pick restriper's target profile and return */ | ||
6983 | if (flags & BTRFS_BLOCK_GROUP_DATA && | ||
6984 | bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { | ||
6985 | tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; | ||
6986 | } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && | ||
6987 | bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | ||
6988 | tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; | ||
6989 | } else if (flags & BTRFS_BLOCK_GROUP_METADATA && | ||
6990 | bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { | ||
6991 | tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; | ||
6992 | } | ||
6993 | |||
6994 | if (tgt) { | ||
6995 | /* extended -> chunk profile */ | ||
6996 | tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; | ||
6997 | return tgt; | ||
6998 | } | ||
6999 | } | ||
7000 | |||
6795 | /* | 7001 | /* |
6796 | * we add in the count of missing devices because we want | 7002 | * we add in the count of missing devices because we want |
6797 | * to make sure that any RAID levels on a degraded FS | 7003 | * to make sure that any RAID levels on a degraded FS |
@@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) | |||
7085 | * space to fit our block group in. | 7291 | * space to fit our block group in. |
7086 | */ | 7292 | */ |
7087 | if (device->total_bytes > device->bytes_used + min_free) { | 7293 | if (device->total_bytes > device->bytes_used + min_free) { |
7088 | ret = find_free_dev_extent(NULL, device, min_free, | 7294 | ret = find_free_dev_extent(device, min_free, |
7089 | &dev_offset, NULL); | 7295 | &dev_offset, NULL); |
7090 | if (!ret) | 7296 | if (!ret) |
7091 | dev_nr++; | 7297 | dev_nr++; |
@@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, | |||
7447 | ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, | 7653 | ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, |
7448 | &cache->space_info); | 7654 | &cache->space_info); |
7449 | BUG_ON(ret); | 7655 | BUG_ON(ret); |
7656 | update_global_block_rsv(root->fs_info); | ||
7450 | 7657 | ||
7451 | spin_lock(&cache->space_info->lock); | 7658 | spin_lock(&cache->space_info->lock); |
7452 | cache->space_info->bytes_readonly += cache->bytes_super; | 7659 | cache->space_info->bytes_readonly += cache->bytes_super; |
@@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, | |||
7466 | return 0; | 7673 | return 0; |
7467 | } | 7674 | } |
7468 | 7675 | ||
7676 | static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | ||
7677 | { | ||
7678 | u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; | ||
7679 | |||
7680 | /* chunk -> extended profile */ | ||
7681 | if (extra_flags == 0) | ||
7682 | extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; | ||
7683 | |||
7684 | if (flags & BTRFS_BLOCK_GROUP_DATA) | ||
7685 | fs_info->avail_data_alloc_bits &= ~extra_flags; | ||
7686 | if (flags & BTRFS_BLOCK_GROUP_METADATA) | ||
7687 | fs_info->avail_metadata_alloc_bits &= ~extra_flags; | ||
7688 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | ||
7689 | fs_info->avail_system_alloc_bits &= ~extra_flags; | ||
7690 | } | ||
7691 | |||
7469 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | 7692 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, |
7470 | struct btrfs_root *root, u64 group_start) | 7693 | struct btrfs_root *root, u64 group_start) |
7471 | { | 7694 | { |
@@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
7476 | struct btrfs_key key; | 7699 | struct btrfs_key key; |
7477 | struct inode *inode; | 7700 | struct inode *inode; |
7478 | int ret; | 7701 | int ret; |
7702 | int index; | ||
7479 | int factor; | 7703 | int factor; |
7480 | 7704 | ||
7481 | root = root->fs_info->extent_root; | 7705 | root = root->fs_info->extent_root; |
@@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
7491 | free_excluded_extents(root, block_group); | 7715 | free_excluded_extents(root, block_group); |
7492 | 7716 | ||
7493 | memcpy(&key, &block_group->key, sizeof(key)); | 7717 | memcpy(&key, &block_group->key, sizeof(key)); |
7718 | index = get_block_group_index(block_group); | ||
7494 | if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | | 7719 | if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | |
7495 | BTRFS_BLOCK_GROUP_RAID1 | | 7720 | BTRFS_BLOCK_GROUP_RAID1 | |
7496 | BTRFS_BLOCK_GROUP_RAID10)) | 7721 | BTRFS_BLOCK_GROUP_RAID10)) |
@@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
7565 | * are still on the list after taking the semaphore | 7790 | * are still on the list after taking the semaphore |
7566 | */ | 7791 | */ |
7567 | list_del_init(&block_group->list); | 7792 | list_del_init(&block_group->list); |
7793 | if (list_empty(&block_group->space_info->block_groups[index])) | ||
7794 | clear_avail_alloc_bits(root->fs_info, block_group->flags); | ||
7568 | up_write(&block_group->space_info->groups_sem); | 7795 | up_write(&block_group->space_info->groups_sem); |
7569 | 7796 | ||
7570 | if (block_group->cached == BTRFS_CACHE_STARTED) | 7797 | if (block_group->cached == BTRFS_CACHE_STARTED) |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 49f3c9dc09f4..9d09a4f81875 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include "ctree.h" | 18 | #include "ctree.h" |
19 | #include "btrfs_inode.h" | 19 | #include "btrfs_inode.h" |
20 | #include "volumes.h" | 20 | #include "volumes.h" |
21 | #include "check-integrity.h" | ||
21 | 22 | ||
22 | static struct kmem_cache *extent_state_cache; | 23 | static struct kmem_cache *extent_state_cache; |
23 | static struct kmem_cache *extent_buffer_cache; | 24 | static struct kmem_cache *extent_buffer_cache; |
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | |||
1895 | } | 1896 | } |
1896 | bio->bi_bdev = dev->bdev; | 1897 | bio->bi_bdev = dev->bdev; |
1897 | bio_add_page(bio, page, length, start-page_offset(page)); | 1898 | bio_add_page(bio, page, length, start-page_offset(page)); |
1898 | submit_bio(WRITE_SYNC, bio); | 1899 | btrfsic_submit_bio(WRITE_SYNC, bio); |
1899 | wait_for_completion(&compl); | 1900 | wait_for_completion(&compl); |
1900 | 1901 | ||
1901 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { | 1902 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, | |||
2393 | ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, | 2394 | ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, |
2394 | mirror_num, bio_flags, start); | 2395 | mirror_num, bio_flags, start); |
2395 | else | 2396 | else |
2396 | submit_bio(rw, bio); | 2397 | btrfsic_submit_bio(rw, bio); |
2397 | 2398 | ||
2398 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 2399 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
2399 | ret = -EOPNOTSUPP; | 2400 | ret = -EOPNOTSUPP; |
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, | |||
3579 | atomic_set(&eb->blocking_writers, 0); | 3580 | atomic_set(&eb->blocking_writers, 0); |
3580 | atomic_set(&eb->spinning_readers, 0); | 3581 | atomic_set(&eb->spinning_readers, 0); |
3581 | atomic_set(&eb->spinning_writers, 0); | 3582 | atomic_set(&eb->spinning_writers, 0); |
3583 | eb->lock_nested = 0; | ||
3582 | init_waitqueue_head(&eb->write_lock_wq); | 3584 | init_waitqueue_head(&eb->write_lock_wq); |
3583 | init_waitqueue_head(&eb->read_lock_wq); | 3585 | init_waitqueue_head(&eb->read_lock_wq); |
3584 | 3586 | ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7604c3001322..bc6a042cb6fc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -129,6 +129,7 @@ struct extent_buffer { | |||
129 | struct list_head leak_list; | 129 | struct list_head leak_list; |
130 | struct rcu_head rcu_head; | 130 | struct rcu_head rcu_head; |
131 | atomic_t refs; | 131 | atomic_t refs; |
132 | pid_t lock_owner; | ||
132 | 133 | ||
133 | /* count of read lock holders on the extent buffer */ | 134 | /* count of read lock holders on the extent buffer */ |
134 | atomic_t write_locks; | 135 | atomic_t write_locks; |
@@ -137,6 +138,7 @@ struct extent_buffer { | |||
137 | atomic_t blocking_readers; | 138 | atomic_t blocking_readers; |
138 | atomic_t spinning_readers; | 139 | atomic_t spinning_readers; |
139 | atomic_t spinning_writers; | 140 | atomic_t spinning_writers; |
141 | int lock_nested; | ||
140 | 142 | ||
141 | /* protects write locks */ | 143 | /* protects write locks */ |
142 | rwlock_t lock; | 144 | rwlock_t lock; |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 034d98503229..859ba2dd8890 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -678,7 +678,7 @@ next_slot: | |||
678 | disk_bytenr, num_bytes, 0, | 678 | disk_bytenr, num_bytes, 0, |
679 | root->root_key.objectid, | 679 | root->root_key.objectid, |
680 | new_key.objectid, | 680 | new_key.objectid, |
681 | start - extent_offset); | 681 | start - extent_offset, 0); |
682 | BUG_ON(ret); | 682 | BUG_ON(ret); |
683 | *hint_byte = disk_bytenr; | 683 | *hint_byte = disk_bytenr; |
684 | } | 684 | } |
@@ -753,7 +753,7 @@ next_slot: | |||
753 | disk_bytenr, num_bytes, 0, | 753 | disk_bytenr, num_bytes, 0, |
754 | root->root_key.objectid, | 754 | root->root_key.objectid, |
755 | key.objectid, key.offset - | 755 | key.objectid, key.offset - |
756 | extent_offset); | 756 | extent_offset, 0); |
757 | BUG_ON(ret); | 757 | BUG_ON(ret); |
758 | inode_sub_bytes(inode, | 758 | inode_sub_bytes(inode, |
759 | extent_end - key.offset); | 759 | extent_end - key.offset); |
@@ -962,7 +962,7 @@ again: | |||
962 | 962 | ||
963 | ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, | 963 | ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, |
964 | root->root_key.objectid, | 964 | root->root_key.objectid, |
965 | ino, orig_offset); | 965 | ino, orig_offset, 0); |
966 | BUG_ON(ret); | 966 | BUG_ON(ret); |
967 | 967 | ||
968 | if (split == start) { | 968 | if (split == start) { |
@@ -989,7 +989,7 @@ again: | |||
989 | del_nr++; | 989 | del_nr++; |
990 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, | 990 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, |
991 | 0, root->root_key.objectid, | 991 | 0, root->root_key.objectid, |
992 | ino, orig_offset); | 992 | ino, orig_offset, 0); |
993 | BUG_ON(ret); | 993 | BUG_ON(ret); |
994 | } | 994 | } |
995 | other_start = 0; | 995 | other_start = 0; |
@@ -1006,7 +1006,7 @@ again: | |||
1006 | del_nr++; | 1006 | del_nr++; |
1007 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, | 1007 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, |
1008 | 0, root->root_key.objectid, | 1008 | 0, root->root_key.objectid, |
1009 | ino, orig_offset); | 1009 | ino, orig_offset, 0); |
1010 | BUG_ON(ret); | 1010 | BUG_ON(ret); |
1011 | } | 1011 | } |
1012 | if (del_nr == 0) { | 1012 | if (del_nr == 0) { |
@@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1274 | dirty_pages); | 1274 | dirty_pages); |
1275 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) | 1275 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) |
1276 | btrfs_btree_balance_dirty(root, 1); | 1276 | btrfs_btree_balance_dirty(root, 1); |
1277 | btrfs_throttle(root); | ||
1278 | 1277 | ||
1279 | pos += copied; | 1278 | pos += copied; |
1280 | num_written += copied; | 1279 | num_written += copied; |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9a897bf79538..d20ff87ca603 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) | |||
319 | io_ctl_unmap_page(io_ctl); | 319 | io_ctl_unmap_page(io_ctl); |
320 | 320 | ||
321 | for (i = 0; i < io_ctl->num_pages; i++) { | 321 | for (i = 0; i < io_ctl->num_pages; i++) { |
322 | ClearPageChecked(io_ctl->pages[i]); | 322 | if (io_ctl->pages[i]) { |
323 | unlock_page(io_ctl->pages[i]); | 323 | ClearPageChecked(io_ctl->pages[i]); |
324 | page_cache_release(io_ctl->pages[i]); | 324 | unlock_page(io_ctl->pages[i]); |
325 | page_cache_release(io_ctl->pages[i]); | ||
326 | } | ||
325 | } | 327 | } |
326 | } | 328 | } |
327 | 329 | ||
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, | |||
635 | if (!num_entries) | 637 | if (!num_entries) |
636 | return 0; | 638 | return 0; |
637 | 639 | ||
638 | io_ctl_init(&io_ctl, inode, root); | 640 | ret = io_ctl_init(&io_ctl, inode, root); |
641 | if (ret) | ||
642 | return ret; | ||
643 | |||
639 | ret = readahead_cache(inode); | 644 | ret = readahead_cache(inode); |
640 | if (ret) | 645 | if (ret) |
641 | goto out; | 646 | goto out; |
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
838 | struct io_ctl io_ctl; | 843 | struct io_ctl io_ctl; |
839 | struct list_head bitmap_list; | 844 | struct list_head bitmap_list; |
840 | struct btrfs_key key; | 845 | struct btrfs_key key; |
841 | u64 start, end, len; | 846 | u64 start, extent_start, extent_end, len; |
842 | int entries = 0; | 847 | int entries = 0; |
843 | int bitmaps = 0; | 848 | int bitmaps = 0; |
844 | int ret; | 849 | int ret; |
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
849 | if (!i_size_read(inode)) | 854 | if (!i_size_read(inode)) |
850 | return -1; | 855 | return -1; |
851 | 856 | ||
852 | io_ctl_init(&io_ctl, inode, root); | 857 | ret = io_ctl_init(&io_ctl, inode, root); |
858 | if (ret) | ||
859 | return -1; | ||
853 | 860 | ||
854 | /* Get the cluster for this block_group if it exists */ | 861 | /* Get the cluster for this block_group if it exists */ |
855 | if (block_group && !list_empty(&block_group->cluster_list)) | 862 | if (block_group && !list_empty(&block_group->cluster_list)) |
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
857 | struct btrfs_free_cluster, | 864 | struct btrfs_free_cluster, |
858 | block_group_list); | 865 | block_group_list); |
859 | 866 | ||
860 | /* | ||
861 | * We shouldn't have switched the pinned extents yet so this is the | ||
862 | * right one | ||
863 | */ | ||
864 | unpin = root->fs_info->pinned_extents; | ||
865 | |||
866 | /* Lock all pages first so we can lock the extent safely. */ | 867 | /* Lock all pages first so we can lock the extent safely. */ |
867 | io_ctl_prepare_pages(&io_ctl, inode, 0); | 868 | io_ctl_prepare_pages(&io_ctl, inode, 0); |
868 | 869 | ||
869 | lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, | 870 | lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, |
870 | 0, &cached_state, GFP_NOFS); | 871 | 0, &cached_state, GFP_NOFS); |
871 | 872 | ||
872 | /* | ||
873 | * When searching for pinned extents, we need to start at our start | ||
874 | * offset. | ||
875 | */ | ||
876 | if (block_group) | ||
877 | start = block_group->key.objectid; | ||
878 | |||
879 | node = rb_first(&ctl->free_space_offset); | 873 | node = rb_first(&ctl->free_space_offset); |
880 | if (!node && cluster) { | 874 | if (!node && cluster) { |
881 | node = rb_first(&cluster->root); | 875 | node = rb_first(&cluster->root); |
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
918 | * We want to add any pinned extents to our free space cache | 912 | * We want to add any pinned extents to our free space cache |
919 | * so we don't leak the space | 913 | * so we don't leak the space |
920 | */ | 914 | */ |
915 | |||
916 | /* | ||
917 | * We shouldn't have switched the pinned extents yet so this is the | ||
918 | * right one | ||
919 | */ | ||
920 | unpin = root->fs_info->pinned_extents; | ||
921 | |||
922 | if (block_group) | ||
923 | start = block_group->key.objectid; | ||
924 | |||
921 | while (block_group && (start < block_group->key.objectid + | 925 | while (block_group && (start < block_group->key.objectid + |
922 | block_group->key.offset)) { | 926 | block_group->key.offset)) { |
923 | ret = find_first_extent_bit(unpin, start, &start, &end, | 927 | ret = find_first_extent_bit(unpin, start, |
928 | &extent_start, &extent_end, | ||
924 | EXTENT_DIRTY); | 929 | EXTENT_DIRTY); |
925 | if (ret) { | 930 | if (ret) { |
926 | ret = 0; | 931 | ret = 0; |
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
928 | } | 933 | } |
929 | 934 | ||
930 | /* This pinned extent is out of our range */ | 935 | /* This pinned extent is out of our range */ |
931 | if (start >= block_group->key.objectid + | 936 | if (extent_start >= block_group->key.objectid + |
932 | block_group->key.offset) | 937 | block_group->key.offset) |
933 | break; | 938 | break; |
934 | 939 | ||
935 | len = block_group->key.objectid + | 940 | extent_start = max(extent_start, start); |
936 | block_group->key.offset - start; | 941 | extent_end = min(block_group->key.objectid + |
937 | len = min(len, end + 1 - start); | 942 | block_group->key.offset, extent_end + 1); |
943 | len = extent_end - extent_start; | ||
938 | 944 | ||
939 | entries++; | 945 | entries++; |
940 | ret = io_ctl_add_entry(&io_ctl, start, len, NULL); | 946 | ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); |
941 | if (ret) | 947 | if (ret) |
942 | goto out_nospc; | 948 | goto out_nospc; |
943 | 949 | ||
944 | start = end + 1; | 950 | start = extent_end; |
945 | } | 951 | } |
946 | 952 | ||
947 | /* Write out the bitmaps */ | 953 | /* Write out the bitmaps */ |
@@ -2283,23 +2289,23 @@ out: | |||
2283 | static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, | 2289 | static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, |
2284 | struct btrfs_free_space *entry, | 2290 | struct btrfs_free_space *entry, |
2285 | struct btrfs_free_cluster *cluster, | 2291 | struct btrfs_free_cluster *cluster, |
2286 | u64 offset, u64 bytes, u64 min_bytes) | 2292 | u64 offset, u64 bytes, |
2293 | u64 cont1_bytes, u64 min_bytes) | ||
2287 | { | 2294 | { |
2288 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2295 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; |
2289 | unsigned long next_zero; | 2296 | unsigned long next_zero; |
2290 | unsigned long i; | 2297 | unsigned long i; |
2291 | unsigned long search_bits; | 2298 | unsigned long want_bits; |
2292 | unsigned long total_bits; | 2299 | unsigned long min_bits; |
2293 | unsigned long found_bits; | 2300 | unsigned long found_bits; |
2294 | unsigned long start = 0; | 2301 | unsigned long start = 0; |
2295 | unsigned long total_found = 0; | 2302 | unsigned long total_found = 0; |
2296 | int ret; | 2303 | int ret; |
2297 | bool found = false; | ||
2298 | 2304 | ||
2299 | i = offset_to_bit(entry->offset, block_group->sectorsize, | 2305 | i = offset_to_bit(entry->offset, block_group->sectorsize, |
2300 | max_t(u64, offset, entry->offset)); | 2306 | max_t(u64, offset, entry->offset)); |
2301 | search_bits = bytes_to_bits(bytes, block_group->sectorsize); | 2307 | want_bits = bytes_to_bits(bytes, block_group->sectorsize); |
2302 | total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); | 2308 | min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); |
2303 | 2309 | ||
2304 | again: | 2310 | again: |
2305 | found_bits = 0; | 2311 | found_bits = 0; |
@@ -2308,7 +2314,7 @@ again: | |||
2308 | i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { | 2314 | i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { |
2309 | next_zero = find_next_zero_bit(entry->bitmap, | 2315 | next_zero = find_next_zero_bit(entry->bitmap, |
2310 | BITS_PER_BITMAP, i); | 2316 | BITS_PER_BITMAP, i); |
2311 | if (next_zero - i >= search_bits) { | 2317 | if (next_zero - i >= min_bits) { |
2312 | found_bits = next_zero - i; | 2318 | found_bits = next_zero - i; |
2313 | break; | 2319 | break; |
2314 | } | 2320 | } |
@@ -2318,10 +2324,9 @@ again: | |||
2318 | if (!found_bits) | 2324 | if (!found_bits) |
2319 | return -ENOSPC; | 2325 | return -ENOSPC; |
2320 | 2326 | ||
2321 | if (!found) { | 2327 | if (!total_found) { |
2322 | start = i; | 2328 | start = i; |
2323 | cluster->max_size = 0; | 2329 | cluster->max_size = 0; |
2324 | found = true; | ||
2325 | } | 2330 | } |
2326 | 2331 | ||
2327 | total_found += found_bits; | 2332 | total_found += found_bits; |
@@ -2329,13 +2334,8 @@ again: | |||
2329 | if (cluster->max_size < found_bits * block_group->sectorsize) | 2334 | if (cluster->max_size < found_bits * block_group->sectorsize) |
2330 | cluster->max_size = found_bits * block_group->sectorsize; | 2335 | cluster->max_size = found_bits * block_group->sectorsize; |
2331 | 2336 | ||
2332 | if (total_found < total_bits) { | 2337 | if (total_found < want_bits || cluster->max_size < cont1_bytes) { |
2333 | i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); | 2338 | i = next_zero + 1; |
2334 | if (i - start > total_bits * 2) { | ||
2335 | total_found = 0; | ||
2336 | cluster->max_size = 0; | ||
2337 | found = false; | ||
2338 | } | ||
2339 | goto again; | 2339 | goto again; |
2340 | } | 2340 | } |
2341 | 2341 | ||
@@ -2346,28 +2346,31 @@ again: | |||
2346 | &entry->offset_index, 1); | 2346 | &entry->offset_index, 1); |
2347 | BUG_ON(ret); | 2347 | BUG_ON(ret); |
2348 | 2348 | ||
2349 | trace_btrfs_setup_cluster(block_group, cluster, | ||
2350 | total_found * block_group->sectorsize, 1); | ||
2349 | return 0; | 2351 | return 0; |
2350 | } | 2352 | } |
2351 | 2353 | ||
2352 | /* | 2354 | /* |
2353 | * This searches the block group for just extents to fill the cluster with. | 2355 | * This searches the block group for just extents to fill the cluster with. |
2356 | * Try to find a cluster with at least bytes total bytes, at least one | ||
2357 | * extent of cont1_bytes, and other clusters of at least min_bytes. | ||
2354 | */ | 2358 | */ |
2355 | static noinline int | 2359 | static noinline int |
2356 | setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, | 2360 | setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, |
2357 | struct btrfs_free_cluster *cluster, | 2361 | struct btrfs_free_cluster *cluster, |
2358 | struct list_head *bitmaps, u64 offset, u64 bytes, | 2362 | struct list_head *bitmaps, u64 offset, u64 bytes, |
2359 | u64 min_bytes) | 2363 | u64 cont1_bytes, u64 min_bytes) |
2360 | { | 2364 | { |
2361 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2365 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; |
2362 | struct btrfs_free_space *first = NULL; | 2366 | struct btrfs_free_space *first = NULL; |
2363 | struct btrfs_free_space *entry = NULL; | 2367 | struct btrfs_free_space *entry = NULL; |
2364 | struct btrfs_free_space *prev = NULL; | ||
2365 | struct btrfs_free_space *last; | 2368 | struct btrfs_free_space *last; |
2366 | struct rb_node *node; | 2369 | struct rb_node *node; |
2367 | u64 window_start; | 2370 | u64 window_start; |
2368 | u64 window_free; | 2371 | u64 window_free; |
2369 | u64 max_extent; | 2372 | u64 max_extent; |
2370 | u64 max_gap = 128 * 1024; | 2373 | u64 total_size = 0; |
2371 | 2374 | ||
2372 | entry = tree_search_offset(ctl, offset, 0, 1); | 2375 | entry = tree_search_offset(ctl, offset, 0, 1); |
2373 | if (!entry) | 2376 | if (!entry) |
@@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2377 | * We don't want bitmaps, so just move along until we find a normal | 2380 | * We don't want bitmaps, so just move along until we find a normal |
2378 | * extent entry. | 2381 | * extent entry. |
2379 | */ | 2382 | */ |
2380 | while (entry->bitmap) { | 2383 | while (entry->bitmap || entry->bytes < min_bytes) { |
2381 | if (list_empty(&entry->list)) | 2384 | if (entry->bitmap && list_empty(&entry->list)) |
2382 | list_add_tail(&entry->list, bitmaps); | 2385 | list_add_tail(&entry->list, bitmaps); |
2383 | node = rb_next(&entry->offset_index); | 2386 | node = rb_next(&entry->offset_index); |
2384 | if (!node) | 2387 | if (!node) |
@@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2391 | max_extent = entry->bytes; | 2394 | max_extent = entry->bytes; |
2392 | first = entry; | 2395 | first = entry; |
2393 | last = entry; | 2396 | last = entry; |
2394 | prev = entry; | ||
2395 | 2397 | ||
2396 | while (window_free <= min_bytes) { | 2398 | for (node = rb_next(&entry->offset_index); node; |
2397 | node = rb_next(&entry->offset_index); | 2399 | node = rb_next(&entry->offset_index)) { |
2398 | if (!node) | ||
2399 | return -ENOSPC; | ||
2400 | entry = rb_entry(node, struct btrfs_free_space, offset_index); | 2400 | entry = rb_entry(node, struct btrfs_free_space, offset_index); |
2401 | 2401 | ||
2402 | if (entry->bitmap) { | 2402 | if (entry->bitmap) { |
@@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2405 | continue; | 2405 | continue; |
2406 | } | 2406 | } |
2407 | 2407 | ||
2408 | /* | 2408 | if (entry->bytes < min_bytes) |
2409 | * we haven't filled the empty size and the window is | 2409 | continue; |
2410 | * very large. reset and try again | 2410 | |
2411 | */ | 2411 | last = entry; |
2412 | if (entry->offset - (prev->offset + prev->bytes) > max_gap || | 2412 | window_free += entry->bytes; |
2413 | entry->offset - window_start > (min_bytes * 2)) { | 2413 | if (entry->bytes > max_extent) |
2414 | first = entry; | ||
2415 | window_start = entry->offset; | ||
2416 | window_free = entry->bytes; | ||
2417 | last = entry; | ||
2418 | max_extent = entry->bytes; | 2414 | max_extent = entry->bytes; |
2419 | } else { | ||
2420 | last = entry; | ||
2421 | window_free += entry->bytes; | ||
2422 | if (entry->bytes > max_extent) | ||
2423 | max_extent = entry->bytes; | ||
2424 | } | ||
2425 | prev = entry; | ||
2426 | } | 2415 | } |
2427 | 2416 | ||
2417 | if (window_free < bytes || max_extent < cont1_bytes) | ||
2418 | return -ENOSPC; | ||
2419 | |||
2428 | cluster->window_start = first->offset; | 2420 | cluster->window_start = first->offset; |
2429 | 2421 | ||
2430 | node = &first->offset_index; | 2422 | node = &first->offset_index; |
@@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2438 | 2430 | ||
2439 | entry = rb_entry(node, struct btrfs_free_space, offset_index); | 2431 | entry = rb_entry(node, struct btrfs_free_space, offset_index); |
2440 | node = rb_next(&entry->offset_index); | 2432 | node = rb_next(&entry->offset_index); |
2441 | if (entry->bitmap) | 2433 | if (entry->bitmap || entry->bytes < min_bytes) |
2442 | continue; | 2434 | continue; |
2443 | 2435 | ||
2444 | rb_erase(&entry->offset_index, &ctl->free_space_offset); | 2436 | rb_erase(&entry->offset_index, &ctl->free_space_offset); |
2445 | ret = tree_insert_offset(&cluster->root, entry->offset, | 2437 | ret = tree_insert_offset(&cluster->root, entry->offset, |
2446 | &entry->offset_index, 0); | 2438 | &entry->offset_index, 0); |
2439 | total_size += entry->bytes; | ||
2447 | BUG_ON(ret); | 2440 | BUG_ON(ret); |
2448 | } while (node && entry != last); | 2441 | } while (node && entry != last); |
2449 | 2442 | ||
2450 | cluster->max_size = max_extent; | 2443 | cluster->max_size = max_extent; |
2451 | 2444 | trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); | |
2452 | return 0; | 2445 | return 0; |
2453 | } | 2446 | } |
2454 | 2447 | ||
@@ -2460,7 +2453,7 @@ static noinline int | |||
2460 | setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, | 2453 | setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, |
2461 | struct btrfs_free_cluster *cluster, | 2454 | struct btrfs_free_cluster *cluster, |
2462 | struct list_head *bitmaps, u64 offset, u64 bytes, | 2455 | struct list_head *bitmaps, u64 offset, u64 bytes, |
2463 | u64 min_bytes) | 2456 | u64 cont1_bytes, u64 min_bytes) |
2464 | { | 2457 | { |
2465 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2458 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; |
2466 | struct btrfs_free_space *entry; | 2459 | struct btrfs_free_space *entry; |
@@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, | |||
2485 | if (entry->bytes < min_bytes) | 2478 | if (entry->bytes < min_bytes) |
2486 | continue; | 2479 | continue; |
2487 | ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, | 2480 | ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, |
2488 | bytes, min_bytes); | 2481 | bytes, cont1_bytes, min_bytes); |
2489 | if (!ret) | 2482 | if (!ret) |
2490 | return 0; | 2483 | return 0; |
2491 | } | 2484 | } |
@@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, | |||
2499 | 2492 | ||
2500 | /* | 2493 | /* |
2501 | * here we try to find a cluster of blocks in a block group. The goal | 2494 | * here we try to find a cluster of blocks in a block group. The goal |
2502 | * is to find at least bytes free and up to empty_size + bytes free. | 2495 | * is to find at least bytes+empty_size. |
2503 | * We might not find them all in one contiguous area. | 2496 | * We might not find them all in one contiguous area. |
2504 | * | 2497 | * |
2505 | * returns zero and sets up cluster if things worked out, otherwise | 2498 | * returns zero and sets up cluster if things worked out, otherwise |
@@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2515 | struct btrfs_free_space *entry, *tmp; | 2508 | struct btrfs_free_space *entry, *tmp; |
2516 | LIST_HEAD(bitmaps); | 2509 | LIST_HEAD(bitmaps); |
2517 | u64 min_bytes; | 2510 | u64 min_bytes; |
2511 | u64 cont1_bytes; | ||
2518 | int ret; | 2512 | int ret; |
2519 | 2513 | ||
2520 | /* for metadata, allow allocates with more holes */ | 2514 | /* |
2515 | * Choose the minimum extent size we'll require for this | ||
2516 | * cluster. For SSD_SPREAD, don't allow any fragmentation. | ||
2517 | * For metadata, allow allocates with smaller extents. For | ||
2518 | * data, keep it dense. | ||
2519 | */ | ||
2521 | if (btrfs_test_opt(root, SSD_SPREAD)) { | 2520 | if (btrfs_test_opt(root, SSD_SPREAD)) { |
2522 | min_bytes = bytes + empty_size; | 2521 | cont1_bytes = min_bytes = bytes + empty_size; |
2523 | } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { | 2522 | } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { |
2524 | /* | 2523 | cont1_bytes = bytes; |
2525 | * we want to do larger allocations when we are | 2524 | min_bytes = block_group->sectorsize; |
2526 | * flushing out the delayed refs, it helps prevent | 2525 | } else { |
2527 | * making more work as we go along. | 2526 | cont1_bytes = max(bytes, (bytes + empty_size) >> 2); |
2528 | */ | 2527 | min_bytes = block_group->sectorsize; |
2529 | if (trans->transaction->delayed_refs.flushing) | 2528 | } |
2530 | min_bytes = max(bytes, (bytes + empty_size) >> 1); | ||
2531 | else | ||
2532 | min_bytes = max(bytes, (bytes + empty_size) >> 4); | ||
2533 | } else | ||
2534 | min_bytes = max(bytes, (bytes + empty_size) >> 2); | ||
2535 | 2529 | ||
2536 | spin_lock(&ctl->tree_lock); | 2530 | spin_lock(&ctl->tree_lock); |
2537 | 2531 | ||
@@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2539 | * If we know we don't have enough space to make a cluster don't even | 2533 | * If we know we don't have enough space to make a cluster don't even |
2540 | * bother doing all the work to try and find one. | 2534 | * bother doing all the work to try and find one. |
2541 | */ | 2535 | */ |
2542 | if (ctl->free_space < min_bytes) { | 2536 | if (ctl->free_space < bytes) { |
2543 | spin_unlock(&ctl->tree_lock); | 2537 | spin_unlock(&ctl->tree_lock); |
2544 | return -ENOSPC; | 2538 | return -ENOSPC; |
2545 | } | 2539 | } |
@@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2552 | goto out; | 2546 | goto out; |
2553 | } | 2547 | } |
2554 | 2548 | ||
2549 | trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, | ||
2550 | min_bytes); | ||
2551 | |||
2552 | INIT_LIST_HEAD(&bitmaps); | ||
2555 | ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, | 2553 | ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, |
2556 | bytes, min_bytes); | 2554 | bytes + empty_size, |
2555 | cont1_bytes, min_bytes); | ||
2557 | if (ret) | 2556 | if (ret) |
2558 | ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, | 2557 | ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, |
2559 | offset, bytes, min_bytes); | 2558 | offset, bytes + empty_size, |
2559 | cont1_bytes, min_bytes); | ||
2560 | 2560 | ||
2561 | /* Clear our temporary list */ | 2561 | /* Clear our temporary list */ |
2562 | list_for_each_entry_safe(entry, tmp, &bitmaps, list) | 2562 | list_for_each_entry_safe(entry, tmp, &bitmaps, list) |
@@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2567 | list_add_tail(&cluster->block_group_list, | 2567 | list_add_tail(&cluster->block_group_list, |
2568 | &block_group->cluster_list); | 2568 | &block_group->cluster_list); |
2569 | cluster->block_group = block_group; | 2569 | cluster->block_group = block_group; |
2570 | } else { | ||
2571 | trace_btrfs_failed_cluster_setup(block_group); | ||
2570 | } | 2572 | } |
2571 | out: | 2573 | out: |
2572 | spin_unlock(&cluster->lock); | 2574 | spin_unlock(&cluster->lock); |
@@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) | |||
2588 | cluster->block_group = NULL; | 2590 | cluster->block_group = NULL; |
2589 | } | 2591 | } |
2590 | 2592 | ||
2591 | int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, | 2593 | static int do_trimming(struct btrfs_block_group_cache *block_group, |
2592 | u64 *trimmed, u64 start, u64 end, u64 minlen) | 2594 | u64 *total_trimmed, u64 start, u64 bytes, |
2595 | u64 reserved_start, u64 reserved_bytes) | ||
2593 | { | 2596 | { |
2594 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2597 | struct btrfs_space_info *space_info = block_group->space_info; |
2595 | struct btrfs_free_space *entry = NULL; | ||
2596 | struct btrfs_fs_info *fs_info = block_group->fs_info; | 2598 | struct btrfs_fs_info *fs_info = block_group->fs_info; |
2597 | u64 bytes = 0; | 2599 | int ret; |
2598 | u64 actually_trimmed; | 2600 | int update = 0; |
2599 | int ret = 0; | 2601 | u64 trimmed = 0; |
2600 | 2602 | ||
2601 | *trimmed = 0; | 2603 | spin_lock(&space_info->lock); |
2604 | spin_lock(&block_group->lock); | ||
2605 | if (!block_group->ro) { | ||
2606 | block_group->reserved += reserved_bytes; | ||
2607 | space_info->bytes_reserved += reserved_bytes; | ||
2608 | update = 1; | ||
2609 | } | ||
2610 | spin_unlock(&block_group->lock); | ||
2611 | spin_unlock(&space_info->lock); | ||
2612 | |||
2613 | ret = btrfs_error_discard_extent(fs_info->extent_root, | ||
2614 | start, bytes, &trimmed); | ||
2615 | if (!ret) | ||
2616 | *total_trimmed += trimmed; | ||
2617 | |||
2618 | btrfs_add_free_space(block_group, reserved_start, reserved_bytes); | ||
2619 | |||
2620 | if (update) { | ||
2621 | spin_lock(&space_info->lock); | ||
2622 | spin_lock(&block_group->lock); | ||
2623 | if (block_group->ro) | ||
2624 | space_info->bytes_readonly += reserved_bytes; | ||
2625 | block_group->reserved -= reserved_bytes; | ||
2626 | space_info->bytes_reserved -= reserved_bytes; | ||
2627 | spin_unlock(&space_info->lock); | ||
2628 | spin_unlock(&block_group->lock); | ||
2629 | } | ||
2630 | |||
2631 | return ret; | ||
2632 | } | ||
2633 | |||
2634 | static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, | ||
2635 | u64 *total_trimmed, u64 start, u64 end, u64 minlen) | ||
2636 | { | ||
2637 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | ||
2638 | struct btrfs_free_space *entry; | ||
2639 | struct rb_node *node; | ||
2640 | int ret = 0; | ||
2641 | u64 extent_start; | ||
2642 | u64 extent_bytes; | ||
2643 | u64 bytes; | ||
2602 | 2644 | ||
2603 | while (start < end) { | 2645 | while (start < end) { |
2604 | spin_lock(&ctl->tree_lock); | 2646 | spin_lock(&ctl->tree_lock); |
@@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, | |||
2609 | } | 2651 | } |
2610 | 2652 | ||
2611 | entry = tree_search_offset(ctl, start, 0, 1); | 2653 | entry = tree_search_offset(ctl, start, 0, 1); |
2612 | if (!entry) | 2654 | if (!entry) { |
2613 | entry = tree_search_offset(ctl, | ||
2614 | offset_to_bitmap(ctl, start), | ||
2615 | 1, 1); | ||
2616 | |||
2617 | if (!entry || entry->offset >= end) { | ||
2618 | spin_unlock(&ctl->tree_lock); | 2655 | spin_unlock(&ctl->tree_lock); |
2619 | break; | 2656 | break; |
2620 | } | 2657 | } |
2621 | 2658 | ||
2622 | if (entry->bitmap) { | 2659 | /* skip bitmaps */ |
2623 | ret = search_bitmap(ctl, entry, &start, &bytes); | 2660 | while (entry->bitmap) { |
2624 | if (!ret) { | 2661 | node = rb_next(&entry->offset_index); |
2625 | if (start >= end) { | 2662 | if (!node) { |
2626 | spin_unlock(&ctl->tree_lock); | ||
2627 | break; | ||
2628 | } | ||
2629 | bytes = min(bytes, end - start); | ||
2630 | bitmap_clear_bits(ctl, entry, start, bytes); | ||
2631 | if (entry->bytes == 0) | ||
2632 | free_bitmap(ctl, entry); | ||
2633 | } else { | ||
2634 | start = entry->offset + BITS_PER_BITMAP * | ||
2635 | block_group->sectorsize; | ||
2636 | spin_unlock(&ctl->tree_lock); | 2663 | spin_unlock(&ctl->tree_lock); |
2637 | ret = 0; | 2664 | goto out; |
2638 | continue; | ||
2639 | } | 2665 | } |
2640 | } else { | 2666 | entry = rb_entry(node, struct btrfs_free_space, |
2641 | start = entry->offset; | 2667 | offset_index); |
2642 | bytes = min(entry->bytes, end - start); | ||
2643 | unlink_free_space(ctl, entry); | ||
2644 | kmem_cache_free(btrfs_free_space_cachep, entry); | ||
2645 | } | 2668 | } |
2646 | 2669 | ||
2670 | if (entry->offset >= end) { | ||
2671 | spin_unlock(&ctl->tree_lock); | ||
2672 | break; | ||
2673 | } | ||
2674 | |||
2675 | extent_start = entry->offset; | ||
2676 | extent_bytes = entry->bytes; | ||
2677 | start = max(start, extent_start); | ||
2678 | bytes = min(extent_start + extent_bytes, end) - start; | ||
2679 | if (bytes < minlen) { | ||
2680 | spin_unlock(&ctl->tree_lock); | ||
2681 | goto next; | ||
2682 | } | ||
2683 | |||
2684 | unlink_free_space(ctl, entry); | ||
2685 | kmem_cache_free(btrfs_free_space_cachep, entry); | ||
2686 | |||
2647 | spin_unlock(&ctl->tree_lock); | 2687 | spin_unlock(&ctl->tree_lock); |
2648 | 2688 | ||
2649 | if (bytes >= minlen) { | 2689 | ret = do_trimming(block_group, total_trimmed, start, bytes, |
2650 | struct btrfs_space_info *space_info; | 2690 | extent_start, extent_bytes); |
2651 | int update = 0; | 2691 | if (ret) |
2652 | 2692 | break; | |
2653 | space_info = block_group->space_info; | 2693 | next: |
2654 | spin_lock(&space_info->lock); | 2694 | start += bytes; |
2655 | spin_lock(&block_group->lock); | ||
2656 | if (!block_group->ro) { | ||
2657 | block_group->reserved += bytes; | ||
2658 | space_info->bytes_reserved += bytes; | ||
2659 | update = 1; | ||
2660 | } | ||
2661 | spin_unlock(&block_group->lock); | ||
2662 | spin_unlock(&space_info->lock); | ||
2663 | |||
2664 | ret = btrfs_error_discard_extent(fs_info->extent_root, | ||
2665 | start, | ||
2666 | bytes, | ||
2667 | &actually_trimmed); | ||
2668 | |||
2669 | btrfs_add_free_space(block_group, start, bytes); | ||
2670 | if (update) { | ||
2671 | spin_lock(&space_info->lock); | ||
2672 | spin_lock(&block_group->lock); | ||
2673 | if (block_group->ro) | ||
2674 | space_info->bytes_readonly += bytes; | ||
2675 | block_group->reserved -= bytes; | ||
2676 | space_info->bytes_reserved -= bytes; | ||
2677 | spin_unlock(&space_info->lock); | ||
2678 | spin_unlock(&block_group->lock); | ||
2679 | } | ||
2680 | 2695 | ||
2681 | if (ret) | 2696 | if (fatal_signal_pending(current)) { |
2682 | break; | 2697 | ret = -ERESTARTSYS; |
2683 | *trimmed += actually_trimmed; | 2698 | break; |
2699 | } | ||
2700 | |||
2701 | cond_resched(); | ||
2702 | } | ||
2703 | out: | ||
2704 | return ret; | ||
2705 | } | ||
2706 | |||
2707 | static int trim_bitmaps(struct btrfs_block_group_cache *block_group, | ||
2708 | u64 *total_trimmed, u64 start, u64 end, u64 minlen) | ||
2709 | { | ||
2710 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | ||
2711 | struct btrfs_free_space *entry; | ||
2712 | int ret = 0; | ||
2713 | int ret2; | ||
2714 | u64 bytes; | ||
2715 | u64 offset = offset_to_bitmap(ctl, start); | ||
2716 | |||
2717 | while (offset < end) { | ||
2718 | bool next_bitmap = false; | ||
2719 | |||
2720 | spin_lock(&ctl->tree_lock); | ||
2721 | |||
2722 | if (ctl->free_space < minlen) { | ||
2723 | spin_unlock(&ctl->tree_lock); | ||
2724 | break; | ||
2725 | } | ||
2726 | |||
2727 | entry = tree_search_offset(ctl, offset, 1, 0); | ||
2728 | if (!entry) { | ||
2729 | spin_unlock(&ctl->tree_lock); | ||
2730 | next_bitmap = true; | ||
2731 | goto next; | ||
2732 | } | ||
2733 | |||
2734 | bytes = minlen; | ||
2735 | ret2 = search_bitmap(ctl, entry, &start, &bytes); | ||
2736 | if (ret2 || start >= end) { | ||
2737 | spin_unlock(&ctl->tree_lock); | ||
2738 | next_bitmap = true; | ||
2739 | goto next; | ||
2740 | } | ||
2741 | |||
2742 | bytes = min(bytes, end - start); | ||
2743 | if (bytes < minlen) { | ||
2744 | spin_unlock(&ctl->tree_lock); | ||
2745 | goto next; | ||
2746 | } | ||
2747 | |||
2748 | bitmap_clear_bits(ctl, entry, start, bytes); | ||
2749 | if (entry->bytes == 0) | ||
2750 | free_bitmap(ctl, entry); | ||
2751 | |||
2752 | spin_unlock(&ctl->tree_lock); | ||
2753 | |||
2754 | ret = do_trimming(block_group, total_trimmed, start, bytes, | ||
2755 | start, bytes); | ||
2756 | if (ret) | ||
2757 | break; | ||
2758 | next: | ||
2759 | if (next_bitmap) { | ||
2760 | offset += BITS_PER_BITMAP * ctl->unit; | ||
2761 | } else { | ||
2762 | start += bytes; | ||
2763 | if (start >= offset + BITS_PER_BITMAP * ctl->unit) | ||
2764 | offset += BITS_PER_BITMAP * ctl->unit; | ||
2684 | } | 2765 | } |
2685 | start += bytes; | ||
2686 | bytes = 0; | ||
2687 | 2766 | ||
2688 | if (fatal_signal_pending(current)) { | 2767 | if (fatal_signal_pending(current)) { |
2689 | ret = -ERESTARTSYS; | 2768 | ret = -ERESTARTSYS; |
@@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, | |||
2696 | return ret; | 2775 | return ret; |
2697 | } | 2776 | } |
2698 | 2777 | ||
2778 | int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, | ||
2779 | u64 *trimmed, u64 start, u64 end, u64 minlen) | ||
2780 | { | ||
2781 | int ret; | ||
2782 | |||
2783 | *trimmed = 0; | ||
2784 | |||
2785 | ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); | ||
2786 | if (ret) | ||
2787 | return ret; | ||
2788 | |||
2789 | ret = trim_bitmaps(block_group, trimmed, start, end, minlen); | ||
2790 | |||
2791 | return ret; | ||
2792 | } | ||
2793 | |||
2699 | /* | 2794 | /* |
2700 | * Find the left-most item in the cache tree, and then return the | 2795 | * Find the left-most item in the cache tree, and then return the |
2701 | * smallest inode number in the item. | 2796 | * smallest inode number in the item. |
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f8962a957d65..213ffa86ce1b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c | |||
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, | |||
438 | trans->bytes_reserved); | 438 | trans->bytes_reserved); |
439 | if (ret) | 439 | if (ret) |
440 | goto out; | 440 | goto out; |
441 | trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, | ||
442 | trans->bytes_reserved, 1); | ||
441 | again: | 443 | again: |
442 | inode = lookup_free_ino_inode(root, path); | 444 | inode = lookup_free_ino_inode(root, path); |
443 | if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { | 445 | if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { |
@@ -498,6 +500,8 @@ again: | |||
498 | out_put: | 500 | out_put: |
499 | iput(inode); | 501 | iput(inode); |
500 | out_release: | 502 | out_release: |
503 | trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, | ||
504 | trans->bytes_reserved, 0); | ||
501 | btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); | 505 | btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); |
502 | out: | 506 | out: |
503 | trans->block_rsv = rsv; | 507 | trans->block_rsv = rsv; |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81b235a61f8c..0da19a0ea00d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state { | |||
1951 | void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, | 1951 | void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, |
1952 | struct btrfs_root *root) | 1952 | struct btrfs_root *root) |
1953 | { | 1953 | { |
1954 | struct btrfs_block_rsv *block_rsv; | ||
1954 | int ret; | 1955 | int ret; |
1955 | 1956 | ||
1956 | if (!list_empty(&root->orphan_list) || | 1957 | if (!list_empty(&root->orphan_list) || |
1957 | root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) | 1958 | root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) |
1958 | return; | 1959 | return; |
1959 | 1960 | ||
1961 | spin_lock(&root->orphan_lock); | ||
1962 | if (!list_empty(&root->orphan_list)) { | ||
1963 | spin_unlock(&root->orphan_lock); | ||
1964 | return; | ||
1965 | } | ||
1966 | |||
1967 | if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { | ||
1968 | spin_unlock(&root->orphan_lock); | ||
1969 | return; | ||
1970 | } | ||
1971 | |||
1972 | block_rsv = root->orphan_block_rsv; | ||
1973 | root->orphan_block_rsv = NULL; | ||
1974 | spin_unlock(&root->orphan_lock); | ||
1975 | |||
1960 | if (root->orphan_item_inserted && | 1976 | if (root->orphan_item_inserted && |
1961 | btrfs_root_refs(&root->root_item) > 0) { | 1977 | btrfs_root_refs(&root->root_item) > 0) { |
1962 | ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, | 1978 | ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, |
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, | |||
1965 | root->orphan_item_inserted = 0; | 1981 | root->orphan_item_inserted = 0; |
1966 | } | 1982 | } |
1967 | 1983 | ||
1968 | if (root->orphan_block_rsv) { | 1984 | if (block_rsv) { |
1969 | WARN_ON(root->orphan_block_rsv->size > 0); | 1985 | WARN_ON(block_rsv->size > 0); |
1970 | btrfs_free_block_rsv(root, root->orphan_block_rsv); | 1986 | btrfs_free_block_rsv(root, block_rsv); |
1971 | root->orphan_block_rsv = NULL; | ||
1972 | } | 1987 | } |
1973 | } | 1988 | } |
1974 | 1989 | ||
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2224 | continue; | 2239 | continue; |
2225 | } | 2240 | } |
2226 | nr_truncate++; | 2241 | nr_truncate++; |
2227 | /* | ||
2228 | * Need to hold the imutex for reservation purposes, not | ||
2229 | * a huge deal here but I have a WARN_ON in | ||
2230 | * btrfs_delalloc_reserve_space to catch offenders. | ||
2231 | */ | ||
2232 | mutex_lock(&inode->i_mutex); | ||
2233 | ret = btrfs_truncate(inode); | 2242 | ret = btrfs_truncate(inode); |
2234 | mutex_unlock(&inode->i_mutex); | ||
2235 | } else { | 2243 | } else { |
2236 | nr_unlink++; | 2244 | nr_unlink++; |
2237 | } | 2245 | } |
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, | |||
2845 | BUG_ON(!root->fs_info->enospc_unlink); | 2853 | BUG_ON(!root->fs_info->enospc_unlink); |
2846 | root->fs_info->enospc_unlink = 0; | 2854 | root->fs_info->enospc_unlink = 0; |
2847 | } | 2855 | } |
2848 | btrfs_end_transaction_throttle(trans, root); | 2856 | btrfs_end_transaction(trans, root); |
2849 | } | 2857 | } |
2850 | 2858 | ||
2851 | static int btrfs_unlink(struct inode *dir, struct dentry *dentry) | 2859 | static int btrfs_unlink(struct inode *dir, struct dentry *dentry) |
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
3009 | int pending_del_nr = 0; | 3017 | int pending_del_nr = 0; |
3010 | int pending_del_slot = 0; | 3018 | int pending_del_slot = 0; |
3011 | int extent_type = -1; | 3019 | int extent_type = -1; |
3012 | int encoding; | ||
3013 | int ret; | 3020 | int ret; |
3014 | int err = 0; | 3021 | int err = 0; |
3015 | u64 ino = btrfs_ino(inode); | 3022 | u64 ino = btrfs_ino(inode); |
@@ -3059,7 +3066,6 @@ search_again: | |||
3059 | leaf = path->nodes[0]; | 3066 | leaf = path->nodes[0]; |
3060 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | 3067 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); |
3061 | found_type = btrfs_key_type(&found_key); | 3068 | found_type = btrfs_key_type(&found_key); |
3062 | encoding = 0; | ||
3063 | 3069 | ||
3064 | if (found_key.objectid != ino) | 3070 | if (found_key.objectid != ino) |
3065 | break; | 3071 | break; |
@@ -3072,10 +3078,6 @@ search_again: | |||
3072 | fi = btrfs_item_ptr(leaf, path->slots[0], | 3078 | fi = btrfs_item_ptr(leaf, path->slots[0], |
3073 | struct btrfs_file_extent_item); | 3079 | struct btrfs_file_extent_item); |
3074 | extent_type = btrfs_file_extent_type(leaf, fi); | 3080 | extent_type = btrfs_file_extent_type(leaf, fi); |
3075 | encoding = btrfs_file_extent_compression(leaf, fi); | ||
3076 | encoding |= btrfs_file_extent_encryption(leaf, fi); | ||
3077 | encoding |= btrfs_file_extent_other_encoding(leaf, fi); | ||
3078 | |||
3079 | if (extent_type != BTRFS_FILE_EXTENT_INLINE) { | 3081 | if (extent_type != BTRFS_FILE_EXTENT_INLINE) { |
3080 | item_end += | 3082 | item_end += |
3081 | btrfs_file_extent_num_bytes(leaf, fi); | 3083 | btrfs_file_extent_num_bytes(leaf, fi); |
@@ -3103,7 +3105,7 @@ search_again: | |||
3103 | if (extent_type != BTRFS_FILE_EXTENT_INLINE) { | 3105 | if (extent_type != BTRFS_FILE_EXTENT_INLINE) { |
3104 | u64 num_dec; | 3106 | u64 num_dec; |
3105 | extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); | 3107 | extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); |
3106 | if (!del_item && !encoding) { | 3108 | if (!del_item) { |
3107 | u64 orig_num_bytes = | 3109 | u64 orig_num_bytes = |
3108 | btrfs_file_extent_num_bytes(leaf, fi); | 3110 | btrfs_file_extent_num_bytes(leaf, fi); |
3109 | extent_num_bytes = new_size - | 3111 | extent_num_bytes = new_size - |
@@ -3179,7 +3181,7 @@ delete: | |||
3179 | ret = btrfs_free_extent(trans, root, extent_start, | 3181 | ret = btrfs_free_extent(trans, root, extent_start, |
3180 | extent_num_bytes, 0, | 3182 | extent_num_bytes, 0, |
3181 | btrfs_header_owner(leaf), | 3183 | btrfs_header_owner(leaf), |
3182 | ino, extent_offset); | 3184 | ino, extent_offset, 0); |
3183 | BUG_ON(ret); | 3185 | BUG_ON(ret); |
3184 | } | 3186 | } |
3185 | 3187 | ||
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) | |||
3434 | i_size_write(inode, newsize); | 3436 | i_size_write(inode, newsize); |
3435 | btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); | 3437 | btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); |
3436 | ret = btrfs_update_inode(trans, root, inode); | 3438 | ret = btrfs_update_inode(trans, root, inode); |
3437 | btrfs_end_transaction_throttle(trans, root); | 3439 | btrfs_end_transaction(trans, root); |
3438 | } else { | 3440 | } else { |
3439 | 3441 | ||
3440 | /* | 3442 | /* |
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | |||
4655 | } | 4657 | } |
4656 | out_unlock: | 4658 | out_unlock: |
4657 | nr = trans->blocks_used; | 4659 | nr = trans->blocks_used; |
4658 | btrfs_end_transaction_throttle(trans, root); | 4660 | btrfs_end_transaction(trans, root); |
4659 | btrfs_btree_balance_dirty(root, nr); | 4661 | btrfs_btree_balance_dirty(root, nr); |
4660 | if (drop_inode) { | 4662 | if (drop_inode) { |
4661 | inode_dec_link_count(inode); | 4663 | inode_dec_link_count(inode); |
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, | |||
4723 | } | 4725 | } |
4724 | out_unlock: | 4726 | out_unlock: |
4725 | nr = trans->blocks_used; | 4727 | nr = trans->blocks_used; |
4726 | btrfs_end_transaction_throttle(trans, root); | 4728 | btrfs_end_transaction(trans, root); |
4727 | if (drop_inode) { | 4729 | if (drop_inode) { |
4728 | inode_dec_link_count(inode); | 4730 | inode_dec_link_count(inode); |
4729 | iput(inode); | 4731 | iput(inode); |
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, | |||
4782 | } | 4784 | } |
4783 | 4785 | ||
4784 | nr = trans->blocks_used; | 4786 | nr = trans->blocks_used; |
4785 | btrfs_end_transaction_throttle(trans, root); | 4787 | btrfs_end_transaction(trans, root); |
4786 | fail: | 4788 | fail: |
4787 | if (drop_inode) { | 4789 | if (drop_inode) { |
4788 | inode_dec_link_count(inode); | 4790 | inode_dec_link_count(inode); |
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4848 | 4850 | ||
4849 | out_fail: | 4851 | out_fail: |
4850 | nr = trans->blocks_used; | 4852 | nr = trans->blocks_used; |
4851 | btrfs_end_transaction_throttle(trans, root); | 4853 | btrfs_end_transaction(trans, root); |
4852 | if (drop_on_err) | 4854 | if (drop_on_err) |
4853 | iput(inode); | 4855 | iput(inode); |
4854 | btrfs_btree_balance_dirty(root, nr); | 4856 | btrfs_btree_balance_dirty(root, nr); |
@@ -5121,7 +5123,7 @@ again: | |||
5121 | } | 5123 | } |
5122 | flush_dcache_page(page); | 5124 | flush_dcache_page(page); |
5123 | } else if (create && PageUptodate(page)) { | 5125 | } else if (create && PageUptodate(page)) { |
5124 | WARN_ON(1); | 5126 | BUG(); |
5125 | if (!trans) { | 5127 | if (!trans) { |
5126 | kunmap(page); | 5128 | kunmap(page); |
5127 | free_extent_map(em); | 5129 | free_extent_map(em); |
@@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
6402 | u64 page_start; | 6404 | u64 page_start; |
6403 | u64 page_end; | 6405 | u64 page_end; |
6404 | 6406 | ||
6405 | /* Need this to keep space reservations serialized */ | ||
6406 | mutex_lock(&inode->i_mutex); | ||
6407 | ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); | 6407 | ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); |
6408 | mutex_unlock(&inode->i_mutex); | ||
6409 | if (!ret) | 6408 | if (!ret) |
6410 | ret = btrfs_update_time(vma->vm_file); | 6409 | ret = btrfs_update_time(vma->vm_file); |
6411 | if (ret) { | 6410 | if (ret) { |
@@ -6494,8 +6493,8 @@ out_unlock: | |||
6494 | if (!ret) | 6493 | if (!ret) |
6495 | return VM_FAULT_LOCKED; | 6494 | return VM_FAULT_LOCKED; |
6496 | unlock_page(page); | 6495 | unlock_page(page); |
6497 | btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); | ||
6498 | out: | 6496 | out: |
6497 | btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); | ||
6499 | return ret; | 6498 | return ret; |
6500 | } | 6499 | } |
6501 | 6500 | ||
@@ -6668,7 +6667,7 @@ end_trans: | |||
6668 | err = ret; | 6667 | err = ret; |
6669 | 6668 | ||
6670 | nr = trans->blocks_used; | 6669 | nr = trans->blocks_used; |
6671 | ret = btrfs_end_transaction_throttle(trans, root); | 6670 | ret = btrfs_end_transaction(trans, root); |
6672 | btrfs_btree_balance_dirty(root, nr); | 6671 | btrfs_btree_balance_dirty(root, nr); |
6673 | } | 6672 | } |
6674 | 6673 | ||
@@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) | |||
6749 | extent_io_tree_init(&ei->io_tree, &inode->i_data); | 6748 | extent_io_tree_init(&ei->io_tree, &inode->i_data); |
6750 | extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); | 6749 | extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); |
6751 | mutex_init(&ei->log_mutex); | 6750 | mutex_init(&ei->log_mutex); |
6751 | mutex_init(&ei->delalloc_mutex); | ||
6752 | btrfs_ordered_inode_tree_init(&ei->ordered_tree); | 6752 | btrfs_ordered_inode_tree_init(&ei->ordered_tree); |
6753 | INIT_LIST_HEAD(&ei->i_orphan); | 6753 | INIT_LIST_HEAD(&ei->i_orphan); |
6754 | INIT_LIST_HEAD(&ei->delalloc_inodes); | 6754 | INIT_LIST_HEAD(&ei->delalloc_inodes); |
@@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
7074 | btrfs_end_log_trans(root); | 7074 | btrfs_end_log_trans(root); |
7075 | } | 7075 | } |
7076 | out_fail: | 7076 | out_fail: |
7077 | btrfs_end_transaction_throttle(trans, root); | 7077 | btrfs_end_transaction(trans, root); |
7078 | out_notrans: | 7078 | out_notrans: |
7079 | if (old_ino == BTRFS_FIRST_FREE_OBJECTID) | 7079 | if (old_ino == BTRFS_FIRST_FREE_OBJECTID) |
7080 | up_read(&root->fs_info->subvol_sem); | 7080 | up_read(&root->fs_info->subvol_sem); |
@@ -7246,7 +7246,7 @@ out_unlock: | |||
7246 | if (!err) | 7246 | if (!err) |
7247 | d_instantiate(dentry, inode); | 7247 | d_instantiate(dentry, inode); |
7248 | nr = trans->blocks_used; | 7248 | nr = trans->blocks_used; |
7249 | btrfs_end_transaction_throttle(trans, root); | 7249 | btrfs_end_transaction(trans, root); |
7250 | if (drop_inode) { | 7250 | if (drop_inode) { |
7251 | inode_dec_link_count(inode); | 7251 | inode_dec_link_count(inode); |
7252 | iput(inode); | 7252 | iput(inode); |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5441ff1480fd..ab620014bcc3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) | |||
176 | struct btrfs_trans_handle *trans; | 176 | struct btrfs_trans_handle *trans; |
177 | unsigned int flags, oldflags; | 177 | unsigned int flags, oldflags; |
178 | int ret; | 178 | int ret; |
179 | u64 ip_oldflags; | ||
180 | unsigned int i_oldflags; | ||
179 | 181 | ||
180 | if (btrfs_root_readonly(root)) | 182 | if (btrfs_root_readonly(root)) |
181 | return -EROFS; | 183 | return -EROFS; |
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) | |||
192 | 194 | ||
193 | mutex_lock(&inode->i_mutex); | 195 | mutex_lock(&inode->i_mutex); |
194 | 196 | ||
197 | ip_oldflags = ip->flags; | ||
198 | i_oldflags = inode->i_flags; | ||
199 | |||
195 | flags = btrfs_mask_flags(inode->i_mode, flags); | 200 | flags = btrfs_mask_flags(inode->i_mode, flags); |
196 | oldflags = btrfs_flags_to_ioctl(ip->flags); | 201 | oldflags = btrfs_flags_to_ioctl(ip->flags); |
197 | if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { | 202 | if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { |
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) | |||
249 | ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); | 254 | ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); |
250 | } | 255 | } |
251 | 256 | ||
252 | trans = btrfs_join_transaction(root); | 257 | trans = btrfs_start_transaction(root, 1); |
253 | BUG_ON(IS_ERR(trans)); | 258 | if (IS_ERR(trans)) { |
259 | ret = PTR_ERR(trans); | ||
260 | goto out_drop; | ||
261 | } | ||
254 | 262 | ||
255 | btrfs_update_iflags(inode); | 263 | btrfs_update_iflags(inode); |
256 | inode->i_ctime = CURRENT_TIME; | 264 | inode->i_ctime = CURRENT_TIME; |
257 | ret = btrfs_update_inode(trans, root, inode); | 265 | ret = btrfs_update_inode(trans, root, inode); |
258 | BUG_ON(ret); | ||
259 | 266 | ||
260 | btrfs_end_transaction(trans, root); | 267 | btrfs_end_transaction(trans, root); |
268 | out_drop: | ||
269 | if (ret) { | ||
270 | ip->flags = ip_oldflags; | ||
271 | inode->i_flags = i_oldflags; | ||
272 | } | ||
261 | 273 | ||
262 | mnt_drop_write_file(file); | 274 | mnt_drop_write_file(file); |
263 | |||
264 | ret = 0; | ||
265 | out_unlock: | 275 | out_unlock: |
266 | mutex_unlock(&inode->i_mutex); | 276 | mutex_unlock(&inode->i_mutex); |
267 | return ret; | 277 | return ret; |
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) | |||
276 | 286 | ||
277 | static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) | 287 | static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) |
278 | { | 288 | { |
279 | struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; | 289 | struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb); |
280 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
281 | struct btrfs_device *device; | 290 | struct btrfs_device *device; |
282 | struct request_queue *q; | 291 | struct request_queue *q; |
283 | struct fstrim_range range; | 292 | struct fstrim_range range; |
284 | u64 minlen = ULLONG_MAX; | 293 | u64 minlen = ULLONG_MAX; |
285 | u64 num_devices = 0; | 294 | u64 num_devices = 0; |
286 | u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); | 295 | u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); |
287 | int ret; | 296 | int ret; |
288 | 297 | ||
289 | if (!capable(CAP_SYS_ADMIN)) | 298 | if (!capable(CAP_SYS_ADMIN)) |
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) | |||
312 | 321 | ||
313 | range.len = min(range.len, total_bytes - range.start); | 322 | range.len = min(range.len, total_bytes - range.start); |
314 | range.minlen = max(range.minlen, minlen); | 323 | range.minlen = max(range.minlen, minlen); |
315 | ret = btrfs_trim_fs(root, &range); | 324 | ret = btrfs_trim_fs(fs_info->tree_root, &range); |
316 | if (ret < 0) | 325 | if (ret < 0) |
317 | return ret; | 326 | return ret; |
318 | 327 | ||
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root, | |||
358 | return PTR_ERR(trans); | 367 | return PTR_ERR(trans); |
359 | 368 | ||
360 | leaf = btrfs_alloc_free_block(trans, root, root->leafsize, | 369 | leaf = btrfs_alloc_free_block(trans, root, root->leafsize, |
361 | 0, objectid, NULL, 0, 0, 0); | 370 | 0, objectid, NULL, 0, 0, 0, 0); |
362 | if (IS_ERR(leaf)) { | 371 | if (IS_ERR(leaf)) { |
363 | ret = PTR_ERR(leaf); | 372 | ret = PTR_ERR(leaf); |
364 | goto fail; | 373 | goto fail; |
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode, | |||
858 | return 0; | 867 | return 0; |
859 | file_end = (isize - 1) >> PAGE_CACHE_SHIFT; | 868 | file_end = (isize - 1) >> PAGE_CACHE_SHIFT; |
860 | 869 | ||
861 | mutex_lock(&inode->i_mutex); | ||
862 | ret = btrfs_delalloc_reserve_space(inode, | 870 | ret = btrfs_delalloc_reserve_space(inode, |
863 | num_pages << PAGE_CACHE_SHIFT); | 871 | num_pages << PAGE_CACHE_SHIFT); |
864 | mutex_unlock(&inode->i_mutex); | ||
865 | if (ret) | 872 | if (ret) |
866 | return ret; | 873 | return ret; |
867 | again: | 874 | again: |
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1203 | if (!capable(CAP_SYS_ADMIN)) | 1210 | if (!capable(CAP_SYS_ADMIN)) |
1204 | return -EPERM; | 1211 | return -EPERM; |
1205 | 1212 | ||
1213 | mutex_lock(&root->fs_info->volume_mutex); | ||
1214 | if (root->fs_info->balance_ctl) { | ||
1215 | printk(KERN_INFO "btrfs: balance in progress\n"); | ||
1216 | ret = -EINVAL; | ||
1217 | goto out; | ||
1218 | } | ||
1219 | |||
1206 | vol_args = memdup_user(arg, sizeof(*vol_args)); | 1220 | vol_args = memdup_user(arg, sizeof(*vol_args)); |
1207 | if (IS_ERR(vol_args)) | 1221 | if (IS_ERR(vol_args)) { |
1208 | return PTR_ERR(vol_args); | 1222 | ret = PTR_ERR(vol_args); |
1223 | goto out; | ||
1224 | } | ||
1209 | 1225 | ||
1210 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; | 1226 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; |
1211 | 1227 | ||
1212 | mutex_lock(&root->fs_info->volume_mutex); | ||
1213 | sizestr = vol_args->name; | 1228 | sizestr = vol_args->name; |
1214 | devstr = strchr(sizestr, ':'); | 1229 | devstr = strchr(sizestr, ':'); |
1215 | if (devstr) { | 1230 | if (devstr) { |
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1226 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", | 1241 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", |
1227 | (unsigned long long)devid); | 1242 | (unsigned long long)devid); |
1228 | ret = -EINVAL; | 1243 | ret = -EINVAL; |
1229 | goto out_unlock; | 1244 | goto out_free; |
1230 | } | 1245 | } |
1231 | if (!strcmp(sizestr, "max")) | 1246 | if (!strcmp(sizestr, "max")) |
1232 | new_size = device->bdev->bd_inode->i_size; | 1247 | new_size = device->bdev->bd_inode->i_size; |
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1241 | new_size = memparse(sizestr, NULL); | 1256 | new_size = memparse(sizestr, NULL); |
1242 | if (new_size == 0) { | 1257 | if (new_size == 0) { |
1243 | ret = -EINVAL; | 1258 | ret = -EINVAL; |
1244 | goto out_unlock; | 1259 | goto out_free; |
1245 | } | 1260 | } |
1246 | } | 1261 | } |
1247 | 1262 | ||
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1250 | if (mod < 0) { | 1265 | if (mod < 0) { |
1251 | if (new_size > old_size) { | 1266 | if (new_size > old_size) { |
1252 | ret = -EINVAL; | 1267 | ret = -EINVAL; |
1253 | goto out_unlock; | 1268 | goto out_free; |
1254 | } | 1269 | } |
1255 | new_size = old_size - new_size; | 1270 | new_size = old_size - new_size; |
1256 | } else if (mod > 0) { | 1271 | } else if (mod > 0) { |
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1259 | 1274 | ||
1260 | if (new_size < 256 * 1024 * 1024) { | 1275 | if (new_size < 256 * 1024 * 1024) { |
1261 | ret = -EINVAL; | 1276 | ret = -EINVAL; |
1262 | goto out_unlock; | 1277 | goto out_free; |
1263 | } | 1278 | } |
1264 | if (new_size > device->bdev->bd_inode->i_size) { | 1279 | if (new_size > device->bdev->bd_inode->i_size) { |
1265 | ret = -EFBIG; | 1280 | ret = -EFBIG; |
1266 | goto out_unlock; | 1281 | goto out_free; |
1267 | } | 1282 | } |
1268 | 1283 | ||
1269 | do_div(new_size, root->sectorsize); | 1284 | do_div(new_size, root->sectorsize); |
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1276 | trans = btrfs_start_transaction(root, 0); | 1291 | trans = btrfs_start_transaction(root, 0); |
1277 | if (IS_ERR(trans)) { | 1292 | if (IS_ERR(trans)) { |
1278 | ret = PTR_ERR(trans); | 1293 | ret = PTR_ERR(trans); |
1279 | goto out_unlock; | 1294 | goto out_free; |
1280 | } | 1295 | } |
1281 | ret = btrfs_grow_device(trans, device, new_size); | 1296 | ret = btrfs_grow_device(trans, device, new_size); |
1282 | btrfs_commit_transaction(trans, root); | 1297 | btrfs_commit_transaction(trans, root); |
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1284 | ret = btrfs_shrink_device(device, new_size); | 1299 | ret = btrfs_shrink_device(device, new_size); |
1285 | } | 1300 | } |
1286 | 1301 | ||
1287 | out_unlock: | 1302 | out_free: |
1288 | mutex_unlock(&root->fs_info->volume_mutex); | ||
1289 | kfree(vol_args); | 1303 | kfree(vol_args); |
1304 | out: | ||
1305 | mutex_unlock(&root->fs_info->volume_mutex); | ||
1290 | return ret; | 1306 | return ret; |
1291 | } | 1307 | } |
1292 | 1308 | ||
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) | |||
2052 | if (!capable(CAP_SYS_ADMIN)) | 2068 | if (!capable(CAP_SYS_ADMIN)) |
2053 | return -EPERM; | 2069 | return -EPERM; |
2054 | 2070 | ||
2071 | mutex_lock(&root->fs_info->volume_mutex); | ||
2072 | if (root->fs_info->balance_ctl) { | ||
2073 | printk(KERN_INFO "btrfs: balance in progress\n"); | ||
2074 | ret = -EINVAL; | ||
2075 | goto out; | ||
2076 | } | ||
2077 | |||
2055 | vol_args = memdup_user(arg, sizeof(*vol_args)); | 2078 | vol_args = memdup_user(arg, sizeof(*vol_args)); |
2056 | if (IS_ERR(vol_args)) | 2079 | if (IS_ERR(vol_args)) { |
2057 | return PTR_ERR(vol_args); | 2080 | ret = PTR_ERR(vol_args); |
2081 | goto out; | ||
2082 | } | ||
2058 | 2083 | ||
2059 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; | 2084 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; |
2060 | ret = btrfs_init_new_device(root, vol_args->name); | 2085 | ret = btrfs_init_new_device(root, vol_args->name); |
2061 | 2086 | ||
2062 | kfree(vol_args); | 2087 | kfree(vol_args); |
2088 | out: | ||
2089 | mutex_unlock(&root->fs_info->volume_mutex); | ||
2063 | return ret; | 2090 | return ret; |
2064 | } | 2091 | } |
2065 | 2092 | ||
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) | |||
2074 | if (root->fs_info->sb->s_flags & MS_RDONLY) | 2101 | if (root->fs_info->sb->s_flags & MS_RDONLY) |
2075 | return -EROFS; | 2102 | return -EROFS; |
2076 | 2103 | ||
2104 | mutex_lock(&root->fs_info->volume_mutex); | ||
2105 | if (root->fs_info->balance_ctl) { | ||
2106 | printk(KERN_INFO "btrfs: balance in progress\n"); | ||
2107 | ret = -EINVAL; | ||
2108 | goto out; | ||
2109 | } | ||
2110 | |||
2077 | vol_args = memdup_user(arg, sizeof(*vol_args)); | 2111 | vol_args = memdup_user(arg, sizeof(*vol_args)); |
2078 | if (IS_ERR(vol_args)) | 2112 | if (IS_ERR(vol_args)) { |
2079 | return PTR_ERR(vol_args); | 2113 | ret = PTR_ERR(vol_args); |
2114 | goto out; | ||
2115 | } | ||
2080 | 2116 | ||
2081 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; | 2117 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; |
2082 | ret = btrfs_rm_device(root, vol_args->name); | 2118 | ret = btrfs_rm_device(root, vol_args->name); |
2083 | 2119 | ||
2084 | kfree(vol_args); | 2120 | kfree(vol_args); |
2121 | out: | ||
2122 | mutex_unlock(&root->fs_info->volume_mutex); | ||
2085 | return ret; | 2123 | return ret; |
2086 | } | 2124 | } |
2087 | 2125 | ||
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, | |||
2427 | disko, diskl, 0, | 2465 | disko, diskl, 0, |
2428 | root->root_key.objectid, | 2466 | root->root_key.objectid, |
2429 | btrfs_ino(inode), | 2467 | btrfs_ino(inode), |
2430 | new_key.offset - datao); | 2468 | new_key.offset - datao, |
2469 | 0); | ||
2431 | BUG_ON(ret); | 2470 | BUG_ON(ret); |
2432 | } | 2471 | } |
2433 | } else if (type == BTRFS_FILE_EXTENT_INLINE) { | 2472 | } else if (type == BTRFS_FILE_EXTENT_INLINE) { |
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, | |||
2977 | { | 3016 | { |
2978 | int ret = 0; | 3017 | int ret = 0; |
2979 | int size; | 3018 | int size; |
2980 | u64 extent_offset; | 3019 | u64 extent_item_pos; |
2981 | struct btrfs_ioctl_logical_ino_args *loi; | 3020 | struct btrfs_ioctl_logical_ino_args *loi; |
2982 | struct btrfs_data_container *inodes = NULL; | 3021 | struct btrfs_data_container *inodes = NULL; |
2983 | struct btrfs_path *path = NULL; | 3022 | struct btrfs_path *path = NULL; |
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, | |||
3008 | } | 3047 | } |
3009 | 3048 | ||
3010 | ret = extent_from_logical(root->fs_info, loi->logical, path, &key); | 3049 | ret = extent_from_logical(root->fs_info, loi->logical, path, &key); |
3050 | btrfs_release_path(path); | ||
3011 | 3051 | ||
3012 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) | 3052 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) |
3013 | ret = -ENOENT; | 3053 | ret = -ENOENT; |
3014 | if (ret < 0) | 3054 | if (ret < 0) |
3015 | goto out; | 3055 | goto out; |
3016 | 3056 | ||
3017 | extent_offset = loi->logical - key.objectid; | 3057 | extent_item_pos = loi->logical - key.objectid; |
3018 | ret = iterate_extent_inodes(root->fs_info, path, key.objectid, | 3058 | ret = iterate_extent_inodes(root->fs_info, path, key.objectid, |
3019 | extent_offset, build_ino_list, inodes); | 3059 | extent_item_pos, build_ino_list, |
3060 | inodes); | ||
3020 | 3061 | ||
3021 | if (ret < 0) | 3062 | if (ret < 0) |
3022 | goto out; | 3063 | goto out; |
@@ -3034,6 +3075,163 @@ out: | |||
3034 | return ret; | 3075 | return ret; |
3035 | } | 3076 | } |
3036 | 3077 | ||
3078 | void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, | ||
3079 | struct btrfs_ioctl_balance_args *bargs) | ||
3080 | { | ||
3081 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; | ||
3082 | |||
3083 | bargs->flags = bctl->flags; | ||
3084 | |||
3085 | if (atomic_read(&fs_info->balance_running)) | ||
3086 | bargs->state |= BTRFS_BALANCE_STATE_RUNNING; | ||
3087 | if (atomic_read(&fs_info->balance_pause_req)) | ||
3088 | bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; | ||
3089 | if (atomic_read(&fs_info->balance_cancel_req)) | ||
3090 | bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; | ||
3091 | |||
3092 | memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); | ||
3093 | memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); | ||
3094 | memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); | ||
3095 | |||
3096 | if (lock) { | ||
3097 | spin_lock(&fs_info->balance_lock); | ||
3098 | memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); | ||
3099 | spin_unlock(&fs_info->balance_lock); | ||
3100 | } else { | ||
3101 | memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); | ||
3102 | } | ||
3103 | } | ||
3104 | |||
3105 | static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) | ||
3106 | { | ||
3107 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
3108 | struct btrfs_ioctl_balance_args *bargs; | ||
3109 | struct btrfs_balance_control *bctl; | ||
3110 | int ret; | ||
3111 | |||
3112 | if (!capable(CAP_SYS_ADMIN)) | ||
3113 | return -EPERM; | ||
3114 | |||
3115 | if (fs_info->sb->s_flags & MS_RDONLY) | ||
3116 | return -EROFS; | ||
3117 | |||
3118 | mutex_lock(&fs_info->volume_mutex); | ||
3119 | mutex_lock(&fs_info->balance_mutex); | ||
3120 | |||
3121 | if (arg) { | ||
3122 | bargs = memdup_user(arg, sizeof(*bargs)); | ||
3123 | if (IS_ERR(bargs)) { | ||
3124 | ret = PTR_ERR(bargs); | ||
3125 | goto out; | ||
3126 | } | ||
3127 | |||
3128 | if (bargs->flags & BTRFS_BALANCE_RESUME) { | ||
3129 | if (!fs_info->balance_ctl) { | ||
3130 | ret = -ENOTCONN; | ||
3131 | goto out_bargs; | ||
3132 | } | ||
3133 | |||
3134 | bctl = fs_info->balance_ctl; | ||
3135 | spin_lock(&fs_info->balance_lock); | ||
3136 | bctl->flags |= BTRFS_BALANCE_RESUME; | ||
3137 | spin_unlock(&fs_info->balance_lock); | ||
3138 | |||
3139 | goto do_balance; | ||
3140 | } | ||
3141 | } else { | ||
3142 | bargs = NULL; | ||
3143 | } | ||
3144 | |||
3145 | if (fs_info->balance_ctl) { | ||
3146 | ret = -EINPROGRESS; | ||
3147 | goto out_bargs; | ||
3148 | } | ||
3149 | |||
3150 | bctl = kzalloc(sizeof(*bctl), GFP_NOFS); | ||
3151 | if (!bctl) { | ||
3152 | ret = -ENOMEM; | ||
3153 | goto out_bargs; | ||
3154 | } | ||
3155 | |||
3156 | bctl->fs_info = fs_info; | ||
3157 | if (arg) { | ||
3158 | memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); | ||
3159 | memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); | ||
3160 | memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); | ||
3161 | |||
3162 | bctl->flags = bargs->flags; | ||
3163 | } else { | ||
3164 | /* balance everything - no filters */ | ||
3165 | bctl->flags |= BTRFS_BALANCE_TYPE_MASK; | ||
3166 | } | ||
3167 | |||
3168 | do_balance: | ||
3169 | ret = btrfs_balance(bctl, bargs); | ||
3170 | /* | ||
3171 | * bctl is freed in __cancel_balance or in free_fs_info if | ||
3172 | * restriper was paused all the way until unmount | ||
3173 | */ | ||
3174 | if (arg) { | ||
3175 | if (copy_to_user(arg, bargs, sizeof(*bargs))) | ||
3176 | ret = -EFAULT; | ||
3177 | } | ||
3178 | |||
3179 | out_bargs: | ||
3180 | kfree(bargs); | ||
3181 | out: | ||
3182 | mutex_unlock(&fs_info->balance_mutex); | ||
3183 | mutex_unlock(&fs_info->volume_mutex); | ||
3184 | return ret; | ||
3185 | } | ||
3186 | |||
3187 | static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) | ||
3188 | { | ||
3189 | if (!capable(CAP_SYS_ADMIN)) | ||
3190 | return -EPERM; | ||
3191 | |||
3192 | switch (cmd) { | ||
3193 | case BTRFS_BALANCE_CTL_PAUSE: | ||
3194 | return btrfs_pause_balance(root->fs_info); | ||
3195 | case BTRFS_BALANCE_CTL_CANCEL: | ||
3196 | return btrfs_cancel_balance(root->fs_info); | ||
3197 | } | ||
3198 | |||
3199 | return -EINVAL; | ||
3200 | } | ||
3201 | |||
3202 | static long btrfs_ioctl_balance_progress(struct btrfs_root *root, | ||
3203 | void __user *arg) | ||
3204 | { | ||
3205 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
3206 | struct btrfs_ioctl_balance_args *bargs; | ||
3207 | int ret = 0; | ||
3208 | |||
3209 | if (!capable(CAP_SYS_ADMIN)) | ||
3210 | return -EPERM; | ||
3211 | |||
3212 | mutex_lock(&fs_info->balance_mutex); | ||
3213 | if (!fs_info->balance_ctl) { | ||
3214 | ret = -ENOTCONN; | ||
3215 | goto out; | ||
3216 | } | ||
3217 | |||
3218 | bargs = kzalloc(sizeof(*bargs), GFP_NOFS); | ||
3219 | if (!bargs) { | ||
3220 | ret = -ENOMEM; | ||
3221 | goto out; | ||
3222 | } | ||
3223 | |||
3224 | update_ioctl_balance_args(fs_info, 1, bargs); | ||
3225 | |||
3226 | if (copy_to_user(arg, bargs, sizeof(*bargs))) | ||
3227 | ret = -EFAULT; | ||
3228 | |||
3229 | kfree(bargs); | ||
3230 | out: | ||
3231 | mutex_unlock(&fs_info->balance_mutex); | ||
3232 | return ret; | ||
3233 | } | ||
3234 | |||
3037 | long btrfs_ioctl(struct file *file, unsigned int | 3235 | long btrfs_ioctl(struct file *file, unsigned int |
3038 | cmd, unsigned long arg) | 3236 | cmd, unsigned long arg) |
3039 | { | 3237 | { |
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
3078 | case BTRFS_IOC_DEV_INFO: | 3276 | case BTRFS_IOC_DEV_INFO: |
3079 | return btrfs_ioctl_dev_info(root, argp); | 3277 | return btrfs_ioctl_dev_info(root, argp); |
3080 | case BTRFS_IOC_BALANCE: | 3278 | case BTRFS_IOC_BALANCE: |
3081 | return btrfs_balance(root->fs_info->dev_root); | 3279 | return btrfs_ioctl_balance(root, NULL); |
3082 | case BTRFS_IOC_CLONE: | 3280 | case BTRFS_IOC_CLONE: |
3083 | return btrfs_ioctl_clone(file, arg, 0, 0, 0); | 3281 | return btrfs_ioctl_clone(file, arg, 0, 0, 0); |
3084 | case BTRFS_IOC_CLONE_RANGE: | 3282 | case BTRFS_IOC_CLONE_RANGE: |
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
3110 | return btrfs_ioctl_scrub_cancel(root, argp); | 3308 | return btrfs_ioctl_scrub_cancel(root, argp); |
3111 | case BTRFS_IOC_SCRUB_PROGRESS: | 3309 | case BTRFS_IOC_SCRUB_PROGRESS: |
3112 | return btrfs_ioctl_scrub_progress(root, argp); | 3310 | return btrfs_ioctl_scrub_progress(root, argp); |
3311 | case BTRFS_IOC_BALANCE_V2: | ||
3312 | return btrfs_ioctl_balance(root, argp); | ||
3313 | case BTRFS_IOC_BALANCE_CTL: | ||
3314 | return btrfs_ioctl_balance_ctl(root, arg); | ||
3315 | case BTRFS_IOC_BALANCE_PROGRESS: | ||
3316 | return btrfs_ioctl_balance_progress(root, argp); | ||
3113 | } | 3317 | } |
3114 | 3318 | ||
3115 | return -ENOTTY; | 3319 | return -ENOTTY; |
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae9915de8..4f69028a68c4 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h | |||
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args { | |||
109 | __u64 reserved[124]; /* pad to 1k */ | 109 | __u64 reserved[124]; /* pad to 1k */ |
110 | }; | 110 | }; |
111 | 111 | ||
112 | /* balance control ioctl modes */ | ||
113 | #define BTRFS_BALANCE_CTL_PAUSE 1 | ||
114 | #define BTRFS_BALANCE_CTL_CANCEL 2 | ||
115 | |||
116 | /* | ||
117 | * this is packed, because it should be exactly the same as its disk | ||
118 | * byte order counterpart (struct btrfs_disk_balance_args) | ||
119 | */ | ||
120 | struct btrfs_balance_args { | ||
121 | __u64 profiles; | ||
122 | __u64 usage; | ||
123 | __u64 devid; | ||
124 | __u64 pstart; | ||
125 | __u64 pend; | ||
126 | __u64 vstart; | ||
127 | __u64 vend; | ||
128 | |||
129 | __u64 target; | ||
130 | |||
131 | __u64 flags; | ||
132 | |||
133 | __u64 unused[8]; | ||
134 | } __attribute__ ((__packed__)); | ||
135 | |||
136 | /* report balance progress to userspace */ | ||
137 | struct btrfs_balance_progress { | ||
138 | __u64 expected; /* estimated # of chunks that will be | ||
139 | * relocated to fulfill the request */ | ||
140 | __u64 considered; /* # of chunks we have considered so far */ | ||
141 | __u64 completed; /* # of chunks relocated so far */ | ||
142 | }; | ||
143 | |||
144 | #define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) | ||
145 | #define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) | ||
146 | #define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) | ||
147 | |||
148 | struct btrfs_ioctl_balance_args { | ||
149 | __u64 flags; /* in/out */ | ||
150 | __u64 state; /* out */ | ||
151 | |||
152 | struct btrfs_balance_args data; /* in/out */ | ||
153 | struct btrfs_balance_args meta; /* in/out */ | ||
154 | struct btrfs_balance_args sys; /* in/out */ | ||
155 | |||
156 | struct btrfs_balance_progress stat; /* out */ | ||
157 | |||
158 | __u64 unused[72]; /* pad to 1k */ | ||
159 | }; | ||
160 | |||
112 | #define BTRFS_INO_LOOKUP_PATH_MAX 4080 | 161 | #define BTRFS_INO_LOOKUP_PATH_MAX 4080 |
113 | struct btrfs_ioctl_ino_lookup_args { | 162 | struct btrfs_ioctl_ino_lookup_args { |
114 | __u64 treeid; | 163 | __u64 treeid; |
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args { | |||
272 | struct btrfs_ioctl_dev_info_args) | 321 | struct btrfs_ioctl_dev_info_args) |
273 | #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ | 322 | #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ |
274 | struct btrfs_ioctl_fs_info_args) | 323 | struct btrfs_ioctl_fs_info_args) |
324 | #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ | ||
325 | struct btrfs_ioctl_balance_args) | ||
326 | #define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) | ||
327 | #define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ | ||
328 | struct btrfs_ioctl_balance_args) | ||
275 | #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ | 329 | #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ |
276 | struct btrfs_ioctl_ino_path_args) | 330 | struct btrfs_ioctl_ino_path_args) |
277 | #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ | 331 | #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ |
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d77b67c4b275..5e178d8f7167 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c | |||
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb); | |||
33 | */ | 33 | */ |
34 | void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) | 34 | void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) |
35 | { | 35 | { |
36 | if (eb->lock_nested) { | ||
37 | read_lock(&eb->lock); | ||
38 | if (eb->lock_nested && current->pid == eb->lock_owner) { | ||
39 | read_unlock(&eb->lock); | ||
40 | return; | ||
41 | } | ||
42 | read_unlock(&eb->lock); | ||
43 | } | ||
36 | if (rw == BTRFS_WRITE_LOCK) { | 44 | if (rw == BTRFS_WRITE_LOCK) { |
37 | if (atomic_read(&eb->blocking_writers) == 0) { | 45 | if (atomic_read(&eb->blocking_writers) == 0) { |
38 | WARN_ON(atomic_read(&eb->spinning_writers) != 1); | 46 | WARN_ON(atomic_read(&eb->spinning_writers) != 1); |
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) | |||
57 | */ | 65 | */ |
58 | void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) | 66 | void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) |
59 | { | 67 | { |
68 | if (eb->lock_nested) { | ||
69 | read_lock(&eb->lock); | ||
70 | if (&eb->lock_nested && current->pid == eb->lock_owner) { | ||
71 | read_unlock(&eb->lock); | ||
72 | return; | ||
73 | } | ||
74 | read_unlock(&eb->lock); | ||
75 | } | ||
60 | if (rw == BTRFS_WRITE_LOCK_BLOCKING) { | 76 | if (rw == BTRFS_WRITE_LOCK_BLOCKING) { |
61 | BUG_ON(atomic_read(&eb->blocking_writers) != 1); | 77 | BUG_ON(atomic_read(&eb->blocking_writers) != 1); |
62 | write_lock(&eb->lock); | 78 | write_lock(&eb->lock); |
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) | |||
81 | void btrfs_tree_read_lock(struct extent_buffer *eb) | 97 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
82 | { | 98 | { |
83 | again: | 99 | again: |
100 | read_lock(&eb->lock); | ||
101 | if (atomic_read(&eb->blocking_writers) && | ||
102 | current->pid == eb->lock_owner) { | ||
103 | /* | ||
104 | * This extent is already write-locked by our thread. We allow | ||
105 | * an additional read lock to be added because it's for the same | ||
106 | * thread. btrfs_find_all_roots() depends on this as it may be | ||
107 | * called on a partly (write-)locked tree. | ||
108 | */ | ||
109 | BUG_ON(eb->lock_nested); | ||
110 | eb->lock_nested = 1; | ||
111 | read_unlock(&eb->lock); | ||
112 | return; | ||
113 | } | ||
114 | read_unlock(&eb->lock); | ||
84 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | 115 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); |
85 | read_lock(&eb->lock); | 116 | read_lock(&eb->lock); |
86 | if (atomic_read(&eb->blocking_writers)) { | 117 | if (atomic_read(&eb->blocking_writers)) { |
87 | read_unlock(&eb->lock); | 118 | read_unlock(&eb->lock); |
88 | wait_event(eb->write_lock_wq, | ||
89 | atomic_read(&eb->blocking_writers) == 0); | ||
90 | goto again; | 119 | goto again; |
91 | } | 120 | } |
92 | atomic_inc(&eb->read_locks); | 121 | atomic_inc(&eb->read_locks); |
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) | |||
129 | } | 158 | } |
130 | atomic_inc(&eb->write_locks); | 159 | atomic_inc(&eb->write_locks); |
131 | atomic_inc(&eb->spinning_writers); | 160 | atomic_inc(&eb->spinning_writers); |
161 | eb->lock_owner = current->pid; | ||
132 | return 1; | 162 | return 1; |
133 | } | 163 | } |
134 | 164 | ||
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) | |||
137 | */ | 167 | */ |
138 | void btrfs_tree_read_unlock(struct extent_buffer *eb) | 168 | void btrfs_tree_read_unlock(struct extent_buffer *eb) |
139 | { | 169 | { |
170 | if (eb->lock_nested) { | ||
171 | read_lock(&eb->lock); | ||
172 | if (eb->lock_nested && current->pid == eb->lock_owner) { | ||
173 | eb->lock_nested = 0; | ||
174 | read_unlock(&eb->lock); | ||
175 | return; | ||
176 | } | ||
177 | read_unlock(&eb->lock); | ||
178 | } | ||
140 | btrfs_assert_tree_read_locked(eb); | 179 | btrfs_assert_tree_read_locked(eb); |
141 | WARN_ON(atomic_read(&eb->spinning_readers) == 0); | 180 | WARN_ON(atomic_read(&eb->spinning_readers) == 0); |
142 | atomic_dec(&eb->spinning_readers); | 181 | atomic_dec(&eb->spinning_readers); |
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) | |||
149 | */ | 188 | */ |
150 | void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) | 189 | void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) |
151 | { | 190 | { |
191 | if (eb->lock_nested) { | ||
192 | read_lock(&eb->lock); | ||
193 | if (eb->lock_nested && current->pid == eb->lock_owner) { | ||
194 | eb->lock_nested = 0; | ||
195 | read_unlock(&eb->lock); | ||
196 | return; | ||
197 | } | ||
198 | read_unlock(&eb->lock); | ||
199 | } | ||
152 | btrfs_assert_tree_read_locked(eb); | 200 | btrfs_assert_tree_read_locked(eb); |
153 | WARN_ON(atomic_read(&eb->blocking_readers) == 0); | 201 | WARN_ON(atomic_read(&eb->blocking_readers) == 0); |
154 | if (atomic_dec_and_test(&eb->blocking_readers)) | 202 | if (atomic_dec_and_test(&eb->blocking_readers)) |
@@ -181,6 +229,7 @@ again: | |||
181 | WARN_ON(atomic_read(&eb->spinning_writers)); | 229 | WARN_ON(atomic_read(&eb->spinning_writers)); |
182 | atomic_inc(&eb->spinning_writers); | 230 | atomic_inc(&eb->spinning_writers); |
183 | atomic_inc(&eb->write_locks); | 231 | atomic_inc(&eb->write_locks); |
232 | eb->lock_owner = current->pid; | ||
184 | return 0; | 233 | return 0; |
185 | } | 234 | } |
186 | 235 | ||
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cfb55434a469..8c1aae2c845d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, | |||
1604 | ret = btrfs_inc_extent_ref(trans, root, new_bytenr, | 1604 | ret = btrfs_inc_extent_ref(trans, root, new_bytenr, |
1605 | num_bytes, parent, | 1605 | num_bytes, parent, |
1606 | btrfs_header_owner(leaf), | 1606 | btrfs_header_owner(leaf), |
1607 | key.objectid, key.offset); | 1607 | key.objectid, key.offset, 1); |
1608 | BUG_ON(ret); | 1608 | BUG_ON(ret); |
1609 | 1609 | ||
1610 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, | 1610 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, |
1611 | parent, btrfs_header_owner(leaf), | 1611 | parent, btrfs_header_owner(leaf), |
1612 | key.objectid, key.offset); | 1612 | key.objectid, key.offset, 1); |
1613 | BUG_ON(ret); | 1613 | BUG_ON(ret); |
1614 | } | 1614 | } |
1615 | if (dirty) | 1615 | if (dirty) |
@@ -1778,21 +1778,23 @@ again: | |||
1778 | 1778 | ||
1779 | ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, | 1779 | ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, |
1780 | path->nodes[level]->start, | 1780 | path->nodes[level]->start, |
1781 | src->root_key.objectid, level - 1, 0); | 1781 | src->root_key.objectid, level - 1, 0, |
1782 | 1); | ||
1782 | BUG_ON(ret); | 1783 | BUG_ON(ret); |
1783 | ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, | 1784 | ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, |
1784 | 0, dest->root_key.objectid, level - 1, | 1785 | 0, dest->root_key.objectid, level - 1, |
1785 | 0); | 1786 | 0, 1); |
1786 | BUG_ON(ret); | 1787 | BUG_ON(ret); |
1787 | 1788 | ||
1788 | ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, | 1789 | ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, |
1789 | path->nodes[level]->start, | 1790 | path->nodes[level]->start, |
1790 | src->root_key.objectid, level - 1, 0); | 1791 | src->root_key.objectid, level - 1, 0, |
1792 | 1); | ||
1791 | BUG_ON(ret); | 1793 | BUG_ON(ret); |
1792 | 1794 | ||
1793 | ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, | 1795 | ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, |
1794 | 0, dest->root_key.objectid, level - 1, | 1796 | 0, dest->root_key.objectid, level - 1, |
1795 | 0); | 1797 | 0, 1); |
1796 | BUG_ON(ret); | 1798 | BUG_ON(ret); |
1797 | 1799 | ||
1798 | btrfs_unlock_up_safe(path, 0); | 1800 | btrfs_unlock_up_safe(path, 0); |
@@ -2244,7 +2246,7 @@ again: | |||
2244 | } else { | 2246 | } else { |
2245 | list_del_init(&reloc_root->root_list); | 2247 | list_del_init(&reloc_root->root_list); |
2246 | } | 2248 | } |
2247 | btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); | 2249 | btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); |
2248 | } | 2250 | } |
2249 | 2251 | ||
2250 | if (found) { | 2252 | if (found) { |
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, | |||
2558 | node->eb->start, blocksize, | 2560 | node->eb->start, blocksize, |
2559 | upper->eb->start, | 2561 | upper->eb->start, |
2560 | btrfs_header_owner(upper->eb), | 2562 | btrfs_header_owner(upper->eb), |
2561 | node->level, 0); | 2563 | node->level, 0, 1); |
2562 | BUG_ON(ret); | 2564 | BUG_ON(ret); |
2563 | 2565 | ||
2564 | ret = btrfs_drop_subtree(trans, root, eb, upper->eb); | 2566 | ret = btrfs_drop_subtree(trans, root, eb, upper->eb); |
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode, | |||
2947 | index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; | 2949 | index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; |
2948 | last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; | 2950 | last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; |
2949 | while (index <= last_index) { | 2951 | while (index <= last_index) { |
2950 | mutex_lock(&inode->i_mutex); | ||
2951 | ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); | 2952 | ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); |
2952 | mutex_unlock(&inode->i_mutex); | ||
2953 | if (ret) | 2953 | if (ret) |
2954 | goto out; | 2954 | goto out; |
2955 | 2955 | ||
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ddf2c90d3fc0..9770cc5bfb76 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include "transaction.h" | 25 | #include "transaction.h" |
26 | #include "backref.h" | 26 | #include "backref.h" |
27 | #include "extent_io.h" | 27 | #include "extent_io.h" |
28 | #include "check-integrity.h" | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * This is only the first step towards a full-features scrub. It reads all | 31 | * This is only the first step towards a full-features scrub. It reads all |
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | |||
309 | u8 ref_level; | 310 | u8 ref_level; |
310 | unsigned long ptr = 0; | 311 | unsigned long ptr = 0; |
311 | const int bufsize = 4096; | 312 | const int bufsize = 4096; |
312 | u64 extent_offset; | 313 | u64 extent_item_pos; |
313 | 314 | ||
314 | path = btrfs_alloc_path(); | 315 | path = btrfs_alloc_path(); |
315 | 316 | ||
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | |||
329 | if (ret < 0) | 330 | if (ret < 0) |
330 | goto out; | 331 | goto out; |
331 | 332 | ||
332 | extent_offset = swarn.logical - found_key.objectid; | 333 | extent_item_pos = swarn.logical - found_key.objectid; |
333 | swarn.extent_item_size = found_key.offset; | 334 | swarn.extent_item_size = found_key.offset; |
334 | 335 | ||
335 | eb = path->nodes[0]; | 336 | eb = path->nodes[0]; |
336 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | 337 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); |
337 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | 338 | item_size = btrfs_item_size_nr(eb, path->slots[0]); |
339 | btrfs_release_path(path); | ||
338 | 340 | ||
339 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | 341 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { |
340 | do { | 342 | do { |
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | |||
351 | } else { | 353 | } else { |
352 | swarn.path = path; | 354 | swarn.path = path; |
353 | iterate_extent_inodes(fs_info, path, found_key.objectid, | 355 | iterate_extent_inodes(fs_info, path, found_key.objectid, |
354 | extent_offset, | 356 | extent_item_pos, |
355 | scrub_print_warning_inode, &swarn); | 357 | scrub_print_warning_inode, &swarn); |
356 | } | 358 | } |
357 | 359 | ||
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, | |||
732 | bio_add_page(bio, page, PAGE_SIZE, 0); | 734 | bio_add_page(bio, page, PAGE_SIZE, 0); |
733 | bio->bi_end_io = scrub_fixup_end_io; | 735 | bio->bi_end_io = scrub_fixup_end_io; |
734 | bio->bi_private = &complete; | 736 | bio->bi_private = &complete; |
735 | submit_bio(rw, bio); | 737 | btrfsic_submit_bio(rw, bio); |
736 | 738 | ||
737 | /* this will also unplug the queue */ | 739 | /* this will also unplug the queue */ |
738 | wait_for_completion(&complete); | 740 | wait_for_completion(&complete); |
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev) | |||
958 | sdev->curr = -1; | 960 | sdev->curr = -1; |
959 | atomic_inc(&sdev->in_flight); | 961 | atomic_inc(&sdev->in_flight); |
960 | 962 | ||
961 | submit_bio(READ, sbio->bio); | 963 | btrfsic_submit_bio(READ, sbio->bio); |
962 | 964 | ||
963 | return 0; | 965 | return 0; |
964 | } | 966 | } |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ae488aa1966a..3ce97b217cbe 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, | |||
147 | 147 | ||
148 | static void btrfs_put_super(struct super_block *sb) | 148 | static void btrfs_put_super(struct super_block *sb) |
149 | { | 149 | { |
150 | struct btrfs_root *root = btrfs_sb(sb); | 150 | (void)close_ctree(btrfs_sb(sb)->tree_root); |
151 | int ret; | 151 | /* FIXME: need to fix VFS to return error? */ |
152 | 152 | /* AV: return it _where_? ->put_super() can be triggered by any number | |
153 | ret = close_ctree(root); | 153 | * of async events, up to and including delivery of SIGKILL to the |
154 | sb->s_fs_info = NULL; | 154 | * last process that kept it busy. Or segfault in the aforementioned |
155 | 155 | * process... Whom would you report that to? | |
156 | (void)ret; /* FIXME: need to fix VFS to return error? */ | 156 | */ |
157 | } | 157 | } |
158 | 158 | ||
159 | enum { | 159 | enum { |
@@ -163,8 +163,11 @@ enum { | |||
163 | Opt_compress_type, Opt_compress_force, Opt_compress_force_type, | 163 | Opt_compress_type, Opt_compress_force, Opt_compress_force_type, |
164 | Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, | 164 | Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, |
165 | Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, | 165 | Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, |
166 | Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, | 166 | Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, |
167 | Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, | 167 | Opt_no_space_cache, Opt_recovery, Opt_skip_balance, |
168 | Opt_check_integrity, Opt_check_integrity_including_extent_data, | ||
169 | Opt_check_integrity_print_mask, | ||
170 | Opt_err, | ||
168 | }; | 171 | }; |
169 | 172 | ||
170 | static match_table_t tokens = { | 173 | static match_table_t tokens = { |
@@ -199,6 +202,10 @@ static match_table_t tokens = { | |||
199 | {Opt_inode_cache, "inode_cache"}, | 202 | {Opt_inode_cache, "inode_cache"}, |
200 | {Opt_no_space_cache, "nospace_cache"}, | 203 | {Opt_no_space_cache, "nospace_cache"}, |
201 | {Opt_recovery, "recovery"}, | 204 | {Opt_recovery, "recovery"}, |
205 | {Opt_skip_balance, "skip_balance"}, | ||
206 | {Opt_check_integrity, "check_int"}, | ||
207 | {Opt_check_integrity_including_extent_data, "check_int_data"}, | ||
208 | {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, | ||
202 | {Opt_err, NULL}, | 209 | {Opt_err, NULL}, |
203 | }; | 210 | }; |
204 | 211 | ||
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
397 | printk(KERN_INFO "btrfs: enabling auto recovery"); | 404 | printk(KERN_INFO "btrfs: enabling auto recovery"); |
398 | btrfs_set_opt(info->mount_opt, RECOVERY); | 405 | btrfs_set_opt(info->mount_opt, RECOVERY); |
399 | break; | 406 | break; |
407 | case Opt_skip_balance: | ||
408 | btrfs_set_opt(info->mount_opt, SKIP_BALANCE); | ||
409 | break; | ||
410 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | ||
411 | case Opt_check_integrity_including_extent_data: | ||
412 | printk(KERN_INFO "btrfs: enabling check integrity" | ||
413 | " including extent data\n"); | ||
414 | btrfs_set_opt(info->mount_opt, | ||
415 | CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); | ||
416 | btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); | ||
417 | break; | ||
418 | case Opt_check_integrity: | ||
419 | printk(KERN_INFO "btrfs: enabling check integrity\n"); | ||
420 | btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); | ||
421 | break; | ||
422 | case Opt_check_integrity_print_mask: | ||
423 | intarg = 0; | ||
424 | match_int(&args[0], &intarg); | ||
425 | if (intarg) { | ||
426 | info->check_integrity_print_mask = intarg; | ||
427 | printk(KERN_INFO "btrfs:" | ||
428 | " check_integrity_print_mask 0x%x\n", | ||
429 | info->check_integrity_print_mask); | ||
430 | } | ||
431 | break; | ||
432 | #else | ||
433 | case Opt_check_integrity_including_extent_data: | ||
434 | case Opt_check_integrity: | ||
435 | case Opt_check_integrity_print_mask: | ||
436 | printk(KERN_ERR "btrfs: support for check_integrity*" | ||
437 | " not compiled in!\n"); | ||
438 | ret = -EINVAL; | ||
439 | goto out; | ||
440 | #endif | ||
400 | case Opt_err: | 441 | case Opt_err: |
401 | printk(KERN_INFO "btrfs: unrecognized mount option " | 442 | printk(KERN_INFO "btrfs: unrecognized mount option " |
402 | "'%s'\n", p); | 443 | "'%s'\n", p); |
@@ -500,7 +541,8 @@ out: | |||
500 | static struct dentry *get_default_root(struct super_block *sb, | 541 | static struct dentry *get_default_root(struct super_block *sb, |
501 | u64 subvol_objectid) | 542 | u64 subvol_objectid) |
502 | { | 543 | { |
503 | struct btrfs_root *root = sb->s_fs_info; | 544 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
545 | struct btrfs_root *root = fs_info->tree_root; | ||
504 | struct btrfs_root *new_root; | 546 | struct btrfs_root *new_root; |
505 | struct btrfs_dir_item *di; | 547 | struct btrfs_dir_item *di; |
506 | struct btrfs_path *path; | 548 | struct btrfs_path *path; |
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb, | |||
530 | * will mount by default if we haven't been given a specific subvolume | 572 | * will mount by default if we haven't been given a specific subvolume |
531 | * to mount. | 573 | * to mount. |
532 | */ | 574 | */ |
533 | dir_id = btrfs_super_root_dir(root->fs_info->super_copy); | 575 | dir_id = btrfs_super_root_dir(fs_info->super_copy); |
534 | di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); | 576 | di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); |
535 | if (IS_ERR(di)) { | 577 | if (IS_ERR(di)) { |
536 | btrfs_free_path(path); | 578 | btrfs_free_path(path); |
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb, | |||
544 | */ | 586 | */ |
545 | btrfs_free_path(path); | 587 | btrfs_free_path(path); |
546 | dir_id = BTRFS_FIRST_FREE_OBJECTID; | 588 | dir_id = BTRFS_FIRST_FREE_OBJECTID; |
547 | new_root = root->fs_info->fs_root; | 589 | new_root = fs_info->fs_root; |
548 | goto setup_root; | 590 | goto setup_root; |
549 | } | 591 | } |
550 | 592 | ||
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb, | |||
552 | btrfs_free_path(path); | 594 | btrfs_free_path(path); |
553 | 595 | ||
554 | find_root: | 596 | find_root: |
555 | new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); | 597 | new_root = btrfs_read_fs_root_no_name(fs_info, &location); |
556 | if (IS_ERR(new_root)) | 598 | if (IS_ERR(new_root)) |
557 | return ERR_CAST(new_root); | 599 | return ERR_CAST(new_root); |
558 | 600 | ||
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb, | |||
588 | { | 630 | { |
589 | struct inode *inode; | 631 | struct inode *inode; |
590 | struct dentry *root_dentry; | 632 | struct dentry *root_dentry; |
591 | struct btrfs_root *tree_root; | 633 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
592 | struct btrfs_key key; | 634 | struct btrfs_key key; |
593 | int err; | 635 | int err; |
594 | 636 | ||
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb, | |||
603 | sb->s_flags |= MS_POSIXACL; | 645 | sb->s_flags |= MS_POSIXACL; |
604 | #endif | 646 | #endif |
605 | 647 | ||
606 | tree_root = open_ctree(sb, fs_devices, (char *)data); | 648 | err = open_ctree(sb, fs_devices, (char *)data); |
607 | 649 | if (err) { | |
608 | if (IS_ERR(tree_root)) { | ||
609 | printk("btrfs: open_ctree failed\n"); | 650 | printk("btrfs: open_ctree failed\n"); |
610 | return PTR_ERR(tree_root); | 651 | return err; |
611 | } | 652 | } |
612 | sb->s_fs_info = tree_root; | ||
613 | 653 | ||
614 | key.objectid = BTRFS_FIRST_FREE_OBJECTID; | 654 | key.objectid = BTRFS_FIRST_FREE_OBJECTID; |
615 | key.type = BTRFS_INODE_ITEM_KEY; | 655 | key.type = BTRFS_INODE_ITEM_KEY; |
616 | key.offset = 0; | 656 | key.offset = 0; |
617 | inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); | 657 | inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); |
618 | if (IS_ERR(inode)) { | 658 | if (IS_ERR(inode)) { |
619 | err = PTR_ERR(inode); | 659 | err = PTR_ERR(inode); |
620 | goto fail_close; | 660 | goto fail_close; |
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb, | |||
631 | 671 | ||
632 | save_mount_options(sb, data); | 672 | save_mount_options(sb, data); |
633 | cleancache_init_fs(sb); | 673 | cleancache_init_fs(sb); |
674 | sb->s_flags |= MS_ACTIVE; | ||
634 | return 0; | 675 | return 0; |
635 | 676 | ||
636 | fail_close: | 677 | fail_close: |
637 | close_ctree(tree_root); | 678 | close_ctree(fs_info->tree_root); |
638 | return err; | 679 | return err; |
639 | } | 680 | } |
640 | 681 | ||
641 | int btrfs_sync_fs(struct super_block *sb, int wait) | 682 | int btrfs_sync_fs(struct super_block *sb, int wait) |
642 | { | 683 | { |
643 | struct btrfs_trans_handle *trans; | 684 | struct btrfs_trans_handle *trans; |
644 | struct btrfs_root *root = btrfs_sb(sb); | 685 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
686 | struct btrfs_root *root = fs_info->tree_root; | ||
645 | int ret; | 687 | int ret; |
646 | 688 | ||
647 | trace_btrfs_sync_fs(wait); | 689 | trace_btrfs_sync_fs(wait); |
648 | 690 | ||
649 | if (!wait) { | 691 | if (!wait) { |
650 | filemap_flush(root->fs_info->btree_inode->i_mapping); | 692 | filemap_flush(fs_info->btree_inode->i_mapping); |
651 | return 0; | 693 | return 0; |
652 | } | 694 | } |
653 | 695 | ||
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) | |||
663 | 705 | ||
664 | static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) | 706 | static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) |
665 | { | 707 | { |
666 | struct btrfs_root *root = btrfs_sb(dentry->d_sb); | 708 | struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); |
667 | struct btrfs_fs_info *info = root->fs_info; | 709 | struct btrfs_root *root = info->tree_root; |
668 | char *compress_type; | 710 | char *compress_type; |
669 | 711 | ||
670 | if (btrfs_test_opt(root, DEGRADED)) | 712 | if (btrfs_test_opt(root, DEGRADED)) |
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) | |||
722 | seq_puts(seq, ",autodefrag"); | 764 | seq_puts(seq, ",autodefrag"); |
723 | if (btrfs_test_opt(root, INODE_MAP_CACHE)) | 765 | if (btrfs_test_opt(root, INODE_MAP_CACHE)) |
724 | seq_puts(seq, ",inode_cache"); | 766 | seq_puts(seq, ",inode_cache"); |
767 | if (btrfs_test_opt(root, SKIP_BALANCE)) | ||
768 | seq_puts(seq, ",skip_balance"); | ||
725 | return 0; | 769 | return 0; |
726 | } | 770 | } |
727 | 771 | ||
728 | static int btrfs_test_super(struct super_block *s, void *data) | 772 | static int btrfs_test_super(struct super_block *s, void *data) |
729 | { | 773 | { |
730 | struct btrfs_root *test_root = data; | 774 | struct btrfs_fs_info *p = data; |
731 | struct btrfs_root *root = btrfs_sb(s); | 775 | struct btrfs_fs_info *fs_info = btrfs_sb(s); |
732 | 776 | ||
733 | /* | 777 | return fs_info->fs_devices == p->fs_devices; |
734 | * If this super block is going away, return false as it | ||
735 | * can't match as an existing super block. | ||
736 | */ | ||
737 | if (!atomic_read(&s->s_active)) | ||
738 | return 0; | ||
739 | return root->fs_info->fs_devices == test_root->fs_info->fs_devices; | ||
740 | } | 778 | } |
741 | 779 | ||
742 | static int btrfs_set_super(struct super_block *s, void *data) | 780 | static int btrfs_set_super(struct super_block *s, void *data) |
743 | { | 781 | { |
744 | s->s_fs_info = data; | 782 | int err = set_anon_super(s, data); |
745 | 783 | if (!err) | |
746 | return set_anon_super(s, data); | 784 | s->s_fs_info = data; |
785 | return err; | ||
747 | } | 786 | } |
748 | 787 | ||
749 | /* | 788 | /* |
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, | |||
903 | if (!fs_info) | 942 | if (!fs_info) |
904 | return ERR_PTR(-ENOMEM); | 943 | return ERR_PTR(-ENOMEM); |
905 | 944 | ||
906 | fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | ||
907 | if (!fs_info->tree_root) { | ||
908 | error = -ENOMEM; | ||
909 | goto error_fs_info; | ||
910 | } | ||
911 | fs_info->tree_root->fs_info = fs_info; | ||
912 | fs_info->fs_devices = fs_devices; | 945 | fs_info->fs_devices = fs_devices; |
913 | 946 | ||
914 | fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); | 947 | fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); |
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, | |||
928 | } | 961 | } |
929 | 962 | ||
930 | bdev = fs_devices->latest_bdev; | 963 | bdev = fs_devices->latest_bdev; |
931 | s = sget(fs_type, btrfs_test_super, btrfs_set_super, | 964 | s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); |
932 | fs_info->tree_root); | ||
933 | if (IS_ERR(s)) { | 965 | if (IS_ERR(s)) { |
934 | error = PTR_ERR(s); | 966 | error = PTR_ERR(s); |
935 | goto error_close_devices; | 967 | goto error_close_devices; |
936 | } | 968 | } |
937 | 969 | ||
938 | if (s->s_root) { | 970 | if (s->s_root) { |
939 | if ((flags ^ s->s_flags) & MS_RDONLY) { | ||
940 | deactivate_locked_super(s); | ||
941 | error = -EBUSY; | ||
942 | goto error_close_devices; | ||
943 | } | ||
944 | |||
945 | btrfs_close_devices(fs_devices); | 971 | btrfs_close_devices(fs_devices); |
946 | free_fs_info(fs_info); | 972 | free_fs_info(fs_info); |
973 | if ((flags ^ s->s_flags) & MS_RDONLY) | ||
974 | error = -EBUSY; | ||
947 | } else { | 975 | } else { |
948 | char b[BDEVNAME_SIZE]; | 976 | char b[BDEVNAME_SIZE]; |
949 | 977 | ||
950 | s->s_flags = flags | MS_NOSEC; | 978 | s->s_flags = flags | MS_NOSEC; |
951 | strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); | 979 | strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); |
952 | btrfs_sb(s)->fs_info->bdev_holder = fs_type; | 980 | btrfs_sb(s)->bdev_holder = fs_type; |
953 | error = btrfs_fill_super(s, fs_devices, data, | 981 | error = btrfs_fill_super(s, fs_devices, data, |
954 | flags & MS_SILENT ? 1 : 0); | 982 | flags & MS_SILENT ? 1 : 0); |
955 | if (error) { | ||
956 | deactivate_locked_super(s); | ||
957 | return ERR_PTR(error); | ||
958 | } | ||
959 | |||
960 | s->s_flags |= MS_ACTIVE; | ||
961 | } | 983 | } |
962 | 984 | ||
963 | root = get_default_root(s, subvol_objectid); | 985 | root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); |
964 | if (IS_ERR(root)) { | 986 | if (IS_ERR(root)) |
965 | deactivate_locked_super(s); | 987 | deactivate_locked_super(s); |
966 | return root; | ||
967 | } | ||
968 | 988 | ||
969 | return root; | 989 | return root; |
970 | 990 | ||
@@ -977,7 +997,8 @@ error_fs_info: | |||
977 | 997 | ||
978 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) | 998 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) |
979 | { | 999 | { |
980 | struct btrfs_root *root = btrfs_sb(sb); | 1000 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
1001 | struct btrfs_root *root = fs_info->tree_root; | ||
981 | int ret; | 1002 | int ret; |
982 | 1003 | ||
983 | ret = btrfs_parse_options(root, data); | 1004 | ret = btrfs_parse_options(root, data); |
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
993 | ret = btrfs_commit_super(root); | 1014 | ret = btrfs_commit_super(root); |
994 | WARN_ON(ret); | 1015 | WARN_ON(ret); |
995 | } else { | 1016 | } else { |
996 | if (root->fs_info->fs_devices->rw_devices == 0) | 1017 | if (fs_info->fs_devices->rw_devices == 0) |
997 | return -EACCES; | 1018 | return -EACCES; |
998 | 1019 | ||
999 | if (btrfs_super_log_root(root->fs_info->super_copy) != 0) | 1020 | if (btrfs_super_log_root(fs_info->super_copy) != 0) |
1000 | return -EINVAL; | 1021 | return -EINVAL; |
1001 | 1022 | ||
1002 | ret = btrfs_cleanup_fs_roots(root->fs_info); | 1023 | ret = btrfs_cleanup_fs_roots(fs_info); |
1003 | WARN_ON(ret); | 1024 | WARN_ON(ret); |
1004 | 1025 | ||
1005 | /* recover relocation */ | 1026 | /* recover relocation */ |
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1168 | 1189 | ||
1169 | static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) | 1190 | static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
1170 | { | 1191 | { |
1171 | struct btrfs_root *root = btrfs_sb(dentry->d_sb); | 1192 | struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); |
1172 | struct btrfs_super_block *disk_super = root->fs_info->super_copy; | 1193 | struct btrfs_super_block *disk_super = fs_info->super_copy; |
1173 | struct list_head *head = &root->fs_info->space_info; | 1194 | struct list_head *head = &fs_info->space_info; |
1174 | struct btrfs_space_info *found; | 1195 | struct btrfs_space_info *found; |
1175 | u64 total_used = 0; | 1196 | u64 total_used = 0; |
1176 | u64 total_free_data = 0; | 1197 | u64 total_free_data = 0; |
1177 | int bits = dentry->d_sb->s_blocksize_bits; | 1198 | int bits = dentry->d_sb->s_blocksize_bits; |
1178 | __be32 *fsid = (__be32 *)root->fs_info->fsid; | 1199 | __be32 *fsid = (__be32 *)fs_info->fsid; |
1179 | int ret; | 1200 | int ret; |
1180 | 1201 | ||
1181 | /* holding chunk_muext to avoid allocating new chunks */ | 1202 | /* holding chunk_muext to avoid allocating new chunks */ |
1182 | mutex_lock(&root->fs_info->chunk_mutex); | 1203 | mutex_lock(&fs_info->chunk_mutex); |
1183 | rcu_read_lock(); | 1204 | rcu_read_lock(); |
1184 | list_for_each_entry_rcu(found, head, list) { | 1205 | list_for_each_entry_rcu(found, head, list) { |
1185 | if (found->flags & BTRFS_BLOCK_GROUP_DATA) { | 1206 | if (found->flags & BTRFS_BLOCK_GROUP_DATA) { |
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1198 | buf->f_bsize = dentry->d_sb->s_blocksize; | 1219 | buf->f_bsize = dentry->d_sb->s_blocksize; |
1199 | buf->f_type = BTRFS_SUPER_MAGIC; | 1220 | buf->f_type = BTRFS_SUPER_MAGIC; |
1200 | buf->f_bavail = total_free_data; | 1221 | buf->f_bavail = total_free_data; |
1201 | ret = btrfs_calc_avail_data_space(root, &total_free_data); | 1222 | ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); |
1202 | if (ret) { | 1223 | if (ret) { |
1203 | mutex_unlock(&root->fs_info->chunk_mutex); | 1224 | mutex_unlock(&fs_info->chunk_mutex); |
1204 | return ret; | 1225 | return ret; |
1205 | } | 1226 | } |
1206 | buf->f_bavail += total_free_data; | 1227 | buf->f_bavail += total_free_data; |
1207 | buf->f_bavail = buf->f_bavail >> bits; | 1228 | buf->f_bavail = buf->f_bavail >> bits; |
1208 | mutex_unlock(&root->fs_info->chunk_mutex); | 1229 | mutex_unlock(&fs_info->chunk_mutex); |
1209 | 1230 | ||
1210 | /* We treat it as constant endianness (it doesn't matter _which_) | 1231 | /* We treat it as constant endianness (it doesn't matter _which_) |
1211 | because we want the fsid to come out the same whether mounted | 1232 | because we want the fsid to come out the same whether mounted |
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1219 | return 0; | 1240 | return 0; |
1220 | } | 1241 | } |
1221 | 1242 | ||
1243 | static void btrfs_kill_super(struct super_block *sb) | ||
1244 | { | ||
1245 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); | ||
1246 | kill_anon_super(sb); | ||
1247 | free_fs_info(fs_info); | ||
1248 | } | ||
1249 | |||
1222 | static struct file_system_type btrfs_fs_type = { | 1250 | static struct file_system_type btrfs_fs_type = { |
1223 | .owner = THIS_MODULE, | 1251 | .owner = THIS_MODULE, |
1224 | .name = "btrfs", | 1252 | .name = "btrfs", |
1225 | .mount = btrfs_mount, | 1253 | .mount = btrfs_mount, |
1226 | .kill_sb = kill_anon_super, | 1254 | .kill_sb = btrfs_kill_super, |
1227 | .fs_flags = FS_REQUIRES_DEV, | 1255 | .fs_flags = FS_REQUIRES_DEV, |
1228 | }; | 1256 | }; |
1229 | 1257 | ||
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, | |||
1257 | 1285 | ||
1258 | static int btrfs_freeze(struct super_block *sb) | 1286 | static int btrfs_freeze(struct super_block *sb) |
1259 | { | 1287 | { |
1260 | struct btrfs_root *root = btrfs_sb(sb); | 1288 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
1261 | mutex_lock(&root->fs_info->transaction_kthread_mutex); | 1289 | mutex_lock(&fs_info->transaction_kthread_mutex); |
1262 | mutex_lock(&root->fs_info->cleaner_mutex); | 1290 | mutex_lock(&fs_info->cleaner_mutex); |
1263 | return 0; | 1291 | return 0; |
1264 | } | 1292 | } |
1265 | 1293 | ||
1266 | static int btrfs_unfreeze(struct super_block *sb) | 1294 | static int btrfs_unfreeze(struct super_block *sb) |
1267 | { | 1295 | { |
1268 | struct btrfs_root *root = btrfs_sb(sb); | 1296 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
1269 | mutex_unlock(&root->fs_info->cleaner_mutex); | 1297 | mutex_unlock(&fs_info->cleaner_mutex); |
1270 | mutex_unlock(&root->fs_info->transaction_kthread_mutex); | 1298 | mutex_unlock(&fs_info->transaction_kthread_mutex); |
1271 | return 0; | 1299 | return 0; |
1272 | } | 1300 | } |
1273 | 1301 | ||
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d94cd3c..287a6728b1ad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) | |||
36 | WARN_ON(atomic_read(&transaction->use_count) == 0); | 36 | WARN_ON(atomic_read(&transaction->use_count) == 0); |
37 | if (atomic_dec_and_test(&transaction->use_count)) { | 37 | if (atomic_dec_and_test(&transaction->use_count)) { |
38 | BUG_ON(!list_empty(&transaction->list)); | 38 | BUG_ON(!list_empty(&transaction->list)); |
39 | WARN_ON(transaction->delayed_refs.root.rb_node); | ||
40 | WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); | ||
39 | memset(transaction, 0, sizeof(*transaction)); | 41 | memset(transaction, 0, sizeof(*transaction)); |
40 | kmem_cache_free(btrfs_transaction_cachep, transaction); | 42 | kmem_cache_free(btrfs_transaction_cachep, transaction); |
41 | } | 43 | } |
@@ -108,8 +110,11 @@ loop: | |||
108 | cur_trans->delayed_refs.num_heads = 0; | 110 | cur_trans->delayed_refs.num_heads = 0; |
109 | cur_trans->delayed_refs.flushing = 0; | 111 | cur_trans->delayed_refs.flushing = 0; |
110 | cur_trans->delayed_refs.run_delayed_start = 0; | 112 | cur_trans->delayed_refs.run_delayed_start = 0; |
113 | cur_trans->delayed_refs.seq = 1; | ||
114 | init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); | ||
111 | spin_lock_init(&cur_trans->commit_lock); | 115 | spin_lock_init(&cur_trans->commit_lock); |
112 | spin_lock_init(&cur_trans->delayed_refs.lock); | 116 | spin_lock_init(&cur_trans->delayed_refs.lock); |
117 | INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); | ||
113 | 118 | ||
114 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); | 119 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); |
115 | list_add_tail(&cur_trans->list, &root->fs_info->trans_list); | 120 | list_add_tail(&cur_trans->list, &root->fs_info->trans_list); |
@@ -321,6 +326,8 @@ again: | |||
321 | } | 326 | } |
322 | 327 | ||
323 | if (num_bytes) { | 328 | if (num_bytes) { |
329 | trace_btrfs_space_reservation(root->fs_info, "transaction", | ||
330 | (u64)h, num_bytes, 1); | ||
324 | h->block_rsv = &root->fs_info->trans_block_rsv; | 331 | h->block_rsv = &root->fs_info->trans_block_rsv; |
325 | h->bytes_reserved = num_bytes; | 332 | h->bytes_reserved = num_bytes; |
326 | } | 333 | } |
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
467 | 474 | ||
468 | btrfs_trans_release_metadata(trans, root); | 475 | btrfs_trans_release_metadata(trans, root); |
469 | trans->block_rsv = NULL; | 476 | trans->block_rsv = NULL; |
470 | while (count < 4) { | 477 | while (count < 2) { |
471 | unsigned long cur = trans->delayed_ref_updates; | 478 | unsigned long cur = trans->delayed_ref_updates; |
472 | trans->delayed_ref_updates = 0; | 479 | trans->delayed_ref_updates = 0; |
473 | if (cur && | 480 | if (cur && |
474 | trans->transaction->delayed_refs.num_heads_ready > 64) { | 481 | trans->transaction->delayed_refs.num_heads_ready > 64) { |
475 | trans->delayed_ref_updates = 0; | 482 | trans->delayed_ref_updates = 0; |
476 | |||
477 | /* | ||
478 | * do a full flush if the transaction is trying | ||
479 | * to close | ||
480 | */ | ||
481 | if (trans->transaction->delayed_refs.flushing) | ||
482 | cur = 0; | ||
483 | btrfs_run_delayed_refs(trans, root, cur); | 483 | btrfs_run_delayed_refs(trans, root, cur); |
484 | } else { | 484 | } else { |
485 | break; | 485 | break; |
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) | |||
1393 | 1393 | ||
1394 | if (btrfs_header_backref_rev(root->node) < | 1394 | if (btrfs_header_backref_rev(root->node) < |
1395 | BTRFS_MIXED_BACKREF_REV) | 1395 | BTRFS_MIXED_BACKREF_REV) |
1396 | btrfs_drop_snapshot(root, NULL, 0); | 1396 | btrfs_drop_snapshot(root, NULL, 0, 0); |
1397 | else | 1397 | else |
1398 | btrfs_drop_snapshot(root, NULL, 1); | 1398 | btrfs_drop_snapshot(root, NULL, 1, 0); |
1399 | } | 1399 | } |
1400 | return 0; | 1400 | return 0; |
1401 | } | 1401 | } |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3568374d419d..cb877e0886a7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, | |||
589 | ret = btrfs_inc_extent_ref(trans, root, | 589 | ret = btrfs_inc_extent_ref(trans, root, |
590 | ins.objectid, ins.offset, | 590 | ins.objectid, ins.offset, |
591 | 0, root->root_key.objectid, | 591 | 0, root->root_key.objectid, |
592 | key->objectid, offset); | 592 | key->objectid, offset, 0); |
593 | BUG_ON(ret); | 593 | BUG_ON(ret); |
594 | } else { | 594 | } else { |
595 | /* | 595 | /* |
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c new file mode 100644 index 000000000000..12f5147bd2b1 --- /dev/null +++ b/fs/btrfs/ulist.c | |||
@@ -0,0 +1,220 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 STRATO AG | ||
3 | * written by Arne Jansen <sensille@gmx.net> | ||
4 | * Distributed under the GNU GPL license version 2. | ||
5 | */ | ||
6 | |||
7 | #include <linux/slab.h> | ||
8 | #include <linux/module.h> | ||
9 | #include "ulist.h" | ||
10 | |||
11 | /* | ||
12 | * ulist is a generic data structure to hold a collection of unique u64 | ||
13 | * values. The only operations it supports is adding to the list and | ||
14 | * enumerating it. | ||
15 | * It is possible to store an auxiliary value along with the key. | ||
16 | * | ||
17 | * The implementation is preliminary and can probably be sped up | ||
18 | * significantly. A first step would be to store the values in an rbtree | ||
19 | * as soon as ULIST_SIZE is exceeded. | ||
20 | * | ||
21 | * A sample usage for ulists is the enumeration of directed graphs without | ||
22 | * visiting a node twice. The pseudo-code could look like this: | ||
23 | * | ||
24 | * ulist = ulist_alloc(); | ||
25 | * ulist_add(ulist, root); | ||
26 | * elem = NULL; | ||
27 | * | ||
28 | * while ((elem = ulist_next(ulist, elem)) { | ||
29 | * for (all child nodes n in elem) | ||
30 | * ulist_add(ulist, n); | ||
31 | * do something useful with the node; | ||
32 | * } | ||
33 | * ulist_free(ulist); | ||
34 | * | ||
35 | * This assumes the graph nodes are adressable by u64. This stems from the | ||
36 | * usage for tree enumeration in btrfs, where the logical addresses are | ||
37 | * 64 bit. | ||
38 | * | ||
39 | * It is also useful for tree enumeration which could be done elegantly | ||
40 | * recursively, but is not possible due to kernel stack limitations. The | ||
41 | * loop would be similar to the above. | ||
42 | */ | ||
43 | |||
44 | /** | ||
45 | * ulist_init - freshly initialize a ulist | ||
46 | * @ulist: the ulist to initialize | ||
47 | * | ||
48 | * Note: don't use this function to init an already used ulist, use | ||
49 | * ulist_reinit instead. | ||
50 | */ | ||
51 | void ulist_init(struct ulist *ulist) | ||
52 | { | ||
53 | ulist->nnodes = 0; | ||
54 | ulist->nodes = ulist->int_nodes; | ||
55 | ulist->nodes_alloced = ULIST_SIZE; | ||
56 | } | ||
57 | EXPORT_SYMBOL(ulist_init); | ||
58 | |||
59 | /** | ||
60 | * ulist_fini - free up additionally allocated memory for the ulist | ||
61 | * @ulist: the ulist from which to free the additional memory | ||
62 | * | ||
63 | * This is useful in cases where the base 'struct ulist' has been statically | ||
64 | * allocated. | ||
65 | */ | ||
66 | void ulist_fini(struct ulist *ulist) | ||
67 | { | ||
68 | /* | ||
69 | * The first ULIST_SIZE elements are stored inline in struct ulist. | ||
70 | * Only if more elements are alocated they need to be freed. | ||
71 | */ | ||
72 | if (ulist->nodes_alloced > ULIST_SIZE) | ||
73 | kfree(ulist->nodes); | ||
74 | ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ | ||
75 | } | ||
76 | EXPORT_SYMBOL(ulist_fini); | ||
77 | |||
78 | /** | ||
79 | * ulist_reinit - prepare a ulist for reuse | ||
80 | * @ulist: ulist to be reused | ||
81 | * | ||
82 | * Free up all additional memory allocated for the list elements and reinit | ||
83 | * the ulist. | ||
84 | */ | ||
85 | void ulist_reinit(struct ulist *ulist) | ||
86 | { | ||
87 | ulist_fini(ulist); | ||
88 | ulist_init(ulist); | ||
89 | } | ||
90 | EXPORT_SYMBOL(ulist_reinit); | ||
91 | |||
92 | /** | ||
93 | * ulist_alloc - dynamically allocate a ulist | ||
94 | * @gfp_mask: allocation flags to for base allocation | ||
95 | * | ||
96 | * The allocated ulist will be returned in an initialized state. | ||
97 | */ | ||
98 | struct ulist *ulist_alloc(unsigned long gfp_mask) | ||
99 | { | ||
100 | struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); | ||
101 | |||
102 | if (!ulist) | ||
103 | return NULL; | ||
104 | |||
105 | ulist_init(ulist); | ||
106 | |||
107 | return ulist; | ||
108 | } | ||
109 | EXPORT_SYMBOL(ulist_alloc); | ||
110 | |||
111 | /** | ||
112 | * ulist_free - free dynamically allocated ulist | ||
113 | * @ulist: ulist to free | ||
114 | * | ||
115 | * It is not necessary to call ulist_fini before. | ||
116 | */ | ||
117 | void ulist_free(struct ulist *ulist) | ||
118 | { | ||
119 | if (!ulist) | ||
120 | return; | ||
121 | ulist_fini(ulist); | ||
122 | kfree(ulist); | ||
123 | } | ||
124 | EXPORT_SYMBOL(ulist_free); | ||
125 | |||
126 | /** | ||
127 | * ulist_add - add an element to the ulist | ||
128 | * @ulist: ulist to add the element to | ||
129 | * @val: value to add to ulist | ||
130 | * @aux: auxiliary value to store along with val | ||
131 | * @gfp_mask: flags to use for allocation | ||
132 | * | ||
133 | * Note: locking must be provided by the caller. In case of rwlocks write | ||
134 | * locking is needed | ||
135 | * | ||
136 | * Add an element to a ulist. The @val will only be added if it doesn't | ||
137 | * already exist. If it is added, the auxiliary value @aux is stored along with | ||
138 | * it. In case @val already exists in the ulist, @aux is ignored, even if | ||
139 | * it differs from the already stored value. | ||
140 | * | ||
141 | * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been | ||
142 | * inserted. | ||
143 | * In case of allocation failure -ENOMEM is returned and the ulist stays | ||
144 | * unaltered. | ||
145 | */ | ||
146 | int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, | ||
147 | unsigned long gfp_mask) | ||
148 | { | ||
149 | int i; | ||
150 | |||
151 | for (i = 0; i < ulist->nnodes; ++i) { | ||
152 | if (ulist->nodes[i].val == val) | ||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | if (ulist->nnodes >= ulist->nodes_alloced) { | ||
157 | u64 new_alloced = ulist->nodes_alloced + 128; | ||
158 | struct ulist_node *new_nodes; | ||
159 | void *old = NULL; | ||
160 | |||
161 | /* | ||
162 | * if nodes_alloced == ULIST_SIZE no memory has been allocated | ||
163 | * yet, so pass NULL to krealloc | ||
164 | */ | ||
165 | if (ulist->nodes_alloced > ULIST_SIZE) | ||
166 | old = ulist->nodes; | ||
167 | |||
168 | new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, | ||
169 | gfp_mask); | ||
170 | if (!new_nodes) | ||
171 | return -ENOMEM; | ||
172 | |||
173 | if (!old) | ||
174 | memcpy(new_nodes, ulist->int_nodes, | ||
175 | sizeof(ulist->int_nodes)); | ||
176 | |||
177 | ulist->nodes = new_nodes; | ||
178 | ulist->nodes_alloced = new_alloced; | ||
179 | } | ||
180 | ulist->nodes[ulist->nnodes].val = val; | ||
181 | ulist->nodes[ulist->nnodes].aux = aux; | ||
182 | ++ulist->nnodes; | ||
183 | |||
184 | return 1; | ||
185 | } | ||
186 | EXPORT_SYMBOL(ulist_add); | ||
187 | |||
188 | /** | ||
189 | * ulist_next - iterate ulist | ||
190 | * @ulist: ulist to iterate | ||
191 | * @prev: previously returned element or %NULL to start iteration | ||
192 | * | ||
193 | * Note: locking must be provided by the caller. In case of rwlocks only read | ||
194 | * locking is needed | ||
195 | * | ||
196 | * This function is used to iterate an ulist. The iteration is started with | ||
197 | * @prev = %NULL. It returns the next element from the ulist or %NULL when the | ||
198 | * end is reached. No guarantee is made with respect to the order in which | ||
199 | * the elements are returned. They might neither be returned in order of | ||
200 | * addition nor in ascending order. | ||
201 | * It is allowed to call ulist_add during an enumeration. Newly added items | ||
202 | * are guaranteed to show up in the running enumeration. | ||
203 | */ | ||
204 | struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) | ||
205 | { | ||
206 | int next; | ||
207 | |||
208 | if (ulist->nnodes == 0) | ||
209 | return NULL; | ||
210 | |||
211 | if (!prev) | ||
212 | return &ulist->nodes[0]; | ||
213 | |||
214 | next = (prev - ulist->nodes) + 1; | ||
215 | if (next < 0 || next >= ulist->nnodes) | ||
216 | return NULL; | ||
217 | |||
218 | return &ulist->nodes[next]; | ||
219 | } | ||
220 | EXPORT_SYMBOL(ulist_next); | ||
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h new file mode 100644 index 000000000000..2e25dec58ec0 --- /dev/null +++ b/fs/btrfs/ulist.h | |||
@@ -0,0 +1,68 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 STRATO AG | ||
3 | * written by Arne Jansen <sensille@gmx.net> | ||
4 | * Distributed under the GNU GPL license version 2. | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | #ifndef __ULIST__ | ||
9 | #define __ULIST__ | ||
10 | |||
11 | /* | ||
12 | * ulist is a generic data structure to hold a collection of unique u64 | ||
13 | * values. The only operations it supports is adding to the list and | ||
14 | * enumerating it. | ||
15 | * It is possible to store an auxiliary value along with the key. | ||
16 | * | ||
17 | * The implementation is preliminary and can probably be sped up | ||
18 | * significantly. A first step would be to store the values in an rbtree | ||
19 | * as soon as ULIST_SIZE is exceeded. | ||
20 | */ | ||
21 | |||
22 | /* | ||
23 | * number of elements statically allocated inside struct ulist | ||
24 | */ | ||
25 | #define ULIST_SIZE 16 | ||
26 | |||
27 | /* | ||
28 | * element of the list | ||
29 | */ | ||
30 | struct ulist_node { | ||
31 | u64 val; /* value to store */ | ||
32 | unsigned long aux; /* auxiliary value saved along with the val */ | ||
33 | }; | ||
34 | |||
35 | struct ulist { | ||
36 | /* | ||
37 | * number of elements stored in list | ||
38 | */ | ||
39 | unsigned long nnodes; | ||
40 | |||
41 | /* | ||
42 | * number of nodes we already have room for | ||
43 | */ | ||
44 | unsigned long nodes_alloced; | ||
45 | |||
46 | /* | ||
47 | * pointer to the array storing the elements. The first ULIST_SIZE | ||
48 | * elements are stored inline. In this case the it points to int_nodes. | ||
49 | * After exceeding ULIST_SIZE, dynamic memory is allocated. | ||
50 | */ | ||
51 | struct ulist_node *nodes; | ||
52 | |||
53 | /* | ||
54 | * inline storage space for the first ULIST_SIZE entries | ||
55 | */ | ||
56 | struct ulist_node int_nodes[ULIST_SIZE]; | ||
57 | }; | ||
58 | |||
59 | void ulist_init(struct ulist *ulist); | ||
60 | void ulist_fini(struct ulist *ulist); | ||
61 | void ulist_reinit(struct ulist *ulist); | ||
62 | struct ulist *ulist_alloc(unsigned long gfp_mask); | ||
63 | void ulist_free(struct ulist *ulist); | ||
64 | int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, | ||
65 | unsigned long gfp_mask); | ||
66 | struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); | ||
67 | |||
68 | #endif | ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9d..0b4e2af7954d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/random.h> | 23 | #include <linux/random.h> |
24 | #include <linux/iocontext.h> | 24 | #include <linux/iocontext.h> |
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/kthread.h> | ||
26 | #include <asm/div64.h> | 27 | #include <asm/div64.h> |
27 | #include "compat.h" | 28 | #include "compat.h" |
28 | #include "ctree.h" | 29 | #include "ctree.h" |
@@ -32,6 +33,7 @@ | |||
32 | #include "print-tree.h" | 33 | #include "print-tree.h" |
33 | #include "volumes.h" | 34 | #include "volumes.h" |
34 | #include "async-thread.h" | 35 | #include "async-thread.h" |
36 | #include "check-integrity.h" | ||
35 | 37 | ||
36 | static int init_first_rw_device(struct btrfs_trans_handle *trans, | 38 | static int init_first_rw_device(struct btrfs_trans_handle *trans, |
37 | struct btrfs_root *root, | 39 | struct btrfs_root *root, |
@@ -246,7 +248,7 @@ loop_lock: | |||
246 | sync_pending = 0; | 248 | sync_pending = 0; |
247 | } | 249 | } |
248 | 250 | ||
249 | submit_bio(cur->bi_rw, cur); | 251 | btrfsic_submit_bio(cur->bi_rw, cur); |
250 | num_run++; | 252 | num_run++; |
251 | batch_run++; | 253 | batch_run++; |
252 | if (need_resched()) | 254 | if (need_resched()) |
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
706 | u64 devid; | 708 | u64 devid; |
707 | u64 transid; | 709 | u64 transid; |
708 | 710 | ||
709 | mutex_lock(&uuid_mutex); | ||
710 | |||
711 | flags |= FMODE_EXCL; | 711 | flags |= FMODE_EXCL; |
712 | bdev = blkdev_get_by_path(path, flags, holder); | 712 | bdev = blkdev_get_by_path(path, flags, holder); |
713 | 713 | ||
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
716 | goto error; | 716 | goto error; |
717 | } | 717 | } |
718 | 718 | ||
719 | mutex_lock(&uuid_mutex); | ||
719 | ret = set_blocksize(bdev, 4096); | 720 | ret = set_blocksize(bdev, 4096); |
720 | if (ret) | 721 | if (ret) |
721 | goto error_close; | 722 | goto error_close; |
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
737 | 738 | ||
738 | brelse(bh); | 739 | brelse(bh); |
739 | error_close: | 740 | error_close: |
741 | mutex_unlock(&uuid_mutex); | ||
740 | blkdev_put(bdev, flags); | 742 | blkdev_put(bdev, flags); |
741 | error: | 743 | error: |
742 | mutex_unlock(&uuid_mutex); | ||
743 | return ret; | 744 | return ret; |
744 | } | 745 | } |
745 | 746 | ||
@@ -829,7 +830,6 @@ out: | |||
829 | 830 | ||
830 | /* | 831 | /* |
831 | * find_free_dev_extent - find free space in the specified device | 832 | * find_free_dev_extent - find free space in the specified device |
832 | * @trans: transaction handler | ||
833 | * @device: the device which we search the free space in | 833 | * @device: the device which we search the free space in |
834 | * @num_bytes: the size of the free space that we need | 834 | * @num_bytes: the size of the free space that we need |
835 | * @start: store the start of the free space. | 835 | * @start: store the start of the free space. |
@@ -848,8 +848,7 @@ out: | |||
848 | * But if we don't find suitable free space, it is used to store the size of | 848 | * But if we don't find suitable free space, it is used to store the size of |
849 | * the max free space. | 849 | * the max free space. |
850 | */ | 850 | */ |
851 | int find_free_dev_extent(struct btrfs_trans_handle *trans, | 851 | int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, |
852 | struct btrfs_device *device, u64 num_bytes, | ||
853 | u64 *start, u64 *len) | 852 | u64 *start, u64 *len) |
854 | { | 853 | { |
855 | struct btrfs_key key; | 854 | struct btrfs_key key; |
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, | |||
893 | key.offset = search_start; | 892 | key.offset = search_start; |
894 | key.type = BTRFS_DEV_EXTENT_KEY; | 893 | key.type = BTRFS_DEV_EXTENT_KEY; |
895 | 894 | ||
896 | ret = btrfs_search_slot(trans, root, &key, path, 0, 0); | 895 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); |
897 | if (ret < 0) | 896 | if (ret < 0) |
898 | goto out; | 897 | goto out; |
899 | if (ret > 0) { | 898 | if (ret > 0) { |
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1282 | bool clear_super = false; | 1281 | bool clear_super = false; |
1283 | 1282 | ||
1284 | mutex_lock(&uuid_mutex); | 1283 | mutex_lock(&uuid_mutex); |
1285 | mutex_lock(&root->fs_info->volume_mutex); | ||
1286 | 1284 | ||
1287 | all_avail = root->fs_info->avail_data_alloc_bits | | 1285 | all_avail = root->fs_info->avail_data_alloc_bits | |
1288 | root->fs_info->avail_system_alloc_bits | | 1286 | root->fs_info->avail_system_alloc_bits | |
@@ -1452,7 +1450,6 @@ error_close: | |||
1452 | if (bdev) | 1450 | if (bdev) |
1453 | blkdev_put(bdev, FMODE_READ | FMODE_EXCL); | 1451 | blkdev_put(bdev, FMODE_READ | FMODE_EXCL); |
1454 | out: | 1452 | out: |
1455 | mutex_unlock(&root->fs_info->volume_mutex); | ||
1456 | mutex_unlock(&uuid_mutex); | 1453 | mutex_unlock(&uuid_mutex); |
1457 | return ret; | 1454 | return ret; |
1458 | error_undo: | 1455 | error_undo: |
@@ -1469,8 +1466,7 @@ error_undo: | |||
1469 | /* | 1466 | /* |
1470 | * does all the dirty work required for changing file system's UUID. | 1467 | * does all the dirty work required for changing file system's UUID. |
1471 | */ | 1468 | */ |
1472 | static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, | 1469 | static int btrfs_prepare_sprout(struct btrfs_root *root) |
1473 | struct btrfs_root *root) | ||
1474 | { | 1470 | { |
1475 | struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; | 1471 | struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; |
1476 | struct btrfs_fs_devices *old_devices; | 1472 | struct btrfs_fs_devices *old_devices; |
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1629 | } | 1625 | } |
1630 | 1626 | ||
1631 | filemap_write_and_wait(bdev->bd_inode->i_mapping); | 1627 | filemap_write_and_wait(bdev->bd_inode->i_mapping); |
1632 | mutex_lock(&root->fs_info->volume_mutex); | ||
1633 | 1628 | ||
1634 | devices = &root->fs_info->fs_devices->devices; | 1629 | devices = &root->fs_info->fs_devices->devices; |
1635 | /* | 1630 | /* |
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1695 | 1690 | ||
1696 | if (seeding_dev) { | 1691 | if (seeding_dev) { |
1697 | sb->s_flags &= ~MS_RDONLY; | 1692 | sb->s_flags &= ~MS_RDONLY; |
1698 | ret = btrfs_prepare_sprout(trans, root); | 1693 | ret = btrfs_prepare_sprout(root); |
1699 | BUG_ON(ret); | 1694 | BUG_ON(ret); |
1700 | } | 1695 | } |
1701 | 1696 | ||
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1757 | ret = btrfs_relocate_sys_chunks(root); | 1752 | ret = btrfs_relocate_sys_chunks(root); |
1758 | BUG_ON(ret); | 1753 | BUG_ON(ret); |
1759 | } | 1754 | } |
1760 | out: | 1755 | |
1761 | mutex_unlock(&root->fs_info->volume_mutex); | ||
1762 | return ret; | 1756 | return ret; |
1763 | error: | 1757 | error: |
1764 | blkdev_put(bdev, FMODE_EXCL); | 1758 | blkdev_put(bdev, FMODE_EXCL); |
@@ -1766,7 +1760,7 @@ error: | |||
1766 | mutex_unlock(&uuid_mutex); | 1760 | mutex_unlock(&uuid_mutex); |
1767 | up_write(&sb->s_umount); | 1761 | up_write(&sb->s_umount); |
1768 | } | 1762 | } |
1769 | goto out; | 1763 | return ret; |
1770 | } | 1764 | } |
1771 | 1765 | ||
1772 | static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, | 1766 | static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, |
@@ -2077,6 +2071,362 @@ error: | |||
2077 | return ret; | 2071 | return ret; |
2078 | } | 2072 | } |
2079 | 2073 | ||
2074 | static int insert_balance_item(struct btrfs_root *root, | ||
2075 | struct btrfs_balance_control *bctl) | ||
2076 | { | ||
2077 | struct btrfs_trans_handle *trans; | ||
2078 | struct btrfs_balance_item *item; | ||
2079 | struct btrfs_disk_balance_args disk_bargs; | ||
2080 | struct btrfs_path *path; | ||
2081 | struct extent_buffer *leaf; | ||
2082 | struct btrfs_key key; | ||
2083 | int ret, err; | ||
2084 | |||
2085 | path = btrfs_alloc_path(); | ||
2086 | if (!path) | ||
2087 | return -ENOMEM; | ||
2088 | |||
2089 | trans = btrfs_start_transaction(root, 0); | ||
2090 | if (IS_ERR(trans)) { | ||
2091 | btrfs_free_path(path); | ||
2092 | return PTR_ERR(trans); | ||
2093 | } | ||
2094 | |||
2095 | key.objectid = BTRFS_BALANCE_OBJECTID; | ||
2096 | key.type = BTRFS_BALANCE_ITEM_KEY; | ||
2097 | key.offset = 0; | ||
2098 | |||
2099 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
2100 | sizeof(*item)); | ||
2101 | if (ret) | ||
2102 | goto out; | ||
2103 | |||
2104 | leaf = path->nodes[0]; | ||
2105 | item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); | ||
2106 | |||
2107 | memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); | ||
2108 | |||
2109 | btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); | ||
2110 | btrfs_set_balance_data(leaf, item, &disk_bargs); | ||
2111 | btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); | ||
2112 | btrfs_set_balance_meta(leaf, item, &disk_bargs); | ||
2113 | btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); | ||
2114 | btrfs_set_balance_sys(leaf, item, &disk_bargs); | ||
2115 | |||
2116 | btrfs_set_balance_flags(leaf, item, bctl->flags); | ||
2117 | |||
2118 | btrfs_mark_buffer_dirty(leaf); | ||
2119 | out: | ||
2120 | btrfs_free_path(path); | ||
2121 | err = btrfs_commit_transaction(trans, root); | ||
2122 | if (err && !ret) | ||
2123 | ret = err; | ||
2124 | return ret; | ||
2125 | } | ||
2126 | |||
2127 | static int del_balance_item(struct btrfs_root *root) | ||
2128 | { | ||
2129 | struct btrfs_trans_handle *trans; | ||
2130 | struct btrfs_path *path; | ||
2131 | struct btrfs_key key; | ||
2132 | int ret, err; | ||
2133 | |||
2134 | path = btrfs_alloc_path(); | ||
2135 | if (!path) | ||
2136 | return -ENOMEM; | ||
2137 | |||
2138 | trans = btrfs_start_transaction(root, 0); | ||
2139 | if (IS_ERR(trans)) { | ||
2140 | btrfs_free_path(path); | ||
2141 | return PTR_ERR(trans); | ||
2142 | } | ||
2143 | |||
2144 | key.objectid = BTRFS_BALANCE_OBJECTID; | ||
2145 | key.type = BTRFS_BALANCE_ITEM_KEY; | ||
2146 | key.offset = 0; | ||
2147 | |||
2148 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
2149 | if (ret < 0) | ||
2150 | goto out; | ||
2151 | if (ret > 0) { | ||
2152 | ret = -ENOENT; | ||
2153 | goto out; | ||
2154 | } | ||
2155 | |||
2156 | ret = btrfs_del_item(trans, root, path); | ||
2157 | out: | ||
2158 | btrfs_free_path(path); | ||
2159 | err = btrfs_commit_transaction(trans, root); | ||
2160 | if (err && !ret) | ||
2161 | ret = err; | ||
2162 | return ret; | ||
2163 | } | ||
2164 | |||
2165 | /* | ||
2166 | * This is a heuristic used to reduce the number of chunks balanced on | ||
2167 | * resume after balance was interrupted. | ||
2168 | */ | ||
2169 | static void update_balance_args(struct btrfs_balance_control *bctl) | ||
2170 | { | ||
2171 | /* | ||
2172 | * Turn on soft mode for chunk types that were being converted. | ||
2173 | */ | ||
2174 | if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) | ||
2175 | bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; | ||
2176 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) | ||
2177 | bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; | ||
2178 | if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) | ||
2179 | bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; | ||
2180 | |||
2181 | /* | ||
2182 | * Turn on usage filter if is not already used. The idea is | ||
2183 | * that chunks that we have already balanced should be | ||
2184 | * reasonably full. Don't do it for chunks that are being | ||
2185 | * converted - that will keep us from relocating unconverted | ||
2186 | * (albeit full) chunks. | ||
2187 | */ | ||
2188 | if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && | ||
2189 | !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { | ||
2190 | bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; | ||
2191 | bctl->data.usage = 90; | ||
2192 | } | ||
2193 | if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && | ||
2194 | !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { | ||
2195 | bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; | ||
2196 | bctl->sys.usage = 90; | ||
2197 | } | ||
2198 | if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && | ||
2199 | !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { | ||
2200 | bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; | ||
2201 | bctl->meta.usage = 90; | ||
2202 | } | ||
2203 | } | ||
2204 | |||
2205 | /* | ||
2206 | * Should be called with both balance and volume mutexes held to | ||
2207 | * serialize other volume operations (add_dev/rm_dev/resize) with | ||
2208 | * restriper. Same goes for unset_balance_control. | ||
2209 | */ | ||
2210 | static void set_balance_control(struct btrfs_balance_control *bctl) | ||
2211 | { | ||
2212 | struct btrfs_fs_info *fs_info = bctl->fs_info; | ||
2213 | |||
2214 | BUG_ON(fs_info->balance_ctl); | ||
2215 | |||
2216 | spin_lock(&fs_info->balance_lock); | ||
2217 | fs_info->balance_ctl = bctl; | ||
2218 | spin_unlock(&fs_info->balance_lock); | ||
2219 | } | ||
2220 | |||
2221 | static void unset_balance_control(struct btrfs_fs_info *fs_info) | ||
2222 | { | ||
2223 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; | ||
2224 | |||
2225 | BUG_ON(!fs_info->balance_ctl); | ||
2226 | |||
2227 | spin_lock(&fs_info->balance_lock); | ||
2228 | fs_info->balance_ctl = NULL; | ||
2229 | spin_unlock(&fs_info->balance_lock); | ||
2230 | |||
2231 | kfree(bctl); | ||
2232 | } | ||
2233 | |||
2234 | /* | ||
2235 | * Balance filters. Return 1 if chunk should be filtered out | ||
2236 | * (should not be balanced). | ||
2237 | */ | ||
2238 | static int chunk_profiles_filter(u64 chunk_profile, | ||
2239 | struct btrfs_balance_args *bargs) | ||
2240 | { | ||
2241 | chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; | ||
2242 | |||
2243 | if (chunk_profile == 0) | ||
2244 | chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; | ||
2245 | |||
2246 | if (bargs->profiles & chunk_profile) | ||
2247 | return 0; | ||
2248 | |||
2249 | return 1; | ||
2250 | } | ||
2251 | |||
2252 | static u64 div_factor_fine(u64 num, int factor) | ||
2253 | { | ||
2254 | if (factor <= 0) | ||
2255 | return 0; | ||
2256 | if (factor >= 100) | ||
2257 | return num; | ||
2258 | |||
2259 | num *= factor; | ||
2260 | do_div(num, 100); | ||
2261 | return num; | ||
2262 | } | ||
2263 | |||
2264 | static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, | ||
2265 | struct btrfs_balance_args *bargs) | ||
2266 | { | ||
2267 | struct btrfs_block_group_cache *cache; | ||
2268 | u64 chunk_used, user_thresh; | ||
2269 | int ret = 1; | ||
2270 | |||
2271 | cache = btrfs_lookup_block_group(fs_info, chunk_offset); | ||
2272 | chunk_used = btrfs_block_group_used(&cache->item); | ||
2273 | |||
2274 | user_thresh = div_factor_fine(cache->key.offset, bargs->usage); | ||
2275 | if (chunk_used < user_thresh) | ||
2276 | ret = 0; | ||
2277 | |||
2278 | btrfs_put_block_group(cache); | ||
2279 | return ret; | ||
2280 | } | ||
2281 | |||
2282 | static int chunk_devid_filter(struct extent_buffer *leaf, | ||
2283 | struct btrfs_chunk *chunk, | ||
2284 | struct btrfs_balance_args *bargs) | ||
2285 | { | ||
2286 | struct btrfs_stripe *stripe; | ||
2287 | int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); | ||
2288 | int i; | ||
2289 | |||
2290 | for (i = 0; i < num_stripes; i++) { | ||
2291 | stripe = btrfs_stripe_nr(chunk, i); | ||
2292 | if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) | ||
2293 | return 0; | ||
2294 | } | ||
2295 | |||
2296 | return 1; | ||
2297 | } | ||
2298 | |||
2299 | /* [pstart, pend) */ | ||
2300 | static int chunk_drange_filter(struct extent_buffer *leaf, | ||
2301 | struct btrfs_chunk *chunk, | ||
2302 | u64 chunk_offset, | ||
2303 | struct btrfs_balance_args *bargs) | ||
2304 | { | ||
2305 | struct btrfs_stripe *stripe; | ||
2306 | int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); | ||
2307 | u64 stripe_offset; | ||
2308 | u64 stripe_length; | ||
2309 | int factor; | ||
2310 | int i; | ||
2311 | |||
2312 | if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) | ||
2313 | return 0; | ||
2314 | |||
2315 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | ||
2316 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) | ||
2317 | factor = 2; | ||
2318 | else | ||
2319 | factor = 1; | ||
2320 | factor = num_stripes / factor; | ||
2321 | |||
2322 | for (i = 0; i < num_stripes; i++) { | ||
2323 | stripe = btrfs_stripe_nr(chunk, i); | ||
2324 | if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) | ||
2325 | continue; | ||
2326 | |||
2327 | stripe_offset = btrfs_stripe_offset(leaf, stripe); | ||
2328 | stripe_length = btrfs_chunk_length(leaf, chunk); | ||
2329 | do_div(stripe_length, factor); | ||
2330 | |||
2331 | if (stripe_offset < bargs->pend && | ||
2332 | stripe_offset + stripe_length > bargs->pstart) | ||
2333 | return 0; | ||
2334 | } | ||
2335 | |||
2336 | return 1; | ||
2337 | } | ||
2338 | |||
2339 | /* [vstart, vend) */ | ||
2340 | static int chunk_vrange_filter(struct extent_buffer *leaf, | ||
2341 | struct btrfs_chunk *chunk, | ||
2342 | u64 chunk_offset, | ||
2343 | struct btrfs_balance_args *bargs) | ||
2344 | { | ||
2345 | if (chunk_offset < bargs->vend && | ||
2346 | chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) | ||
2347 | /* at least part of the chunk is inside this vrange */ | ||
2348 | return 0; | ||
2349 | |||
2350 | return 1; | ||
2351 | } | ||
2352 | |||
2353 | static int chunk_soft_convert_filter(u64 chunk_profile, | ||
2354 | struct btrfs_balance_args *bargs) | ||
2355 | { | ||
2356 | if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) | ||
2357 | return 0; | ||
2358 | |||
2359 | chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; | ||
2360 | |||
2361 | if (chunk_profile == 0) | ||
2362 | chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; | ||
2363 | |||
2364 | if (bargs->target & chunk_profile) | ||
2365 | return 1; | ||
2366 | |||
2367 | return 0; | ||
2368 | } | ||
2369 | |||
2370 | static int should_balance_chunk(struct btrfs_root *root, | ||
2371 | struct extent_buffer *leaf, | ||
2372 | struct btrfs_chunk *chunk, u64 chunk_offset) | ||
2373 | { | ||
2374 | struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; | ||
2375 | struct btrfs_balance_args *bargs = NULL; | ||
2376 | u64 chunk_type = btrfs_chunk_type(leaf, chunk); | ||
2377 | |||
2378 | /* type filter */ | ||
2379 | if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & | ||
2380 | (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { | ||
2381 | return 0; | ||
2382 | } | ||
2383 | |||
2384 | if (chunk_type & BTRFS_BLOCK_GROUP_DATA) | ||
2385 | bargs = &bctl->data; | ||
2386 | else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) | ||
2387 | bargs = &bctl->sys; | ||
2388 | else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) | ||
2389 | bargs = &bctl->meta; | ||
2390 | |||
2391 | /* profiles filter */ | ||
2392 | if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && | ||
2393 | chunk_profiles_filter(chunk_type, bargs)) { | ||
2394 | return 0; | ||
2395 | } | ||
2396 | |||
2397 | /* usage filter */ | ||
2398 | if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && | ||
2399 | chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { | ||
2400 | return 0; | ||
2401 | } | ||
2402 | |||
2403 | /* devid filter */ | ||
2404 | if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && | ||
2405 | chunk_devid_filter(leaf, chunk, bargs)) { | ||
2406 | return 0; | ||
2407 | } | ||
2408 | |||
2409 | /* drange filter, makes sense only with devid filter */ | ||
2410 | if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && | ||
2411 | chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { | ||
2412 | return 0; | ||
2413 | } | ||
2414 | |||
2415 | /* vrange filter */ | ||
2416 | if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && | ||
2417 | chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { | ||
2418 | return 0; | ||
2419 | } | ||
2420 | |||
2421 | /* soft profile changing mode */ | ||
2422 | if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && | ||
2423 | chunk_soft_convert_filter(chunk_type, bargs)) { | ||
2424 | return 0; | ||
2425 | } | ||
2426 | |||
2427 | return 1; | ||
2428 | } | ||
2429 | |||
2080 | static u64 div_factor(u64 num, int factor) | 2430 | static u64 div_factor(u64 num, int factor) |
2081 | { | 2431 | { |
2082 | if (factor == 10) | 2432 | if (factor == 10) |
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor) | |||
2086 | return num; | 2436 | return num; |
2087 | } | 2437 | } |
2088 | 2438 | ||
2089 | int btrfs_balance(struct btrfs_root *dev_root) | 2439 | static int __btrfs_balance(struct btrfs_fs_info *fs_info) |
2090 | { | 2440 | { |
2091 | int ret; | 2441 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
2092 | struct list_head *devices = &dev_root->fs_info->fs_devices->devices; | 2442 | struct btrfs_root *chunk_root = fs_info->chunk_root; |
2443 | struct btrfs_root *dev_root = fs_info->dev_root; | ||
2444 | struct list_head *devices; | ||
2093 | struct btrfs_device *device; | 2445 | struct btrfs_device *device; |
2094 | u64 old_size; | 2446 | u64 old_size; |
2095 | u64 size_to_free; | 2447 | u64 size_to_free; |
2448 | struct btrfs_chunk *chunk; | ||
2096 | struct btrfs_path *path; | 2449 | struct btrfs_path *path; |
2097 | struct btrfs_key key; | 2450 | struct btrfs_key key; |
2098 | struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; | ||
2099 | struct btrfs_trans_handle *trans; | ||
2100 | struct btrfs_key found_key; | 2451 | struct btrfs_key found_key; |
2101 | 2452 | struct btrfs_trans_handle *trans; | |
2102 | if (dev_root->fs_info->sb->s_flags & MS_RDONLY) | 2453 | struct extent_buffer *leaf; |
2103 | return -EROFS; | 2454 | int slot; |
2104 | 2455 | int ret; | |
2105 | if (!capable(CAP_SYS_ADMIN)) | 2456 | int enospc_errors = 0; |
2106 | return -EPERM; | 2457 | bool counting = true; |
2107 | |||
2108 | mutex_lock(&dev_root->fs_info->volume_mutex); | ||
2109 | dev_root = dev_root->fs_info->dev_root; | ||
2110 | 2458 | ||
2111 | /* step one make some room on all the devices */ | 2459 | /* step one make some room on all the devices */ |
2460 | devices = &fs_info->fs_devices->devices; | ||
2112 | list_for_each_entry(device, devices, dev_list) { | 2461 | list_for_each_entry(device, devices, dev_list) { |
2113 | old_size = device->total_bytes; | 2462 | old_size = device->total_bytes; |
2114 | size_to_free = div_factor(old_size, 1); | 2463 | size_to_free = div_factor(old_size, 1); |
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
2137 | ret = -ENOMEM; | 2486 | ret = -ENOMEM; |
2138 | goto error; | 2487 | goto error; |
2139 | } | 2488 | } |
2489 | |||
2490 | /* zero out stat counters */ | ||
2491 | spin_lock(&fs_info->balance_lock); | ||
2492 | memset(&bctl->stat, 0, sizeof(bctl->stat)); | ||
2493 | spin_unlock(&fs_info->balance_lock); | ||
2494 | again: | ||
2140 | key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; | 2495 | key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; |
2141 | key.offset = (u64)-1; | 2496 | key.offset = (u64)-1; |
2142 | key.type = BTRFS_CHUNK_ITEM_KEY; | 2497 | key.type = BTRFS_CHUNK_ITEM_KEY; |
2143 | 2498 | ||
2144 | while (1) { | 2499 | while (1) { |
2500 | if ((!counting && atomic_read(&fs_info->balance_pause_req)) || | ||
2501 | atomic_read(&fs_info->balance_cancel_req)) { | ||
2502 | ret = -ECANCELED; | ||
2503 | goto error; | ||
2504 | } | ||
2505 | |||
2145 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); | 2506 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); |
2146 | if (ret < 0) | 2507 | if (ret < 0) |
2147 | goto error; | 2508 | goto error; |
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
2151 | * failed | 2512 | * failed |
2152 | */ | 2513 | */ |
2153 | if (ret == 0) | 2514 | if (ret == 0) |
2154 | break; | 2515 | BUG(); /* FIXME break ? */ |
2155 | 2516 | ||
2156 | ret = btrfs_previous_item(chunk_root, path, 0, | 2517 | ret = btrfs_previous_item(chunk_root, path, 0, |
2157 | BTRFS_CHUNK_ITEM_KEY); | 2518 | BTRFS_CHUNK_ITEM_KEY); |
2158 | if (ret) | 2519 | if (ret) { |
2520 | ret = 0; | ||
2159 | break; | 2521 | break; |
2522 | } | ||
2523 | |||
2524 | leaf = path->nodes[0]; | ||
2525 | slot = path->slots[0]; | ||
2526 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
2160 | 2527 | ||
2161 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
2162 | path->slots[0]); | ||
2163 | if (found_key.objectid != key.objectid) | 2528 | if (found_key.objectid != key.objectid) |
2164 | break; | 2529 | break; |
2165 | 2530 | ||
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
2167 | if (found_key.offset == 0) | 2532 | if (found_key.offset == 0) |
2168 | break; | 2533 | break; |
2169 | 2534 | ||
2535 | chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); | ||
2536 | |||
2537 | if (!counting) { | ||
2538 | spin_lock(&fs_info->balance_lock); | ||
2539 | bctl->stat.considered++; | ||
2540 | spin_unlock(&fs_info->balance_lock); | ||
2541 | } | ||
2542 | |||
2543 | ret = should_balance_chunk(chunk_root, leaf, chunk, | ||
2544 | found_key.offset); | ||
2170 | btrfs_release_path(path); | 2545 | btrfs_release_path(path); |
2546 | if (!ret) | ||
2547 | goto loop; | ||
2548 | |||
2549 | if (counting) { | ||
2550 | spin_lock(&fs_info->balance_lock); | ||
2551 | bctl->stat.expected++; | ||
2552 | spin_unlock(&fs_info->balance_lock); | ||
2553 | goto loop; | ||
2554 | } | ||
2555 | |||
2171 | ret = btrfs_relocate_chunk(chunk_root, | 2556 | ret = btrfs_relocate_chunk(chunk_root, |
2172 | chunk_root->root_key.objectid, | 2557 | chunk_root->root_key.objectid, |
2173 | found_key.objectid, | 2558 | found_key.objectid, |
2174 | found_key.offset); | 2559 | found_key.offset); |
2175 | if (ret && ret != -ENOSPC) | 2560 | if (ret && ret != -ENOSPC) |
2176 | goto error; | 2561 | goto error; |
2562 | if (ret == -ENOSPC) { | ||
2563 | enospc_errors++; | ||
2564 | } else { | ||
2565 | spin_lock(&fs_info->balance_lock); | ||
2566 | bctl->stat.completed++; | ||
2567 | spin_unlock(&fs_info->balance_lock); | ||
2568 | } | ||
2569 | loop: | ||
2177 | key.offset = found_key.offset - 1; | 2570 | key.offset = found_key.offset - 1; |
2178 | } | 2571 | } |
2179 | ret = 0; | 2572 | |
2573 | if (counting) { | ||
2574 | btrfs_release_path(path); | ||
2575 | counting = false; | ||
2576 | goto again; | ||
2577 | } | ||
2180 | error: | 2578 | error: |
2181 | btrfs_free_path(path); | 2579 | btrfs_free_path(path); |
2182 | mutex_unlock(&dev_root->fs_info->volume_mutex); | 2580 | if (enospc_errors) { |
2581 | printk(KERN_INFO "btrfs: %d enospc errors during balance\n", | ||
2582 | enospc_errors); | ||
2583 | if (!ret) | ||
2584 | ret = -ENOSPC; | ||
2585 | } | ||
2586 | |||
2183 | return ret; | 2587 | return ret; |
2184 | } | 2588 | } |
2185 | 2589 | ||
2590 | static inline int balance_need_close(struct btrfs_fs_info *fs_info) | ||
2591 | { | ||
2592 | /* cancel requested || normal exit path */ | ||
2593 | return atomic_read(&fs_info->balance_cancel_req) || | ||
2594 | (atomic_read(&fs_info->balance_pause_req) == 0 && | ||
2595 | atomic_read(&fs_info->balance_cancel_req) == 0); | ||
2596 | } | ||
2597 | |||
2598 | static void __cancel_balance(struct btrfs_fs_info *fs_info) | ||
2599 | { | ||
2600 | int ret; | ||
2601 | |||
2602 | unset_balance_control(fs_info); | ||
2603 | ret = del_balance_item(fs_info->tree_root); | ||
2604 | BUG_ON(ret); | ||
2605 | } | ||
2606 | |||
2607 | void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, | ||
2608 | struct btrfs_ioctl_balance_args *bargs); | ||
2609 | |||
2610 | /* | ||
2611 | * Should be called with both balance and volume mutexes held | ||
2612 | */ | ||
2613 | int btrfs_balance(struct btrfs_balance_control *bctl, | ||
2614 | struct btrfs_ioctl_balance_args *bargs) | ||
2615 | { | ||
2616 | struct btrfs_fs_info *fs_info = bctl->fs_info; | ||
2617 | u64 allowed; | ||
2618 | int ret; | ||
2619 | |||
2620 | if (btrfs_fs_closing(fs_info) || | ||
2621 | atomic_read(&fs_info->balance_pause_req) || | ||
2622 | atomic_read(&fs_info->balance_cancel_req)) { | ||
2623 | ret = -EINVAL; | ||
2624 | goto out; | ||
2625 | } | ||
2626 | |||
2627 | /* | ||
2628 | * In case of mixed groups both data and meta should be picked, | ||
2629 | * and identical options should be given for both of them. | ||
2630 | */ | ||
2631 | allowed = btrfs_super_incompat_flags(fs_info->super_copy); | ||
2632 | if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && | ||
2633 | (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { | ||
2634 | if (!(bctl->flags & BTRFS_BALANCE_DATA) || | ||
2635 | !(bctl->flags & BTRFS_BALANCE_METADATA) || | ||
2636 | memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { | ||
2637 | printk(KERN_ERR "btrfs: with mixed groups data and " | ||
2638 | "metadata balance options must be the same\n"); | ||
2639 | ret = -EINVAL; | ||
2640 | goto out; | ||
2641 | } | ||
2642 | } | ||
2643 | |||
2644 | /* | ||
2645 | * Profile changing sanity checks. Skip them if a simple | ||
2646 | * balance is requested. | ||
2647 | */ | ||
2648 | if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & | ||
2649 | BTRFS_BALANCE_ARGS_CONVERT)) | ||
2650 | goto do_balance; | ||
2651 | |||
2652 | allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; | ||
2653 | if (fs_info->fs_devices->num_devices == 1) | ||
2654 | allowed |= BTRFS_BLOCK_GROUP_DUP; | ||
2655 | else if (fs_info->fs_devices->num_devices < 4) | ||
2656 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | ||
2657 | else | ||
2658 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | ||
2659 | BTRFS_BLOCK_GROUP_RAID10); | ||
2660 | |||
2661 | if (!profile_is_valid(bctl->data.target, 1) || | ||
2662 | bctl->data.target & ~allowed) { | ||
2663 | printk(KERN_ERR "btrfs: unable to start balance with target " | ||
2664 | "data profile %llu\n", | ||
2665 | (unsigned long long)bctl->data.target); | ||
2666 | ret = -EINVAL; | ||
2667 | goto out; | ||
2668 | } | ||
2669 | if (!profile_is_valid(bctl->meta.target, 1) || | ||
2670 | bctl->meta.target & ~allowed) { | ||
2671 | printk(KERN_ERR "btrfs: unable to start balance with target " | ||
2672 | "metadata profile %llu\n", | ||
2673 | (unsigned long long)bctl->meta.target); | ||
2674 | ret = -EINVAL; | ||
2675 | goto out; | ||
2676 | } | ||
2677 | if (!profile_is_valid(bctl->sys.target, 1) || | ||
2678 | bctl->sys.target & ~allowed) { | ||
2679 | printk(KERN_ERR "btrfs: unable to start balance with target " | ||
2680 | "system profile %llu\n", | ||
2681 | (unsigned long long)bctl->sys.target); | ||
2682 | ret = -EINVAL; | ||
2683 | goto out; | ||
2684 | } | ||
2685 | |||
2686 | if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { | ||
2687 | printk(KERN_ERR "btrfs: dup for data is not allowed\n"); | ||
2688 | ret = -EINVAL; | ||
2689 | goto out; | ||
2690 | } | ||
2691 | |||
2692 | /* allow to reduce meta or sys integrity only if force set */ | ||
2693 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | ||
2694 | BTRFS_BLOCK_GROUP_RAID10; | ||
2695 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && | ||
2696 | (fs_info->avail_system_alloc_bits & allowed) && | ||
2697 | !(bctl->sys.target & allowed)) || | ||
2698 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && | ||
2699 | (fs_info->avail_metadata_alloc_bits & allowed) && | ||
2700 | !(bctl->meta.target & allowed))) { | ||
2701 | if (bctl->flags & BTRFS_BALANCE_FORCE) { | ||
2702 | printk(KERN_INFO "btrfs: force reducing metadata " | ||
2703 | "integrity\n"); | ||
2704 | } else { | ||
2705 | printk(KERN_ERR "btrfs: balance will reduce metadata " | ||
2706 | "integrity, use force if you want this\n"); | ||
2707 | ret = -EINVAL; | ||
2708 | goto out; | ||
2709 | } | ||
2710 | } | ||
2711 | |||
2712 | do_balance: | ||
2713 | ret = insert_balance_item(fs_info->tree_root, bctl); | ||
2714 | if (ret && ret != -EEXIST) | ||
2715 | goto out; | ||
2716 | |||
2717 | if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { | ||
2718 | BUG_ON(ret == -EEXIST); | ||
2719 | set_balance_control(bctl); | ||
2720 | } else { | ||
2721 | BUG_ON(ret != -EEXIST); | ||
2722 | spin_lock(&fs_info->balance_lock); | ||
2723 | update_balance_args(bctl); | ||
2724 | spin_unlock(&fs_info->balance_lock); | ||
2725 | } | ||
2726 | |||
2727 | atomic_inc(&fs_info->balance_running); | ||
2728 | mutex_unlock(&fs_info->balance_mutex); | ||
2729 | |||
2730 | ret = __btrfs_balance(fs_info); | ||
2731 | |||
2732 | mutex_lock(&fs_info->balance_mutex); | ||
2733 | atomic_dec(&fs_info->balance_running); | ||
2734 | |||
2735 | if (bargs) { | ||
2736 | memset(bargs, 0, sizeof(*bargs)); | ||
2737 | update_ioctl_balance_args(fs_info, 0, bargs); | ||
2738 | } | ||
2739 | |||
2740 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || | ||
2741 | balance_need_close(fs_info)) { | ||
2742 | __cancel_balance(fs_info); | ||
2743 | } | ||
2744 | |||
2745 | wake_up(&fs_info->balance_wait_q); | ||
2746 | |||
2747 | return ret; | ||
2748 | out: | ||
2749 | if (bctl->flags & BTRFS_BALANCE_RESUME) | ||
2750 | __cancel_balance(fs_info); | ||
2751 | else | ||
2752 | kfree(bctl); | ||
2753 | return ret; | ||
2754 | } | ||
2755 | |||
2756 | static int balance_kthread(void *data) | ||
2757 | { | ||
2758 | struct btrfs_balance_control *bctl = | ||
2759 | (struct btrfs_balance_control *)data; | ||
2760 | struct btrfs_fs_info *fs_info = bctl->fs_info; | ||
2761 | int ret = 0; | ||
2762 | |||
2763 | mutex_lock(&fs_info->volume_mutex); | ||
2764 | mutex_lock(&fs_info->balance_mutex); | ||
2765 | |||
2766 | set_balance_control(bctl); | ||
2767 | |||
2768 | if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { | ||
2769 | printk(KERN_INFO "btrfs: force skipping balance\n"); | ||
2770 | } else { | ||
2771 | printk(KERN_INFO "btrfs: continuing balance\n"); | ||
2772 | ret = btrfs_balance(bctl, NULL); | ||
2773 | } | ||
2774 | |||
2775 | mutex_unlock(&fs_info->balance_mutex); | ||
2776 | mutex_unlock(&fs_info->volume_mutex); | ||
2777 | return ret; | ||
2778 | } | ||
2779 | |||
2780 | int btrfs_recover_balance(struct btrfs_root *tree_root) | ||
2781 | { | ||
2782 | struct task_struct *tsk; | ||
2783 | struct btrfs_balance_control *bctl; | ||
2784 | struct btrfs_balance_item *item; | ||
2785 | struct btrfs_disk_balance_args disk_bargs; | ||
2786 | struct btrfs_path *path; | ||
2787 | struct extent_buffer *leaf; | ||
2788 | struct btrfs_key key; | ||
2789 | int ret; | ||
2790 | |||
2791 | path = btrfs_alloc_path(); | ||
2792 | if (!path) | ||
2793 | return -ENOMEM; | ||
2794 | |||
2795 | bctl = kzalloc(sizeof(*bctl), GFP_NOFS); | ||
2796 | if (!bctl) { | ||
2797 | ret = -ENOMEM; | ||
2798 | goto out; | ||
2799 | } | ||
2800 | |||
2801 | key.objectid = BTRFS_BALANCE_OBJECTID; | ||
2802 | key.type = BTRFS_BALANCE_ITEM_KEY; | ||
2803 | key.offset = 0; | ||
2804 | |||
2805 | ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); | ||
2806 | if (ret < 0) | ||
2807 | goto out_bctl; | ||
2808 | if (ret > 0) { /* ret = -ENOENT; */ | ||
2809 | ret = 0; | ||
2810 | goto out_bctl; | ||
2811 | } | ||
2812 | |||
2813 | leaf = path->nodes[0]; | ||
2814 | item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); | ||
2815 | |||
2816 | bctl->fs_info = tree_root->fs_info; | ||
2817 | bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; | ||
2818 | |||
2819 | btrfs_balance_data(leaf, item, &disk_bargs); | ||
2820 | btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); | ||
2821 | btrfs_balance_meta(leaf, item, &disk_bargs); | ||
2822 | btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); | ||
2823 | btrfs_balance_sys(leaf, item, &disk_bargs); | ||
2824 | btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); | ||
2825 | |||
2826 | tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); | ||
2827 | if (IS_ERR(tsk)) | ||
2828 | ret = PTR_ERR(tsk); | ||
2829 | else | ||
2830 | goto out; | ||
2831 | |||
2832 | out_bctl: | ||
2833 | kfree(bctl); | ||
2834 | out: | ||
2835 | btrfs_free_path(path); | ||
2836 | return ret; | ||
2837 | } | ||
2838 | |||
2839 | int btrfs_pause_balance(struct btrfs_fs_info *fs_info) | ||
2840 | { | ||
2841 | int ret = 0; | ||
2842 | |||
2843 | mutex_lock(&fs_info->balance_mutex); | ||
2844 | if (!fs_info->balance_ctl) { | ||
2845 | mutex_unlock(&fs_info->balance_mutex); | ||
2846 | return -ENOTCONN; | ||
2847 | } | ||
2848 | |||
2849 | if (atomic_read(&fs_info->balance_running)) { | ||
2850 | atomic_inc(&fs_info->balance_pause_req); | ||
2851 | mutex_unlock(&fs_info->balance_mutex); | ||
2852 | |||
2853 | wait_event(fs_info->balance_wait_q, | ||
2854 | atomic_read(&fs_info->balance_running) == 0); | ||
2855 | |||
2856 | mutex_lock(&fs_info->balance_mutex); | ||
2857 | /* we are good with balance_ctl ripped off from under us */ | ||
2858 | BUG_ON(atomic_read(&fs_info->balance_running)); | ||
2859 | atomic_dec(&fs_info->balance_pause_req); | ||
2860 | } else { | ||
2861 | ret = -ENOTCONN; | ||
2862 | } | ||
2863 | |||
2864 | mutex_unlock(&fs_info->balance_mutex); | ||
2865 | return ret; | ||
2866 | } | ||
2867 | |||
2868 | int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) | ||
2869 | { | ||
2870 | mutex_lock(&fs_info->balance_mutex); | ||
2871 | if (!fs_info->balance_ctl) { | ||
2872 | mutex_unlock(&fs_info->balance_mutex); | ||
2873 | return -ENOTCONN; | ||
2874 | } | ||
2875 | |||
2876 | atomic_inc(&fs_info->balance_cancel_req); | ||
2877 | /* | ||
2878 | * if we are running just wait and return, balance item is | ||
2879 | * deleted in btrfs_balance in this case | ||
2880 | */ | ||
2881 | if (atomic_read(&fs_info->balance_running)) { | ||
2882 | mutex_unlock(&fs_info->balance_mutex); | ||
2883 | wait_event(fs_info->balance_wait_q, | ||
2884 | atomic_read(&fs_info->balance_running) == 0); | ||
2885 | mutex_lock(&fs_info->balance_mutex); | ||
2886 | } else { | ||
2887 | /* __cancel_balance needs volume_mutex */ | ||
2888 | mutex_unlock(&fs_info->balance_mutex); | ||
2889 | mutex_lock(&fs_info->volume_mutex); | ||
2890 | mutex_lock(&fs_info->balance_mutex); | ||
2891 | |||
2892 | if (fs_info->balance_ctl) | ||
2893 | __cancel_balance(fs_info); | ||
2894 | |||
2895 | mutex_unlock(&fs_info->volume_mutex); | ||
2896 | } | ||
2897 | |||
2898 | BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); | ||
2899 | atomic_dec(&fs_info->balance_cancel_req); | ||
2900 | mutex_unlock(&fs_info->balance_mutex); | ||
2901 | return 0; | ||
2902 | } | ||
2903 | |||
2186 | /* | 2904 | /* |
2187 | * shrinking a device means finding all of the device extents past | 2905 | * shrinking a device means finding all of the device extents past |
2188 | * the new size, and then following the back refs to the chunks. | 2906 | * the new size, and then following the back refs to the chunks. |
@@ -2323,8 +3041,7 @@ done: | |||
2323 | return ret; | 3041 | return ret; |
2324 | } | 3042 | } |
2325 | 3043 | ||
2326 | static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, | 3044 | static int btrfs_add_system_chunk(struct btrfs_root *root, |
2327 | struct btrfs_root *root, | ||
2328 | struct btrfs_key *key, | 3045 | struct btrfs_key *key, |
2329 | struct btrfs_chunk *chunk, int item_size) | 3046 | struct btrfs_chunk *chunk, int item_size) |
2330 | { | 3047 | { |
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
2441 | max_stripe_size = 1024 * 1024 * 1024; | 3158 | max_stripe_size = 1024 * 1024 * 1024; |
2442 | max_chunk_size = 10 * max_stripe_size; | 3159 | max_chunk_size = 10 * max_stripe_size; |
2443 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { | 3160 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { |
2444 | max_stripe_size = 256 * 1024 * 1024; | 3161 | /* for larger filesystems, use larger metadata chunks */ |
3162 | if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) | ||
3163 | max_stripe_size = 1024 * 1024 * 1024; | ||
3164 | else | ||
3165 | max_stripe_size = 256 * 1024 * 1024; | ||
2445 | max_chunk_size = max_stripe_size; | 3166 | max_chunk_size = max_stripe_size; |
2446 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { | 3167 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { |
2447 | max_stripe_size = 8 * 1024 * 1024; | 3168 | max_stripe_size = 32 * 1024 * 1024; |
2448 | max_chunk_size = 2 * max_stripe_size; | 3169 | max_chunk_size = 2 * max_stripe_size; |
2449 | } else { | 3170 | } else { |
2450 | printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", | 3171 | printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", |
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
2496 | if (total_avail == 0) | 3217 | if (total_avail == 0) |
2497 | continue; | 3218 | continue; |
2498 | 3219 | ||
2499 | ret = find_free_dev_extent(trans, device, | 3220 | ret = find_free_dev_extent(device, |
2500 | max_stripe_size * dev_stripes, | 3221 | max_stripe_size * dev_stripes, |
2501 | &dev_offset, &max_avail); | 3222 | &dev_offset, &max_avail); |
2502 | if (ret && ret != -ENOSPC) | 3223 | if (ret && ret != -ENOSPC) |
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, | |||
2687 | BUG_ON(ret); | 3408 | BUG_ON(ret); |
2688 | 3409 | ||
2689 | if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { | 3410 | if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { |
2690 | ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, | 3411 | ret = btrfs_add_system_chunk(chunk_root, &key, chunk, |
2691 | item_size); | 3412 | item_size); |
2692 | BUG_ON(ret); | 3413 | BUG_ON(ret); |
2693 | } | 3414 | } |
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
2752 | return ret; | 3473 | return ret; |
2753 | 3474 | ||
2754 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | | 3475 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | |
2755 | (fs_info->metadata_alloc_profile & | 3476 | fs_info->avail_metadata_alloc_bits; |
2756 | fs_info->avail_metadata_alloc_bits); | ||
2757 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | 3477 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); |
2758 | 3478 | ||
2759 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, | 3479 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, |
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
2763 | sys_chunk_offset = chunk_offset + chunk_size; | 3483 | sys_chunk_offset = chunk_offset + chunk_size; |
2764 | 3484 | ||
2765 | alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | | 3485 | alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | |
2766 | (fs_info->system_alloc_profile & | 3486 | fs_info->avail_system_alloc_bits; |
2767 | fs_info->avail_system_alloc_bits); | ||
2768 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | 3487 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); |
2769 | 3488 | ||
2770 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, | 3489 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, |
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
2901 | u64 stripe_nr; | 3620 | u64 stripe_nr; |
2902 | u64 stripe_nr_orig; | 3621 | u64 stripe_nr_orig; |
2903 | u64 stripe_nr_end; | 3622 | u64 stripe_nr_end; |
2904 | int stripes_allocated = 8; | ||
2905 | int stripes_required = 1; | ||
2906 | int stripe_index; | 3623 | int stripe_index; |
2907 | int i; | 3624 | int i; |
3625 | int ret = 0; | ||
2908 | int num_stripes; | 3626 | int num_stripes; |
2909 | int max_errors = 0; | 3627 | int max_errors = 0; |
2910 | struct btrfs_bio *bbio = NULL; | 3628 | struct btrfs_bio *bbio = NULL; |
2911 | 3629 | ||
2912 | if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) | ||
2913 | stripes_allocated = 1; | ||
2914 | again: | ||
2915 | if (bbio_ret) { | ||
2916 | bbio = kzalloc(btrfs_bio_size(stripes_allocated), | ||
2917 | GFP_NOFS); | ||
2918 | if (!bbio) | ||
2919 | return -ENOMEM; | ||
2920 | |||
2921 | atomic_set(&bbio->error, 0); | ||
2922 | } | ||
2923 | |||
2924 | read_lock(&em_tree->lock); | 3630 | read_lock(&em_tree->lock); |
2925 | em = lookup_extent_mapping(em_tree, logical, *length); | 3631 | em = lookup_extent_mapping(em_tree, logical, *length); |
2926 | read_unlock(&em_tree->lock); | 3632 | read_unlock(&em_tree->lock); |
@@ -2939,32 +3645,6 @@ again: | |||
2939 | if (mirror_num > map->num_stripes) | 3645 | if (mirror_num > map->num_stripes) |
2940 | mirror_num = 0; | 3646 | mirror_num = 0; |
2941 | 3647 | ||
2942 | /* if our btrfs_bio struct is too small, back off and try again */ | ||
2943 | if (rw & REQ_WRITE) { | ||
2944 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | ||
2945 | BTRFS_BLOCK_GROUP_DUP)) { | ||
2946 | stripes_required = map->num_stripes; | ||
2947 | max_errors = 1; | ||
2948 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | ||
2949 | stripes_required = map->sub_stripes; | ||
2950 | max_errors = 1; | ||
2951 | } | ||
2952 | } | ||
2953 | if (rw & REQ_DISCARD) { | ||
2954 | if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | | ||
2955 | BTRFS_BLOCK_GROUP_RAID1 | | ||
2956 | BTRFS_BLOCK_GROUP_DUP | | ||
2957 | BTRFS_BLOCK_GROUP_RAID10)) { | ||
2958 | stripes_required = map->num_stripes; | ||
2959 | } | ||
2960 | } | ||
2961 | if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && | ||
2962 | stripes_allocated < stripes_required) { | ||
2963 | stripes_allocated = map->num_stripes; | ||
2964 | free_extent_map(em); | ||
2965 | kfree(bbio); | ||
2966 | goto again; | ||
2967 | } | ||
2968 | stripe_nr = offset; | 3648 | stripe_nr = offset; |
2969 | /* | 3649 | /* |
2970 | * stripe_nr counts the total number of stripes we have to stride | 3650 | * stripe_nr counts the total number of stripes we have to stride |
@@ -2980,10 +3660,7 @@ again: | |||
2980 | 3660 | ||
2981 | if (rw & REQ_DISCARD) | 3661 | if (rw & REQ_DISCARD) |
2982 | *length = min_t(u64, em->len - offset, *length); | 3662 | *length = min_t(u64, em->len - offset, *length); |
2983 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | | 3663 | else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
2984 | BTRFS_BLOCK_GROUP_RAID1 | | ||
2985 | BTRFS_BLOCK_GROUP_RAID10 | | ||
2986 | BTRFS_BLOCK_GROUP_DUP)) { | ||
2987 | /* we limit the length of each bio to what fits in a stripe */ | 3664 | /* we limit the length of each bio to what fits in a stripe */ |
2988 | *length = min_t(u64, em->len - offset, | 3665 | *length = min_t(u64, em->len - offset, |
2989 | map->stripe_len - stripe_offset); | 3666 | map->stripe_len - stripe_offset); |
@@ -3059,81 +3736,55 @@ again: | |||
3059 | } | 3736 | } |
3060 | BUG_ON(stripe_index >= map->num_stripes); | 3737 | BUG_ON(stripe_index >= map->num_stripes); |
3061 | 3738 | ||
3739 | bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); | ||
3740 | if (!bbio) { | ||
3741 | ret = -ENOMEM; | ||
3742 | goto out; | ||
3743 | } | ||
3744 | atomic_set(&bbio->error, 0); | ||
3745 | |||
3062 | if (rw & REQ_DISCARD) { | 3746 | if (rw & REQ_DISCARD) { |
3747 | int factor = 0; | ||
3748 | int sub_stripes = 0; | ||
3749 | u64 stripes_per_dev = 0; | ||
3750 | u32 remaining_stripes = 0; | ||
3751 | |||
3752 | if (map->type & | ||
3753 | (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { | ||
3754 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) | ||
3755 | sub_stripes = 1; | ||
3756 | else | ||
3757 | sub_stripes = map->sub_stripes; | ||
3758 | |||
3759 | factor = map->num_stripes / sub_stripes; | ||
3760 | stripes_per_dev = div_u64_rem(stripe_nr_end - | ||
3761 | stripe_nr_orig, | ||
3762 | factor, | ||
3763 | &remaining_stripes); | ||
3764 | } | ||
3765 | |||
3063 | for (i = 0; i < num_stripes; i++) { | 3766 | for (i = 0; i < num_stripes; i++) { |
3064 | bbio->stripes[i].physical = | 3767 | bbio->stripes[i].physical = |
3065 | map->stripes[stripe_index].physical + | 3768 | map->stripes[stripe_index].physical + |
3066 | stripe_offset + stripe_nr * map->stripe_len; | 3769 | stripe_offset + stripe_nr * map->stripe_len; |
3067 | bbio->stripes[i].dev = map->stripes[stripe_index].dev; | 3770 | bbio->stripes[i].dev = map->stripes[stripe_index].dev; |
3068 | 3771 | ||
3069 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 3772 | if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | |
3070 | u64 stripes; | 3773 | BTRFS_BLOCK_GROUP_RAID10)) { |
3071 | u32 last_stripe = 0; | 3774 | bbio->stripes[i].length = stripes_per_dev * |
3072 | int j; | 3775 | map->stripe_len; |
3073 | 3776 | if (i / sub_stripes < remaining_stripes) | |
3074 | div_u64_rem(stripe_nr_end - 1, | 3777 | bbio->stripes[i].length += |
3075 | map->num_stripes, | 3778 | map->stripe_len; |
3076 | &last_stripe); | 3779 | if (i < sub_stripes) |
3077 | |||
3078 | for (j = 0; j < map->num_stripes; j++) { | ||
3079 | u32 test; | ||
3080 | |||
3081 | div_u64_rem(stripe_nr_end - 1 - j, | ||
3082 | map->num_stripes, &test); | ||
3083 | if (test == stripe_index) | ||
3084 | break; | ||
3085 | } | ||
3086 | stripes = stripe_nr_end - 1 - j; | ||
3087 | do_div(stripes, map->num_stripes); | ||
3088 | bbio->stripes[i].length = map->stripe_len * | ||
3089 | (stripes - stripe_nr + 1); | ||
3090 | |||
3091 | if (i == 0) { | ||
3092 | bbio->stripes[i].length -= | ||
3093 | stripe_offset; | ||
3094 | stripe_offset = 0; | ||
3095 | } | ||
3096 | if (stripe_index == last_stripe) | ||
3097 | bbio->stripes[i].length -= | ||
3098 | stripe_end_offset; | ||
3099 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | ||
3100 | u64 stripes; | ||
3101 | int j; | ||
3102 | int factor = map->num_stripes / | ||
3103 | map->sub_stripes; | ||
3104 | u32 last_stripe = 0; | ||
3105 | |||
3106 | div_u64_rem(stripe_nr_end - 1, | ||
3107 | factor, &last_stripe); | ||
3108 | last_stripe *= map->sub_stripes; | ||
3109 | |||
3110 | for (j = 0; j < factor; j++) { | ||
3111 | u32 test; | ||
3112 | |||
3113 | div_u64_rem(stripe_nr_end - 1 - j, | ||
3114 | factor, &test); | ||
3115 | |||
3116 | if (test == | ||
3117 | stripe_index / map->sub_stripes) | ||
3118 | break; | ||
3119 | } | ||
3120 | stripes = stripe_nr_end - 1 - j; | ||
3121 | do_div(stripes, factor); | ||
3122 | bbio->stripes[i].length = map->stripe_len * | ||
3123 | (stripes - stripe_nr + 1); | ||
3124 | |||
3125 | if (i < map->sub_stripes) { | ||
3126 | bbio->stripes[i].length -= | 3780 | bbio->stripes[i].length -= |
3127 | stripe_offset; | 3781 | stripe_offset; |
3128 | if (i == map->sub_stripes - 1) | 3782 | if ((i / sub_stripes + 1) % |
3129 | stripe_offset = 0; | 3783 | sub_stripes == remaining_stripes) |
3130 | } | ||
3131 | if (stripe_index >= last_stripe && | ||
3132 | stripe_index <= (last_stripe + | ||
3133 | map->sub_stripes - 1)) { | ||
3134 | bbio->stripes[i].length -= | 3784 | bbio->stripes[i].length -= |
3135 | stripe_end_offset; | 3785 | stripe_end_offset; |
3136 | } | 3786 | if (i == sub_stripes - 1) |
3787 | stripe_offset = 0; | ||
3137 | } else | 3788 | } else |
3138 | bbio->stripes[i].length = *length; | 3789 | bbio->stripes[i].length = *length; |
3139 | 3790 | ||
@@ -3155,15 +3806,22 @@ again: | |||
3155 | stripe_index++; | 3806 | stripe_index++; |
3156 | } | 3807 | } |
3157 | } | 3808 | } |
3158 | if (bbio_ret) { | 3809 | |
3159 | *bbio_ret = bbio; | 3810 | if (rw & REQ_WRITE) { |
3160 | bbio->num_stripes = num_stripes; | 3811 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
3161 | bbio->max_errors = max_errors; | 3812 | BTRFS_BLOCK_GROUP_RAID10 | |
3162 | bbio->mirror_num = mirror_num; | 3813 | BTRFS_BLOCK_GROUP_DUP)) { |
3814 | max_errors = 1; | ||
3815 | } | ||
3163 | } | 3816 | } |
3817 | |||
3818 | *bbio_ret = bbio; | ||
3819 | bbio->num_stripes = num_stripes; | ||
3820 | bbio->max_errors = max_errors; | ||
3821 | bbio->mirror_num = mirror_num; | ||
3164 | out: | 3822 | out: |
3165 | free_extent_map(em); | 3823 | free_extent_map(em); |
3166 | return 0; | 3824 | return ret; |
3167 | } | 3825 | } |
3168 | 3826 | ||
3169 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 3827 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, |
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root, | |||
3304 | /* don't bother with additional async steps for reads, right now */ | 3962 | /* don't bother with additional async steps for reads, right now */ |
3305 | if (!(rw & REQ_WRITE)) { | 3963 | if (!(rw & REQ_WRITE)) { |
3306 | bio_get(bio); | 3964 | bio_get(bio); |
3307 | submit_bio(rw, bio); | 3965 | btrfsic_submit_bio(rw, bio); |
3308 | bio_put(bio); | 3966 | bio_put(bio); |
3309 | return 0; | 3967 | return 0; |
3310 | } | 3968 | } |
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
3399 | if (async_submit) | 4057 | if (async_submit) |
3400 | schedule_bio(root, dev, rw, bio); | 4058 | schedule_bio(root, dev, rw, bio); |
3401 | else | 4059 | else |
3402 | submit_bio(rw, bio); | 4060 | btrfsic_submit_bio(rw, bio); |
3403 | } else { | 4061 | } else { |
3404 | bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; | 4062 | bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; |
3405 | bio->bi_sector = logical >> 9; | 4063 | bio->bi_sector = logical >> 9; |
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) | |||
3568 | struct btrfs_fs_devices *fs_devices; | 4226 | struct btrfs_fs_devices *fs_devices; |
3569 | int ret; | 4227 | int ret; |
3570 | 4228 | ||
3571 | mutex_lock(&uuid_mutex); | 4229 | BUG_ON(!mutex_is_locked(&uuid_mutex)); |
3572 | 4230 | ||
3573 | fs_devices = root->fs_info->fs_devices->seed; | 4231 | fs_devices = root->fs_info->fs_devices->seed; |
3574 | while (fs_devices) { | 4232 | while (fs_devices) { |
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) | |||
3606 | fs_devices->seed = root->fs_info->fs_devices->seed; | 4264 | fs_devices->seed = root->fs_info->fs_devices->seed; |
3607 | root->fs_info->fs_devices->seed = fs_devices; | 4265 | root->fs_info->fs_devices->seed = fs_devices; |
3608 | out: | 4266 | out: |
3609 | mutex_unlock(&uuid_mutex); | ||
3610 | return ret; | 4267 | return ret; |
3611 | } | 4268 | } |
3612 | 4269 | ||
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) | |||
3749 | if (!path) | 4406 | if (!path) |
3750 | return -ENOMEM; | 4407 | return -ENOMEM; |
3751 | 4408 | ||
4409 | mutex_lock(&uuid_mutex); | ||
4410 | lock_chunks(root); | ||
4411 | |||
3752 | /* first we search for all of the device items, and then we | 4412 | /* first we search for all of the device items, and then we |
3753 | * read in all of the chunk items. This way we can create chunk | 4413 | * read in all of the chunk items. This way we can create chunk |
3754 | * mappings that reference all of the devices that are afound | 4414 | * mappings that reference all of the devices that are afound |
@@ -3799,6 +4459,9 @@ again: | |||
3799 | } | 4459 | } |
3800 | ret = 0; | 4460 | ret = 0; |
3801 | error: | 4461 | error: |
4462 | unlock_chunks(root); | ||
4463 | mutex_unlock(&uuid_mutex); | ||
4464 | |||
3802 | btrfs_free_path(path); | 4465 | btrfs_free_path(path); |
3803 | return ret; | 4466 | return ret; |
3804 | } | 4467 | } |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37f..19ac95048b88 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -186,6 +186,51 @@ struct map_lookup { | |||
186 | #define map_lookup_size(n) (sizeof(struct map_lookup) + \ | 186 | #define map_lookup_size(n) (sizeof(struct map_lookup) + \ |
187 | (sizeof(struct btrfs_bio_stripe) * (n))) | 187 | (sizeof(struct btrfs_bio_stripe) * (n))) |
188 | 188 | ||
189 | /* | ||
190 | * Restriper's general type filter | ||
191 | */ | ||
192 | #define BTRFS_BALANCE_DATA (1ULL << 0) | ||
193 | #define BTRFS_BALANCE_SYSTEM (1ULL << 1) | ||
194 | #define BTRFS_BALANCE_METADATA (1ULL << 2) | ||
195 | |||
196 | #define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ | ||
197 | BTRFS_BALANCE_SYSTEM | \ | ||
198 | BTRFS_BALANCE_METADATA) | ||
199 | |||
200 | #define BTRFS_BALANCE_FORCE (1ULL << 3) | ||
201 | #define BTRFS_BALANCE_RESUME (1ULL << 4) | ||
202 | |||
203 | /* | ||
204 | * Balance filters | ||
205 | */ | ||
206 | #define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) | ||
207 | #define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) | ||
208 | #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) | ||
209 | #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) | ||
210 | #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) | ||
211 | |||
212 | /* | ||
213 | * Profile changing flags. When SOFT is set we won't relocate chunk if | ||
214 | * it already has the target profile (even though it may be | ||
215 | * half-filled). | ||
216 | */ | ||
217 | #define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) | ||
218 | #define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) | ||
219 | |||
220 | struct btrfs_balance_args; | ||
221 | struct btrfs_balance_progress; | ||
222 | struct btrfs_balance_control { | ||
223 | struct btrfs_fs_info *fs_info; | ||
224 | |||
225 | struct btrfs_balance_args data; | ||
226 | struct btrfs_balance_args meta; | ||
227 | struct btrfs_balance_args sys; | ||
228 | |||
229 | u64 flags; | ||
230 | |||
231 | struct btrfs_balance_progress stat; | ||
232 | }; | ||
233 | |||
189 | int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, | 234 | int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, |
190 | u64 end, u64 *length); | 235 | u64 end, u64 *length); |
191 | 236 | ||
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, | |||
228 | u8 *uuid, u8 *fsid); | 273 | u8 *uuid, u8 *fsid); |
229 | int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); | 274 | int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); |
230 | int btrfs_init_new_device(struct btrfs_root *root, char *path); | 275 | int btrfs_init_new_device(struct btrfs_root *root, char *path); |
231 | int btrfs_balance(struct btrfs_root *dev_root); | 276 | int btrfs_balance(struct btrfs_balance_control *bctl, |
277 | struct btrfs_ioctl_balance_args *bargs); | ||
278 | int btrfs_recover_balance(struct btrfs_root *tree_root); | ||
279 | int btrfs_pause_balance(struct btrfs_fs_info *fs_info); | ||
280 | int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); | ||
232 | int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); | 281 | int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); |
233 | int find_free_dev_extent(struct btrfs_trans_handle *trans, | 282 | int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, |
234 | struct btrfs_device *device, u64 num_bytes, | ||
235 | u64 *start, u64 *max_avail); | 283 | u64 *start, u64 *max_avail); |
236 | #endif | 284 | #endif |
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3848b04e310e..e7a5659087e6 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c | |||
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, | |||
200 | ret = btrfs_update_inode(trans, root, inode); | 200 | ret = btrfs_update_inode(trans, root, inode); |
201 | BUG_ON(ret); | 201 | BUG_ON(ret); |
202 | out: | 202 | out: |
203 | btrfs_end_transaction_throttle(trans, root); | 203 | btrfs_end_transaction(trans, root); |
204 | return ret; | 204 | return ret; |
205 | } | 205 | } |
206 | 206 | ||
diff --git a/fs/namei.c b/fs/namei.c index c283a1ec008e..208c6aa4a989 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page) | |||
140 | 140 | ||
141 | static char *getname_flags(const char __user *filename, int flags, int *empty) | 141 | static char *getname_flags(const char __user *filename, int flags, int *empty) |
142 | { | 142 | { |
143 | char *tmp, *result; | 143 | char *result = __getname(); |
144 | 144 | int retval; | |
145 | result = ERR_PTR(-ENOMEM); | 145 | |
146 | tmp = __getname(); | 146 | if (!result) |
147 | if (tmp) { | 147 | return ERR_PTR(-ENOMEM); |
148 | int retval = do_getname(filename, tmp); | 148 | |
149 | 149 | retval = do_getname(filename, result); | |
150 | result = tmp; | 150 | if (retval < 0) { |
151 | if (retval < 0) { | 151 | if (retval == -ENOENT && empty) |
152 | if (retval == -ENOENT && empty) | 152 | *empty = 1; |
153 | *empty = 1; | 153 | if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { |
154 | if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { | 154 | __putname(result); |
155 | __putname(tmp); | 155 | return ERR_PTR(retval); |
156 | result = ERR_PTR(retval); | ||
157 | } | ||
158 | } | 156 | } |
159 | } | 157 | } |
160 | audit_getname(result); | 158 | audit_getname(result); |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 5485a5388ecb..9cde9edf9c4d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path) | |||
198 | return result; | 198 | return result; |
199 | } | 199 | } |
200 | 200 | ||
201 | static struct mm_struct *__check_mem_permission(struct task_struct *task) | 201 | static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) |
202 | { | ||
203 | struct mm_struct *mm; | ||
204 | |||
205 | mm = get_task_mm(task); | ||
206 | if (!mm) | ||
207 | return ERR_PTR(-EINVAL); | ||
208 | |||
209 | /* | ||
210 | * A task can always look at itself, in case it chooses | ||
211 | * to use system calls instead of load instructions. | ||
212 | */ | ||
213 | if (task == current) | ||
214 | return mm; | ||
215 | |||
216 | /* | ||
217 | * If current is actively ptrace'ing, and would also be | ||
218 | * permitted to freshly attach with ptrace now, permit it. | ||
219 | */ | ||
220 | if (task_is_stopped_or_traced(task)) { | ||
221 | int match; | ||
222 | rcu_read_lock(); | ||
223 | match = (ptrace_parent(task) == current); | ||
224 | rcu_read_unlock(); | ||
225 | if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) | ||
226 | return mm; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * No one else is allowed. | ||
231 | */ | ||
232 | mmput(mm); | ||
233 | return ERR_PTR(-EPERM); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * If current may access user memory in @task return a reference to the | ||
238 | * corresponding mm, otherwise ERR_PTR. | ||
239 | */ | ||
240 | static struct mm_struct *check_mem_permission(struct task_struct *task) | ||
241 | { | ||
242 | struct mm_struct *mm; | ||
243 | int err; | ||
244 | |||
245 | /* | ||
246 | * Avoid racing if task exec's as we might get a new mm but validate | ||
247 | * against old credentials. | ||
248 | */ | ||
249 | err = mutex_lock_killable(&task->signal->cred_guard_mutex); | ||
250 | if (err) | ||
251 | return ERR_PTR(err); | ||
252 | |||
253 | mm = __check_mem_permission(task); | ||
254 | mutex_unlock(&task->signal->cred_guard_mutex); | ||
255 | |||
256 | return mm; | ||
257 | } | ||
258 | |||
259 | struct mm_struct *mm_for_maps(struct task_struct *task) | ||
260 | { | 202 | { |
261 | struct mm_struct *mm; | 203 | struct mm_struct *mm; |
262 | int err; | 204 | int err; |
@@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task) | |||
267 | 209 | ||
268 | mm = get_task_mm(task); | 210 | mm = get_task_mm(task); |
269 | if (mm && mm != current->mm && | 211 | if (mm && mm != current->mm && |
270 | !ptrace_may_access(task, PTRACE_MODE_READ)) { | 212 | !ptrace_may_access(task, mode)) { |
271 | mmput(mm); | 213 | mmput(mm); |
272 | mm = ERR_PTR(-EACCES); | 214 | mm = ERR_PTR(-EACCES); |
273 | } | 215 | } |
@@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task) | |||
276 | return mm; | 218 | return mm; |
277 | } | 219 | } |
278 | 220 | ||
221 | struct mm_struct *mm_for_maps(struct task_struct *task) | ||
222 | { | ||
223 | return mm_access(task, PTRACE_MODE_READ); | ||
224 | } | ||
225 | |||
279 | static int proc_pid_cmdline(struct task_struct *task, char * buffer) | 226 | static int proc_pid_cmdline(struct task_struct *task, char * buffer) |
280 | { | 227 | { |
281 | int res = 0; | 228 | int res = 0; |
@@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = { | |||
752 | 699 | ||
753 | static int mem_open(struct inode* inode, struct file* file) | 700 | static int mem_open(struct inode* inode, struct file* file) |
754 | { | 701 | { |
755 | file->private_data = (void*)((long)current->self_exec_id); | 702 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); |
703 | struct mm_struct *mm; | ||
704 | |||
705 | if (!task) | ||
706 | return -ESRCH; | ||
707 | |||
708 | mm = mm_access(task, PTRACE_MODE_ATTACH); | ||
709 | put_task_struct(task); | ||
710 | |||
711 | if (IS_ERR(mm)) | ||
712 | return PTR_ERR(mm); | ||
713 | |||
756 | /* OK to pass negative loff_t, we can catch out-of-range */ | 714 | /* OK to pass negative loff_t, we can catch out-of-range */ |
757 | file->f_mode |= FMODE_UNSIGNED_OFFSET; | 715 | file->f_mode |= FMODE_UNSIGNED_OFFSET; |
716 | file->private_data = mm; | ||
717 | |||
758 | return 0; | 718 | return 0; |
759 | } | 719 | } |
760 | 720 | ||
761 | static ssize_t mem_read(struct file * file, char __user * buf, | 721 | static ssize_t mem_read(struct file * file, char __user * buf, |
762 | size_t count, loff_t *ppos) | 722 | size_t count, loff_t *ppos) |
763 | { | 723 | { |
764 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); | 724 | int ret; |
765 | char *page; | 725 | char *page; |
766 | unsigned long src = *ppos; | 726 | unsigned long src = *ppos; |
767 | int ret = -ESRCH; | 727 | struct mm_struct *mm = file->private_data; |
768 | struct mm_struct *mm; | ||
769 | 728 | ||
770 | if (!task) | 729 | if (!mm) |
771 | goto out_no_task; | 730 | return 0; |
772 | 731 | ||
773 | ret = -ENOMEM; | ||
774 | page = (char *)__get_free_page(GFP_TEMPORARY); | 732 | page = (char *)__get_free_page(GFP_TEMPORARY); |
775 | if (!page) | 733 | if (!page) |
776 | goto out; | 734 | return -ENOMEM; |
777 | |||
778 | mm = check_mem_permission(task); | ||
779 | ret = PTR_ERR(mm); | ||
780 | if (IS_ERR(mm)) | ||
781 | goto out_free; | ||
782 | |||
783 | ret = -EIO; | ||
784 | |||
785 | if (file->private_data != (void*)((long)current->self_exec_id)) | ||
786 | goto out_put; | ||
787 | 735 | ||
788 | ret = 0; | 736 | ret = 0; |
789 | 737 | ||
@@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf, | |||
810 | } | 758 | } |
811 | *ppos = src; | 759 | *ppos = src; |
812 | 760 | ||
813 | out_put: | ||
814 | mmput(mm); | ||
815 | out_free: | ||
816 | free_page((unsigned long) page); | 761 | free_page((unsigned long) page); |
817 | out: | ||
818 | put_task_struct(task); | ||
819 | out_no_task: | ||
820 | return ret; | 762 | return ret; |
821 | } | 763 | } |
822 | 764 | ||
@@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf, | |||
825 | { | 767 | { |
826 | int copied; | 768 | int copied; |
827 | char *page; | 769 | char *page; |
828 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); | ||
829 | unsigned long dst = *ppos; | 770 | unsigned long dst = *ppos; |
830 | struct mm_struct *mm; | 771 | struct mm_struct *mm = file->private_data; |
831 | 772 | ||
832 | copied = -ESRCH; | 773 | if (!mm) |
833 | if (!task) | 774 | return 0; |
834 | goto out_no_task; | ||
835 | 775 | ||
836 | copied = -ENOMEM; | ||
837 | page = (char *)__get_free_page(GFP_TEMPORARY); | 776 | page = (char *)__get_free_page(GFP_TEMPORARY); |
838 | if (!page) | 777 | if (!page) |
839 | goto out_task; | 778 | return -ENOMEM; |
840 | |||
841 | mm = check_mem_permission(task); | ||
842 | copied = PTR_ERR(mm); | ||
843 | if (IS_ERR(mm)) | ||
844 | goto out_free; | ||
845 | |||
846 | copied = -EIO; | ||
847 | if (file->private_data != (void *)((long)current->self_exec_id)) | ||
848 | goto out_mm; | ||
849 | 779 | ||
850 | copied = 0; | 780 | copied = 0; |
851 | while (count > 0) { | 781 | while (count > 0) { |
@@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf, | |||
869 | } | 799 | } |
870 | *ppos = dst; | 800 | *ppos = dst; |
871 | 801 | ||
872 | out_mm: | ||
873 | mmput(mm); | ||
874 | out_free: | ||
875 | free_page((unsigned long) page); | 802 | free_page((unsigned long) page); |
876 | out_task: | ||
877 | put_task_struct(task); | ||
878 | out_no_task: | ||
879 | return copied; | 803 | return copied; |
880 | } | 804 | } |
881 | 805 | ||
@@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) | |||
895 | return file->f_pos; | 819 | return file->f_pos; |
896 | } | 820 | } |
897 | 821 | ||
822 | static int mem_release(struct inode *inode, struct file *file) | ||
823 | { | ||
824 | struct mm_struct *mm = file->private_data; | ||
825 | |||
826 | mmput(mm); | ||
827 | return 0; | ||
828 | } | ||
829 | |||
898 | static const struct file_operations proc_mem_operations = { | 830 | static const struct file_operations proc_mem_operations = { |
899 | .llseek = mem_lseek, | 831 | .llseek = mem_lseek, |
900 | .read = mem_read, | 832 | .read = mem_read, |
901 | .write = mem_write, | 833 | .write = mem_write, |
902 | .open = mem_open, | 834 | .open = mem_open, |
835 | .release = mem_release, | ||
903 | }; | 836 | }; |
904 | 837 | ||
905 | static ssize_t environ_read(struct file *file, char __user *buf, | 838 | static ssize_t environ_read(struct file *file, char __user *buf, |
@@ -1199,9 +1132,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, | |||
1199 | ssize_t length; | 1132 | ssize_t length; |
1200 | uid_t loginuid; | 1133 | uid_t loginuid; |
1201 | 1134 | ||
1202 | if (!capable(CAP_AUDIT_CONTROL)) | ||
1203 | return -EPERM; | ||
1204 | |||
1205 | rcu_read_lock(); | 1135 | rcu_read_lock(); |
1206 | if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { | 1136 | if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { |
1207 | rcu_read_unlock(); | 1137 | rcu_read_unlock(); |
@@ -1230,7 +1160,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, | |||
1230 | goto out_free_page; | 1160 | goto out_free_page; |
1231 | 1161 | ||
1232 | } | 1162 | } |
1233 | length = audit_set_loginuid(current, loginuid); | 1163 | length = audit_set_loginuid(loginuid); |
1234 | if (likely(length == 0)) | 1164 | if (likely(length == 0)) |
1235 | length = count; | 1165 | length = count; |
1236 | 1166 | ||
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 574d4ee9b625..74b9baf36ac3 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -111,8 +111,7 @@ xfs_ioend_new_eof( | |||
111 | xfs_fsize_t bsize; | 111 | xfs_fsize_t bsize; |
112 | 112 | ||
113 | bsize = ioend->io_offset + ioend->io_size; | 113 | bsize = ioend->io_offset + ioend->io_size; |
114 | isize = MAX(ip->i_size, ip->i_new_size); | 114 | isize = MIN(i_size_read(VFS_I(ip)), bsize); |
115 | isize = MIN(isize, bsize); | ||
116 | return isize > ip->i_d.di_size ? isize : 0; | 115 | return isize > ip->i_d.di_size ? isize : 0; |
117 | } | 116 | } |
118 | 117 | ||
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) | |||
126 | } | 125 | } |
127 | 126 | ||
128 | /* | 127 | /* |
129 | * Update on-disk file size now that data has been written to disk. The | 128 | * Update on-disk file size now that data has been written to disk. |
130 | * current in-memory file size is i_size. If a write is beyond eof i_new_size | ||
131 | * will be the intended file size until i_size is updated. If this write does | ||
132 | * not extend all the way to the valid file size then restrict this update to | ||
133 | * the end of the write. | ||
134 | * | 129 | * |
135 | * This function does not block as blocking on the inode lock in IO completion | 130 | * This function does not block as blocking on the inode lock in IO completion |
136 | * can lead to IO completion order dependency deadlocks.. If it can't get the | 131 | * can lead to IO completion order dependency deadlocks.. If it can't get the |
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write( | |||
1279 | struct xfs_ioend *ioend = iocb->private; | 1274 | struct xfs_ioend *ioend = iocb->private; |
1280 | 1275 | ||
1281 | /* | 1276 | /* |
1277 | * While the generic direct I/O code updates the inode size, it does | ||
1278 | * so only after the end_io handler is called, which means our | ||
1279 | * end_io handler thinks the on-disk size is outside the in-core | ||
1280 | * size. To prevent this just update it a little bit earlier here. | ||
1281 | */ | ||
1282 | if (offset + size > i_size_read(ioend->io_inode)) | ||
1283 | i_size_write(ioend->io_inode, offset + size); | ||
1284 | |||
1285 | /* | ||
1282 | * blockdev_direct_IO can return an error even after the I/O | 1286 | * blockdev_direct_IO can return an error even after the I/O |
1283 | * completion handler was called. Thus we need to protect | 1287 | * completion handler was called. Thus we need to protect |
1284 | * against double-freeing. | 1288 | * against double-freeing. |
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed( | |||
1340 | 1344 | ||
1341 | if (to > inode->i_size) { | 1345 | if (to > inode->i_size) { |
1342 | /* | 1346 | /* |
1343 | * punch out the delalloc blocks we have already allocated. We | 1347 | * Punch out the delalloc blocks we have already allocated. |
1344 | * don't call xfs_setattr() to do this as we may be in the | 1348 | * |
1345 | * middle of a multi-iovec write and so the vfs inode->i_size | 1349 | * Don't bother with xfs_setattr given that nothing can have |
1346 | * will not match the xfs ip->i_size and so it will zero too | 1350 | * made it to disk yet as the page is still locked at this |
1347 | * much. Hence we jus truncate the page cache to zero what is | 1351 | * point. |
1348 | * necessary and punch the delalloc blocks directly. | ||
1349 | */ | 1352 | */ |
1350 | struct xfs_inode *ip = XFS_I(inode); | 1353 | struct xfs_inode *ip = XFS_I(inode); |
1351 | xfs_fileoff_t start_fsb; | 1354 | xfs_fileoff_t start_fsb; |
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1e5d97f86ea8..08b9ac644c31 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c | |||
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp) | |||
827 | if (error) | 827 | if (error) |
828 | goto out; | 828 | goto out; |
829 | 829 | ||
830 | /* | ||
831 | * Commit the last in the sequence of transactions. | ||
832 | */ | ||
833 | xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); | ||
834 | error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); | 830 | error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); |
835 | xfs_iunlock(dp, XFS_ILOCK_EXCL); | 831 | xfs_iunlock(dp, XFS_ILOCK_EXCL); |
836 | 832 | ||
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index c1b55e596551..d25eafd4d28d 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c | |||
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) | |||
271 | dp = args->dp; | 271 | dp = args->dp; |
272 | mp = dp->i_mount; | 272 | mp = dp->i_mount; |
273 | dp->i_d.di_forkoff = forkoff; | 273 | dp->i_d.di_forkoff = forkoff; |
274 | dp->i_df.if_ext_max = | ||
275 | XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); | ||
276 | dp->i_afp->if_ext_max = | ||
277 | XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); | ||
278 | 274 | ||
279 | ifp = dp->i_afp; | 275 | ifp = dp->i_afp; |
280 | ASSERT(ifp->if_flags & XFS_IFINLINE); | 276 | ASSERT(ifp->if_flags & XFS_IFINLINE); |
@@ -326,7 +322,6 @@ xfs_attr_fork_reset( | |||
326 | ASSERT(ip->i_d.di_anextents == 0); | 322 | ASSERT(ip->i_d.di_anextents == 0); |
327 | ASSERT(ip->i_afp == NULL); | 323 | ASSERT(ip->i_afp == NULL); |
328 | 324 | ||
329 | ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); | ||
330 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | 325 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
331 | } | 326 | } |
332 | 327 | ||
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) | |||
389 | (args->op_flags & XFS_DA_OP_ADDNAME) || | 384 | (args->op_flags & XFS_DA_OP_ADDNAME) || |
390 | !(mp->m_flags & XFS_MOUNT_ATTR2) || | 385 | !(mp->m_flags & XFS_MOUNT_ATTR2) || |
391 | dp->i_d.di_format == XFS_DINODE_FMT_BTREE); | 386 | dp->i_d.di_format == XFS_DINODE_FMT_BTREE); |
392 | dp->i_afp->if_ext_max = | ||
393 | XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); | ||
394 | dp->i_df.if_ext_max = | ||
395 | XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); | ||
396 | xfs_trans_log_inode(args->trans, dp, | 387 | xfs_trans_log_inode(args->trans, dp, |
397 | XFS_ILOG_CORE | XFS_ILOG_ADATA); | 388 | XFS_ILOG_CORE | XFS_ILOG_ADATA); |
398 | } | 389 | } |
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d0ab78837057..188ef2fbd628 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c | |||
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge( | |||
249 | } | 249 | } |
250 | 250 | ||
251 | /* | 251 | /* |
252 | * Update the record referred to by cur to the value given | 252 | * Check if the inode needs to be converted to btree format. |
253 | */ | ||
254 | static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) | ||
255 | { | ||
256 | return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && | ||
257 | XFS_IFORK_NEXTENTS(ip, whichfork) > | ||
258 | XFS_IFORK_MAXEXT(ip, whichfork); | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * Check if the inode should be converted to extent format. | ||
263 | */ | ||
264 | static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) | ||
265 | { | ||
266 | return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && | ||
267 | XFS_IFORK_NEXTENTS(ip, whichfork) <= | ||
268 | XFS_IFORK_MAXEXT(ip, whichfork); | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * Update the record referred to by cur to the value given | ||
253 | * by [off, bno, len, state]. | 273 | * by [off, bno, len, state]. |
254 | * This either works (return 0) or gets an EFSCORRUPTED error. | 274 | * This either works (return 0) or gets an EFSCORRUPTED error. |
255 | */ | 275 | */ |
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real( | |||
683 | goto done; | 703 | goto done; |
684 | XFS_WANT_CORRUPTED_GOTO(i == 1, done); | 704 | XFS_WANT_CORRUPTED_GOTO(i == 1, done); |
685 | } | 705 | } |
686 | if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && | 706 | |
687 | bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { | 707 | if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { |
688 | error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, | 708 | error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, |
689 | bma->firstblock, bma->flist, | 709 | bma->firstblock, bma->flist, |
690 | &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); | 710 | &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); |
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real( | |||
767 | goto done; | 787 | goto done; |
768 | XFS_WANT_CORRUPTED_GOTO(i == 1, done); | 788 | XFS_WANT_CORRUPTED_GOTO(i == 1, done); |
769 | } | 789 | } |
770 | if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && | 790 | |
771 | bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { | 791 | if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { |
772 | error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, | 792 | error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, |
773 | bma->firstblock, bma->flist, &bma->cur, 1, | 793 | bma->firstblock, bma->flist, &bma->cur, 1, |
774 | &tmp_rval, XFS_DATA_FORK); | 794 | &tmp_rval, XFS_DATA_FORK); |
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real( | |||
836 | goto done; | 856 | goto done; |
837 | XFS_WANT_CORRUPTED_GOTO(i == 1, done); | 857 | XFS_WANT_CORRUPTED_GOTO(i == 1, done); |
838 | } | 858 | } |
839 | if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && | 859 | |
840 | bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { | 860 | if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { |
841 | error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, | 861 | error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, |
842 | bma->firstblock, bma->flist, &bma->cur, | 862 | bma->firstblock, bma->flist, &bma->cur, |
843 | 1, &tmp_rval, XFS_DATA_FORK); | 863 | 1, &tmp_rval, XFS_DATA_FORK); |
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real( | |||
884 | } | 904 | } |
885 | 905 | ||
886 | /* convert to a btree if necessary */ | 906 | /* convert to a btree if necessary */ |
887 | if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && | 907 | if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { |
888 | XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { | ||
889 | int tmp_logflags; /* partial log flag return val */ | 908 | int tmp_logflags; /* partial log flag return val */ |
890 | 909 | ||
891 | ASSERT(bma->cur == NULL); | 910 | ASSERT(bma->cur == NULL); |
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real( | |||
1421 | } | 1440 | } |
1422 | 1441 | ||
1423 | /* convert to a btree if necessary */ | 1442 | /* convert to a btree if necessary */ |
1424 | if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && | 1443 | if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { |
1425 | XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { | ||
1426 | int tmp_logflags; /* partial log flag return val */ | 1444 | int tmp_logflags; /* partial log flag return val */ |
1427 | 1445 | ||
1428 | ASSERT(cur == NULL); | 1446 | ASSERT(cur == NULL); |
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real( | |||
1812 | } | 1830 | } |
1813 | 1831 | ||
1814 | /* convert to a btree if necessary */ | 1832 | /* convert to a btree if necessary */ |
1815 | if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && | 1833 | if (xfs_bmap_needs_btree(bma->ip, whichfork)) { |
1816 | XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { | ||
1817 | int tmp_logflags; /* partial log flag return val */ | 1834 | int tmp_logflags; /* partial log flag return val */ |
1818 | 1835 | ||
1819 | ASSERT(bma->cur == NULL); | 1836 | ASSERT(bma->cur == NULL); |
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree( | |||
3037 | 3054 | ||
3038 | ifp = XFS_IFORK_PTR(ip, whichfork); | 3055 | ifp = XFS_IFORK_PTR(ip, whichfork); |
3039 | ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); | 3056 | ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); |
3040 | ASSERT(ifp->if_ext_max == | 3057 | |
3041 | XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
3042 | /* | 3058 | /* |
3043 | * Make space in the inode incore. | 3059 | * Make space in the inode incore. |
3044 | */ | 3060 | */ |
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset( | |||
3184 | ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { | 3200 | ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { |
3185 | uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; | 3201 | uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; |
3186 | 3202 | ||
3187 | if (dfl_forkoff > ip->i_d.di_forkoff) { | 3203 | if (dfl_forkoff > ip->i_d.di_forkoff) |
3188 | ip->i_d.di_forkoff = dfl_forkoff; | 3204 | ip->i_d.di_forkoff = dfl_forkoff; |
3189 | ip->i_df.if_ext_max = | ||
3190 | XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); | ||
3191 | ip->i_afp->if_ext_max = | ||
3192 | XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); | ||
3193 | } | ||
3194 | } | 3205 | } |
3195 | } | 3206 | } |
3196 | 3207 | ||
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork( | |||
3430 | int error; /* error return value */ | 3441 | int error; /* error return value */ |
3431 | 3442 | ||
3432 | ASSERT(XFS_IFORK_Q(ip) == 0); | 3443 | ASSERT(XFS_IFORK_Q(ip) == 0); |
3433 | ASSERT(ip->i_df.if_ext_max == | ||
3434 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
3435 | 3444 | ||
3436 | mp = ip->i_mount; | 3445 | mp = ip->i_mount; |
3437 | ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); | 3446 | ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); |
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork( | |||
3486 | error = XFS_ERROR(EINVAL); | 3495 | error = XFS_ERROR(EINVAL); |
3487 | goto error1; | 3496 | goto error1; |
3488 | } | 3497 | } |
3489 | ip->i_df.if_ext_max = | 3498 | |
3490 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); | ||
3491 | ASSERT(ip->i_afp == NULL); | 3499 | ASSERT(ip->i_afp == NULL); |
3492 | ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); | 3500 | ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); |
3493 | ip->i_afp->if_ext_max = | ||
3494 | XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); | ||
3495 | ip->i_afp->if_flags = XFS_IFEXTENTS; | 3501 | ip->i_afp->if_flags = XFS_IFEXTENTS; |
3496 | logflags = 0; | 3502 | logflags = 0; |
3497 | xfs_bmap_init(&flist, &firstblock); | 3503 | xfs_bmap_init(&flist, &firstblock); |
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork( | |||
3535 | } else | 3541 | } else |
3536 | spin_unlock(&mp->m_sb_lock); | 3542 | spin_unlock(&mp->m_sb_lock); |
3537 | } | 3543 | } |
3538 | if ((error = xfs_bmap_finish(&tp, &flist, &committed))) | 3544 | |
3545 | error = xfs_bmap_finish(&tp, &flist, &committed); | ||
3546 | if (error) | ||
3539 | goto error2; | 3547 | goto error2; |
3540 | error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); | 3548 | return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); |
3541 | ASSERT(ip->i_df.if_ext_max == | ||
3542 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
3543 | return error; | ||
3544 | error2: | 3549 | error2: |
3545 | xfs_bmap_cancel(&flist); | 3550 | xfs_bmap_cancel(&flist); |
3546 | error1: | 3551 | error1: |
3547 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 3552 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
3548 | error0: | 3553 | error0: |
3549 | xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); | 3554 | xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); |
3550 | ASSERT(ip->i_df.if_ext_max == | ||
3551 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
3552 | return error; | 3555 | return error; |
3553 | } | 3556 | } |
3554 | 3557 | ||
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block( | |||
3994 | xfs_bmbt_irec_t s; /* internal version of extent */ | 3997 | xfs_bmbt_irec_t s; /* internal version of extent */ |
3995 | 3998 | ||
3996 | #ifndef DEBUG | 3999 | #ifndef DEBUG |
3997 | if (whichfork == XFS_DATA_FORK) { | 4000 | if (whichfork == XFS_DATA_FORK) |
3998 | return S_ISREG(ip->i_d.di_mode) ? | 4001 | return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; |
3999 | (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : | ||
4000 | (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); | ||
4001 | } | ||
4002 | #endif /* !DEBUG */ | 4002 | #endif /* !DEBUG */ |
4003 | if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) | 4003 | if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) |
4004 | return 0; | 4004 | return 0; |
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block( | |||
4010 | xfs_bmbt_get_all(ep, &s); | 4010 | xfs_bmbt_get_all(ep, &s); |
4011 | rval = s.br_startoff == 0 && s.br_blockcount == 1; | 4011 | rval = s.br_startoff == 0 && s.br_blockcount == 1; |
4012 | if (rval && whichfork == XFS_DATA_FORK) | 4012 | if (rval && whichfork == XFS_DATA_FORK) |
4013 | ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); | 4013 | ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); |
4014 | return rval; | 4014 | return rval; |
4015 | } | 4015 | } |
4016 | 4016 | ||
@@ -4379,8 +4379,6 @@ xfs_bmapi_read( | |||
4379 | XFS_STATS_INC(xs_blk_mapr); | 4379 | XFS_STATS_INC(xs_blk_mapr); |
4380 | 4380 | ||
4381 | ifp = XFS_IFORK_PTR(ip, whichfork); | 4381 | ifp = XFS_IFORK_PTR(ip, whichfork); |
4382 | ASSERT(ifp->if_ext_max == | ||
4383 | XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
4384 | 4382 | ||
4385 | if (!(ifp->if_flags & XFS_IFEXTENTS)) { | 4383 | if (!(ifp->if_flags & XFS_IFEXTENTS)) { |
4386 | error = xfs_iread_extents(NULL, ip, whichfork); | 4384 | error = xfs_iread_extents(NULL, ip, whichfork); |
@@ -4871,8 +4869,6 @@ xfs_bmapi_write( | |||
4871 | return XFS_ERROR(EIO); | 4869 | return XFS_ERROR(EIO); |
4872 | 4870 | ||
4873 | ifp = XFS_IFORK_PTR(ip, whichfork); | 4871 | ifp = XFS_IFORK_PTR(ip, whichfork); |
4874 | ASSERT(ifp->if_ext_max == | ||
4875 | XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
4876 | 4872 | ||
4877 | XFS_STATS_INC(xs_blk_mapw); | 4873 | XFS_STATS_INC(xs_blk_mapw); |
4878 | 4874 | ||
@@ -4981,8 +4977,7 @@ xfs_bmapi_write( | |||
4981 | /* | 4977 | /* |
4982 | * Transform from btree to extents, give it cur. | 4978 | * Transform from btree to extents, give it cur. |
4983 | */ | 4979 | */ |
4984 | if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && | 4980 | if (xfs_bmap_wants_extents(ip, whichfork)) { |
4985 | XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { | ||
4986 | int tmp_logflags = 0; | 4981 | int tmp_logflags = 0; |
4987 | 4982 | ||
4988 | ASSERT(bma.cur); | 4983 | ASSERT(bma.cur); |
@@ -4992,10 +4987,10 @@ xfs_bmapi_write( | |||
4992 | if (error) | 4987 | if (error) |
4993 | goto error0; | 4988 | goto error0; |
4994 | } | 4989 | } |
4995 | ASSERT(ifp->if_ext_max == | 4990 | |
4996 | XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
4997 | ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || | 4991 | ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || |
4998 | XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); | 4992 | XFS_IFORK_NEXTENTS(ip, whichfork) > |
4993 | XFS_IFORK_MAXEXT(ip, whichfork)); | ||
4999 | error = 0; | 4994 | error = 0; |
5000 | error0: | 4995 | error0: |
5001 | /* | 4996 | /* |
@@ -5095,8 +5090,7 @@ xfs_bunmapi( | |||
5095 | 5090 | ||
5096 | ASSERT(len > 0); | 5091 | ASSERT(len > 0); |
5097 | ASSERT(nexts >= 0); | 5092 | ASSERT(nexts >= 0); |
5098 | ASSERT(ifp->if_ext_max == | 5093 | |
5099 | XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
5100 | if (!(ifp->if_flags & XFS_IFEXTENTS) && | 5094 | if (!(ifp->if_flags & XFS_IFEXTENTS) && |
5101 | (error = xfs_iread_extents(tp, ip, whichfork))) | 5095 | (error = xfs_iread_extents(tp, ip, whichfork))) |
5102 | return error; | 5096 | return error; |
@@ -5322,7 +5316,8 @@ xfs_bunmapi( | |||
5322 | */ | 5316 | */ |
5323 | if (!wasdel && xfs_trans_get_block_res(tp) == 0 && | 5317 | if (!wasdel && xfs_trans_get_block_res(tp) == 0 && |
5324 | XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && | 5318 | XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && |
5325 | XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && | 5319 | XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ |
5320 | XFS_IFORK_MAXEXT(ip, whichfork) && | ||
5326 | del.br_startoff > got.br_startoff && | 5321 | del.br_startoff > got.br_startoff && |
5327 | del.br_startoff + del.br_blockcount < | 5322 | del.br_startoff + del.br_blockcount < |
5328 | got.br_startoff + got.br_blockcount) { | 5323 | got.br_startoff + got.br_blockcount) { |
@@ -5353,13 +5348,11 @@ nodelete: | |||
5353 | } | 5348 | } |
5354 | } | 5349 | } |
5355 | *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; | 5350 | *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; |
5356 | ASSERT(ifp->if_ext_max == | 5351 | |
5357 | XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
5358 | /* | 5352 | /* |
5359 | * Convert to a btree if necessary. | 5353 | * Convert to a btree if necessary. |
5360 | */ | 5354 | */ |
5361 | if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && | 5355 | if (xfs_bmap_needs_btree(ip, whichfork)) { |
5362 | XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { | ||
5363 | ASSERT(cur == NULL); | 5356 | ASSERT(cur == NULL); |
5364 | error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, | 5357 | error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, |
5365 | &cur, 0, &tmp_logflags, whichfork); | 5358 | &cur, 0, &tmp_logflags, whichfork); |
@@ -5370,8 +5363,7 @@ nodelete: | |||
5370 | /* | 5363 | /* |
5371 | * transform from btree to extents, give it cur | 5364 | * transform from btree to extents, give it cur |
5372 | */ | 5365 | */ |
5373 | else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && | 5366 | else if (xfs_bmap_wants_extents(ip, whichfork)) { |
5374 | XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { | ||
5375 | ASSERT(cur != NULL); | 5367 | ASSERT(cur != NULL); |
5376 | error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, | 5368 | error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, |
5377 | whichfork); | 5369 | whichfork); |
@@ -5382,8 +5374,6 @@ nodelete: | |||
5382 | /* | 5374 | /* |
5383 | * transform from extents to local? | 5375 | * transform from extents to local? |
5384 | */ | 5376 | */ |
5385 | ASSERT(ifp->if_ext_max == | ||
5386 | XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
5387 | error = 0; | 5377 | error = 0; |
5388 | error0: | 5378 | error0: |
5389 | /* | 5379 | /* |
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole( | |||
5434 | if (startblock == HOLESTARTBLOCK) { | 5424 | if (startblock == HOLESTARTBLOCK) { |
5435 | mp = ip->i_mount; | 5425 | mp = ip->i_mount; |
5436 | out->bmv_block = -1; | 5426 | out->bmv_block = -1; |
5437 | fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); | 5427 | fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); |
5438 | fixlen -= out->bmv_offset; | 5428 | fixlen -= out->bmv_offset; |
5439 | if (prealloced && out->bmv_offset + out->bmv_length == end) { | 5429 | if (prealloced && out->bmv_offset + out->bmv_length == end) { |
5440 | /* Came to hole at EOF. Trim it. */ | 5430 | /* Came to hole at EOF. Trim it. */ |
@@ -5522,7 +5512,7 @@ xfs_getbmap( | |||
5522 | fixlen = XFS_MAXIOFFSET(mp); | 5512 | fixlen = XFS_MAXIOFFSET(mp); |
5523 | } else { | 5513 | } else { |
5524 | prealloced = 0; | 5514 | prealloced = 0; |
5525 | fixlen = ip->i_size; | 5515 | fixlen = XFS_ISIZE(ip); |
5526 | } | 5516 | } |
5527 | } | 5517 | } |
5528 | 5518 | ||
@@ -5551,7 +5541,7 @@ xfs_getbmap( | |||
5551 | 5541 | ||
5552 | xfs_ilock(ip, XFS_IOLOCK_SHARED); | 5542 | xfs_ilock(ip, XFS_IOLOCK_SHARED); |
5553 | if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { | 5543 | if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { |
5554 | if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { | 5544 | if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { |
5555 | error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); | 5545 | error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); |
5556 | if (error) | 5546 | if (error) |
5557 | goto out_unlock_iolock; | 5547 | goto out_unlock_iolock; |
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 654dc6f05bac..dd974a55c77d 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c | |||
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format( | |||
163 | 163 | ||
164 | /* Check temp in extent form to max in target */ | 164 | /* Check temp in extent form to max in target */ |
165 | if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && | 165 | if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && |
166 | XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) | 166 | XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > |
167 | XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) | ||
167 | return EINVAL; | 168 | return EINVAL; |
168 | 169 | ||
169 | /* Check target in extent form to max in temp */ | 170 | /* Check target in extent form to max in temp */ |
170 | if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && | 171 | if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && |
171 | XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) | 172 | XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > |
173 | XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) | ||
172 | return EINVAL; | 174 | return EINVAL; |
173 | 175 | ||
174 | /* | 176 | /* |
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format( | |||
180 | * (a common defrag case) which will occur when the temp inode is in | 182 | * (a common defrag case) which will occur when the temp inode is in |
181 | * extent format... | 183 | * extent format... |
182 | */ | 184 | */ |
183 | if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && | 185 | if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { |
184 | ((XFS_IFORK_BOFF(ip) && | 186 | if (XFS_IFORK_BOFF(ip) && |
185 | tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || | 187 | tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) |
186 | XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) | 188 | return EINVAL; |
187 | return EINVAL; | 189 | if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= |
190 | XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) | ||
191 | return EINVAL; | ||
192 | } | ||
188 | 193 | ||
189 | /* Reciprocal target->temp btree format checks */ | 194 | /* Reciprocal target->temp btree format checks */ |
190 | if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && | 195 | if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { |
191 | ((XFS_IFORK_BOFF(tip) && | 196 | if (XFS_IFORK_BOFF(tip) && |
192 | ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || | 197 | ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) |
193 | XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) | 198 | return EINVAL; |
194 | return EINVAL; | 199 | |
200 | if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= | ||
201 | XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) | ||
202 | return EINVAL; | ||
203 | } | ||
195 | 204 | ||
196 | return 0; | 205 | return 0; |
197 | } | 206 | } |
@@ -349,16 +358,6 @@ xfs_swap_extents( | |||
349 | *tifp = *tempifp; /* struct copy */ | 358 | *tifp = *tempifp; /* struct copy */ |
350 | 359 | ||
351 | /* | 360 | /* |
352 | * Fix the in-memory data fork values that are dependent on the fork | ||
353 | * offset in the inode. We can't assume they remain the same as attr2 | ||
354 | * has dynamic fork offsets. | ||
355 | */ | ||
356 | ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / | ||
357 | (uint)sizeof(xfs_bmbt_rec_t); | ||
358 | tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / | ||
359 | (uint)sizeof(xfs_bmbt_rec_t); | ||
360 | |||
361 | /* | ||
362 | * Fix the on-disk inode values | 361 | * Fix the on-disk inode values |
363 | */ | 362 | */ |
364 | tmp = (__uint64_t)ip->i_d.di_nblocks; | 363 | tmp = (__uint64_t)ip->i_d.di_nblocks; |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f675f3d9d7b3..7e5bc872f2b4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
@@ -327,7 +327,7 @@ xfs_file_aio_read( | |||
327 | mp->m_rtdev_targp : mp->m_ddev_targp; | 327 | mp->m_rtdev_targp : mp->m_ddev_targp; |
328 | if ((iocb->ki_pos & target->bt_smask) || | 328 | if ((iocb->ki_pos & target->bt_smask) || |
329 | (size & target->bt_smask)) { | 329 | (size & target->bt_smask)) { |
330 | if (iocb->ki_pos == ip->i_size) | 330 | if (iocb->ki_pos == i_size_read(inode)) |
331 | return 0; | 331 | return 0; |
332 | return -XFS_ERROR(EINVAL); | 332 | return -XFS_ERROR(EINVAL); |
333 | } | 333 | } |
@@ -412,51 +412,6 @@ xfs_file_splice_read( | |||
412 | return ret; | 412 | return ret; |
413 | } | 413 | } |
414 | 414 | ||
415 | STATIC void | ||
416 | xfs_aio_write_isize_update( | ||
417 | struct inode *inode, | ||
418 | loff_t *ppos, | ||
419 | ssize_t bytes_written) | ||
420 | { | ||
421 | struct xfs_inode *ip = XFS_I(inode); | ||
422 | xfs_fsize_t isize = i_size_read(inode); | ||
423 | |||
424 | if (bytes_written > 0) | ||
425 | XFS_STATS_ADD(xs_write_bytes, bytes_written); | ||
426 | |||
427 | if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && | ||
428 | *ppos > isize)) | ||
429 | *ppos = isize; | ||
430 | |||
431 | if (*ppos > ip->i_size) { | ||
432 | xfs_rw_ilock(ip, XFS_ILOCK_EXCL); | ||
433 | if (*ppos > ip->i_size) | ||
434 | ip->i_size = *ppos; | ||
435 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); | ||
436 | } | ||
437 | } | ||
438 | |||
439 | /* | ||
440 | * If this was a direct or synchronous I/O that failed (such as ENOSPC) then | ||
441 | * part of the I/O may have been written to disk before the error occurred. In | ||
442 | * this case the on-disk file size may have been adjusted beyond the in-memory | ||
443 | * file size and now needs to be truncated back. | ||
444 | */ | ||
445 | STATIC void | ||
446 | xfs_aio_write_newsize_update( | ||
447 | struct xfs_inode *ip, | ||
448 | xfs_fsize_t new_size) | ||
449 | { | ||
450 | if (new_size == ip->i_new_size) { | ||
451 | xfs_rw_ilock(ip, XFS_ILOCK_EXCL); | ||
452 | if (new_size == ip->i_new_size) | ||
453 | ip->i_new_size = 0; | ||
454 | if (ip->i_d.di_size > ip->i_size) | ||
455 | ip->i_d.di_size = ip->i_size; | ||
456 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); | ||
457 | } | ||
458 | } | ||
459 | |||
460 | /* | 415 | /* |
461 | * xfs_file_splice_write() does not use xfs_rw_ilock() because | 416 | * xfs_file_splice_write() does not use xfs_rw_ilock() because |
462 | * generic_file_splice_write() takes the i_mutex itself. This, in theory, | 417 | * generic_file_splice_write() takes the i_mutex itself. This, in theory, |
@@ -475,7 +430,6 @@ xfs_file_splice_write( | |||
475 | { | 430 | { |
476 | struct inode *inode = outfilp->f_mapping->host; | 431 | struct inode *inode = outfilp->f_mapping->host; |
477 | struct xfs_inode *ip = XFS_I(inode); | 432 | struct xfs_inode *ip = XFS_I(inode); |
478 | xfs_fsize_t new_size; | ||
479 | int ioflags = 0; | 433 | int ioflags = 0; |
480 | ssize_t ret; | 434 | ssize_t ret; |
481 | 435 | ||
@@ -489,19 +443,12 @@ xfs_file_splice_write( | |||
489 | 443 | ||
490 | xfs_ilock(ip, XFS_IOLOCK_EXCL); | 444 | xfs_ilock(ip, XFS_IOLOCK_EXCL); |
491 | 445 | ||
492 | new_size = *ppos + count; | ||
493 | |||
494 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
495 | if (new_size > ip->i_size) | ||
496 | ip->i_new_size = new_size; | ||
497 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
498 | |||
499 | trace_xfs_file_splice_write(ip, count, *ppos, ioflags); | 446 | trace_xfs_file_splice_write(ip, count, *ppos, ioflags); |
500 | 447 | ||
501 | ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); | 448 | ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); |
449 | if (ret > 0) | ||
450 | XFS_STATS_ADD(xs_write_bytes, ret); | ||
502 | 451 | ||
503 | xfs_aio_write_isize_update(inode, ppos, ret); | ||
504 | xfs_aio_write_newsize_update(ip, new_size); | ||
505 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); | 452 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); |
506 | return ret; | 453 | return ret; |
507 | } | 454 | } |
@@ -689,28 +636,26 @@ out_lock: | |||
689 | /* | 636 | /* |
690 | * Common pre-write limit and setup checks. | 637 | * Common pre-write limit and setup checks. |
691 | * | 638 | * |
692 | * Returns with iolock held according to @iolock. | 639 | * Called with the iolocked held either shared and exclusive according to |
640 | * @iolock, and returns with it held. Might upgrade the iolock to exclusive | ||
641 | * if called for a direct write beyond i_size. | ||
693 | */ | 642 | */ |
694 | STATIC ssize_t | 643 | STATIC ssize_t |
695 | xfs_file_aio_write_checks( | 644 | xfs_file_aio_write_checks( |
696 | struct file *file, | 645 | struct file *file, |
697 | loff_t *pos, | 646 | loff_t *pos, |
698 | size_t *count, | 647 | size_t *count, |
699 | xfs_fsize_t *new_sizep, | ||
700 | int *iolock) | 648 | int *iolock) |
701 | { | 649 | { |
702 | struct inode *inode = file->f_mapping->host; | 650 | struct inode *inode = file->f_mapping->host; |
703 | struct xfs_inode *ip = XFS_I(inode); | 651 | struct xfs_inode *ip = XFS_I(inode); |
704 | xfs_fsize_t new_size; | ||
705 | int error = 0; | 652 | int error = 0; |
706 | 653 | ||
707 | xfs_rw_ilock(ip, XFS_ILOCK_EXCL); | 654 | xfs_rw_ilock(ip, XFS_ILOCK_EXCL); |
708 | *new_sizep = 0; | ||
709 | restart: | 655 | restart: |
710 | error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); | 656 | error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); |
711 | if (error) { | 657 | if (error) { |
712 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); | 658 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); |
713 | *iolock = 0; | ||
714 | return error; | 659 | return error; |
715 | } | 660 | } |
716 | 661 | ||
@@ -720,36 +665,21 @@ restart: | |||
720 | /* | 665 | /* |
721 | * If the offset is beyond the size of the file, we need to zero any | 666 | * If the offset is beyond the size of the file, we need to zero any |
722 | * blocks that fall between the existing EOF and the start of this | 667 | * blocks that fall between the existing EOF and the start of this |
723 | * write. There is no need to issue zeroing if another in-flght IO ends | 668 | * write. If zeroing is needed and we are currently holding the |
724 | * at or before this one If zeronig is needed and we are currently | 669 | * iolock shared, we need to update it to exclusive which involves |
725 | * holding the iolock shared, we need to update it to exclusive which | 670 | * dropping all locks and relocking to maintain correct locking order. |
726 | * involves dropping all locks and relocking to maintain correct locking | 671 | * If we do this, restart the function to ensure all checks and values |
727 | * order. If we do this, restart the function to ensure all checks and | 672 | * are still valid. |
728 | * values are still valid. | ||
729 | */ | 673 | */ |
730 | if ((ip->i_new_size && *pos > ip->i_new_size) || | 674 | if (*pos > i_size_read(inode)) { |
731 | (!ip->i_new_size && *pos > ip->i_size)) { | ||
732 | if (*iolock == XFS_IOLOCK_SHARED) { | 675 | if (*iolock == XFS_IOLOCK_SHARED) { |
733 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); | 676 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); |
734 | *iolock = XFS_IOLOCK_EXCL; | 677 | *iolock = XFS_IOLOCK_EXCL; |
735 | xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); | 678 | xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); |
736 | goto restart; | 679 | goto restart; |
737 | } | 680 | } |
738 | error = -xfs_zero_eof(ip, *pos, ip->i_size); | 681 | error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); |
739 | } | 682 | } |
740 | |||
741 | /* | ||
742 | * If this IO extends beyond EOF, we may need to update ip->i_new_size. | ||
743 | * We have already zeroed space beyond EOF (if necessary). Only update | ||
744 | * ip->i_new_size if this IO ends beyond any other in-flight writes. | ||
745 | */ | ||
746 | new_size = *pos + *count; | ||
747 | if (new_size > ip->i_size) { | ||
748 | if (new_size > ip->i_new_size) | ||
749 | ip->i_new_size = new_size; | ||
750 | *new_sizep = new_size; | ||
751 | } | ||
752 | |||
753 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); | 683 | xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); |
754 | if (error) | 684 | if (error) |
755 | return error; | 685 | return error; |
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write( | |||
794 | const struct iovec *iovp, | 724 | const struct iovec *iovp, |
795 | unsigned long nr_segs, | 725 | unsigned long nr_segs, |
796 | loff_t pos, | 726 | loff_t pos, |
797 | size_t ocount, | 727 | size_t ocount) |
798 | xfs_fsize_t *new_size, | ||
799 | int *iolock) | ||
800 | { | 728 | { |
801 | struct file *file = iocb->ki_filp; | 729 | struct file *file = iocb->ki_filp; |
802 | struct address_space *mapping = file->f_mapping; | 730 | struct address_space *mapping = file->f_mapping; |
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write( | |||
806 | ssize_t ret = 0; | 734 | ssize_t ret = 0; |
807 | size_t count = ocount; | 735 | size_t count = ocount; |
808 | int unaligned_io = 0; | 736 | int unaligned_io = 0; |
737 | int iolock; | ||
809 | struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? | 738 | struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? |
810 | mp->m_rtdev_targp : mp->m_ddev_targp; | 739 | mp->m_rtdev_targp : mp->m_ddev_targp; |
811 | 740 | ||
812 | *iolock = 0; | ||
813 | if ((pos & target->bt_smask) || (count & target->bt_smask)) | 741 | if ((pos & target->bt_smask) || (count & target->bt_smask)) |
814 | return -XFS_ERROR(EINVAL); | 742 | return -XFS_ERROR(EINVAL); |
815 | 743 | ||
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write( | |||
824 | * EOF zeroing cases and fill out the new inode size as appropriate. | 752 | * EOF zeroing cases and fill out the new inode size as appropriate. |
825 | */ | 753 | */ |
826 | if (unaligned_io || mapping->nrpages) | 754 | if (unaligned_io || mapping->nrpages) |
827 | *iolock = XFS_IOLOCK_EXCL; | 755 | iolock = XFS_IOLOCK_EXCL; |
828 | else | 756 | else |
829 | *iolock = XFS_IOLOCK_SHARED; | 757 | iolock = XFS_IOLOCK_SHARED; |
830 | xfs_rw_ilock(ip, *iolock); | 758 | xfs_rw_ilock(ip, iolock); |
831 | 759 | ||
832 | /* | 760 | /* |
833 | * Recheck if there are cached pages that need invalidate after we got | 761 | * Recheck if there are cached pages that need invalidate after we got |
834 | * the iolock to protect against other threads adding new pages while | 762 | * the iolock to protect against other threads adding new pages while |
835 | * we were waiting for the iolock. | 763 | * we were waiting for the iolock. |
836 | */ | 764 | */ |
837 | if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { | 765 | if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { |
838 | xfs_rw_iunlock(ip, *iolock); | 766 | xfs_rw_iunlock(ip, iolock); |
839 | *iolock = XFS_IOLOCK_EXCL; | 767 | iolock = XFS_IOLOCK_EXCL; |
840 | xfs_rw_ilock(ip, *iolock); | 768 | xfs_rw_ilock(ip, iolock); |
841 | } | 769 | } |
842 | 770 | ||
843 | ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); | 771 | ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); |
844 | if (ret) | 772 | if (ret) |
845 | return ret; | 773 | goto out; |
846 | 774 | ||
847 | if (mapping->nrpages) { | 775 | if (mapping->nrpages) { |
848 | ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, | 776 | ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, |
849 | FI_REMAPF_LOCKED); | 777 | FI_REMAPF_LOCKED); |
850 | if (ret) | 778 | if (ret) |
851 | return ret; | 779 | goto out; |
852 | } | 780 | } |
853 | 781 | ||
854 | /* | 782 | /* |
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write( | |||
857 | */ | 785 | */ |
858 | if (unaligned_io) | 786 | if (unaligned_io) |
859 | inode_dio_wait(inode); | 787 | inode_dio_wait(inode); |
860 | else if (*iolock == XFS_IOLOCK_EXCL) { | 788 | else if (iolock == XFS_IOLOCK_EXCL) { |
861 | xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); | 789 | xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); |
862 | *iolock = XFS_IOLOCK_SHARED; | 790 | iolock = XFS_IOLOCK_SHARED; |
863 | } | 791 | } |
864 | 792 | ||
865 | trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); | 793 | trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); |
866 | ret = generic_file_direct_write(iocb, iovp, | 794 | ret = generic_file_direct_write(iocb, iovp, |
867 | &nr_segs, pos, &iocb->ki_pos, count, ocount); | 795 | &nr_segs, pos, &iocb->ki_pos, count, ocount); |
868 | 796 | ||
797 | out: | ||
798 | xfs_rw_iunlock(ip, iolock); | ||
799 | |||
869 | /* No fallback to buffered IO on errors for XFS. */ | 800 | /* No fallback to buffered IO on errors for XFS. */ |
870 | ASSERT(ret < 0 || ret == count); | 801 | ASSERT(ret < 0 || ret == count); |
871 | return ret; | 802 | return ret; |
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write( | |||
877 | const struct iovec *iovp, | 808 | const struct iovec *iovp, |
878 | unsigned long nr_segs, | 809 | unsigned long nr_segs, |
879 | loff_t pos, | 810 | loff_t pos, |
880 | size_t ocount, | 811 | size_t ocount) |
881 | xfs_fsize_t *new_size, | ||
882 | int *iolock) | ||
883 | { | 812 | { |
884 | struct file *file = iocb->ki_filp; | 813 | struct file *file = iocb->ki_filp; |
885 | struct address_space *mapping = file->f_mapping; | 814 | struct address_space *mapping = file->f_mapping; |
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write( | |||
887 | struct xfs_inode *ip = XFS_I(inode); | 816 | struct xfs_inode *ip = XFS_I(inode); |
888 | ssize_t ret; | 817 | ssize_t ret; |
889 | int enospc = 0; | 818 | int enospc = 0; |
819 | int iolock = XFS_IOLOCK_EXCL; | ||
890 | size_t count = ocount; | 820 | size_t count = ocount; |
891 | 821 | ||
892 | *iolock = XFS_IOLOCK_EXCL; | 822 | xfs_rw_ilock(ip, iolock); |
893 | xfs_rw_ilock(ip, *iolock); | ||
894 | 823 | ||
895 | ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); | 824 | ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); |
896 | if (ret) | 825 | if (ret) |
897 | return ret; | 826 | goto out; |
898 | 827 | ||
899 | /* We can write back this queue in page reclaim */ | 828 | /* We can write back this queue in page reclaim */ |
900 | current->backing_dev_info = mapping->backing_dev_info; | 829 | current->backing_dev_info = mapping->backing_dev_info; |
@@ -908,13 +837,15 @@ write_retry: | |||
908 | * page locks and retry *once* | 837 | * page locks and retry *once* |
909 | */ | 838 | */ |
910 | if (ret == -ENOSPC && !enospc) { | 839 | if (ret == -ENOSPC && !enospc) { |
911 | ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); | ||
912 | if (ret) | ||
913 | return ret; | ||
914 | enospc = 1; | 840 | enospc = 1; |
915 | goto write_retry; | 841 | ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); |
842 | if (!ret) | ||
843 | goto write_retry; | ||
916 | } | 844 | } |
845 | |||
917 | current->backing_dev_info = NULL; | 846 | current->backing_dev_info = NULL; |
847 | out: | ||
848 | xfs_rw_iunlock(ip, iolock); | ||
918 | return ret; | 849 | return ret; |
919 | } | 850 | } |
920 | 851 | ||
@@ -930,9 +861,7 @@ xfs_file_aio_write( | |||
930 | struct inode *inode = mapping->host; | 861 | struct inode *inode = mapping->host; |
931 | struct xfs_inode *ip = XFS_I(inode); | 862 | struct xfs_inode *ip = XFS_I(inode); |
932 | ssize_t ret; | 863 | ssize_t ret; |
933 | int iolock; | ||
934 | size_t ocount = 0; | 864 | size_t ocount = 0; |
935 | xfs_fsize_t new_size = 0; | ||
936 | 865 | ||
937 | XFS_STATS_INC(xs_write_calls); | 866 | XFS_STATS_INC(xs_write_calls); |
938 | 867 | ||
@@ -951,33 +880,22 @@ xfs_file_aio_write( | |||
951 | return -EIO; | 880 | return -EIO; |
952 | 881 | ||
953 | if (unlikely(file->f_flags & O_DIRECT)) | 882 | if (unlikely(file->f_flags & O_DIRECT)) |
954 | ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, | 883 | ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); |
955 | ocount, &new_size, &iolock); | ||
956 | else | 884 | else |
957 | ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, | 885 | ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, |
958 | ocount, &new_size, &iolock); | 886 | ocount); |
959 | |||
960 | xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); | ||
961 | 887 | ||
962 | if (ret <= 0) | 888 | if (ret > 0) { |
963 | goto out_unlock; | 889 | ssize_t err; |
964 | 890 | ||
965 | /* Handle various SYNC-type writes */ | 891 | XFS_STATS_ADD(xs_write_bytes, ret); |
966 | if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { | ||
967 | loff_t end = pos + ret - 1; | ||
968 | int error; | ||
969 | 892 | ||
970 | xfs_rw_iunlock(ip, iolock); | 893 | /* Handle various SYNC-type writes */ |
971 | error = xfs_file_fsync(file, pos, end, | 894 | err = generic_write_sync(file, pos, ret); |
972 | (file->f_flags & __O_SYNC) ? 0 : 1); | 895 | if (err < 0) |
973 | xfs_rw_ilock(ip, iolock); | 896 | ret = err; |
974 | if (error) | ||
975 | ret = error; | ||
976 | } | 897 | } |
977 | 898 | ||
978 | out_unlock: | ||
979 | xfs_aio_write_newsize_update(ip, new_size); | ||
980 | xfs_rw_iunlock(ip, iolock); | ||
981 | return ret; | 899 | return ret; |
982 | } | 900 | } |
983 | 901 | ||
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index ed88ed16811c..652b875a9d4c 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c | |||
@@ -90,7 +90,7 @@ xfs_wait_on_pages( | |||
90 | 90 | ||
91 | if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { | 91 | if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { |
92 | return -filemap_fdatawait_range(mapping, first, | 92 | return -filemap_fdatawait_range(mapping, first, |
93 | last == -1 ? ip->i_size - 1 : last); | 93 | last == -1 ? XFS_ISIZE(ip) - 1 : last); |
94 | } | 94 | } |
95 | return 0; | 95 | return 0; |
96 | } | 96 | } |
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 3960a066d7ff..8c3e46394d48 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c | |||
@@ -77,7 +77,7 @@ xfs_inode_alloc( | |||
77 | 77 | ||
78 | ASSERT(atomic_read(&ip->i_pincount) == 0); | 78 | ASSERT(atomic_read(&ip->i_pincount) == 0); |
79 | ASSERT(!spin_is_locked(&ip->i_flags_lock)); | 79 | ASSERT(!spin_is_locked(&ip->i_flags_lock)); |
80 | ASSERT(completion_done(&ip->i_flush)); | 80 | ASSERT(!xfs_isiflocked(ip)); |
81 | ASSERT(ip->i_ino == 0); | 81 | ASSERT(ip->i_ino == 0); |
82 | 82 | ||
83 | mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); | 83 | mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); |
@@ -94,8 +94,6 @@ xfs_inode_alloc( | |||
94 | ip->i_update_core = 0; | 94 | ip->i_update_core = 0; |
95 | ip->i_delayed_blks = 0; | 95 | ip->i_delayed_blks = 0; |
96 | memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); | 96 | memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); |
97 | ip->i_size = 0; | ||
98 | ip->i_new_size = 0; | ||
99 | 97 | ||
100 | return ip; | 98 | return ip; |
101 | } | 99 | } |
@@ -150,7 +148,7 @@ xfs_inode_free( | |||
150 | /* asserts to verify all state is correct here */ | 148 | /* asserts to verify all state is correct here */ |
151 | ASSERT(atomic_read(&ip->i_pincount) == 0); | 149 | ASSERT(atomic_read(&ip->i_pincount) == 0); |
152 | ASSERT(!spin_is_locked(&ip->i_flags_lock)); | 150 | ASSERT(!spin_is_locked(&ip->i_flags_lock)); |
153 | ASSERT(completion_done(&ip->i_flush)); | 151 | ASSERT(!xfs_isiflocked(ip)); |
154 | 152 | ||
155 | /* | 153 | /* |
156 | * Because we use RCU freeing we need to ensure the inode always | 154 | * Because we use RCU freeing we need to ensure the inode always |
@@ -450,8 +448,6 @@ again: | |||
450 | 448 | ||
451 | *ipp = ip; | 449 | *ipp = ip; |
452 | 450 | ||
453 | ASSERT(ip->i_df.if_ext_max == | ||
454 | XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); | ||
455 | /* | 451 | /* |
456 | * If we have a real type for an on-disk inode, we can set ops(&unlock) | 452 | * If we have a real type for an on-disk inode, we can set ops(&unlock) |
457 | * now. If it's a new inode being created, xfs_ialloc will handle it. | 453 | * now. If it's a new inode being created, xfs_ialloc will handle it. |
@@ -715,3 +711,19 @@ xfs_isilocked( | |||
715 | return 0; | 711 | return 0; |
716 | } | 712 | } |
717 | #endif | 713 | #endif |
714 | |||
715 | void | ||
716 | __xfs_iflock( | ||
717 | struct xfs_inode *ip) | ||
718 | { | ||
719 | wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); | ||
720 | DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); | ||
721 | |||
722 | do { | ||
723 | prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); | ||
724 | if (xfs_isiflocked(ip)) | ||
725 | io_schedule(); | ||
726 | } while (!xfs_iflock_nowait(ip)); | ||
727 | |||
728 | finish_wait(wq, &wait.wait); | ||
729 | } | ||
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9dda7cc32848..b21022499c2e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -299,11 +299,8 @@ xfs_iformat( | |||
299 | { | 299 | { |
300 | xfs_attr_shortform_t *atp; | 300 | xfs_attr_shortform_t *atp; |
301 | int size; | 301 | int size; |
302 | int error; | 302 | int error = 0; |
303 | xfs_fsize_t di_size; | 303 | xfs_fsize_t di_size; |
304 | ip->i_df.if_ext_max = | ||
305 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); | ||
306 | error = 0; | ||
307 | 304 | ||
308 | if (unlikely(be32_to_cpu(dip->di_nextents) + | 305 | if (unlikely(be32_to_cpu(dip->di_nextents) + |
309 | be16_to_cpu(dip->di_anextents) > | 306 | be16_to_cpu(dip->di_anextents) > |
@@ -350,7 +347,6 @@ xfs_iformat( | |||
350 | return XFS_ERROR(EFSCORRUPTED); | 347 | return XFS_ERROR(EFSCORRUPTED); |
351 | } | 348 | } |
352 | ip->i_d.di_size = 0; | 349 | ip->i_d.di_size = 0; |
353 | ip->i_size = 0; | ||
354 | ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); | 350 | ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); |
355 | break; | 351 | break; |
356 | 352 | ||
@@ -409,10 +405,10 @@ xfs_iformat( | |||
409 | } | 405 | } |
410 | if (!XFS_DFORK_Q(dip)) | 406 | if (!XFS_DFORK_Q(dip)) |
411 | return 0; | 407 | return 0; |
408 | |||
412 | ASSERT(ip->i_afp == NULL); | 409 | ASSERT(ip->i_afp == NULL); |
413 | ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); | 410 | ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); |
414 | ip->i_afp->if_ext_max = | 411 | |
415 | XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); | ||
416 | switch (dip->di_aformat) { | 412 | switch (dip->di_aformat) { |
417 | case XFS_DINODE_FMT_LOCAL: | 413 | case XFS_DINODE_FMT_LOCAL: |
418 | atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); | 414 | atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); |
@@ -604,10 +600,11 @@ xfs_iformat_btree( | |||
604 | * or the number of extents is greater than the number of | 600 | * or the number of extents is greater than the number of |
605 | * blocks. | 601 | * blocks. |
606 | */ | 602 | */ |
607 | if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max | 603 | if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= |
608 | || XFS_BMDR_SPACE_CALC(nrecs) > | 604 | XFS_IFORK_MAXEXT(ip, whichfork) || |
609 | XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) | 605 | XFS_BMDR_SPACE_CALC(nrecs) > |
610 | || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { | 606 | XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || |
607 | XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { | ||
611 | xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", | 608 | xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", |
612 | (unsigned long long) ip->i_ino); | 609 | (unsigned long long) ip->i_ino); |
613 | XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, | 610 | XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, |
@@ -835,12 +832,6 @@ xfs_iread( | |||
835 | * with the uninitialized part of it. | 832 | * with the uninitialized part of it. |
836 | */ | 833 | */ |
837 | ip->i_d.di_mode = 0; | 834 | ip->i_d.di_mode = 0; |
838 | /* | ||
839 | * Initialize the per-fork minima and maxima for a new | ||
840 | * inode here. xfs_iformat will do it for old inodes. | ||
841 | */ | ||
842 | ip->i_df.if_ext_max = | ||
843 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); | ||
844 | } | 835 | } |
845 | 836 | ||
846 | /* | 837 | /* |
@@ -861,7 +852,6 @@ xfs_iread( | |||
861 | } | 852 | } |
862 | 853 | ||
863 | ip->i_delayed_blks = 0; | 854 | ip->i_delayed_blks = 0; |
864 | ip->i_size = ip->i_d.di_size; | ||
865 | 855 | ||
866 | /* | 856 | /* |
867 | * Mark the buffer containing the inode as something to keep | 857 | * Mark the buffer containing the inode as something to keep |
@@ -1051,7 +1041,6 @@ xfs_ialloc( | |||
1051 | } | 1041 | } |
1052 | 1042 | ||
1053 | ip->i_d.di_size = 0; | 1043 | ip->i_d.di_size = 0; |
1054 | ip->i_size = 0; | ||
1055 | ip->i_d.di_nextents = 0; | 1044 | ip->i_d.di_nextents = 0; |
1056 | ASSERT(ip->i_d.di_nblocks == 0); | 1045 | ASSERT(ip->i_d.di_nblocks == 0); |
1057 | 1046 | ||
@@ -1166,52 +1155,6 @@ xfs_ialloc( | |||
1166 | } | 1155 | } |
1167 | 1156 | ||
1168 | /* | 1157 | /* |
1169 | * Check to make sure that there are no blocks allocated to the | ||
1170 | * file beyond the size of the file. We don't check this for | ||
1171 | * files with fixed size extents or real time extents, but we | ||
1172 | * at least do it for regular files. | ||
1173 | */ | ||
1174 | #ifdef DEBUG | ||
1175 | STATIC void | ||
1176 | xfs_isize_check( | ||
1177 | struct xfs_inode *ip, | ||
1178 | xfs_fsize_t isize) | ||
1179 | { | ||
1180 | struct xfs_mount *mp = ip->i_mount; | ||
1181 | xfs_fileoff_t map_first; | ||
1182 | int nimaps; | ||
1183 | xfs_bmbt_irec_t imaps[2]; | ||
1184 | int error; | ||
1185 | |||
1186 | if (!S_ISREG(ip->i_d.di_mode)) | ||
1187 | return; | ||
1188 | |||
1189 | if (XFS_IS_REALTIME_INODE(ip)) | ||
1190 | return; | ||
1191 | |||
1192 | if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) | ||
1193 | return; | ||
1194 | |||
1195 | nimaps = 2; | ||
1196 | map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); | ||
1197 | /* | ||
1198 | * The filesystem could be shutting down, so bmapi may return | ||
1199 | * an error. | ||
1200 | */ | ||
1201 | error = xfs_bmapi_read(ip, map_first, | ||
1202 | (XFS_B_TO_FSB(mp, | ||
1203 | (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), | ||
1204 | imaps, &nimaps, XFS_BMAPI_ENTIRE); | ||
1205 | if (error) | ||
1206 | return; | ||
1207 | ASSERT(nimaps == 1); | ||
1208 | ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); | ||
1209 | } | ||
1210 | #else /* DEBUG */ | ||
1211 | #define xfs_isize_check(ip, isize) | ||
1212 | #endif /* DEBUG */ | ||
1213 | |||
1214 | /* | ||
1215 | * Free up the underlying blocks past new_size. The new size must be smaller | 1158 | * Free up the underlying blocks past new_size. The new size must be smaller |
1216 | * than the current size. This routine can be used both for the attribute and | 1159 | * than the current size. This routine can be used both for the attribute and |
1217 | * data fork, and does not modify the inode size, which is left to the caller. | 1160 | * data fork, and does not modify the inode size, which is left to the caller. |
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents( | |||
1252 | int done = 0; | 1195 | int done = 0; |
1253 | 1196 | ||
1254 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); | 1197 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); |
1255 | ASSERT(new_size <= ip->i_size); | 1198 | ASSERT(new_size <= XFS_ISIZE(ip)); |
1256 | ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); | 1199 | ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); |
1257 | ASSERT(ip->i_itemp != NULL); | 1200 | ASSERT(ip->i_itemp != NULL); |
1258 | ASSERT(ip->i_itemp->ili_lock_flags == 0); | 1201 | ASSERT(ip->i_itemp->ili_lock_flags == 0); |
1259 | ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); | 1202 | ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); |
1260 | 1203 | ||
1204 | trace_xfs_itruncate_extents_start(ip, new_size); | ||
1205 | |||
1261 | /* | 1206 | /* |
1262 | * Since it is possible for space to become allocated beyond | 1207 | * Since it is possible for space to become allocated beyond |
1263 | * the end of the file (in a crash where the space is allocated | 1208 | * the end of the file (in a crash where the space is allocated |
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents( | |||
1325 | goto out; | 1270 | goto out; |
1326 | } | 1271 | } |
1327 | 1272 | ||
1273 | /* | ||
1274 | * Always re-log the inode so that our permanent transaction can keep | ||
1275 | * on rolling it forward in the log. | ||
1276 | */ | ||
1277 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||
1278 | |||
1279 | trace_xfs_itruncate_extents_end(ip, new_size); | ||
1280 | |||
1328 | out: | 1281 | out: |
1329 | *tpp = tp; | 1282 | *tpp = tp; |
1330 | return error; | 1283 | return error; |
@@ -1338,74 +1291,6 @@ out_bmap_cancel: | |||
1338 | goto out; | 1291 | goto out; |
1339 | } | 1292 | } |
1340 | 1293 | ||
1341 | int | ||
1342 | xfs_itruncate_data( | ||
1343 | struct xfs_trans **tpp, | ||
1344 | struct xfs_inode *ip, | ||
1345 | xfs_fsize_t new_size) | ||
1346 | { | ||
1347 | int error; | ||
1348 | |||
1349 | trace_xfs_itruncate_data_start(ip, new_size); | ||
1350 | |||
1351 | /* | ||
1352 | * The first thing we do is set the size to new_size permanently on | ||
1353 | * disk. This way we don't have to worry about anyone ever being able | ||
1354 | * to look at the data being freed even in the face of a crash. | ||
1355 | * What we're getting around here is the case where we free a block, it | ||
1356 | * is allocated to another file, it is written to, and then we crash. | ||
1357 | * If the new data gets written to the file but the log buffers | ||
1358 | * containing the free and reallocation don't, then we'd end up with | ||
1359 | * garbage in the blocks being freed. As long as we make the new_size | ||
1360 | * permanent before actually freeing any blocks it doesn't matter if | ||
1361 | * they get written to. | ||
1362 | */ | ||
1363 | if (ip->i_d.di_nextents > 0) { | ||
1364 | /* | ||
1365 | * If we are not changing the file size then do not update | ||
1366 | * the on-disk file size - we may be called from | ||
1367 | * xfs_inactive_free_eofblocks(). If we update the on-disk | ||
1368 | * file size and then the system crashes before the contents | ||
1369 | * of the file are flushed to disk then the files may be | ||
1370 | * full of holes (ie NULL files bug). | ||
1371 | */ | ||
1372 | if (ip->i_size != new_size) { | ||
1373 | ip->i_d.di_size = new_size; | ||
1374 | ip->i_size = new_size; | ||
1375 | xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); | ||
1376 | } | ||
1377 | } | ||
1378 | |||
1379 | error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); | ||
1380 | if (error) | ||
1381 | return error; | ||
1382 | |||
1383 | /* | ||
1384 | * If we are not changing the file size then do not update the on-disk | ||
1385 | * file size - we may be called from xfs_inactive_free_eofblocks(). | ||
1386 | * If we update the on-disk file size and then the system crashes | ||
1387 | * before the contents of the file are flushed to disk then the files | ||
1388 | * may be full of holes (ie NULL files bug). | ||
1389 | */ | ||
1390 | xfs_isize_check(ip, new_size); | ||
1391 | if (ip->i_size != new_size) { | ||
1392 | ip->i_d.di_size = new_size; | ||
1393 | ip->i_size = new_size; | ||
1394 | } | ||
1395 | |||
1396 | ASSERT(new_size != 0 || ip->i_delayed_blks == 0); | ||
1397 | ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); | ||
1398 | |||
1399 | /* | ||
1400 | * Always re-log the inode so that our permanent transaction can keep | ||
1401 | * on rolling it forward in the log. | ||
1402 | */ | ||
1403 | xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); | ||
1404 | |||
1405 | trace_xfs_itruncate_data_end(ip, new_size); | ||
1406 | return 0; | ||
1407 | } | ||
1408 | |||
1409 | /* | 1294 | /* |
1410 | * This is called when the inode's link count goes to 0. | 1295 | * This is called when the inode's link count goes to 0. |
1411 | * We place the on-disk inode on a list in the AGI. It | 1296 | * We place the on-disk inode on a list in the AGI. It |
@@ -1824,8 +1709,7 @@ xfs_ifree( | |||
1824 | ASSERT(ip->i_d.di_nlink == 0); | 1709 | ASSERT(ip->i_d.di_nlink == 0); |
1825 | ASSERT(ip->i_d.di_nextents == 0); | 1710 | ASSERT(ip->i_d.di_nextents == 0); |
1826 | ASSERT(ip->i_d.di_anextents == 0); | 1711 | ASSERT(ip->i_d.di_anextents == 0); |
1827 | ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || | 1712 | ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); |
1828 | (!S_ISREG(ip->i_d.di_mode))); | ||
1829 | ASSERT(ip->i_d.di_nblocks == 0); | 1713 | ASSERT(ip->i_d.di_nblocks == 0); |
1830 | 1714 | ||
1831 | /* | 1715 | /* |
@@ -1844,8 +1728,6 @@ xfs_ifree( | |||
1844 | ip->i_d.di_flags = 0; | 1728 | ip->i_d.di_flags = 0; |
1845 | ip->i_d.di_dmevmask = 0; | 1729 | ip->i_d.di_dmevmask = 0; |
1846 | ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ | 1730 | ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ |
1847 | ip->i_df.if_ext_max = | ||
1848 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); | ||
1849 | ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; | 1731 | ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; |
1850 | ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; | 1732 | ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; |
1851 | /* | 1733 | /* |
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork( | |||
2151 | * once someone is waiting for it to be unpinned. | 2033 | * once someone is waiting for it to be unpinned. |
2152 | */ | 2034 | */ |
2153 | static void | 2035 | static void |
2154 | xfs_iunpin_nowait( | 2036 | xfs_iunpin( |
2155 | struct xfs_inode *ip) | 2037 | struct xfs_inode *ip) |
2156 | { | 2038 | { |
2157 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); | 2039 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); |
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait( | |||
2163 | 2045 | ||
2164 | } | 2046 | } |
2165 | 2047 | ||
2048 | static void | ||
2049 | __xfs_iunpin_wait( | ||
2050 | struct xfs_inode *ip) | ||
2051 | { | ||
2052 | wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); | ||
2053 | DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); | ||
2054 | |||
2055 | xfs_iunpin(ip); | ||
2056 | |||
2057 | do { | ||
2058 | prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); | ||
2059 | if (xfs_ipincount(ip)) | ||
2060 | io_schedule(); | ||
2061 | } while (xfs_ipincount(ip)); | ||
2062 | finish_wait(wq, &wait.wait); | ||
2063 | } | ||
2064 | |||
2166 | void | 2065 | void |
2167 | xfs_iunpin_wait( | 2066 | xfs_iunpin_wait( |
2168 | struct xfs_inode *ip) | 2067 | struct xfs_inode *ip) |
2169 | { | 2068 | { |
2170 | if (xfs_ipincount(ip)) { | 2069 | if (xfs_ipincount(ip)) |
2171 | xfs_iunpin_nowait(ip); | 2070 | __xfs_iunpin_wait(ip); |
2172 | wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); | ||
2173 | } | ||
2174 | } | 2071 | } |
2175 | 2072 | ||
2176 | /* | 2073 | /* |
@@ -2510,9 +2407,9 @@ xfs_iflush( | |||
2510 | XFS_STATS_INC(xs_iflush_count); | 2407 | XFS_STATS_INC(xs_iflush_count); |
2511 | 2408 | ||
2512 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); | 2409 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); |
2513 | ASSERT(!completion_done(&ip->i_flush)); | 2410 | ASSERT(xfs_isiflocked(ip)); |
2514 | ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || | 2411 | ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || |
2515 | ip->i_d.di_nextents > ip->i_df.if_ext_max); | 2412 | ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); |
2516 | 2413 | ||
2517 | iip = ip->i_itemp; | 2414 | iip = ip->i_itemp; |
2518 | mp = ip->i_mount; | 2415 | mp = ip->i_mount; |
@@ -2529,7 +2426,7 @@ xfs_iflush( | |||
2529 | * out for us if they occur after the log force completes. | 2426 | * out for us if they occur after the log force completes. |
2530 | */ | 2427 | */ |
2531 | if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { | 2428 | if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { |
2532 | xfs_iunpin_nowait(ip); | 2429 | xfs_iunpin(ip); |
2533 | xfs_ifunlock(ip); | 2430 | xfs_ifunlock(ip); |
2534 | return EAGAIN; | 2431 | return EAGAIN; |
2535 | } | 2432 | } |
@@ -2626,9 +2523,9 @@ xfs_iflush_int( | |||
2626 | #endif | 2523 | #endif |
2627 | 2524 | ||
2628 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); | 2525 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); |
2629 | ASSERT(!completion_done(&ip->i_flush)); | 2526 | ASSERT(xfs_isiflocked(ip)); |
2630 | ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || | 2527 | ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || |
2631 | ip->i_d.di_nextents > ip->i_df.if_ext_max); | 2528 | ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); |
2632 | 2529 | ||
2633 | iip = ip->i_itemp; | 2530 | iip = ip->i_itemp; |
2634 | mp = ip->i_mount; | 2531 | mp = ip->i_mount; |
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f0e6b151ba37..2f27b7454085 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h | |||
@@ -66,7 +66,6 @@ typedef struct xfs_ifork { | |||
66 | struct xfs_btree_block *if_broot; /* file's incore btree root */ | 66 | struct xfs_btree_block *if_broot; /* file's incore btree root */ |
67 | short if_broot_bytes; /* bytes allocated for root */ | 67 | short if_broot_bytes; /* bytes allocated for root */ |
68 | unsigned char if_flags; /* per-fork flags */ | 68 | unsigned char if_flags; /* per-fork flags */ |
69 | unsigned char if_ext_max; /* max # of extent records */ | ||
70 | union { | 69 | union { |
71 | xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ | 70 | xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ |
72 | xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ | 71 | xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ |
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode { | |||
206 | ((w) == XFS_DATA_FORK ? \ | 205 | ((w) == XFS_DATA_FORK ? \ |
207 | ((ip)->i_d.di_nextents = (n)) : \ | 206 | ((ip)->i_d.di_nextents = (n)) : \ |
208 | ((ip)->i_d.di_anextents = (n))) | 207 | ((ip)->i_d.di_anextents = (n))) |
209 | 208 | #define XFS_IFORK_MAXEXT(ip, w) \ | |
209 | (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) | ||
210 | 210 | ||
211 | 211 | ||
212 | #ifdef __KERNEL__ | 212 | #ifdef __KERNEL__ |
213 | 213 | ||
214 | struct bhv_desc; | ||
215 | struct xfs_buf; | 214 | struct xfs_buf; |
216 | struct xfs_bmap_free; | 215 | struct xfs_bmap_free; |
217 | struct xfs_bmbt_irec; | 216 | struct xfs_bmbt_irec; |
@@ -220,12 +219,6 @@ struct xfs_mount; | |||
220 | struct xfs_trans; | 219 | struct xfs_trans; |
221 | struct xfs_dquot; | 220 | struct xfs_dquot; |
222 | 221 | ||
223 | typedef struct dm_attrs_s { | ||
224 | __uint32_t da_dmevmask; /* DMIG event mask */ | ||
225 | __uint16_t da_dmstate; /* DMIG state info */ | ||
226 | __uint16_t da_pad; /* DMIG extra padding */ | ||
227 | } dm_attrs_t; | ||
228 | |||
229 | typedef struct xfs_inode { | 222 | typedef struct xfs_inode { |
230 | /* Inode linking and identification information. */ | 223 | /* Inode linking and identification information. */ |
231 | struct xfs_mount *i_mount; /* fs mount struct ptr */ | 224 | struct xfs_mount *i_mount; /* fs mount struct ptr */ |
@@ -244,27 +237,19 @@ typedef struct xfs_inode { | |||
244 | struct xfs_inode_log_item *i_itemp; /* logging information */ | 237 | struct xfs_inode_log_item *i_itemp; /* logging information */ |
245 | mrlock_t i_lock; /* inode lock */ | 238 | mrlock_t i_lock; /* inode lock */ |
246 | mrlock_t i_iolock; /* inode IO lock */ | 239 | mrlock_t i_iolock; /* inode IO lock */ |
247 | struct completion i_flush; /* inode flush completion q */ | ||
248 | atomic_t i_pincount; /* inode pin count */ | 240 | atomic_t i_pincount; /* inode pin count */ |
249 | wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ | ||
250 | spinlock_t i_flags_lock; /* inode i_flags lock */ | 241 | spinlock_t i_flags_lock; /* inode i_flags lock */ |
251 | /* Miscellaneous state. */ | 242 | /* Miscellaneous state. */ |
252 | unsigned short i_flags; /* see defined flags below */ | 243 | unsigned long i_flags; /* see defined flags below */ |
253 | unsigned char i_update_core; /* timestamps/size is dirty */ | 244 | unsigned char i_update_core; /* timestamps/size is dirty */ |
254 | unsigned int i_delayed_blks; /* count of delay alloc blks */ | 245 | unsigned int i_delayed_blks; /* count of delay alloc blks */ |
255 | 246 | ||
256 | xfs_icdinode_t i_d; /* most of ondisk inode */ | 247 | xfs_icdinode_t i_d; /* most of ondisk inode */ |
257 | 248 | ||
258 | xfs_fsize_t i_size; /* in-memory size */ | ||
259 | xfs_fsize_t i_new_size; /* size when write completes */ | ||
260 | |||
261 | /* VFS inode */ | 249 | /* VFS inode */ |
262 | struct inode i_vnode; /* embedded VFS inode */ | 250 | struct inode i_vnode; /* embedded VFS inode */ |
263 | } xfs_inode_t; | 251 | } xfs_inode_t; |
264 | 252 | ||
265 | #define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ | ||
266 | (ip)->i_size : (ip)->i_d.di_size; | ||
267 | |||
268 | /* Convert from vfs inode to xfs inode */ | 253 | /* Convert from vfs inode to xfs inode */ |
269 | static inline struct xfs_inode *XFS_I(struct inode *inode) | 254 | static inline struct xfs_inode *XFS_I(struct inode *inode) |
270 | { | 255 | { |
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) | |||
278 | } | 263 | } |
279 | 264 | ||
280 | /* | 265 | /* |
266 | * For regular files we only update the on-disk filesize when actually | ||
267 | * writing data back to disk. Until then only the copy in the VFS inode | ||
268 | * is uptodate. | ||
269 | */ | ||
270 | static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) | ||
271 | { | ||
272 | if (S_ISREG(ip->i_d.di_mode)) | ||
273 | return i_size_read(VFS_I(ip)); | ||
274 | return ip->i_d.di_size; | ||
275 | } | ||
276 | |||
277 | /* | ||
281 | * i_flags helper functions | 278 | * i_flags helper functions |
282 | */ | 279 | */ |
283 | static inline void | 280 | static inline void |
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) | |||
331 | return ret; | 328 | return ret; |
332 | } | 329 | } |
333 | 330 | ||
331 | static inline int | ||
332 | xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) | ||
333 | { | ||
334 | int ret; | ||
335 | |||
336 | spin_lock(&ip->i_flags_lock); | ||
337 | ret = ip->i_flags & flags; | ||
338 | if (!ret) | ||
339 | ip->i_flags |= flags; | ||
340 | spin_unlock(&ip->i_flags_lock); | ||
341 | return ret; | ||
342 | } | ||
343 | |||
334 | /* | 344 | /* |
335 | * Project quota id helpers (previously projid was 16bit only | 345 | * Project quota id helpers (previously projid was 16bit only |
336 | * and using two 16bit values to hold new 32bit projid was chosen | 346 | * and using two 16bit values to hold new 32bit projid was chosen |
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip, | |||
351 | } | 361 | } |
352 | 362 | ||
353 | /* | 363 | /* |
354 | * Manage the i_flush queue embedded in the inode. This completion | ||
355 | * queue synchronizes processes attempting to flush the in-core | ||
356 | * inode back to disk. | ||
357 | */ | ||
358 | static inline void xfs_iflock(xfs_inode_t *ip) | ||
359 | { | ||
360 | wait_for_completion(&ip->i_flush); | ||
361 | } | ||
362 | |||
363 | static inline int xfs_iflock_nowait(xfs_inode_t *ip) | ||
364 | { | ||
365 | return try_wait_for_completion(&ip->i_flush); | ||
366 | } | ||
367 | |||
368 | static inline void xfs_ifunlock(xfs_inode_t *ip) | ||
369 | { | ||
370 | complete(&ip->i_flush); | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * In-core inode flags. | 364 | * In-core inode flags. |
375 | */ | 365 | */ |
376 | #define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ | 366 | #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ |
377 | #define XFS_ISTALE 0x0002 /* inode has been staled */ | 367 | #define XFS_ISTALE (1 << 1) /* inode has been staled */ |
378 | #define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ | 368 | #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ |
379 | #define XFS_INEW 0x0008 /* inode has just been allocated */ | 369 | #define XFS_INEW (1 << 3) /* inode has just been allocated */ |
380 | #define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ | 370 | #define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ |
381 | #define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ | 371 | #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ |
382 | #define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ | 372 | #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ |
373 | #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ | ||
374 | #define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) | ||
375 | #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ | ||
376 | #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) | ||
383 | 377 | ||
384 | /* | 378 | /* |
385 | * Per-lifetime flags need to be reset when re-using a reclaimable inode during | 379 | * Per-lifetime flags need to be reset when re-using a reclaimable inode during |
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) | |||
392 | XFS_IFILESTREAM); | 386 | XFS_IFILESTREAM); |
393 | 387 | ||
394 | /* | 388 | /* |
389 | * Synchronize processes attempting to flush the in-core inode back to disk. | ||
390 | */ | ||
391 | |||
392 | extern void __xfs_iflock(struct xfs_inode *ip); | ||
393 | |||
394 | static inline int xfs_iflock_nowait(struct xfs_inode *ip) | ||
395 | { | ||
396 | return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); | ||
397 | } | ||
398 | |||
399 | static inline void xfs_iflock(struct xfs_inode *ip) | ||
400 | { | ||
401 | if (!xfs_iflock_nowait(ip)) | ||
402 | __xfs_iflock(ip); | ||
403 | } | ||
404 | |||
405 | static inline void xfs_ifunlock(struct xfs_inode *ip) | ||
406 | { | ||
407 | xfs_iflags_clear(ip, XFS_IFLOCK); | ||
408 | wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); | ||
409 | } | ||
410 | |||
411 | static inline int xfs_isiflocked(struct xfs_inode *ip) | ||
412 | { | ||
413 | return xfs_iflags_test(ip, XFS_IFLOCK); | ||
414 | } | ||
415 | |||
416 | /* | ||
395 | * Flags for inode locking. | 417 | * Flags for inode locking. |
396 | * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) | 418 | * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) |
397 | * 1<<16 - 1<<32-1 -- lockdep annotation (integers) | 419 | * 1<<16 - 1<<32-1 -- lockdep annotation (integers) |
@@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *, | |||
491 | struct xfs_bmap_free *); | 513 | struct xfs_bmap_free *); |
492 | int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, | 514 | int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, |
493 | int, xfs_fsize_t); | 515 | int, xfs_fsize_t); |
494 | int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *, | ||
495 | xfs_fsize_t); | ||
496 | int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); | 516 | int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); |
497 | 517 | ||
498 | void xfs_iext_realloc(xfs_inode_t *, int, int); | 518 | void xfs_iext_realloc(xfs_inode_t *, int, int); |
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cfd6c7f8cc3c..91d71dcd4852 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c | |||
@@ -79,8 +79,6 @@ xfs_inode_item_size( | |||
79 | break; | 79 | break; |
80 | 80 | ||
81 | case XFS_DINODE_FMT_BTREE: | 81 | case XFS_DINODE_FMT_BTREE: |
82 | ASSERT(ip->i_df.if_ext_max == | ||
83 | XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); | ||
84 | iip->ili_format.ilf_fields &= | 82 | iip->ili_format.ilf_fields &= |
85 | ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | | 83 | ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | |
86 | XFS_ILOG_DEV | XFS_ILOG_UUID); | 84 | XFS_ILOG_DEV | XFS_ILOG_UUID); |
@@ -557,7 +555,7 @@ xfs_inode_item_unpin( | |||
557 | trace_xfs_inode_unpin(ip, _RET_IP_); | 555 | trace_xfs_inode_unpin(ip, _RET_IP_); |
558 | ASSERT(atomic_read(&ip->i_pincount) > 0); | 556 | ASSERT(atomic_read(&ip->i_pincount) > 0); |
559 | if (atomic_dec_and_test(&ip->i_pincount)) | 557 | if (atomic_dec_and_test(&ip->i_pincount)) |
560 | wake_up(&ip->i_ipin_wait); | 558 | wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); |
561 | } | 559 | } |
562 | 560 | ||
563 | /* | 561 | /* |
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf( | |||
719 | * If a flush is not in progress anymore, chances are that the | 717 | * If a flush is not in progress anymore, chances are that the |
720 | * inode was taken off the AIL. So, just get out. | 718 | * inode was taken off the AIL. So, just get out. |
721 | */ | 719 | */ |
722 | if (completion_done(&ip->i_flush) || | 720 | if (!xfs_isiflocked(ip) || |
723 | !(lip->li_flags & XFS_LI_IN_AIL)) { | 721 | !(lip->li_flags & XFS_LI_IN_AIL)) { |
724 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 722 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
725 | return true; | 723 | return true; |
@@ -752,7 +750,7 @@ xfs_inode_item_push( | |||
752 | struct xfs_inode *ip = iip->ili_inode; | 750 | struct xfs_inode *ip = iip->ili_inode; |
753 | 751 | ||
754 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); | 752 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); |
755 | ASSERT(!completion_done(&ip->i_flush)); | 753 | ASSERT(xfs_isiflocked(ip)); |
756 | 754 | ||
757 | /* | 755 | /* |
758 | * Since we were able to lock the inode's flush lock and | 756 | * Since we were able to lock the inode's flush lock and |
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9afa282aa937..246c7d57c6f9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c | |||
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb( | |||
57 | xfs_fileoff_t *last_fsb) | 57 | xfs_fileoff_t *last_fsb) |
58 | { | 58 | { |
59 | xfs_fileoff_t new_last_fsb = 0; | 59 | xfs_fileoff_t new_last_fsb = 0; |
60 | xfs_extlen_t align; | 60 | xfs_extlen_t align = 0; |
61 | int eof, error; | 61 | int eof, error; |
62 | 62 | ||
63 | if (XFS_IS_REALTIME_INODE(ip)) | 63 | if (!XFS_IS_REALTIME_INODE(ip)) { |
64 | ; | 64 | /* |
65 | /* | 65 | * Round up the allocation request to a stripe unit |
66 | * If mounted with the "-o swalloc" option, roundup the allocation | 66 | * (m_dalign) boundary if the file size is >= stripe unit |
67 | * request to a stripe width boundary if the file size is >= | 67 | * size, and we are allocating past the allocation eof. |
68 | * stripe width and we are allocating past the allocation eof. | 68 | * |
69 | */ | 69 | * If mounted with the "-o swalloc" option the alignment is |
70 | else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && | 70 | * increased from the strip unit size to the stripe width. |
71 | (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) | 71 | */ |
72 | new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); | 72 | if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) |
73 | /* | 73 | align = mp->m_swidth; |
74 | * Roundup the allocation request to a stripe unit (m_dalign) boundary | 74 | else if (mp->m_dalign) |
75 | * if the file size is >= stripe unit size, and we are allocating past | 75 | align = mp->m_dalign; |
76 | * the allocation eof. | 76 | |
77 | */ | 77 | if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) |
78 | else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) | 78 | new_last_fsb = roundup_64(*last_fsb, align); |
79 | new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); | 79 | } |
80 | 80 | ||
81 | /* | 81 | /* |
82 | * Always round up the allocation request to an extent boundary | 82 | * Always round up the allocation request to an extent boundary |
@@ -154,7 +154,7 @@ xfs_iomap_write_direct( | |||
154 | 154 | ||
155 | offset_fsb = XFS_B_TO_FSBT(mp, offset); | 155 | offset_fsb = XFS_B_TO_FSBT(mp, offset); |
156 | last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); | 156 | last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); |
157 | if ((offset + count) > ip->i_size) { | 157 | if ((offset + count) > XFS_ISIZE(ip)) { |
158 | error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); | 158 | error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); |
159 | if (error) | 159 | if (error) |
160 | goto error_out; | 160 | goto error_out; |
@@ -211,7 +211,7 @@ xfs_iomap_write_direct( | |||
211 | xfs_trans_ijoin(tp, ip, 0); | 211 | xfs_trans_ijoin(tp, ip, 0); |
212 | 212 | ||
213 | bmapi_flag = 0; | 213 | bmapi_flag = 0; |
214 | if (offset < ip->i_size || extsz) | 214 | if (offset < XFS_ISIZE(ip) || extsz) |
215 | bmapi_flag |= XFS_BMAPI_PREALLOC; | 215 | bmapi_flag |= XFS_BMAPI_PREALLOC; |
216 | 216 | ||
217 | /* | 217 | /* |
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate( | |||
286 | int found_delalloc = 0; | 286 | int found_delalloc = 0; |
287 | 287 | ||
288 | *prealloc = 0; | 288 | *prealloc = 0; |
289 | if ((offset + count) <= ip->i_size) | 289 | if (offset + count <= XFS_ISIZE(ip)) |
290 | return 0; | 290 | return 0; |
291 | 291 | ||
292 | /* | 292 | /* |
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size( | |||
340 | * if we pass in alloc_blocks = 0. Hence the "+ 1" to | 340 | * if we pass in alloc_blocks = 0. Hence the "+ 1" to |
341 | * ensure we always pass in a non-zero value. | 341 | * ensure we always pass in a non-zero value. |
342 | */ | 342 | */ |
343 | alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; | 343 | alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; |
344 | alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, | 344 | alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, |
345 | rounddown_pow_of_two(alloc_blocks)); | 345 | rounddown_pow_of_two(alloc_blocks)); |
346 | 346 | ||
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate( | |||
564 | * back.... | 564 | * back.... |
565 | */ | 565 | */ |
566 | nimaps = 1; | 566 | nimaps = 1; |
567 | end_fsb = XFS_B_TO_FSB(mp, ip->i_size); | 567 | end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); |
568 | error = xfs_bmap_last_offset(NULL, ip, &last_block, | 568 | error = xfs_bmap_last_offset(NULL, ip, &last_block, |
569 | XFS_DATA_FORK); | 569 | XFS_DATA_FORK); |
570 | if (error) | 570 | if (error) |
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f9babd179223..ab302539e5b9 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c | |||
@@ -750,6 +750,7 @@ xfs_setattr_size( | |||
750 | struct xfs_mount *mp = ip->i_mount; | 750 | struct xfs_mount *mp = ip->i_mount; |
751 | struct inode *inode = VFS_I(ip); | 751 | struct inode *inode = VFS_I(ip); |
752 | int mask = iattr->ia_valid; | 752 | int mask = iattr->ia_valid; |
753 | xfs_off_t oldsize, newsize; | ||
753 | struct xfs_trans *tp; | 754 | struct xfs_trans *tp; |
754 | int error; | 755 | int error; |
755 | uint lock_flags; | 756 | uint lock_flags; |
@@ -777,11 +778,13 @@ xfs_setattr_size( | |||
777 | lock_flags |= XFS_IOLOCK_EXCL; | 778 | lock_flags |= XFS_IOLOCK_EXCL; |
778 | xfs_ilock(ip, lock_flags); | 779 | xfs_ilock(ip, lock_flags); |
779 | 780 | ||
781 | oldsize = inode->i_size; | ||
782 | newsize = iattr->ia_size; | ||
783 | |||
780 | /* | 784 | /* |
781 | * Short circuit the truncate case for zero length files. | 785 | * Short circuit the truncate case for zero length files. |
782 | */ | 786 | */ |
783 | if (iattr->ia_size == 0 && | 787 | if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { |
784 | ip->i_size == 0 && ip->i_d.di_nextents == 0) { | ||
785 | if (!(mask & (ATTR_CTIME|ATTR_MTIME))) | 788 | if (!(mask & (ATTR_CTIME|ATTR_MTIME))) |
786 | goto out_unlock; | 789 | goto out_unlock; |
787 | 790 | ||
@@ -807,14 +810,14 @@ xfs_setattr_size( | |||
807 | * the inode to the transaction, because the inode cannot be unlocked | 810 | * the inode to the transaction, because the inode cannot be unlocked |
808 | * once it is a part of the transaction. | 811 | * once it is a part of the transaction. |
809 | */ | 812 | */ |
810 | if (iattr->ia_size > ip->i_size) { | 813 | if (newsize > oldsize) { |
811 | /* | 814 | /* |
812 | * Do the first part of growing a file: zero any data in the | 815 | * Do the first part of growing a file: zero any data in the |
813 | * last block that is beyond the old EOF. We need to do this | 816 | * last block that is beyond the old EOF. We need to do this |
814 | * before the inode is joined to the transaction to modify | 817 | * before the inode is joined to the transaction to modify |
815 | * i_size. | 818 | * i_size. |
816 | */ | 819 | */ |
817 | error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); | 820 | error = xfs_zero_eof(ip, newsize, oldsize); |
818 | if (error) | 821 | if (error) |
819 | goto out_unlock; | 822 | goto out_unlock; |
820 | } | 823 | } |
@@ -833,8 +836,8 @@ xfs_setattr_size( | |||
833 | * here and prevents waiting for other data not within the range we | 836 | * here and prevents waiting for other data not within the range we |
834 | * care about here. | 837 | * care about here. |
835 | */ | 838 | */ |
836 | if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { | 839 | if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { |
837 | error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, | 840 | error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, |
838 | FI_NONE); | 841 | FI_NONE); |
839 | if (error) | 842 | if (error) |
840 | goto out_unlock; | 843 | goto out_unlock; |
@@ -845,8 +848,7 @@ xfs_setattr_size( | |||
845 | */ | 848 | */ |
846 | inode_dio_wait(inode); | 849 | inode_dio_wait(inode); |
847 | 850 | ||
848 | error = -block_truncate_page(inode->i_mapping, iattr->ia_size, | 851 | error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); |
849 | xfs_get_blocks); | ||
850 | if (error) | 852 | if (error) |
851 | goto out_unlock; | 853 | goto out_unlock; |
852 | 854 | ||
@@ -857,7 +859,7 @@ xfs_setattr_size( | |||
857 | if (error) | 859 | if (error) |
858 | goto out_trans_cancel; | 860 | goto out_trans_cancel; |
859 | 861 | ||
860 | truncate_setsize(inode, iattr->ia_size); | 862 | truncate_setsize(inode, newsize); |
861 | 863 | ||
862 | commit_flags = XFS_TRANS_RELEASE_LOG_RES; | 864 | commit_flags = XFS_TRANS_RELEASE_LOG_RES; |
863 | lock_flags |= XFS_ILOCK_EXCL; | 865 | lock_flags |= XFS_ILOCK_EXCL; |
@@ -876,19 +878,29 @@ xfs_setattr_size( | |||
876 | * these flags set. For all other operations the VFS set these flags | 878 | * these flags set. For all other operations the VFS set these flags |
877 | * explicitly if it wants a timestamp update. | 879 | * explicitly if it wants a timestamp update. |
878 | */ | 880 | */ |
879 | if (iattr->ia_size != ip->i_size && | 881 | if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { |
880 | (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { | ||
881 | iattr->ia_ctime = iattr->ia_mtime = | 882 | iattr->ia_ctime = iattr->ia_mtime = |
882 | current_fs_time(inode->i_sb); | 883 | current_fs_time(inode->i_sb); |
883 | mask |= ATTR_CTIME | ATTR_MTIME; | 884 | mask |= ATTR_CTIME | ATTR_MTIME; |
884 | } | 885 | } |
885 | 886 | ||
886 | if (iattr->ia_size > ip->i_size) { | 887 | /* |
887 | ip->i_d.di_size = iattr->ia_size; | 888 | * The first thing we do is set the size to new_size permanently on |
888 | ip->i_size = iattr->ia_size; | 889 | * disk. This way we don't have to worry about anyone ever being able |
889 | } else if (iattr->ia_size <= ip->i_size || | 890 | * to look at the data being freed even in the face of a crash. |
890 | (iattr->ia_size == 0 && ip->i_d.di_nextents)) { | 891 | * What we're getting around here is the case where we free a block, it |
891 | error = xfs_itruncate_data(&tp, ip, iattr->ia_size); | 892 | * is allocated to another file, it is written to, and then we crash. |
893 | * If the new data gets written to the file but the log buffers | ||
894 | * containing the free and reallocation don't, then we'd end up with | ||
895 | * garbage in the blocks being freed. As long as we make the new size | ||
896 | * permanent before actually freeing any blocks it doesn't matter if | ||
897 | * they get written to. | ||
898 | */ | ||
899 | ip->i_d.di_size = newsize; | ||
900 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||
901 | |||
902 | if (newsize <= oldsize) { | ||
903 | error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); | ||
892 | if (error) | 904 | if (error) |
893 | goto out_trans_abort; | 905 | goto out_trans_abort; |
894 | 906 | ||
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5cc3dde1bc90..eafbcff81f3a 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include "xfs_mount.h" | 31 | #include "xfs_mount.h" |
32 | #include "xfs_bmap_btree.h" | 32 | #include "xfs_bmap_btree.h" |
33 | #include "xfs_inode.h" | 33 | #include "xfs_inode.h" |
34 | #include "xfs_inode_item.h" | ||
34 | #include "xfs_itable.h" | 35 | #include "xfs_itable.h" |
35 | #include "xfs_bmap.h" | 36 | #include "xfs_bmap.h" |
36 | #include "xfs_rtalloc.h" | 37 | #include "xfs_rtalloc.h" |
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile( | |||
263 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 264 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
264 | xfs_trans_ijoin(tp, ip, 0); | 265 | xfs_trans_ijoin(tp, ip, 0); |
265 | 266 | ||
266 | error = xfs_itruncate_data(&tp, ip, 0); | 267 | ip->i_d.di_size = 0; |
268 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||
269 | |||
270 | error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); | ||
267 | if (error) { | 271 | if (error) { |
268 | xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | | 272 | xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | |
269 | XFS_TRANS_ABORT); | 273 | XFS_TRANS_ABORT); |
270 | goto out_unlock; | 274 | goto out_unlock; |
271 | } | 275 | } |
272 | 276 | ||
277 | ASSERT(ip->i_d.di_nextents == 0); | ||
278 | |||
273 | xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); | 279 | xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
274 | error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); | 280 | error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); |
275 | 281 | ||
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 281961c1d81a..ee5b695c99a7 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c | |||
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once( | |||
828 | /* xfs inode */ | 828 | /* xfs inode */ |
829 | atomic_set(&ip->i_pincount, 0); | 829 | atomic_set(&ip->i_pincount, 0); |
830 | spin_lock_init(&ip->i_flags_lock); | 830 | spin_lock_init(&ip->i_flags_lock); |
831 | init_waitqueue_head(&ip->i_ipin_wait); | ||
832 | /* | ||
833 | * Because we want to use a counting completion, complete | ||
834 | * the flush completion once to allow a single access to | ||
835 | * the flush completion without blocking. | ||
836 | */ | ||
837 | init_completion(&ip->i_flush); | ||
838 | complete(&ip->i_flush); | ||
839 | 831 | ||
840 | mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, | 832 | mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, |
841 | "xfsino", ip->i_ino); | 833 | "xfsino", ip->i_ino); |
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 72c01a1c16e7..40b75eecd2b4 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c | |||
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab( | |||
707 | return 1; | 707 | return 1; |
708 | 708 | ||
709 | /* | 709 | /* |
710 | * do some unlocked checks first to avoid unnecessary lock traffic. | 710 | * If we are asked for non-blocking operation, do unlocked checks to |
711 | * The first is a flush lock check, the second is a already in reclaim | 711 | * see if the inode already is being flushed or in reclaim to avoid |
712 | * check. Only do these checks if we are not going to block on locks. | 712 | * lock traffic. |
713 | */ | 713 | */ |
714 | if ((flags & SYNC_TRYLOCK) && | 714 | if ((flags & SYNC_TRYLOCK) && |
715 | (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { | 715 | __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) |
716 | return 1; | 716 | return 1; |
717 | } | ||
718 | 717 | ||
719 | /* | 718 | /* |
720 | * The radix tree lock here protects a thread in xfs_iget from racing | 719 | * The radix tree lock here protects a thread in xfs_iget from racing |
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a9d5b1e06efe..6b6df5802e95 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h | |||
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class, | |||
891 | __field(dev_t, dev) | 891 | __field(dev_t, dev) |
892 | __field(xfs_ino_t, ino) | 892 | __field(xfs_ino_t, ino) |
893 | __field(xfs_fsize_t, size) | 893 | __field(xfs_fsize_t, size) |
894 | __field(xfs_fsize_t, new_size) | ||
895 | __field(loff_t, offset) | 894 | __field(loff_t, offset) |
896 | __field(size_t, count) | 895 | __field(size_t, count) |
897 | __field(int, flags) | 896 | __field(int, flags) |
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class, | |||
900 | __entry->dev = VFS_I(ip)->i_sb->s_dev; | 899 | __entry->dev = VFS_I(ip)->i_sb->s_dev; |
901 | __entry->ino = ip->i_ino; | 900 | __entry->ino = ip->i_ino; |
902 | __entry->size = ip->i_d.di_size; | 901 | __entry->size = ip->i_d.di_size; |
903 | __entry->new_size = ip->i_new_size; | ||
904 | __entry->offset = offset; | 902 | __entry->offset = offset; |
905 | __entry->count = count; | 903 | __entry->count = count; |
906 | __entry->flags = flags; | 904 | __entry->flags = flags; |
907 | ), | 905 | ), |
908 | TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " | 906 | TP_printk("dev %d:%d ino 0x%llx size 0x%llx " |
909 | "offset 0x%llx count 0x%zx ioflags %s", | 907 | "offset 0x%llx count 0x%zx ioflags %s", |
910 | MAJOR(__entry->dev), MINOR(__entry->dev), | 908 | MAJOR(__entry->dev), MINOR(__entry->dev), |
911 | __entry->ino, | 909 | __entry->ino, |
912 | __entry->size, | 910 | __entry->size, |
913 | __entry->new_size, | ||
914 | __entry->offset, | 911 | __entry->offset, |
915 | __entry->count, | 912 | __entry->count, |
916 | __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) | 913 | __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) |
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, | |||
978 | __field(dev_t, dev) | 975 | __field(dev_t, dev) |
979 | __field(xfs_ino_t, ino) | 976 | __field(xfs_ino_t, ino) |
980 | __field(loff_t, size) | 977 | __field(loff_t, size) |
981 | __field(loff_t, new_size) | ||
982 | __field(loff_t, offset) | 978 | __field(loff_t, offset) |
983 | __field(size_t, count) | 979 | __field(size_t, count) |
984 | __field(int, type) | 980 | __field(int, type) |
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, | |||
990 | __entry->dev = VFS_I(ip)->i_sb->s_dev; | 986 | __entry->dev = VFS_I(ip)->i_sb->s_dev; |
991 | __entry->ino = ip->i_ino; | 987 | __entry->ino = ip->i_ino; |
992 | __entry->size = ip->i_d.di_size; | 988 | __entry->size = ip->i_d.di_size; |
993 | __entry->new_size = ip->i_new_size; | ||
994 | __entry->offset = offset; | 989 | __entry->offset = offset; |
995 | __entry->count = count; | 990 | __entry->count = count; |
996 | __entry->type = type; | 991 | __entry->type = type; |
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class, | |||
998 | __entry->startblock = irec ? irec->br_startblock : 0; | 993 | __entry->startblock = irec ? irec->br_startblock : 0; |
999 | __entry->blockcount = irec ? irec->br_blockcount : 0; | 994 | __entry->blockcount = irec ? irec->br_blockcount : 0; |
1000 | ), | 995 | ), |
1001 | TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " | 996 | TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " |
1002 | "offset 0x%llx count %zd type %s " | 997 | "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", |
1003 | "startoff 0x%llx startblock %lld blockcount 0x%llx", | ||
1004 | MAJOR(__entry->dev), MINOR(__entry->dev), | 998 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1005 | __entry->ino, | 999 | __entry->ino, |
1006 | __entry->size, | 1000 | __entry->size, |
1007 | __entry->new_size, | ||
1008 | __entry->offset, | 1001 | __entry->offset, |
1009 | __entry->count, | 1002 | __entry->count, |
1010 | __print_symbolic(__entry->type, XFS_IO_TYPES), | 1003 | __print_symbolic(__entry->type, XFS_IO_TYPES), |
@@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, | |||
1031 | __field(xfs_ino_t, ino) | 1024 | __field(xfs_ino_t, ino) |
1032 | __field(loff_t, isize) | 1025 | __field(loff_t, isize) |
1033 | __field(loff_t, disize) | 1026 | __field(loff_t, disize) |
1034 | __field(loff_t, new_size) | ||
1035 | __field(loff_t, offset) | 1027 | __field(loff_t, offset) |
1036 | __field(size_t, count) | 1028 | __field(size_t, count) |
1037 | ), | 1029 | ), |
1038 | TP_fast_assign( | 1030 | TP_fast_assign( |
1039 | __entry->dev = VFS_I(ip)->i_sb->s_dev; | 1031 | __entry->dev = VFS_I(ip)->i_sb->s_dev; |
1040 | __entry->ino = ip->i_ino; | 1032 | __entry->ino = ip->i_ino; |
1041 | __entry->isize = ip->i_size; | 1033 | __entry->isize = VFS_I(ip)->i_size; |
1042 | __entry->disize = ip->i_d.di_size; | 1034 | __entry->disize = ip->i_d.di_size; |
1043 | __entry->new_size = ip->i_new_size; | ||
1044 | __entry->offset = offset; | 1035 | __entry->offset = offset; |
1045 | __entry->count = count; | 1036 | __entry->count = count; |
1046 | ), | 1037 | ), |
1047 | TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " | 1038 | TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " |
1048 | "offset 0x%llx count %zd", | 1039 | "offset 0x%llx count %zd", |
1049 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1040 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1050 | __entry->ino, | 1041 | __entry->ino, |
1051 | __entry->isize, | 1042 | __entry->isize, |
1052 | __entry->disize, | 1043 | __entry->disize, |
1053 | __entry->new_size, | ||
1054 | __entry->offset, | 1044 | __entry->offset, |
1055 | __entry->count) | 1045 | __entry->count) |
1056 | ); | 1046 | ); |
@@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, | |||
1090 | DEFINE_EVENT(xfs_itrunc_class, name, \ | 1080 | DEFINE_EVENT(xfs_itrunc_class, name, \ |
1091 | TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ | 1081 | TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ |
1092 | TP_ARGS(ip, new_size)) | 1082 | TP_ARGS(ip, new_size)) |
1093 | DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); | 1083 | DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); |
1094 | DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); | 1084 | DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); |
1095 | 1085 | ||
1096 | TRACE_EVENT(xfs_pagecache_inval, | 1086 | TRACE_EVENT(xfs_pagecache_inval, |
1097 | TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), | 1087 | TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), |
@@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, | |||
1568 | __field(xfs_ino_t, ino) | 1558 | __field(xfs_ino_t, ino) |
1569 | __field(int, format) | 1559 | __field(int, format) |
1570 | __field(int, nex) | 1560 | __field(int, nex) |
1571 | __field(int, max_nex) | ||
1572 | __field(int, broot_size) | 1561 | __field(int, broot_size) |
1573 | __field(int, fork_off) | 1562 | __field(int, fork_off) |
1574 | ), | 1563 | ), |
@@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, | |||
1578 | __entry->ino = ip->i_ino; | 1567 | __entry->ino = ip->i_ino; |
1579 | __entry->format = ip->i_d.di_format; | 1568 | __entry->format = ip->i_d.di_format; |
1580 | __entry->nex = ip->i_d.di_nextents; | 1569 | __entry->nex = ip->i_d.di_nextents; |
1581 | __entry->max_nex = ip->i_df.if_ext_max; | ||
1582 | __entry->broot_size = ip->i_df.if_broot_bytes; | 1570 | __entry->broot_size = ip->i_df.if_broot_bytes; |
1583 | __entry->fork_off = XFS_IFORK_BOFF(ip); | 1571 | __entry->fork_off = XFS_IFORK_BOFF(ip); |
1584 | ), | 1572 | ), |
1585 | TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " | 1573 | TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " |
1586 | "Max in-fork extents %d, broot size %d, fork offset %d", | 1574 | "broot size %d, fork offset %d", |
1587 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1575 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1588 | __entry->ino, | 1576 | __entry->ino, |
1589 | __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), | 1577 | __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), |
1590 | __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), | 1578 | __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), |
1591 | __entry->nex, | 1579 | __entry->nex, |
1592 | __entry->max_nex, | ||
1593 | __entry->broot_size, | 1580 | __entry->broot_size, |
1594 | __entry->fork_off) | 1581 | __entry->fork_off) |
1595 | ) | 1582 | ) |
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f2fea868d4db..0cf52da9d246 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c | |||
@@ -175,7 +175,7 @@ xfs_free_eofblocks( | |||
175 | * Figure out if there are any blocks beyond the end | 175 | * Figure out if there are any blocks beyond the end |
176 | * of the file. If not, then there is nothing to do. | 176 | * of the file. If not, then there is nothing to do. |
177 | */ | 177 | */ |
178 | end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); | 178 | end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); |
179 | last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); | 179 | last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); |
180 | if (last_fsb <= end_fsb) | 180 | if (last_fsb <= end_fsb) |
181 | return 0; | 181 | return 0; |
@@ -226,7 +226,14 @@ xfs_free_eofblocks( | |||
226 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 226 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
227 | xfs_trans_ijoin(tp, ip, 0); | 227 | xfs_trans_ijoin(tp, ip, 0); |
228 | 228 | ||
229 | error = xfs_itruncate_data(&tp, ip, ip->i_size); | 229 | /* |
230 | * Do not update the on-disk file size. If we update the | ||
231 | * on-disk file size and then the system crashes before the | ||
232 | * contents of the file are flushed to disk then the files | ||
233 | * may be full of holes (ie NULL files bug). | ||
234 | */ | ||
235 | error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, | ||
236 | XFS_ISIZE(ip)); | ||
230 | if (error) { | 237 | if (error) { |
231 | /* | 238 | /* |
232 | * If we get an error at this point we simply don't | 239 | * If we get an error at this point we simply don't |
@@ -540,8 +547,8 @@ xfs_release( | |||
540 | return 0; | 547 | return 0; |
541 | 548 | ||
542 | if ((S_ISREG(ip->i_d.di_mode) && | 549 | if ((S_ISREG(ip->i_d.di_mode) && |
543 | ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || | 550 | (VFS_I(ip)->i_size > 0 || |
544 | ip->i_delayed_blks > 0)) && | 551 | (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && |
545 | (ip->i_df.if_flags & XFS_IFEXTENTS)) && | 552 | (ip->i_df.if_flags & XFS_IFEXTENTS)) && |
546 | (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { | 553 | (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { |
547 | 554 | ||
@@ -618,7 +625,7 @@ xfs_inactive( | |||
618 | * only one with a reference to the inode. | 625 | * only one with a reference to the inode. |
619 | */ | 626 | */ |
620 | truncate = ((ip->i_d.di_nlink == 0) && | 627 | truncate = ((ip->i_d.di_nlink == 0) && |
621 | ((ip->i_d.di_size != 0) || (ip->i_size != 0) || | 628 | ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 || |
622 | (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && | 629 | (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && |
623 | S_ISREG(ip->i_d.di_mode)); | 630 | S_ISREG(ip->i_d.di_mode)); |
624 | 631 | ||
@@ -632,12 +639,12 @@ xfs_inactive( | |||
632 | 639 | ||
633 | if (ip->i_d.di_nlink != 0) { | 640 | if (ip->i_d.di_nlink != 0) { |
634 | if ((S_ISREG(ip->i_d.di_mode) && | 641 | if ((S_ISREG(ip->i_d.di_mode) && |
635 | ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || | 642 | (VFS_I(ip)->i_size > 0 || |
636 | ip->i_delayed_blks > 0)) && | 643 | (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && |
637 | (ip->i_df.if_flags & XFS_IFEXTENTS) && | 644 | (ip->i_df.if_flags & XFS_IFEXTENTS) && |
638 | (!(ip->i_d.di_flags & | 645 | (!(ip->i_d.di_flags & |
639 | (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || | 646 | (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || |
640 | (ip->i_delayed_blks != 0)))) { | 647 | ip->i_delayed_blks != 0))) { |
641 | error = xfs_free_eofblocks(mp, ip, 0); | 648 | error = xfs_free_eofblocks(mp, ip, 0); |
642 | if (error) | 649 | if (error) |
643 | return VN_INACTIVE_CACHE; | 650 | return VN_INACTIVE_CACHE; |
@@ -670,13 +677,18 @@ xfs_inactive( | |||
670 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 677 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
671 | xfs_trans_ijoin(tp, ip, 0); | 678 | xfs_trans_ijoin(tp, ip, 0); |
672 | 679 | ||
673 | error = xfs_itruncate_data(&tp, ip, 0); | 680 | ip->i_d.di_size = 0; |
681 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||
682 | |||
683 | error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); | ||
674 | if (error) { | 684 | if (error) { |
675 | xfs_trans_cancel(tp, | 685 | xfs_trans_cancel(tp, |
676 | XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); | 686 | XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); |
677 | xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); | 687 | xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); |
678 | return VN_INACTIVE_CACHE; | 688 | return VN_INACTIVE_CACHE; |
679 | } | 689 | } |
690 | |||
691 | ASSERT(ip->i_d.di_nextents == 0); | ||
680 | } else if (S_ISLNK(ip->i_d.di_mode)) { | 692 | } else if (S_ISLNK(ip->i_d.di_mode)) { |
681 | 693 | ||
682 | /* | 694 | /* |
@@ -1961,11 +1973,11 @@ xfs_zero_remaining_bytes( | |||
1961 | * since nothing can read beyond eof. The space will | 1973 | * since nothing can read beyond eof. The space will |
1962 | * be zeroed when the file is extended anyway. | 1974 | * be zeroed when the file is extended anyway. |
1963 | */ | 1975 | */ |
1964 | if (startoff >= ip->i_size) | 1976 | if (startoff >= XFS_ISIZE(ip)) |
1965 | return 0; | 1977 | return 0; |
1966 | 1978 | ||
1967 | if (endoff > ip->i_size) | 1979 | if (endoff > XFS_ISIZE(ip)) |
1968 | endoff = ip->i_size; | 1980 | endoff = XFS_ISIZE(ip); |
1969 | 1981 | ||
1970 | bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? | 1982 | bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? |
1971 | mp->m_rtdev_targp : mp->m_ddev_targp, | 1983 | mp->m_rtdev_targp : mp->m_ddev_targp, |
@@ -2260,7 +2272,7 @@ xfs_change_file_space( | |||
2260 | bf->l_start += offset; | 2272 | bf->l_start += offset; |
2261 | break; | 2273 | break; |
2262 | case 2: /*SEEK_END*/ | 2274 | case 2: /*SEEK_END*/ |
2263 | bf->l_start += ip->i_size; | 2275 | bf->l_start += XFS_ISIZE(ip); |
2264 | break; | 2276 | break; |
2265 | default: | 2277 | default: |
2266 | return XFS_ERROR(EINVAL); | 2278 | return XFS_ERROR(EINVAL); |
@@ -2277,7 +2289,7 @@ xfs_change_file_space( | |||
2277 | bf->l_whence = 0; | 2289 | bf->l_whence = 0; |
2278 | 2290 | ||
2279 | startoffset = bf->l_start; | 2291 | startoffset = bf->l_start; |
2280 | fsize = ip->i_size; | 2292 | fsize = XFS_ISIZE(ip); |
2281 | 2293 | ||
2282 | /* | 2294 | /* |
2283 | * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve | 2295 | * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve |
diff --git a/include/linux/audit.h b/include/linux/audit.h index 426ab9f4dd85..9ff7a2c48b50 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h | |||
@@ -26,6 +26,7 @@ | |||
26 | 26 | ||
27 | #include <linux/types.h> | 27 | #include <linux/types.h> |
28 | #include <linux/elf-em.h> | 28 | #include <linux/elf-em.h> |
29 | #include <linux/ptrace.h> | ||
29 | 30 | ||
30 | /* The netlink messages for the audit system is divided into blocks: | 31 | /* The netlink messages for the audit system is divided into blocks: |
31 | * 1000 - 1099 are for commanding the audit system | 32 | * 1000 - 1099 are for commanding the audit system |
@@ -181,6 +182,40 @@ | |||
181 | * AUDIT_UNUSED_BITS is updated if need be. */ | 182 | * AUDIT_UNUSED_BITS is updated if need be. */ |
182 | #define AUDIT_UNUSED_BITS 0x07FFFC00 | 183 | #define AUDIT_UNUSED_BITS 0x07FFFC00 |
183 | 184 | ||
185 | /* AUDIT_FIELD_COMPARE rule list */ | ||
186 | #define AUDIT_COMPARE_UID_TO_OBJ_UID 1 | ||
187 | #define AUDIT_COMPARE_GID_TO_OBJ_GID 2 | ||
188 | #define AUDIT_COMPARE_EUID_TO_OBJ_UID 3 | ||
189 | #define AUDIT_COMPARE_EGID_TO_OBJ_GID 4 | ||
190 | #define AUDIT_COMPARE_AUID_TO_OBJ_UID 5 | ||
191 | #define AUDIT_COMPARE_SUID_TO_OBJ_UID 6 | ||
192 | #define AUDIT_COMPARE_SGID_TO_OBJ_GID 7 | ||
193 | #define AUDIT_COMPARE_FSUID_TO_OBJ_UID 8 | ||
194 | #define AUDIT_COMPARE_FSGID_TO_OBJ_GID 9 | ||
195 | |||
196 | #define AUDIT_COMPARE_UID_TO_AUID 10 | ||
197 | #define AUDIT_COMPARE_UID_TO_EUID 11 | ||
198 | #define AUDIT_COMPARE_UID_TO_FSUID 12 | ||
199 | #define AUDIT_COMPARE_UID_TO_SUID 13 | ||
200 | |||
201 | #define AUDIT_COMPARE_AUID_TO_FSUID 14 | ||
202 | #define AUDIT_COMPARE_AUID_TO_SUID 15 | ||
203 | #define AUDIT_COMPARE_AUID_TO_EUID 16 | ||
204 | |||
205 | #define AUDIT_COMPARE_EUID_TO_SUID 17 | ||
206 | #define AUDIT_COMPARE_EUID_TO_FSUID 18 | ||
207 | |||
208 | #define AUDIT_COMPARE_SUID_TO_FSUID 19 | ||
209 | |||
210 | #define AUDIT_COMPARE_GID_TO_EGID 20 | ||
211 | #define AUDIT_COMPARE_GID_TO_FSGID 21 | ||
212 | #define AUDIT_COMPARE_GID_TO_SGID 22 | ||
213 | |||
214 | #define AUDIT_COMPARE_EGID_TO_FSGID 23 | ||
215 | #define AUDIT_COMPARE_EGID_TO_SGID 24 | ||
216 | #define AUDIT_COMPARE_SGID_TO_FSGID 25 | ||
217 | |||
218 | #define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_SGID_TO_FSGID | ||
184 | 219 | ||
185 | /* Rule fields */ | 220 | /* Rule fields */ |
186 | /* These are useful when checking the | 221 | /* These are useful when checking the |
@@ -222,6 +257,9 @@ | |||
222 | #define AUDIT_PERM 106 | 257 | #define AUDIT_PERM 106 |
223 | #define AUDIT_DIR 107 | 258 | #define AUDIT_DIR 107 |
224 | #define AUDIT_FILETYPE 108 | 259 | #define AUDIT_FILETYPE 108 |
260 | #define AUDIT_OBJ_UID 109 | ||
261 | #define AUDIT_OBJ_GID 110 | ||
262 | #define AUDIT_FIELD_COMPARE 111 | ||
225 | 263 | ||
226 | #define AUDIT_ARG0 200 | 264 | #define AUDIT_ARG0 200 |
227 | #define AUDIT_ARG1 (AUDIT_ARG0+1) | 265 | #define AUDIT_ARG1 (AUDIT_ARG0+1) |
@@ -408,28 +446,24 @@ struct audit_field { | |||
408 | void *lsm_rule; | 446 | void *lsm_rule; |
409 | }; | 447 | }; |
410 | 448 | ||
411 | #define AUDITSC_INVALID 0 | ||
412 | #define AUDITSC_SUCCESS 1 | ||
413 | #define AUDITSC_FAILURE 2 | ||
414 | #define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS ) | ||
415 | extern int __init audit_register_class(int class, unsigned *list); | 449 | extern int __init audit_register_class(int class, unsigned *list); |
416 | extern int audit_classify_syscall(int abi, unsigned syscall); | 450 | extern int audit_classify_syscall(int abi, unsigned syscall); |
417 | extern int audit_classify_arch(int arch); | 451 | extern int audit_classify_arch(int arch); |
418 | #ifdef CONFIG_AUDITSYSCALL | 452 | #ifdef CONFIG_AUDITSYSCALL |
419 | /* These are defined in auditsc.c */ | 453 | /* These are defined in auditsc.c */ |
420 | /* Public API */ | 454 | /* Public API */ |
421 | extern void audit_finish_fork(struct task_struct *child); | ||
422 | extern int audit_alloc(struct task_struct *task); | 455 | extern int audit_alloc(struct task_struct *task); |
423 | extern void audit_free(struct task_struct *task); | 456 | extern void __audit_free(struct task_struct *task); |
424 | extern void audit_syscall_entry(int arch, | 457 | extern void __audit_syscall_entry(int arch, |
425 | int major, unsigned long a0, unsigned long a1, | 458 | int major, unsigned long a0, unsigned long a1, |
426 | unsigned long a2, unsigned long a3); | 459 | unsigned long a2, unsigned long a3); |
427 | extern void audit_syscall_exit(int failed, long return_code); | 460 | extern void __audit_syscall_exit(int ret_success, long ret_value); |
428 | extern void __audit_getname(const char *name); | 461 | extern void __audit_getname(const char *name); |
429 | extern void audit_putname(const char *name); | 462 | extern void audit_putname(const char *name); |
430 | extern void __audit_inode(const char *name, const struct dentry *dentry); | 463 | extern void __audit_inode(const char *name, const struct dentry *dentry); |
431 | extern void __audit_inode_child(const struct dentry *dentry, | 464 | extern void __audit_inode_child(const struct dentry *dentry, |
432 | const struct inode *parent); | 465 | const struct inode *parent); |
466 | extern void __audit_seccomp(unsigned long syscall); | ||
433 | extern void __audit_ptrace(struct task_struct *t); | 467 | extern void __audit_ptrace(struct task_struct *t); |
434 | 468 | ||
435 | static inline int audit_dummy_context(void) | 469 | static inline int audit_dummy_context(void) |
@@ -437,6 +471,27 @@ static inline int audit_dummy_context(void) | |||
437 | void *p = current->audit_context; | 471 | void *p = current->audit_context; |
438 | return !p || *(int *)p; | 472 | return !p || *(int *)p; |
439 | } | 473 | } |
474 | static inline void audit_free(struct task_struct *task) | ||
475 | { | ||
476 | if (unlikely(task->audit_context)) | ||
477 | __audit_free(task); | ||
478 | } | ||
479 | static inline void audit_syscall_entry(int arch, int major, unsigned long a0, | ||
480 | unsigned long a1, unsigned long a2, | ||
481 | unsigned long a3) | ||
482 | { | ||
483 | if (unlikely(!audit_dummy_context())) | ||
484 | __audit_syscall_entry(arch, major, a0, a1, a2, a3); | ||
485 | } | ||
486 | static inline void audit_syscall_exit(void *pt_regs) | ||
487 | { | ||
488 | if (unlikely(current->audit_context)) { | ||
489 | int success = is_syscall_success(pt_regs); | ||
490 | int return_code = regs_return_value(pt_regs); | ||
491 | |||
492 | __audit_syscall_exit(success, return_code); | ||
493 | } | ||
494 | } | ||
440 | static inline void audit_getname(const char *name) | 495 | static inline void audit_getname(const char *name) |
441 | { | 496 | { |
442 | if (unlikely(!audit_dummy_context())) | 497 | if (unlikely(!audit_dummy_context())) |
@@ -453,6 +508,12 @@ static inline void audit_inode_child(const struct dentry *dentry, | |||
453 | } | 508 | } |
454 | void audit_core_dumps(long signr); | 509 | void audit_core_dumps(long signr); |
455 | 510 | ||
511 | static inline void audit_seccomp(unsigned long syscall) | ||
512 | { | ||
513 | if (unlikely(!audit_dummy_context())) | ||
514 | __audit_seccomp(syscall); | ||
515 | } | ||
516 | |||
456 | static inline void audit_ptrace(struct task_struct *t) | 517 | static inline void audit_ptrace(struct task_struct *t) |
457 | { | 518 | { |
458 | if (unlikely(!audit_dummy_context())) | 519 | if (unlikely(!audit_dummy_context())) |
@@ -463,17 +524,16 @@ static inline void audit_ptrace(struct task_struct *t) | |||
463 | extern unsigned int audit_serial(void); | 524 | extern unsigned int audit_serial(void); |
464 | extern int auditsc_get_stamp(struct audit_context *ctx, | 525 | extern int auditsc_get_stamp(struct audit_context *ctx, |
465 | struct timespec *t, unsigned int *serial); | 526 | struct timespec *t, unsigned int *serial); |
466 | extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); | 527 | extern int audit_set_loginuid(uid_t loginuid); |
467 | #define audit_get_loginuid(t) ((t)->loginuid) | 528 | #define audit_get_loginuid(t) ((t)->loginuid) |
468 | #define audit_get_sessionid(t) ((t)->sessionid) | 529 | #define audit_get_sessionid(t) ((t)->sessionid) |
469 | extern void audit_log_task_context(struct audit_buffer *ab); | 530 | extern void audit_log_task_context(struct audit_buffer *ab); |
470 | extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); | 531 | extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); |
471 | extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); | 532 | extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); |
472 | extern int audit_bprm(struct linux_binprm *bprm); | 533 | extern int __audit_bprm(struct linux_binprm *bprm); |
473 | extern void audit_socketcall(int nargs, unsigned long *args); | 534 | extern void __audit_socketcall(int nargs, unsigned long *args); |
474 | extern int audit_sockaddr(int len, void *addr); | 535 | extern int __audit_sockaddr(int len, void *addr); |
475 | extern void __audit_fd_pair(int fd1, int fd2); | 536 | extern void __audit_fd_pair(int fd1, int fd2); |
476 | extern int audit_set_macxattr(const char *name); | ||
477 | extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); | 537 | extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); |
478 | extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); | 538 | extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); |
479 | extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); | 539 | extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); |
@@ -499,6 +559,23 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid | |||
499 | if (unlikely(!audit_dummy_context())) | 559 | if (unlikely(!audit_dummy_context())) |
500 | __audit_ipc_set_perm(qbytes, uid, gid, mode); | 560 | __audit_ipc_set_perm(qbytes, uid, gid, mode); |
501 | } | 561 | } |
562 | static inline int audit_bprm(struct linux_binprm *bprm) | ||
563 | { | ||
564 | if (unlikely(!audit_dummy_context())) | ||
565 | return __audit_bprm(bprm); | ||
566 | return 0; | ||
567 | } | ||
568 | static inline void audit_socketcall(int nargs, unsigned long *args) | ||
569 | { | ||
570 | if (unlikely(!audit_dummy_context())) | ||
571 | __audit_socketcall(nargs, args); | ||
572 | } | ||
573 | static inline int audit_sockaddr(int len, void *addr) | ||
574 | { | ||
575 | if (unlikely(!audit_dummy_context())) | ||
576 | return __audit_sockaddr(len, addr); | ||
577 | return 0; | ||
578 | } | ||
502 | static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) | 579 | static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) |
503 | { | 580 | { |
504 | if (unlikely(!audit_dummy_context())) | 581 | if (unlikely(!audit_dummy_context())) |
@@ -544,12 +621,11 @@ static inline void audit_mmap_fd(int fd, int flags) | |||
544 | 621 | ||
545 | extern int audit_n_rules; | 622 | extern int audit_n_rules; |
546 | extern int audit_signals; | 623 | extern int audit_signals; |
547 | #else | 624 | #else /* CONFIG_AUDITSYSCALL */ |
548 | #define audit_finish_fork(t) | ||
549 | #define audit_alloc(t) ({ 0; }) | 625 | #define audit_alloc(t) ({ 0; }) |
550 | #define audit_free(t) do { ; } while (0) | 626 | #define audit_free(t) do { ; } while (0) |
551 | #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) | 627 | #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) |
552 | #define audit_syscall_exit(f,r) do { ; } while (0) | 628 | #define audit_syscall_exit(r) do { ; } while (0) |
553 | #define audit_dummy_context() 1 | 629 | #define audit_dummy_context() 1 |
554 | #define audit_getname(n) do { ; } while (0) | 630 | #define audit_getname(n) do { ; } while (0) |
555 | #define audit_putname(n) do { ; } while (0) | 631 | #define audit_putname(n) do { ; } while (0) |
@@ -558,6 +634,7 @@ extern int audit_signals; | |||
558 | #define audit_inode(n,d) do { (void)(d); } while (0) | 634 | #define audit_inode(n,d) do { (void)(d); } while (0) |
559 | #define audit_inode_child(i,p) do { ; } while (0) | 635 | #define audit_inode_child(i,p) do { ; } while (0) |
560 | #define audit_core_dumps(i) do { ; } while (0) | 636 | #define audit_core_dumps(i) do { ; } while (0) |
637 | #define audit_seccomp(i) do { ; } while (0) | ||
561 | #define auditsc_get_stamp(c,t,s) (0) | 638 | #define auditsc_get_stamp(c,t,s) (0) |
562 | #define audit_get_loginuid(t) (-1) | 639 | #define audit_get_loginuid(t) (-1) |
563 | #define audit_get_sessionid(t) (-1) | 640 | #define audit_get_sessionid(t) (-1) |
@@ -568,7 +645,6 @@ extern int audit_signals; | |||
568 | #define audit_socketcall(n,a) ((void)0) | 645 | #define audit_socketcall(n,a) ((void)0) |
569 | #define audit_fd_pair(n,a) ((void)0) | 646 | #define audit_fd_pair(n,a) ((void)0) |
570 | #define audit_sockaddr(len, addr) ({ 0; }) | 647 | #define audit_sockaddr(len, addr) ({ 0; }) |
571 | #define audit_set_macxattr(n) do { ; } while (0) | ||
572 | #define audit_mq_open(o,m,a) ((void)0) | 648 | #define audit_mq_open(o,m,a) ((void)0) |
573 | #define audit_mq_sendrecv(d,l,p,t) ((void)0) | 649 | #define audit_mq_sendrecv(d,l,p,t) ((void)0) |
574 | #define audit_mq_notify(d,n) ((void)0) | 650 | #define audit_mq_notify(d,n) ((void)0) |
@@ -579,7 +655,7 @@ extern int audit_signals; | |||
579 | #define audit_ptrace(t) ((void)0) | 655 | #define audit_ptrace(t) ((void)0) |
580 | #define audit_n_rules 0 | 656 | #define audit_n_rules 0 |
581 | #define audit_signals 0 | 657 | #define audit_signals 0 |
582 | #endif | 658 | #endif /* CONFIG_AUDITSYSCALL */ |
583 | 659 | ||
584 | #ifdef CONFIG_AUDIT | 660 | #ifdef CONFIG_AUDIT |
585 | /* These are defined in audit.c */ | 661 | /* These are defined in audit.c */ |
diff --git a/include/linux/kref.h b/include/linux/kref.h index abc0120b09b7..9c07dcebded7 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h | |||
@@ -17,6 +17,7 @@ | |||
17 | 17 | ||
18 | #include <linux/bug.h> | 18 | #include <linux/bug.h> |
19 | #include <linux/atomic.h> | 19 | #include <linux/atomic.h> |
20 | #include <linux/kernel.h> | ||
20 | 21 | ||
21 | struct kref { | 22 | struct kref { |
22 | atomic_t refcount; | 23 | atomic_t refcount; |
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index a27e56ca41a4..c2f1f6a5fcb8 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h | |||
@@ -112,6 +112,7 @@ | |||
112 | 112 | ||
113 | #include <linux/compiler.h> /* For unlikely. */ | 113 | #include <linux/compiler.h> /* For unlikely. */ |
114 | #include <linux/sched.h> /* For struct task_struct. */ | 114 | #include <linux/sched.h> /* For struct task_struct. */ |
115 | #include <linux/err.h> /* for IS_ERR_VALUE */ | ||
115 | 116 | ||
116 | 117 | ||
117 | extern long arch_ptrace(struct task_struct *child, long request, | 118 | extern long arch_ptrace(struct task_struct *child, long request, |
@@ -266,6 +267,15 @@ static inline void ptrace_release_task(struct task_struct *task) | |||
266 | #define force_successful_syscall_return() do { } while (0) | 267 | #define force_successful_syscall_return() do { } while (0) |
267 | #endif | 268 | #endif |
268 | 269 | ||
270 | #ifndef is_syscall_success | ||
271 | /* | ||
272 | * On most systems we can tell if a syscall is a success based on if the retval | ||
273 | * is an error value. On some systems like ia64 and powerpc they have different | ||
274 | * indicators of success/failure and must define their own. | ||
275 | */ | ||
276 | #define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) | ||
277 | #endif | ||
278 | |||
269 | /* | 279 | /* |
270 | * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. | 280 | * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. |
271 | * | 281 | * |
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index ecdaeb98b293..5cf685086dd3 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h | |||
@@ -312,7 +312,6 @@ struct tty_driver { | |||
312 | */ | 312 | */ |
313 | struct tty_struct **ttys; | 313 | struct tty_struct **ttys; |
314 | struct ktermios **termios; | 314 | struct ktermios **termios; |
315 | struct ktermios **termios_locked; | ||
316 | void *driver_state; | 315 | void *driver_state; |
317 | 316 | ||
318 | /* | 317 | /* |
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b31702ac15be..84f3001a568d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h | |||
@@ -16,6 +16,8 @@ struct btrfs_delayed_ref_node; | |||
16 | struct btrfs_delayed_tree_ref; | 16 | struct btrfs_delayed_tree_ref; |
17 | struct btrfs_delayed_data_ref; | 17 | struct btrfs_delayed_data_ref; |
18 | struct btrfs_delayed_ref_head; | 18 | struct btrfs_delayed_ref_head; |
19 | struct btrfs_block_group_cache; | ||
20 | struct btrfs_free_cluster; | ||
19 | struct map_lookup; | 21 | struct map_lookup; |
20 | struct extent_buffer; | 22 | struct extent_buffer; |
21 | 23 | ||
@@ -44,6 +46,17 @@ struct extent_buffer; | |||
44 | obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ | 46 | obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ |
45 | (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" | 47 | (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" |
46 | 48 | ||
49 | #define BTRFS_GROUP_FLAGS \ | ||
50 | { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ | ||
51 | { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ | ||
52 | { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ | ||
53 | { BTRFS_BLOCK_GROUP_RAID0, "RAID0"}, \ | ||
54 | { BTRFS_BLOCK_GROUP_RAID1, "RAID1"}, \ | ||
55 | { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \ | ||
56 | { BTRFS_BLOCK_GROUP_RAID10, "RAID10"} | ||
57 | |||
58 | #define BTRFS_UUID_SIZE 16 | ||
59 | |||
47 | TRACE_EVENT(btrfs_transaction_commit, | 60 | TRACE_EVENT(btrfs_transaction_commit, |
48 | 61 | ||
49 | TP_PROTO(struct btrfs_root *root), | 62 | TP_PROTO(struct btrfs_root *root), |
@@ -621,6 +634,34 @@ TRACE_EVENT(btrfs_cow_block, | |||
621 | __entry->cow_level) | 634 | __entry->cow_level) |
622 | ); | 635 | ); |
623 | 636 | ||
637 | TRACE_EVENT(btrfs_space_reservation, | ||
638 | |||
639 | TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val, | ||
640 | u64 bytes, int reserve), | ||
641 | |||
642 | TP_ARGS(fs_info, type, val, bytes, reserve), | ||
643 | |||
644 | TP_STRUCT__entry( | ||
645 | __array( u8, fsid, BTRFS_UUID_SIZE ) | ||
646 | __string( type, type ) | ||
647 | __field( u64, val ) | ||
648 | __field( u64, bytes ) | ||
649 | __field( int, reserve ) | ||
650 | ), | ||
651 | |||
652 | TP_fast_assign( | ||
653 | memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE); | ||
654 | __assign_str(type, type); | ||
655 | __entry->val = val; | ||
656 | __entry->bytes = bytes; | ||
657 | __entry->reserve = reserve; | ||
658 | ), | ||
659 | |||
660 | TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type), | ||
661 | __entry->val, __entry->reserve ? "reserve" : "release", | ||
662 | __entry->bytes) | ||
663 | ); | ||
664 | |||
624 | DECLARE_EVENT_CLASS(btrfs__reserved_extent, | 665 | DECLARE_EVENT_CLASS(btrfs__reserved_extent, |
625 | 666 | ||
626 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), | 667 | TP_PROTO(struct btrfs_root *root, u64 start, u64 len), |
@@ -659,6 +700,168 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, | |||
659 | TP_ARGS(root, start, len) | 700 | TP_ARGS(root, start, len) |
660 | ); | 701 | ); |
661 | 702 | ||
703 | TRACE_EVENT(find_free_extent, | ||
704 | |||
705 | TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size, | ||
706 | u64 data), | ||
707 | |||
708 | TP_ARGS(root, num_bytes, empty_size, data), | ||
709 | |||
710 | TP_STRUCT__entry( | ||
711 | __field( u64, root_objectid ) | ||
712 | __field( u64, num_bytes ) | ||
713 | __field( u64, empty_size ) | ||
714 | __field( u64, data ) | ||
715 | ), | ||
716 | |||
717 | TP_fast_assign( | ||
718 | __entry->root_objectid = root->root_key.objectid; | ||
719 | __entry->num_bytes = num_bytes; | ||
720 | __entry->empty_size = empty_size; | ||
721 | __entry->data = data; | ||
722 | ), | ||
723 | |||
724 | TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, " | ||
725 | "flags = %Lu(%s)", show_root_type(__entry->root_objectid), | ||
726 | __entry->num_bytes, __entry->empty_size, __entry->data, | ||
727 | __print_flags((unsigned long)__entry->data, "|", | ||
728 | BTRFS_GROUP_FLAGS)) | ||
729 | ); | ||
730 | |||
731 | DECLARE_EVENT_CLASS(btrfs__reserve_extent, | ||
732 | |||
733 | TP_PROTO(struct btrfs_root *root, | ||
734 | struct btrfs_block_group_cache *block_group, u64 start, | ||
735 | u64 len), | ||
736 | |||
737 | TP_ARGS(root, block_group, start, len), | ||
738 | |||
739 | TP_STRUCT__entry( | ||
740 | __field( u64, root_objectid ) | ||
741 | __field( u64, bg_objectid ) | ||
742 | __field( u64, flags ) | ||
743 | __field( u64, start ) | ||
744 | __field( u64, len ) | ||
745 | ), | ||
746 | |||
747 | TP_fast_assign( | ||
748 | __entry->root_objectid = root->root_key.objectid; | ||
749 | __entry->bg_objectid = block_group->key.objectid; | ||
750 | __entry->flags = block_group->flags; | ||
751 | __entry->start = start; | ||
752 | __entry->len = len; | ||
753 | ), | ||
754 | |||
755 | TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " | ||
756 | "start = %Lu, len = %Lu", | ||
757 | show_root_type(__entry->root_objectid), __entry->bg_objectid, | ||
758 | __entry->flags, __print_flags((unsigned long)__entry->flags, | ||
759 | "|", BTRFS_GROUP_FLAGS), | ||
760 | __entry->start, __entry->len) | ||
761 | ); | ||
762 | |||
763 | DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, | ||
764 | |||
765 | TP_PROTO(struct btrfs_root *root, | ||
766 | struct btrfs_block_group_cache *block_group, u64 start, | ||
767 | u64 len), | ||
768 | |||
769 | TP_ARGS(root, block_group, start, len) | ||
770 | ); | ||
771 | |||
772 | DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, | ||
773 | |||
774 | TP_PROTO(struct btrfs_root *root, | ||
775 | struct btrfs_block_group_cache *block_group, u64 start, | ||
776 | u64 len), | ||
777 | |||
778 | TP_ARGS(root, block_group, start, len) | ||
779 | ); | ||
780 | |||
781 | TRACE_EVENT(btrfs_find_cluster, | ||
782 | |||
783 | TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start, | ||
784 | u64 bytes, u64 empty_size, u64 min_bytes), | ||
785 | |||
786 | TP_ARGS(block_group, start, bytes, empty_size, min_bytes), | ||
787 | |||
788 | TP_STRUCT__entry( | ||
789 | __field( u64, bg_objectid ) | ||
790 | __field( u64, flags ) | ||
791 | __field( u64, start ) | ||
792 | __field( u64, bytes ) | ||
793 | __field( u64, empty_size ) | ||
794 | __field( u64, min_bytes ) | ||
795 | ), | ||
796 | |||
797 | TP_fast_assign( | ||
798 | __entry->bg_objectid = block_group->key.objectid; | ||
799 | __entry->flags = block_group->flags; | ||
800 | __entry->start = start; | ||
801 | __entry->bytes = bytes; | ||
802 | __entry->empty_size = empty_size; | ||
803 | __entry->min_bytes = min_bytes; | ||
804 | ), | ||
805 | |||
806 | TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," | ||
807 | " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid, | ||
808 | __entry->flags, | ||
809 | __print_flags((unsigned long)__entry->flags, "|", | ||
810 | BTRFS_GROUP_FLAGS), __entry->start, | ||
811 | __entry->bytes, __entry->empty_size, __entry->min_bytes) | ||
812 | ); | ||
813 | |||
814 | TRACE_EVENT(btrfs_failed_cluster_setup, | ||
815 | |||
816 | TP_PROTO(struct btrfs_block_group_cache *block_group), | ||
817 | |||
818 | TP_ARGS(block_group), | ||
819 | |||
820 | TP_STRUCT__entry( | ||
821 | __field( u64, bg_objectid ) | ||
822 | ), | ||
823 | |||
824 | TP_fast_assign( | ||
825 | __entry->bg_objectid = block_group->key.objectid; | ||
826 | ), | ||
827 | |||
828 | TP_printk("block_group = %Lu", __entry->bg_objectid) | ||
829 | ); | ||
830 | |||
831 | TRACE_EVENT(btrfs_setup_cluster, | ||
832 | |||
833 | TP_PROTO(struct btrfs_block_group_cache *block_group, | ||
834 | struct btrfs_free_cluster *cluster, u64 size, int bitmap), | ||
835 | |||
836 | TP_ARGS(block_group, cluster, size, bitmap), | ||
837 | |||
838 | TP_STRUCT__entry( | ||
839 | __field( u64, bg_objectid ) | ||
840 | __field( u64, flags ) | ||
841 | __field( u64, start ) | ||
842 | __field( u64, max_size ) | ||
843 | __field( u64, size ) | ||
844 | __field( int, bitmap ) | ||
845 | ), | ||
846 | |||
847 | TP_fast_assign( | ||
848 | __entry->bg_objectid = block_group->key.objectid; | ||
849 | __entry->flags = block_group->flags; | ||
850 | __entry->start = cluster->window_start; | ||
851 | __entry->max_size = cluster->max_size; | ||
852 | __entry->size = size; | ||
853 | __entry->bitmap = bitmap; | ||
854 | ), | ||
855 | |||
856 | TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " | ||
857 | "size = %Lu, max_size = %Lu, bitmap = %d", | ||
858 | __entry->bg_objectid, | ||
859 | __entry->flags, | ||
860 | __print_flags((unsigned long)__entry->flags, "|", | ||
861 | BTRFS_GROUP_FLAGS), __entry->start, | ||
862 | __entry->size, __entry->max_size, __entry->bitmap) | ||
863 | ); | ||
864 | |||
662 | #endif /* _TRACE_BTRFS_H */ | 865 | #endif /* _TRACE_BTRFS_H */ |
663 | 866 | ||
664 | /* This part must be outside protection */ | 867 | /* This part must be outside protection */ |
diff --git a/init/Kconfig b/init/Kconfig index 6ac2236244c3..3f42cd66f0f8 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -355,7 +355,7 @@ config AUDIT | |||
355 | 355 | ||
356 | config AUDITSYSCALL | 356 | config AUDITSYSCALL |
357 | bool "Enable system-call auditing support" | 357 | bool "Enable system-call auditing support" |
358 | depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH) | 358 | depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || ARM) |
359 | default y if SECURITY_SELINUX | 359 | default y if SECURITY_SELINUX |
360 | help | 360 | help |
361 | Enable low-overhead system-call auditing infrastructure that | 361 | Enable low-overhead system-call auditing infrastructure that |
@@ -372,6 +372,20 @@ config AUDIT_TREE | |||
372 | depends on AUDITSYSCALL | 372 | depends on AUDITSYSCALL |
373 | select FSNOTIFY | 373 | select FSNOTIFY |
374 | 374 | ||
375 | config AUDIT_LOGINUID_IMMUTABLE | ||
376 | bool "Make audit loginuid immutable" | ||
377 | depends on AUDIT | ||
378 | help | ||
379 | The config option toggles if a task setting its loginuid requires | ||
380 | CAP_SYS_AUDITCONTROL or if that task should require no special permissions | ||
381 | but should instead only allow setting its loginuid if it was never | ||
382 | previously set. On systems which use systemd or a similar central | ||
383 | process to restart login services this should be set to true. On older | ||
384 | systems in which an admin would typically have to directly stop and | ||
385 | start processes this should be set to false. Setting this to true allows | ||
386 | one to drop potentially dangerous capabilites from the login tasks, | ||
387 | but may not be backwards compatible with older init systems. | ||
388 | |||
375 | source "kernel/irq/Kconfig" | 389 | source "kernel/irq/Kconfig" |
376 | 390 | ||
377 | menu "RCU Subsystem" | 391 | menu "RCU Subsystem" |
diff --git a/kernel/audit.c b/kernel/audit.c index 57e3f5107937..bb0eb5bb9a0a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
631 | } | 631 | } |
632 | 632 | ||
633 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 633 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
634 | audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", | 634 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", |
635 | pid, uid, auid, ses); | 635 | pid, uid, auid, ses); |
636 | if (sid) { | 636 | if (sid) { |
637 | rc = security_secid_to_secctx(sid, &ctx, &len); | 637 | rc = security_secid_to_secctx(sid, &ctx, &len); |
@@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | |||
1423 | char *p, *pathname; | 1423 | char *p, *pathname; |
1424 | 1424 | ||
1425 | if (prefix) | 1425 | if (prefix) |
1426 | audit_log_format(ab, " %s", prefix); | 1426 | audit_log_format(ab, "%s", prefix); |
1427 | 1427 | ||
1428 | /* We will allow 11 spaces for ' (deleted)' to be appended */ | 1428 | /* We will allow 11 spaces for ' (deleted)' to be appended */ |
1429 | pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); | 1429 | pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); |
diff --git a/kernel/audit.h b/kernel/audit.h index 91e7071c4d2c..816766803371 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -36,12 +36,8 @@ enum audit_state { | |||
36 | AUDIT_DISABLED, /* Do not create per-task audit_context. | 36 | AUDIT_DISABLED, /* Do not create per-task audit_context. |
37 | * No syscall-specific audit records can | 37 | * No syscall-specific audit records can |
38 | * be generated. */ | 38 | * be generated. */ |
39 | AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, | ||
40 | * but don't necessarily fill it in at | ||
41 | * syscall entry time (i.e., filter | ||
42 | * instead). */ | ||
43 | AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, | 39 | AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, |
44 | * and always fill it in at syscall | 40 | * and fill it in at syscall |
45 | * entry time. This makes a full | 41 | * entry time. This makes a full |
46 | * syscall record available if some | 42 | * syscall record available if some |
47 | * other part of the kernel decides it | 43 | * other part of the kernel decides it |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f8277c80d678..a6c3f1abd206 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | |||
235 | switch(listnr) { | 235 | switch(listnr) { |
236 | default: | 236 | default: |
237 | goto exit_err; | 237 | goto exit_err; |
238 | case AUDIT_FILTER_USER: | ||
239 | case AUDIT_FILTER_TYPE: | ||
240 | #ifdef CONFIG_AUDITSYSCALL | 238 | #ifdef CONFIG_AUDITSYSCALL |
241 | case AUDIT_FILTER_ENTRY: | 239 | case AUDIT_FILTER_ENTRY: |
240 | if (rule->action == AUDIT_ALWAYS) | ||
241 | goto exit_err; | ||
242 | case AUDIT_FILTER_EXIT: | 242 | case AUDIT_FILTER_EXIT: |
243 | case AUDIT_FILTER_TASK: | 243 | case AUDIT_FILTER_TASK: |
244 | #endif | 244 | #endif |
245 | case AUDIT_FILTER_USER: | ||
246 | case AUDIT_FILTER_TYPE: | ||
245 | ; | 247 | ; |
246 | } | 248 | } |
247 | if (unlikely(rule->action == AUDIT_POSSIBLE)) { | 249 | if (unlikely(rule->action == AUDIT_POSSIBLE)) { |
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
385 | goto exit_free; | 387 | goto exit_free; |
386 | break; | 388 | break; |
387 | case AUDIT_FILETYPE: | 389 | case AUDIT_FILETYPE: |
388 | if ((f->val & ~S_IFMT) > S_IFMT) | 390 | if (f->val & ~S_IFMT) |
389 | goto exit_free; | 391 | goto exit_free; |
390 | break; | 392 | break; |
391 | case AUDIT_INODE: | 393 | case AUDIT_INODE: |
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
459 | case AUDIT_ARG1: | 461 | case AUDIT_ARG1: |
460 | case AUDIT_ARG2: | 462 | case AUDIT_ARG2: |
461 | case AUDIT_ARG3: | 463 | case AUDIT_ARG3: |
464 | case AUDIT_OBJ_UID: | ||
465 | case AUDIT_OBJ_GID: | ||
462 | break; | 466 | break; |
463 | case AUDIT_ARCH: | 467 | case AUDIT_ARCH: |
464 | entry->rule.arch_f = f; | 468 | entry->rule.arch_f = f; |
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
522 | goto exit_free; | 526 | goto exit_free; |
523 | break; | 527 | break; |
524 | case AUDIT_FILTERKEY: | 528 | case AUDIT_FILTERKEY: |
525 | err = -EINVAL; | ||
526 | if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) | 529 | if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) |
527 | goto exit_free; | 530 | goto exit_free; |
528 | str = audit_unpack_string(&bufp, &remain, f->val); | 531 | str = audit_unpack_string(&bufp, &remain, f->val); |
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
536 | goto exit_free; | 539 | goto exit_free; |
537 | break; | 540 | break; |
538 | case AUDIT_FILETYPE: | 541 | case AUDIT_FILETYPE: |
539 | if ((f->val & ~S_IFMT) > S_IFMT) | 542 | if (f->val & ~S_IFMT) |
543 | goto exit_free; | ||
544 | break; | ||
545 | case AUDIT_FIELD_COMPARE: | ||
546 | if (f->val > AUDIT_MAX_FIELD_COMPARE) | ||
540 | goto exit_free; | 547 | goto exit_free; |
541 | break; | 548 | break; |
542 | default: | 549 | default: |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e7fe2b0d29b3..caaea6e944f8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -70,9 +70,15 @@ | |||
70 | 70 | ||
71 | #include "audit.h" | 71 | #include "audit.h" |
72 | 72 | ||
73 | /* flags stating the success for a syscall */ | ||
74 | #define AUDITSC_INVALID 0 | ||
75 | #define AUDITSC_SUCCESS 1 | ||
76 | #define AUDITSC_FAILURE 2 | ||
77 | |||
73 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | 78 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context |
74 | * for saving names from getname(). */ | 79 | * for saving names from getname(). If we get more names we will allocate |
75 | #define AUDIT_NAMES 20 | 80 | * a name dynamically and also add those to the list anchored by names_list. */ |
81 | #define AUDIT_NAMES 5 | ||
76 | 82 | ||
77 | /* Indicates that audit should log the full pathname. */ | 83 | /* Indicates that audit should log the full pathname. */ |
78 | #define AUDIT_NAME_FULL -1 | 84 | #define AUDIT_NAME_FULL -1 |
@@ -101,9 +107,8 @@ struct audit_cap_data { | |||
101 | * | 107 | * |
102 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ | 108 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ |
103 | struct audit_names { | 109 | struct audit_names { |
110 | struct list_head list; /* audit_context->names_list */ | ||
104 | const char *name; | 111 | const char *name; |
105 | int name_len; /* number of name's characters to log */ | ||
106 | unsigned name_put; /* call __putname() for this name */ | ||
107 | unsigned long ino; | 112 | unsigned long ino; |
108 | dev_t dev; | 113 | dev_t dev; |
109 | umode_t mode; | 114 | umode_t mode; |
@@ -113,6 +118,14 @@ struct audit_names { | |||
113 | u32 osid; | 118 | u32 osid; |
114 | struct audit_cap_data fcap; | 119 | struct audit_cap_data fcap; |
115 | unsigned int fcap_ver; | 120 | unsigned int fcap_ver; |
121 | int name_len; /* number of name's characters to log */ | ||
122 | bool name_put; /* call __putname() for this name */ | ||
123 | /* | ||
124 | * This was an allocated audit_names and not from the array of | ||
125 | * names allocated in the task audit context. Thus this name | ||
126 | * should be freed on syscall exit | ||
127 | */ | ||
128 | bool should_free; | ||
116 | }; | 129 | }; |
117 | 130 | ||
118 | struct audit_aux_data { | 131 | struct audit_aux_data { |
@@ -174,8 +187,17 @@ struct audit_context { | |||
174 | long return_code;/* syscall return code */ | 187 | long return_code;/* syscall return code */ |
175 | u64 prio; | 188 | u64 prio; |
176 | int return_valid; /* return code is valid */ | 189 | int return_valid; /* return code is valid */ |
177 | int name_count; | 190 | /* |
178 | struct audit_names names[AUDIT_NAMES]; | 191 | * The names_list is the list of all audit_names collected during this |
192 | * syscall. The first AUDIT_NAMES entries in the names_list will | ||
193 | * actually be from the preallocated_names array for performance | ||
194 | * reasons. Except during allocation they should never be referenced | ||
195 | * through the preallocated_names array and should only be found/used | ||
196 | * by running the names_list. | ||
197 | */ | ||
198 | struct audit_names preallocated_names[AUDIT_NAMES]; | ||
199 | int name_count; /* total records in names_list */ | ||
200 | struct list_head names_list; /* anchor for struct audit_names->list */ | ||
179 | char * filterkey; /* key for rule that triggered record */ | 201 | char * filterkey; /* key for rule that triggered record */ |
180 | struct path pwd; | 202 | struct path pwd; |
181 | struct audit_context *previous; /* For nested syscalls */ | 203 | struct audit_context *previous; /* For nested syscalls */ |
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask) | |||
305 | } | 327 | } |
306 | } | 328 | } |
307 | 329 | ||
308 | static int audit_match_filetype(struct audit_context *ctx, int which) | 330 | static int audit_match_filetype(struct audit_context *ctx, int val) |
309 | { | 331 | { |
310 | unsigned index = which & ~S_IFMT; | 332 | struct audit_names *n; |
311 | umode_t mode = which & S_IFMT; | 333 | umode_t mode = (umode_t)val; |
312 | 334 | ||
313 | if (unlikely(!ctx)) | 335 | if (unlikely(!ctx)) |
314 | return 0; | 336 | return 0; |
315 | 337 | ||
316 | if (index >= ctx->name_count) | 338 | list_for_each_entry(n, &ctx->names_list, list) { |
317 | return 0; | 339 | if ((n->ino != -1) && |
318 | if (ctx->names[index].ino == -1) | 340 | ((n->mode & S_IFMT) == mode)) |
319 | return 0; | 341 | return 1; |
320 | if ((ctx->names[index].mode ^ mode) & S_IFMT) | 342 | } |
321 | return 0; | 343 | |
322 | return 1; | 344 | return 0; |
323 | } | 345 | } |
324 | 346 | ||
325 | /* | 347 | /* |
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) | |||
441 | return 0; | 463 | return 0; |
442 | } | 464 | } |
443 | 465 | ||
466 | static int audit_compare_id(uid_t uid1, | ||
467 | struct audit_names *name, | ||
468 | unsigned long name_offset, | ||
469 | struct audit_field *f, | ||
470 | struct audit_context *ctx) | ||
471 | { | ||
472 | struct audit_names *n; | ||
473 | unsigned long addr; | ||
474 | uid_t uid2; | ||
475 | int rc; | ||
476 | |||
477 | BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); | ||
478 | |||
479 | if (name) { | ||
480 | addr = (unsigned long)name; | ||
481 | addr += name_offset; | ||
482 | |||
483 | uid2 = *(uid_t *)addr; | ||
484 | rc = audit_comparator(uid1, f->op, uid2); | ||
485 | if (rc) | ||
486 | return rc; | ||
487 | } | ||
488 | |||
489 | if (ctx) { | ||
490 | list_for_each_entry(n, &ctx->names_list, list) { | ||
491 | addr = (unsigned long)n; | ||
492 | addr += name_offset; | ||
493 | |||
494 | uid2 = *(uid_t *)addr; | ||
495 | |||
496 | rc = audit_comparator(uid1, f->op, uid2); | ||
497 | if (rc) | ||
498 | return rc; | ||
499 | } | ||
500 | } | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | static int audit_field_compare(struct task_struct *tsk, | ||
505 | const struct cred *cred, | ||
506 | struct audit_field *f, | ||
507 | struct audit_context *ctx, | ||
508 | struct audit_names *name) | ||
509 | { | ||
510 | switch (f->val) { | ||
511 | /* process to file object comparisons */ | ||
512 | case AUDIT_COMPARE_UID_TO_OBJ_UID: | ||
513 | return audit_compare_id(cred->uid, | ||
514 | name, offsetof(struct audit_names, uid), | ||
515 | f, ctx); | ||
516 | case AUDIT_COMPARE_GID_TO_OBJ_GID: | ||
517 | return audit_compare_id(cred->gid, | ||
518 | name, offsetof(struct audit_names, gid), | ||
519 | f, ctx); | ||
520 | case AUDIT_COMPARE_EUID_TO_OBJ_UID: | ||
521 | return audit_compare_id(cred->euid, | ||
522 | name, offsetof(struct audit_names, uid), | ||
523 | f, ctx); | ||
524 | case AUDIT_COMPARE_EGID_TO_OBJ_GID: | ||
525 | return audit_compare_id(cred->egid, | ||
526 | name, offsetof(struct audit_names, gid), | ||
527 | f, ctx); | ||
528 | case AUDIT_COMPARE_AUID_TO_OBJ_UID: | ||
529 | return audit_compare_id(tsk->loginuid, | ||
530 | name, offsetof(struct audit_names, uid), | ||
531 | f, ctx); | ||
532 | case AUDIT_COMPARE_SUID_TO_OBJ_UID: | ||
533 | return audit_compare_id(cred->suid, | ||
534 | name, offsetof(struct audit_names, uid), | ||
535 | f, ctx); | ||
536 | case AUDIT_COMPARE_SGID_TO_OBJ_GID: | ||
537 | return audit_compare_id(cred->sgid, | ||
538 | name, offsetof(struct audit_names, gid), | ||
539 | f, ctx); | ||
540 | case AUDIT_COMPARE_FSUID_TO_OBJ_UID: | ||
541 | return audit_compare_id(cred->fsuid, | ||
542 | name, offsetof(struct audit_names, uid), | ||
543 | f, ctx); | ||
544 | case AUDIT_COMPARE_FSGID_TO_OBJ_GID: | ||
545 | return audit_compare_id(cred->fsgid, | ||
546 | name, offsetof(struct audit_names, gid), | ||
547 | f, ctx); | ||
548 | /* uid comparisons */ | ||
549 | case AUDIT_COMPARE_UID_TO_AUID: | ||
550 | return audit_comparator(cred->uid, f->op, tsk->loginuid); | ||
551 | case AUDIT_COMPARE_UID_TO_EUID: | ||
552 | return audit_comparator(cred->uid, f->op, cred->euid); | ||
553 | case AUDIT_COMPARE_UID_TO_SUID: | ||
554 | return audit_comparator(cred->uid, f->op, cred->suid); | ||
555 | case AUDIT_COMPARE_UID_TO_FSUID: | ||
556 | return audit_comparator(cred->uid, f->op, cred->fsuid); | ||
557 | /* auid comparisons */ | ||
558 | case AUDIT_COMPARE_AUID_TO_EUID: | ||
559 | return audit_comparator(tsk->loginuid, f->op, cred->euid); | ||
560 | case AUDIT_COMPARE_AUID_TO_SUID: | ||
561 | return audit_comparator(tsk->loginuid, f->op, cred->suid); | ||
562 | case AUDIT_COMPARE_AUID_TO_FSUID: | ||
563 | return audit_comparator(tsk->loginuid, f->op, cred->fsuid); | ||
564 | /* euid comparisons */ | ||
565 | case AUDIT_COMPARE_EUID_TO_SUID: | ||
566 | return audit_comparator(cred->euid, f->op, cred->suid); | ||
567 | case AUDIT_COMPARE_EUID_TO_FSUID: | ||
568 | return audit_comparator(cred->euid, f->op, cred->fsuid); | ||
569 | /* suid comparisons */ | ||
570 | case AUDIT_COMPARE_SUID_TO_FSUID: | ||
571 | return audit_comparator(cred->suid, f->op, cred->fsuid); | ||
572 | /* gid comparisons */ | ||
573 | case AUDIT_COMPARE_GID_TO_EGID: | ||
574 | return audit_comparator(cred->gid, f->op, cred->egid); | ||
575 | case AUDIT_COMPARE_GID_TO_SGID: | ||
576 | return audit_comparator(cred->gid, f->op, cred->sgid); | ||
577 | case AUDIT_COMPARE_GID_TO_FSGID: | ||
578 | return audit_comparator(cred->gid, f->op, cred->fsgid); | ||
579 | /* egid comparisons */ | ||
580 | case AUDIT_COMPARE_EGID_TO_SGID: | ||
581 | return audit_comparator(cred->egid, f->op, cred->sgid); | ||
582 | case AUDIT_COMPARE_EGID_TO_FSGID: | ||
583 | return audit_comparator(cred->egid, f->op, cred->fsgid); | ||
584 | /* sgid comparison */ | ||
585 | case AUDIT_COMPARE_SGID_TO_FSGID: | ||
586 | return audit_comparator(cred->sgid, f->op, cred->fsgid); | ||
587 | default: | ||
588 | WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); | ||
589 | return 0; | ||
590 | } | ||
591 | return 0; | ||
592 | } | ||
593 | |||
444 | /* Determine if any context name data matches a rule's watch data */ | 594 | /* Determine if any context name data matches a rule's watch data */ |
445 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 595 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
446 | * otherwise. | 596 | * otherwise. |
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
457 | bool task_creation) | 607 | bool task_creation) |
458 | { | 608 | { |
459 | const struct cred *cred; | 609 | const struct cred *cred; |
460 | int i, j, need_sid = 1; | 610 | int i, need_sid = 1; |
461 | u32 sid; | 611 | u32 sid; |
462 | 612 | ||
463 | cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); | 613 | cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); |
464 | 614 | ||
465 | for (i = 0; i < rule->field_count; i++) { | 615 | for (i = 0; i < rule->field_count; i++) { |
466 | struct audit_field *f = &rule->fields[i]; | 616 | struct audit_field *f = &rule->fields[i]; |
617 | struct audit_names *n; | ||
467 | int result = 0; | 618 | int result = 0; |
468 | 619 | ||
469 | switch (f->type) { | 620 | switch (f->type) { |
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
522 | } | 673 | } |
523 | break; | 674 | break; |
524 | case AUDIT_DEVMAJOR: | 675 | case AUDIT_DEVMAJOR: |
525 | if (name) | 676 | if (name) { |
526 | result = audit_comparator(MAJOR(name->dev), | 677 | if (audit_comparator(MAJOR(name->dev), f->op, f->val) || |
527 | f->op, f->val); | 678 | audit_comparator(MAJOR(name->rdev), f->op, f->val)) |
528 | else if (ctx) { | 679 | ++result; |
529 | for (j = 0; j < ctx->name_count; j++) { | 680 | } else if (ctx) { |
530 | if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { | 681 | list_for_each_entry(n, &ctx->names_list, list) { |
682 | if (audit_comparator(MAJOR(n->dev), f->op, f->val) || | ||
683 | audit_comparator(MAJOR(n->rdev), f->op, f->val)) { | ||
531 | ++result; | 684 | ++result; |
532 | break; | 685 | break; |
533 | } | 686 | } |
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
535 | } | 688 | } |
536 | break; | 689 | break; |
537 | case AUDIT_DEVMINOR: | 690 | case AUDIT_DEVMINOR: |
538 | if (name) | 691 | if (name) { |
539 | result = audit_comparator(MINOR(name->dev), | 692 | if (audit_comparator(MINOR(name->dev), f->op, f->val) || |
540 | f->op, f->val); | 693 | audit_comparator(MINOR(name->rdev), f->op, f->val)) |
541 | else if (ctx) { | 694 | ++result; |
542 | for (j = 0; j < ctx->name_count; j++) { | 695 | } else if (ctx) { |
543 | if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { | 696 | list_for_each_entry(n, &ctx->names_list, list) { |
697 | if (audit_comparator(MINOR(n->dev), f->op, f->val) || | ||
698 | audit_comparator(MINOR(n->rdev), f->op, f->val)) { | ||
544 | ++result; | 699 | ++result; |
545 | break; | 700 | break; |
546 | } | 701 | } |
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
551 | if (name) | 706 | if (name) |
552 | result = (name->ino == f->val); | 707 | result = (name->ino == f->val); |
553 | else if (ctx) { | 708 | else if (ctx) { |
554 | for (j = 0; j < ctx->name_count; j++) { | 709 | list_for_each_entry(n, &ctx->names_list, list) { |
555 | if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { | 710 | if (audit_comparator(n->ino, f->op, f->val)) { |
711 | ++result; | ||
712 | break; | ||
713 | } | ||
714 | } | ||
715 | } | ||
716 | break; | ||
717 | case AUDIT_OBJ_UID: | ||
718 | if (name) { | ||
719 | result = audit_comparator(name->uid, f->op, f->val); | ||
720 | } else if (ctx) { | ||
721 | list_for_each_entry(n, &ctx->names_list, list) { | ||
722 | if (audit_comparator(n->uid, f->op, f->val)) { | ||
723 | ++result; | ||
724 | break; | ||
725 | } | ||
726 | } | ||
727 | } | ||
728 | break; | ||
729 | case AUDIT_OBJ_GID: | ||
730 | if (name) { | ||
731 | result = audit_comparator(name->gid, f->op, f->val); | ||
732 | } else if (ctx) { | ||
733 | list_for_each_entry(n, &ctx->names_list, list) { | ||
734 | if (audit_comparator(n->gid, f->op, f->val)) { | ||
556 | ++result; | 735 | ++result; |
557 | break; | 736 | break; |
558 | } | 737 | } |
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
607 | name->osid, f->type, f->op, | 786 | name->osid, f->type, f->op, |
608 | f->lsm_rule, ctx); | 787 | f->lsm_rule, ctx); |
609 | } else if (ctx) { | 788 | } else if (ctx) { |
610 | for (j = 0; j < ctx->name_count; j++) { | 789 | list_for_each_entry(n, &ctx->names_list, list) { |
611 | if (security_audit_rule_match( | 790 | if (security_audit_rule_match(n->osid, f->type, |
612 | ctx->names[j].osid, | 791 | f->op, f->lsm_rule, |
613 | f->type, f->op, | 792 | ctx)) { |
614 | f->lsm_rule, ctx)) { | ||
615 | ++result; | 793 | ++result; |
616 | break; | 794 | break; |
617 | } | 795 | } |
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
643 | case AUDIT_FILETYPE: | 821 | case AUDIT_FILETYPE: |
644 | result = audit_match_filetype(ctx, f->val); | 822 | result = audit_match_filetype(ctx, f->val); |
645 | break; | 823 | break; |
824 | case AUDIT_FIELD_COMPARE: | ||
825 | result = audit_field_compare(tsk, cred, f, ctx, name); | ||
826 | break; | ||
646 | } | 827 | } |
647 | |||
648 | if (!result) | 828 | if (!result) |
649 | return 0; | 829 | return 0; |
650 | } | 830 | } |
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
722 | return AUDIT_BUILD_CONTEXT; | 902 | return AUDIT_BUILD_CONTEXT; |
723 | } | 903 | } |
724 | 904 | ||
725 | /* At syscall exit time, this filter is called if any audit_names[] have been | 905 | /* |
906 | * Given an audit_name check the inode hash table to see if they match. | ||
907 | * Called holding the rcu read lock to protect the use of audit_inode_hash | ||
908 | */ | ||
909 | static int audit_filter_inode_name(struct task_struct *tsk, | ||
910 | struct audit_names *n, | ||
911 | struct audit_context *ctx) { | ||
912 | int word, bit; | ||
913 | int h = audit_hash_ino((u32)n->ino); | ||
914 | struct list_head *list = &audit_inode_hash[h]; | ||
915 | struct audit_entry *e; | ||
916 | enum audit_state state; | ||
917 | |||
918 | word = AUDIT_WORD(ctx->major); | ||
919 | bit = AUDIT_BIT(ctx->major); | ||
920 | |||
921 | if (list_empty(list)) | ||
922 | return 0; | ||
923 | |||
924 | list_for_each_entry_rcu(e, list, list) { | ||
925 | if ((e->rule.mask[word] & bit) == bit && | ||
926 | audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { | ||
927 | ctx->current_state = state; | ||
928 | return 1; | ||
929 | } | ||
930 | } | ||
931 | |||
932 | return 0; | ||
933 | } | ||
934 | |||
935 | /* At syscall exit time, this filter is called if any audit_names have been | ||
726 | * collected during syscall processing. We only check rules in sublists at hash | 936 | * collected during syscall processing. We only check rules in sublists at hash |
727 | * buckets applicable to the inode numbers in audit_names[]. | 937 | * buckets applicable to the inode numbers in audit_names. |
728 | * Regarding audit_state, same rules apply as for audit_filter_syscall(). | 938 | * Regarding audit_state, same rules apply as for audit_filter_syscall(). |
729 | */ | 939 | */ |
730 | void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) | 940 | void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) |
731 | { | 941 | { |
732 | int i; | 942 | struct audit_names *n; |
733 | struct audit_entry *e; | ||
734 | enum audit_state state; | ||
735 | 943 | ||
736 | if (audit_pid && tsk->tgid == audit_pid) | 944 | if (audit_pid && tsk->tgid == audit_pid) |
737 | return; | 945 | return; |
738 | 946 | ||
739 | rcu_read_lock(); | 947 | rcu_read_lock(); |
740 | for (i = 0; i < ctx->name_count; i++) { | ||
741 | int word = AUDIT_WORD(ctx->major); | ||
742 | int bit = AUDIT_BIT(ctx->major); | ||
743 | struct audit_names *n = &ctx->names[i]; | ||
744 | int h = audit_hash_ino((u32)n->ino); | ||
745 | struct list_head *list = &audit_inode_hash[h]; | ||
746 | |||
747 | if (list_empty(list)) | ||
748 | continue; | ||
749 | 948 | ||
750 | list_for_each_entry_rcu(e, list, list) { | 949 | list_for_each_entry(n, &ctx->names_list, list) { |
751 | if ((e->rule.mask[word] & bit) == bit && | 950 | if (audit_filter_inode_name(tsk, n, ctx)) |
752 | audit_filter_rules(tsk, &e->rule, ctx, n, | 951 | break; |
753 | &state, false)) { | ||
754 | rcu_read_unlock(); | ||
755 | ctx->current_state = state; | ||
756 | return; | ||
757 | } | ||
758 | } | ||
759 | } | 952 | } |
760 | rcu_read_unlock(); | 953 | rcu_read_unlock(); |
761 | } | 954 | } |
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
766 | { | 959 | { |
767 | struct audit_context *context = tsk->audit_context; | 960 | struct audit_context *context = tsk->audit_context; |
768 | 961 | ||
769 | if (likely(!context)) | 962 | if (!context) |
770 | return NULL; | 963 | return NULL; |
771 | context->return_valid = return_valid; | 964 | context->return_valid = return_valid; |
772 | 965 | ||
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
799 | 992 | ||
800 | static inline void audit_free_names(struct audit_context *context) | 993 | static inline void audit_free_names(struct audit_context *context) |
801 | { | 994 | { |
802 | int i; | 995 | struct audit_names *n, *next; |
803 | 996 | ||
804 | #if AUDIT_DEBUG == 2 | 997 | #if AUDIT_DEBUG == 2 |
805 | if (context->put_count + context->ino_count != context->name_count) { | 998 | if (context->put_count + context->ino_count != context->name_count) { |
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context) | |||
810 | context->serial, context->major, context->in_syscall, | 1003 | context->serial, context->major, context->in_syscall, |
811 | context->name_count, context->put_count, | 1004 | context->name_count, context->put_count, |
812 | context->ino_count); | 1005 | context->ino_count); |
813 | for (i = 0; i < context->name_count; i++) { | 1006 | list_for_each_entry(n, &context->names_list, list) { |
814 | printk(KERN_ERR "names[%d] = %p = %s\n", i, | 1007 | printk(KERN_ERR "names[%d] = %p = %s\n", i, |
815 | context->names[i].name, | 1008 | n->name, n->name ?: "(null)"); |
816 | context->names[i].name ?: "(null)"); | ||
817 | } | 1009 | } |
818 | dump_stack(); | 1010 | dump_stack(); |
819 | return; | 1011 | return; |
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context) | |||
824 | context->ino_count = 0; | 1016 | context->ino_count = 0; |
825 | #endif | 1017 | #endif |
826 | 1018 | ||
827 | for (i = 0; i < context->name_count; i++) { | 1019 | list_for_each_entry_safe(n, next, &context->names_list, list) { |
828 | if (context->names[i].name && context->names[i].name_put) | 1020 | list_del(&n->list); |
829 | __putname(context->names[i].name); | 1021 | if (n->name && n->name_put) |
1022 | __putname(n->name); | ||
1023 | if (n->should_free) | ||
1024 | kfree(n); | ||
830 | } | 1025 | } |
831 | context->name_count = 0; | 1026 | context->name_count = 0; |
832 | path_put(&context->pwd); | 1027 | path_put(&context->pwd); |
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) | |||
864 | return NULL; | 1059 | return NULL; |
865 | audit_zero_context(context, state); | 1060 | audit_zero_context(context, state); |
866 | INIT_LIST_HEAD(&context->killed_trees); | 1061 | INIT_LIST_HEAD(&context->killed_trees); |
1062 | INIT_LIST_HEAD(&context->names_list); | ||
867 | return context; | 1063 | return context; |
868 | } | 1064 | } |
869 | 1065 | ||
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk) | |||
886 | return 0; /* Return if not auditing. */ | 1082 | return 0; /* Return if not auditing. */ |
887 | 1083 | ||
888 | state = audit_filter_task(tsk, &key); | 1084 | state = audit_filter_task(tsk, &key); |
889 | if (likely(state == AUDIT_DISABLED)) | 1085 | if (state == AUDIT_DISABLED) |
890 | return 0; | 1086 | return 0; |
891 | 1087 | ||
892 | if (!(context = audit_alloc_context(state))) { | 1088 | if (!(context = audit_alloc_context(state))) { |
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
975 | while (vma) { | 1171 | while (vma) { |
976 | if ((vma->vm_flags & VM_EXECUTABLE) && | 1172 | if ((vma->vm_flags & VM_EXECUTABLE) && |
977 | vma->vm_file) { | 1173 | vma->vm_file) { |
978 | audit_log_d_path(ab, "exe=", | 1174 | audit_log_d_path(ab, " exe=", |
979 | &vma->vm_file->f_path); | 1175 | &vma->vm_file->f_path); |
980 | break; | 1176 | break; |
981 | } | 1177 | } |
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context, | |||
1166 | struct audit_buffer **ab, | 1362 | struct audit_buffer **ab, |
1167 | struct audit_aux_data_execve *axi) | 1363 | struct audit_aux_data_execve *axi) |
1168 | { | 1364 | { |
1169 | int i; | 1365 | int i, len; |
1170 | size_t len, len_sent = 0; | 1366 | size_t len_sent = 0; |
1171 | const char __user *p; | 1367 | const char __user *p; |
1172 | char *buf; | 1368 | char *buf; |
1173 | 1369 | ||
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1324 | audit_log_end(ab); | 1520 | audit_log_end(ab); |
1325 | } | 1521 | } |
1326 | 1522 | ||
1523 | static void audit_log_name(struct audit_context *context, struct audit_names *n, | ||
1524 | int record_num, int *call_panic) | ||
1525 | { | ||
1526 | struct audit_buffer *ab; | ||
1527 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | ||
1528 | if (!ab) | ||
1529 | return; /* audit_panic has been called */ | ||
1530 | |||
1531 | audit_log_format(ab, "item=%d", record_num); | ||
1532 | |||
1533 | if (n->name) { | ||
1534 | switch (n->name_len) { | ||
1535 | case AUDIT_NAME_FULL: | ||
1536 | /* log the full path */ | ||
1537 | audit_log_format(ab, " name="); | ||
1538 | audit_log_untrustedstring(ab, n->name); | ||
1539 | break; | ||
1540 | case 0: | ||
1541 | /* name was specified as a relative path and the | ||
1542 | * directory component is the cwd */ | ||
1543 | audit_log_d_path(ab, " name=", &context->pwd); | ||
1544 | break; | ||
1545 | default: | ||
1546 | /* log the name's directory component */ | ||
1547 | audit_log_format(ab, " name="); | ||
1548 | audit_log_n_untrustedstring(ab, n->name, | ||
1549 | n->name_len); | ||
1550 | } | ||
1551 | } else | ||
1552 | audit_log_format(ab, " name=(null)"); | ||
1553 | |||
1554 | if (n->ino != (unsigned long)-1) { | ||
1555 | audit_log_format(ab, " inode=%lu" | ||
1556 | " dev=%02x:%02x mode=%#ho" | ||
1557 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
1558 | n->ino, | ||
1559 | MAJOR(n->dev), | ||
1560 | MINOR(n->dev), | ||
1561 | n->mode, | ||
1562 | n->uid, | ||
1563 | n->gid, | ||
1564 | MAJOR(n->rdev), | ||
1565 | MINOR(n->rdev)); | ||
1566 | } | ||
1567 | if (n->osid != 0) { | ||
1568 | char *ctx = NULL; | ||
1569 | u32 len; | ||
1570 | if (security_secid_to_secctx( | ||
1571 | n->osid, &ctx, &len)) { | ||
1572 | audit_log_format(ab, " osid=%u", n->osid); | ||
1573 | *call_panic = 2; | ||
1574 | } else { | ||
1575 | audit_log_format(ab, " obj=%s", ctx); | ||
1576 | security_release_secctx(ctx, len); | ||
1577 | } | ||
1578 | } | ||
1579 | |||
1580 | audit_log_fcaps(ab, n); | ||
1581 | |||
1582 | audit_log_end(ab); | ||
1583 | } | ||
1584 | |||
1327 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 1585 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
1328 | { | 1586 | { |
1329 | const struct cred *cred; | 1587 | const struct cred *cred; |
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1331 | struct audit_buffer *ab; | 1589 | struct audit_buffer *ab; |
1332 | struct audit_aux_data *aux; | 1590 | struct audit_aux_data *aux; |
1333 | const char *tty; | 1591 | const char *tty; |
1592 | struct audit_names *n; | ||
1334 | 1593 | ||
1335 | /* tsk == current */ | 1594 | /* tsk == current */ |
1336 | context->pid = tsk->pid; | 1595 | context->pid = tsk->pid; |
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1466 | if (context->pwd.dentry && context->pwd.mnt) { | 1725 | if (context->pwd.dentry && context->pwd.mnt) { |
1467 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); | 1726 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); |
1468 | if (ab) { | 1727 | if (ab) { |
1469 | audit_log_d_path(ab, "cwd=", &context->pwd); | 1728 | audit_log_d_path(ab, " cwd=", &context->pwd); |
1470 | audit_log_end(ab); | 1729 | audit_log_end(ab); |
1471 | } | 1730 | } |
1472 | } | 1731 | } |
1473 | for (i = 0; i < context->name_count; i++) { | ||
1474 | struct audit_names *n = &context->names[i]; | ||
1475 | 1732 | ||
1476 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | 1733 | i = 0; |
1477 | if (!ab) | 1734 | list_for_each_entry(n, &context->names_list, list) |
1478 | continue; /* audit_panic has been called */ | 1735 | audit_log_name(context, n, i++, &call_panic); |
1479 | |||
1480 | audit_log_format(ab, "item=%d", i); | ||
1481 | |||
1482 | if (n->name) { | ||
1483 | switch(n->name_len) { | ||
1484 | case AUDIT_NAME_FULL: | ||
1485 | /* log the full path */ | ||
1486 | audit_log_format(ab, " name="); | ||
1487 | audit_log_untrustedstring(ab, n->name); | ||
1488 | break; | ||
1489 | case 0: | ||
1490 | /* name was specified as a relative path and the | ||
1491 | * directory component is the cwd */ | ||
1492 | audit_log_d_path(ab, "name=", &context->pwd); | ||
1493 | break; | ||
1494 | default: | ||
1495 | /* log the name's directory component */ | ||
1496 | audit_log_format(ab, " name="); | ||
1497 | audit_log_n_untrustedstring(ab, n->name, | ||
1498 | n->name_len); | ||
1499 | } | ||
1500 | } else | ||
1501 | audit_log_format(ab, " name=(null)"); | ||
1502 | |||
1503 | if (n->ino != (unsigned long)-1) { | ||
1504 | audit_log_format(ab, " inode=%lu" | ||
1505 | " dev=%02x:%02x mode=%#ho" | ||
1506 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
1507 | n->ino, | ||
1508 | MAJOR(n->dev), | ||
1509 | MINOR(n->dev), | ||
1510 | n->mode, | ||
1511 | n->uid, | ||
1512 | n->gid, | ||
1513 | MAJOR(n->rdev), | ||
1514 | MINOR(n->rdev)); | ||
1515 | } | ||
1516 | if (n->osid != 0) { | ||
1517 | char *ctx = NULL; | ||
1518 | u32 len; | ||
1519 | if (security_secid_to_secctx( | ||
1520 | n->osid, &ctx, &len)) { | ||
1521 | audit_log_format(ab, " osid=%u", n->osid); | ||
1522 | call_panic = 2; | ||
1523 | } else { | ||
1524 | audit_log_format(ab, " obj=%s", ctx); | ||
1525 | security_release_secctx(ctx, len); | ||
1526 | } | ||
1527 | } | ||
1528 | |||
1529 | audit_log_fcaps(ab, n); | ||
1530 | |||
1531 | audit_log_end(ab); | ||
1532 | } | ||
1533 | 1736 | ||
1534 | /* Send end of event record to help user space know we are finished */ | 1737 | /* Send end of event record to help user space know we are finished */ |
1535 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); | 1738 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); |
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1545 | * | 1748 | * |
1546 | * Called from copy_process and do_exit | 1749 | * Called from copy_process and do_exit |
1547 | */ | 1750 | */ |
1548 | void audit_free(struct task_struct *tsk) | 1751 | void __audit_free(struct task_struct *tsk) |
1549 | { | 1752 | { |
1550 | struct audit_context *context; | 1753 | struct audit_context *context; |
1551 | 1754 | ||
1552 | context = audit_get_context(tsk, 0, 0); | 1755 | context = audit_get_context(tsk, 0, 0); |
1553 | if (likely(!context)) | 1756 | if (!context) |
1554 | return; | 1757 | return; |
1555 | 1758 | ||
1556 | /* Check for system calls that do not go through the exit | 1759 | /* Check for system calls that do not go through the exit |
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk) | |||
1583 | * will only be written if another part of the kernel requests that it | 1786 | * will only be written if another part of the kernel requests that it |
1584 | * be written). | 1787 | * be written). |
1585 | */ | 1788 | */ |
1586 | void audit_syscall_entry(int arch, int major, | 1789 | void __audit_syscall_entry(int arch, int major, |
1587 | unsigned long a1, unsigned long a2, | 1790 | unsigned long a1, unsigned long a2, |
1588 | unsigned long a3, unsigned long a4) | 1791 | unsigned long a3, unsigned long a4) |
1589 | { | 1792 | { |
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major, | |||
1591 | struct audit_context *context = tsk->audit_context; | 1794 | struct audit_context *context = tsk->audit_context; |
1592 | enum audit_state state; | 1795 | enum audit_state state; |
1593 | 1796 | ||
1594 | if (unlikely(!context)) | 1797 | if (!context) |
1595 | return; | 1798 | return; |
1596 | 1799 | ||
1597 | /* | 1800 | /* |
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major, | |||
1648 | context->prio = 0; | 1851 | context->prio = 0; |
1649 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); | 1852 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); |
1650 | } | 1853 | } |
1651 | if (likely(state == AUDIT_DISABLED)) | 1854 | if (state == AUDIT_DISABLED) |
1652 | return; | 1855 | return; |
1653 | 1856 | ||
1654 | context->serial = 0; | 1857 | context->serial = 0; |
@@ -1658,30 +1861,9 @@ void audit_syscall_entry(int arch, int major, | |||
1658 | context->ppid = 0; | 1861 | context->ppid = 0; |
1659 | } | 1862 | } |
1660 | 1863 | ||
1661 | void audit_finish_fork(struct task_struct *child) | ||
1662 | { | ||
1663 | struct audit_context *ctx = current->audit_context; | ||
1664 | struct audit_context *p = child->audit_context; | ||
1665 | if (!p || !ctx) | ||
1666 | return; | ||
1667 | if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) | ||
1668 | return; | ||
1669 | p->arch = ctx->arch; | ||
1670 | p->major = ctx->major; | ||
1671 | memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); | ||
1672 | p->ctime = ctx->ctime; | ||
1673 | p->dummy = ctx->dummy; | ||
1674 | p->in_syscall = ctx->in_syscall; | ||
1675 | p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); | ||
1676 | p->ppid = current->pid; | ||
1677 | p->prio = ctx->prio; | ||
1678 | p->current_state = ctx->current_state; | ||
1679 | } | ||
1680 | |||
1681 | /** | 1864 | /** |
1682 | * audit_syscall_exit - deallocate audit context after a system call | 1865 | * audit_syscall_exit - deallocate audit context after a system call |
1683 | * @valid: success/failure flag | 1866 | * @pt_regs: syscall registers |
1684 | * @return_code: syscall return value | ||
1685 | * | 1867 | * |
1686 | * Tear down after system call. If the audit context has been marked as | 1868 | * Tear down after system call. If the audit context has been marked as |
1687 | * auditable (either because of the AUDIT_RECORD_CONTEXT state from | 1869 | * auditable (either because of the AUDIT_RECORD_CONTEXT state from |
@@ -1689,14 +1871,18 @@ void audit_finish_fork(struct task_struct *child) | |||
1689 | * message), then write out the syscall information. In call cases, | 1871 | * message), then write out the syscall information. In call cases, |
1690 | * free the names stored from getname(). | 1872 | * free the names stored from getname(). |
1691 | */ | 1873 | */ |
1692 | void audit_syscall_exit(int valid, long return_code) | 1874 | void __audit_syscall_exit(int success, long return_code) |
1693 | { | 1875 | { |
1694 | struct task_struct *tsk = current; | 1876 | struct task_struct *tsk = current; |
1695 | struct audit_context *context; | 1877 | struct audit_context *context; |
1696 | 1878 | ||
1697 | context = audit_get_context(tsk, valid, return_code); | 1879 | if (success) |
1880 | success = AUDITSC_SUCCESS; | ||
1881 | else | ||
1882 | success = AUDITSC_FAILURE; | ||
1698 | 1883 | ||
1699 | if (likely(!context)) | 1884 | context = audit_get_context(tsk, success, return_code); |
1885 | if (!context) | ||
1700 | return; | 1886 | return; |
1701 | 1887 | ||
1702 | if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) | 1888 | if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) |
@@ -1821,6 +2007,30 @@ retry: | |||
1821 | #endif | 2007 | #endif |
1822 | } | 2008 | } |
1823 | 2009 | ||
2010 | static struct audit_names *audit_alloc_name(struct audit_context *context) | ||
2011 | { | ||
2012 | struct audit_names *aname; | ||
2013 | |||
2014 | if (context->name_count < AUDIT_NAMES) { | ||
2015 | aname = &context->preallocated_names[context->name_count]; | ||
2016 | memset(aname, 0, sizeof(*aname)); | ||
2017 | } else { | ||
2018 | aname = kzalloc(sizeof(*aname), GFP_NOFS); | ||
2019 | if (!aname) | ||
2020 | return NULL; | ||
2021 | aname->should_free = true; | ||
2022 | } | ||
2023 | |||
2024 | aname->ino = (unsigned long)-1; | ||
2025 | list_add_tail(&aname->list, &context->names_list); | ||
2026 | |||
2027 | context->name_count++; | ||
2028 | #if AUDIT_DEBUG | ||
2029 | context->ino_count++; | ||
2030 | #endif | ||
2031 | return aname; | ||
2032 | } | ||
2033 | |||
1824 | /** | 2034 | /** |
1825 | * audit_getname - add a name to the list | 2035 | * audit_getname - add a name to the list |
1826 | * @name: name to add | 2036 | * @name: name to add |
@@ -1831,9 +2041,7 @@ retry: | |||
1831 | void __audit_getname(const char *name) | 2041 | void __audit_getname(const char *name) |
1832 | { | 2042 | { |
1833 | struct audit_context *context = current->audit_context; | 2043 | struct audit_context *context = current->audit_context; |
1834 | 2044 | struct audit_names *n; | |
1835 | if (IS_ERR(name) || !name) | ||
1836 | return; | ||
1837 | 2045 | ||
1838 | if (!context->in_syscall) { | 2046 | if (!context->in_syscall) { |
1839 | #if AUDIT_DEBUG == 2 | 2047 | #if AUDIT_DEBUG == 2 |
@@ -1843,13 +2051,15 @@ void __audit_getname(const char *name) | |||
1843 | #endif | 2051 | #endif |
1844 | return; | 2052 | return; |
1845 | } | 2053 | } |
1846 | BUG_ON(context->name_count >= AUDIT_NAMES); | 2054 | |
1847 | context->names[context->name_count].name = name; | 2055 | n = audit_alloc_name(context); |
1848 | context->names[context->name_count].name_len = AUDIT_NAME_FULL; | 2056 | if (!n) |
1849 | context->names[context->name_count].name_put = 1; | 2057 | return; |
1850 | context->names[context->name_count].ino = (unsigned long)-1; | 2058 | |
1851 | context->names[context->name_count].osid = 0; | 2059 | n->name = name; |
1852 | ++context->name_count; | 2060 | n->name_len = AUDIT_NAME_FULL; |
2061 | n->name_put = true; | ||
2062 | |||
1853 | if (!context->pwd.dentry) | 2063 | if (!context->pwd.dentry) |
1854 | get_fs_pwd(current->fs, &context->pwd); | 2064 | get_fs_pwd(current->fs, &context->pwd); |
1855 | } | 2065 | } |
@@ -1871,12 +2081,13 @@ void audit_putname(const char *name) | |||
1871 | printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", | 2081 | printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", |
1872 | __FILE__, __LINE__, context->serial, name); | 2082 | __FILE__, __LINE__, context->serial, name); |
1873 | if (context->name_count) { | 2083 | if (context->name_count) { |
2084 | struct audit_names *n; | ||
1874 | int i; | 2085 | int i; |
1875 | for (i = 0; i < context->name_count; i++) | 2086 | |
2087 | list_for_each_entry(n, &context->names_list, list) | ||
1876 | printk(KERN_ERR "name[%d] = %p = %s\n", i, | 2088 | printk(KERN_ERR "name[%d] = %p = %s\n", i, |
1877 | context->names[i].name, | 2089 | n->name, n->name ?: "(null)"); |
1878 | context->names[i].name ?: "(null)"); | 2090 | } |
1879 | } | ||
1880 | #endif | 2091 | #endif |
1881 | __putname(name); | 2092 | __putname(name); |
1882 | } | 2093 | } |
@@ -1897,39 +2108,11 @@ void audit_putname(const char *name) | |||
1897 | #endif | 2108 | #endif |
1898 | } | 2109 | } |
1899 | 2110 | ||
1900 | static int audit_inc_name_count(struct audit_context *context, | ||
1901 | const struct inode *inode) | ||
1902 | { | ||
1903 | if (context->name_count >= AUDIT_NAMES) { | ||
1904 | if (inode) | ||
1905 | printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " | ||
1906 | "dev=%02x:%02x, inode=%lu\n", | ||
1907 | MAJOR(inode->i_sb->s_dev), | ||
1908 | MINOR(inode->i_sb->s_dev), | ||
1909 | inode->i_ino); | ||
1910 | |||
1911 | else | ||
1912 | printk(KERN_DEBUG "name_count maxed, losing inode data\n"); | ||
1913 | return 1; | ||
1914 | } | ||
1915 | context->name_count++; | ||
1916 | #if AUDIT_DEBUG | ||
1917 | context->ino_count++; | ||
1918 | #endif | ||
1919 | return 0; | ||
1920 | } | ||
1921 | |||
1922 | |||
1923 | static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) | 2111 | static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) |
1924 | { | 2112 | { |
1925 | struct cpu_vfs_cap_data caps; | 2113 | struct cpu_vfs_cap_data caps; |
1926 | int rc; | 2114 | int rc; |
1927 | 2115 | ||
1928 | memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); | ||
1929 | memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); | ||
1930 | name->fcap.fE = 0; | ||
1931 | name->fcap_ver = 0; | ||
1932 | |||
1933 | if (!dentry) | 2116 | if (!dentry) |
1934 | return 0; | 2117 | return 0; |
1935 | 2118 | ||
@@ -1969,30 +2152,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent | |||
1969 | */ | 2152 | */ |
1970 | void __audit_inode(const char *name, const struct dentry *dentry) | 2153 | void __audit_inode(const char *name, const struct dentry *dentry) |
1971 | { | 2154 | { |
1972 | int idx; | ||
1973 | struct audit_context *context = current->audit_context; | 2155 | struct audit_context *context = current->audit_context; |
1974 | const struct inode *inode = dentry->d_inode; | 2156 | const struct inode *inode = dentry->d_inode; |
2157 | struct audit_names *n; | ||
1975 | 2158 | ||
1976 | if (!context->in_syscall) | 2159 | if (!context->in_syscall) |
1977 | return; | 2160 | return; |
1978 | if (context->name_count | 2161 | |
1979 | && context->names[context->name_count-1].name | 2162 | list_for_each_entry_reverse(n, &context->names_list, list) { |
1980 | && context->names[context->name_count-1].name == name) | 2163 | if (n->name && (n->name == name)) |
1981 | idx = context->name_count - 1; | 2164 | goto out; |
1982 | else if (context->name_count > 1 | ||
1983 | && context->names[context->name_count-2].name | ||
1984 | && context->names[context->name_count-2].name == name) | ||
1985 | idx = context->name_count - 2; | ||
1986 | else { | ||
1987 | /* FIXME: how much do we care about inodes that have no | ||
1988 | * associated name? */ | ||
1989 | if (audit_inc_name_count(context, inode)) | ||
1990 | return; | ||
1991 | idx = context->name_count - 1; | ||
1992 | context->names[idx].name = NULL; | ||
1993 | } | 2165 | } |
2166 | |||
2167 | /* unable to find the name from a previous getname() */ | ||
2168 | n = audit_alloc_name(context); | ||
2169 | if (!n) | ||
2170 | return; | ||
2171 | out: | ||
1994 | handle_path(dentry); | 2172 | handle_path(dentry); |
1995 | audit_copy_inode(&context->names[idx], dentry, inode); | 2173 | audit_copy_inode(n, dentry, inode); |
1996 | } | 2174 | } |
1997 | 2175 | ||
1998 | /** | 2176 | /** |
@@ -2011,11 +2189,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
2011 | void __audit_inode_child(const struct dentry *dentry, | 2189 | void __audit_inode_child(const struct dentry *dentry, |
2012 | const struct inode *parent) | 2190 | const struct inode *parent) |
2013 | { | 2191 | { |
2014 | int idx; | ||
2015 | struct audit_context *context = current->audit_context; | 2192 | struct audit_context *context = current->audit_context; |
2016 | const char *found_parent = NULL, *found_child = NULL; | 2193 | const char *found_parent = NULL, *found_child = NULL; |
2017 | const struct inode *inode = dentry->d_inode; | 2194 | const struct inode *inode = dentry->d_inode; |
2018 | const char *dname = dentry->d_name.name; | 2195 | const char *dname = dentry->d_name.name; |
2196 | struct audit_names *n; | ||
2019 | int dirlen = 0; | 2197 | int dirlen = 0; |
2020 | 2198 | ||
2021 | if (!context->in_syscall) | 2199 | if (!context->in_syscall) |
@@ -2025,9 +2203,7 @@ void __audit_inode_child(const struct dentry *dentry, | |||
2025 | handle_one(inode); | 2203 | handle_one(inode); |
2026 | 2204 | ||
2027 | /* parent is more likely, look for it first */ | 2205 | /* parent is more likely, look for it first */ |
2028 | for (idx = 0; idx < context->name_count; idx++) { | 2206 | list_for_each_entry(n, &context->names_list, list) { |
2029 | struct audit_names *n = &context->names[idx]; | ||
2030 | |||
2031 | if (!n->name) | 2207 | if (!n->name) |
2032 | continue; | 2208 | continue; |
2033 | 2209 | ||
@@ -2040,9 +2216,7 @@ void __audit_inode_child(const struct dentry *dentry, | |||
2040 | } | 2216 | } |
2041 | 2217 | ||
2042 | /* no matching parent, look for matching child */ | 2218 | /* no matching parent, look for matching child */ |
2043 | for (idx = 0; idx < context->name_count; idx++) { | 2219 | list_for_each_entry(n, &context->names_list, list) { |
2044 | struct audit_names *n = &context->names[idx]; | ||
2045 | |||
2046 | if (!n->name) | 2220 | if (!n->name) |
2047 | continue; | 2221 | continue; |
2048 | 2222 | ||
@@ -2060,34 +2234,29 @@ void __audit_inode_child(const struct dentry *dentry, | |||
2060 | 2234 | ||
2061 | add_names: | 2235 | add_names: |
2062 | if (!found_parent) { | 2236 | if (!found_parent) { |
2063 | if (audit_inc_name_count(context, parent)) | 2237 | n = audit_alloc_name(context); |
2238 | if (!n) | ||
2064 | return; | 2239 | return; |
2065 | idx = context->name_count - 1; | 2240 | audit_copy_inode(n, NULL, parent); |
2066 | context->names[idx].name = NULL; | ||
2067 | audit_copy_inode(&context->names[idx], NULL, parent); | ||
2068 | } | 2241 | } |
2069 | 2242 | ||
2070 | if (!found_child) { | 2243 | if (!found_child) { |
2071 | if (audit_inc_name_count(context, inode)) | 2244 | n = audit_alloc_name(context); |
2245 | if (!n) | ||
2072 | return; | 2246 | return; |
2073 | idx = context->name_count - 1; | ||
2074 | 2247 | ||
2075 | /* Re-use the name belonging to the slot for a matching parent | 2248 | /* Re-use the name belonging to the slot for a matching parent |
2076 | * directory. All names for this context are relinquished in | 2249 | * directory. All names for this context are relinquished in |
2077 | * audit_free_names() */ | 2250 | * audit_free_names() */ |
2078 | if (found_parent) { | 2251 | if (found_parent) { |
2079 | context->names[idx].name = found_parent; | 2252 | n->name = found_parent; |
2080 | context->names[idx].name_len = AUDIT_NAME_FULL; | 2253 | n->name_len = AUDIT_NAME_FULL; |
2081 | /* don't call __putname() */ | 2254 | /* don't call __putname() */ |
2082 | context->names[idx].name_put = 0; | 2255 | n->name_put = false; |
2083 | } else { | ||
2084 | context->names[idx].name = NULL; | ||
2085 | } | 2256 | } |
2086 | 2257 | ||
2087 | if (inode) | 2258 | if (inode) |
2088 | audit_copy_inode(&context->names[idx], NULL, inode); | 2259 | audit_copy_inode(n, NULL, inode); |
2089 | else | ||
2090 | context->names[idx].ino = (unsigned long)-1; | ||
2091 | } | 2260 | } |
2092 | } | 2261 | } |
2093 | EXPORT_SYMBOL_GPL(__audit_inode_child); | 2262 | EXPORT_SYMBOL_GPL(__audit_inode_child); |
@@ -2121,19 +2290,28 @@ int auditsc_get_stamp(struct audit_context *ctx, | |||
2121 | static atomic_t session_id = ATOMIC_INIT(0); | 2290 | static atomic_t session_id = ATOMIC_INIT(0); |
2122 | 2291 | ||
2123 | /** | 2292 | /** |
2124 | * audit_set_loginuid - set a task's audit_context loginuid | 2293 | * audit_set_loginuid - set current task's audit_context loginuid |
2125 | * @task: task whose audit context is being modified | ||
2126 | * @loginuid: loginuid value | 2294 | * @loginuid: loginuid value |
2127 | * | 2295 | * |
2128 | * Returns 0. | 2296 | * Returns 0. |
2129 | * | 2297 | * |
2130 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). | 2298 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). |
2131 | */ | 2299 | */ |
2132 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | 2300 | int audit_set_loginuid(uid_t loginuid) |
2133 | { | 2301 | { |
2134 | unsigned int sessionid = atomic_inc_return(&session_id); | 2302 | struct task_struct *task = current; |
2135 | struct audit_context *context = task->audit_context; | 2303 | struct audit_context *context = task->audit_context; |
2304 | unsigned int sessionid; | ||
2305 | |||
2306 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE | ||
2307 | if (task->loginuid != -1) | ||
2308 | return -EPERM; | ||
2309 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ | ||
2310 | if (!capable(CAP_AUDIT_CONTROL)) | ||
2311 | return -EPERM; | ||
2312 | #endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ | ||
2136 | 2313 | ||
2314 | sessionid = atomic_inc_return(&session_id); | ||
2137 | if (context && context->in_syscall) { | 2315 | if (context && context->in_syscall) { |
2138 | struct audit_buffer *ab; | 2316 | struct audit_buffer *ab; |
2139 | 2317 | ||
@@ -2271,14 +2449,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo | |||
2271 | context->ipc.has_perm = 1; | 2449 | context->ipc.has_perm = 1; |
2272 | } | 2450 | } |
2273 | 2451 | ||
2274 | int audit_bprm(struct linux_binprm *bprm) | 2452 | int __audit_bprm(struct linux_binprm *bprm) |
2275 | { | 2453 | { |
2276 | struct audit_aux_data_execve *ax; | 2454 | struct audit_aux_data_execve *ax; |
2277 | struct audit_context *context = current->audit_context; | 2455 | struct audit_context *context = current->audit_context; |
2278 | 2456 | ||
2279 | if (likely(!audit_enabled || !context || context->dummy)) | ||
2280 | return 0; | ||
2281 | |||
2282 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | 2457 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); |
2283 | if (!ax) | 2458 | if (!ax) |
2284 | return -ENOMEM; | 2459 | return -ENOMEM; |
@@ -2299,13 +2474,10 @@ int audit_bprm(struct linux_binprm *bprm) | |||
2299 | * @args: args array | 2474 | * @args: args array |
2300 | * | 2475 | * |
2301 | */ | 2476 | */ |
2302 | void audit_socketcall(int nargs, unsigned long *args) | 2477 | void __audit_socketcall(int nargs, unsigned long *args) |
2303 | { | 2478 | { |
2304 | struct audit_context *context = current->audit_context; | 2479 | struct audit_context *context = current->audit_context; |
2305 | 2480 | ||
2306 | if (likely(!context || context->dummy)) | ||
2307 | return; | ||
2308 | |||
2309 | context->type = AUDIT_SOCKETCALL; | 2481 | context->type = AUDIT_SOCKETCALL; |
2310 | context->socketcall.nargs = nargs; | 2482 | context->socketcall.nargs = nargs; |
2311 | memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); | 2483 | memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); |
@@ -2331,13 +2503,10 @@ void __audit_fd_pair(int fd1, int fd2) | |||
2331 | * | 2503 | * |
2332 | * Returns 0 for success or NULL context or < 0 on error. | 2504 | * Returns 0 for success or NULL context or < 0 on error. |
2333 | */ | 2505 | */ |
2334 | int audit_sockaddr(int len, void *a) | 2506 | int __audit_sockaddr(int len, void *a) |
2335 | { | 2507 | { |
2336 | struct audit_context *context = current->audit_context; | 2508 | struct audit_context *context = current->audit_context; |
2337 | 2509 | ||
2338 | if (likely(!context || context->dummy)) | ||
2339 | return 0; | ||
2340 | |||
2341 | if (!context->sockaddr) { | 2510 | if (!context->sockaddr) { |
2342 | void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); | 2511 | void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); |
2343 | if (!p) | 2512 | if (!p) |
@@ -2499,6 +2668,25 @@ void __audit_mmap_fd(int fd, int flags) | |||
2499 | context->type = AUDIT_MMAP; | 2668 | context->type = AUDIT_MMAP; |
2500 | } | 2669 | } |
2501 | 2670 | ||
2671 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | ||
2672 | { | ||
2673 | uid_t auid, uid; | ||
2674 | gid_t gid; | ||
2675 | unsigned int sessionid; | ||
2676 | |||
2677 | auid = audit_get_loginuid(current); | ||
2678 | sessionid = audit_get_sessionid(current); | ||
2679 | current_uid_gid(&uid, &gid); | ||
2680 | |||
2681 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | ||
2682 | auid, uid, gid, sessionid); | ||
2683 | audit_log_task_context(ab); | ||
2684 | audit_log_format(ab, " pid=%d comm=", current->pid); | ||
2685 | audit_log_untrustedstring(ab, current->comm); | ||
2686 | audit_log_format(ab, " reason="); | ||
2687 | audit_log_string(ab, reason); | ||
2688 | audit_log_format(ab, " sig=%ld", signr); | ||
2689 | } | ||
2502 | /** | 2690 | /** |
2503 | * audit_core_dumps - record information about processes that end abnormally | 2691 | * audit_core_dumps - record information about processes that end abnormally |
2504 | * @signr: signal value | 2692 | * @signr: signal value |
@@ -2509,10 +2697,6 @@ void __audit_mmap_fd(int fd, int flags) | |||
2509 | void audit_core_dumps(long signr) | 2697 | void audit_core_dumps(long signr) |
2510 | { | 2698 | { |
2511 | struct audit_buffer *ab; | 2699 | struct audit_buffer *ab; |
2512 | u32 sid; | ||
2513 | uid_t auid = audit_get_loginuid(current), uid; | ||
2514 | gid_t gid; | ||
2515 | unsigned int sessionid = audit_get_sessionid(current); | ||
2516 | 2700 | ||
2517 | if (!audit_enabled) | 2701 | if (!audit_enabled) |
2518 | return; | 2702 | return; |
@@ -2521,24 +2705,17 @@ void audit_core_dumps(long signr) | |||
2521 | return; | 2705 | return; |
2522 | 2706 | ||
2523 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2707 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2524 | current_uid_gid(&uid, &gid); | 2708 | audit_log_abend(ab, "memory violation", signr); |
2525 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | 2709 | audit_log_end(ab); |
2526 | auid, uid, gid, sessionid); | 2710 | } |
2527 | security_task_getsecid(current, &sid); | ||
2528 | if (sid) { | ||
2529 | char *ctx = NULL; | ||
2530 | u32 len; | ||
2531 | 2711 | ||
2532 | if (security_secid_to_secctx(sid, &ctx, &len)) | 2712 | void __audit_seccomp(unsigned long syscall) |
2533 | audit_log_format(ab, " ssid=%u", sid); | 2713 | { |
2534 | else { | 2714 | struct audit_buffer *ab; |
2535 | audit_log_format(ab, " subj=%s", ctx); | 2715 | |
2536 | security_release_secctx(ctx, len); | 2716 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2537 | } | 2717 | audit_log_abend(ab, "seccomp", SIGKILL); |
2538 | } | 2718 | audit_log_format(ab, " syscall=%ld", syscall); |
2539 | audit_log_format(ab, " pid=%d comm=", current->pid); | ||
2540 | audit_log_untrustedstring(ab, current->comm); | ||
2541 | audit_log_format(ab, " sig=%ld", signr); | ||
2542 | audit_log_end(ab); | 2719 | audit_log_end(ab); |
2543 | } | 2720 | } |
2544 | 2721 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index 0fcf1c14a297..3f1adb6c6470 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) | |||
384 | BUG(); | 384 | BUG(); |
385 | } | 385 | } |
386 | 386 | ||
387 | if (has_ns_capability(current, ns, cap)) { | 387 | if (security_capable(current_cred(), ns, cap) == 0) { |
388 | current->flags |= PF_SUPERPRIV; | 388 | current->flags |= PF_SUPERPRIV; |
389 | return true; | 389 | return true; |
390 | } | 390 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index c44738267be7..294b1709170d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -964,8 +964,7 @@ void do_exit(long code) | |||
964 | acct_collect(code, group_dead); | 964 | acct_collect(code, group_dead); |
965 | if (group_dead) | 965 | if (group_dead) |
966 | tty_audit_exit(); | 966 | tty_audit_exit(); |
967 | if (unlikely(tsk->audit_context)) | 967 | audit_free(tsk); |
968 | audit_free(tsk); | ||
969 | 968 | ||
970 | tsk->exit_code = code; | 969 | tsk->exit_code = code; |
971 | taskstats_exit(tsk, group_dead); | 970 | taskstats_exit(tsk, group_dead); |
diff --git a/kernel/fork.c b/kernel/fork.c index f3fa18887cc9..051f090d40c1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1527,8 +1527,6 @@ long do_fork(unsigned long clone_flags, | |||
1527 | init_completion(&vfork); | 1527 | init_completion(&vfork); |
1528 | } | 1528 | } |
1529 | 1529 | ||
1530 | audit_finish_fork(p); | ||
1531 | |||
1532 | /* | 1530 | /* |
1533 | * We set PF_STARTING at creation in case tracing wants to | 1531 | * We set PF_STARTING at creation in case tracing wants to |
1534 | * use this to distinguish a fully live task from one that | 1532 | * use this to distinguish a fully live task from one that |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13b631d..e8d76c5895ea 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -6,6 +6,7 @@ | |||
6 | * This defines a simple but solid secure-computing mode. | 6 | * This defines a simple but solid secure-computing mode. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/audit.h> | ||
9 | #include <linux/seccomp.h> | 10 | #include <linux/seccomp.h> |
10 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
11 | #include <linux/compat.h> | 12 | #include <linux/compat.h> |
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall) | |||
54 | #ifdef SECCOMP_DEBUG | 55 | #ifdef SECCOMP_DEBUG |
55 | dump_stack(); | 56 | dump_stack(); |
56 | #endif | 57 | #endif |
58 | audit_seccomp(this_syscall); | ||
57 | do_exit(SIGKILL); | 59 | do_exit(SIGKILL); |
58 | } | 60 | } |
59 | 61 | ||
diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c index c5c5a72c30be..2ad942fb1e23 100644 --- a/security/integrity/ima/ima_audit.c +++ b/security/integrity/ima/ima_audit.c | |||
@@ -56,9 +56,11 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode, | |||
56 | audit_log_format(ab, " name="); | 56 | audit_log_format(ab, " name="); |
57 | audit_log_untrustedstring(ab, fname); | 57 | audit_log_untrustedstring(ab, fname); |
58 | } | 58 | } |
59 | if (inode) | 59 | if (inode) { |
60 | audit_log_format(ab, " dev=%s ino=%lu", | 60 | audit_log_format(ab, " dev="); |
61 | inode->i_sb->s_id, inode->i_ino); | 61 | audit_log_untrustedstring(ab, inode->i_sb->s_id); |
62 | audit_log_format(ab, " ino=%lu", inode->i_ino); | ||
63 | } | ||
62 | audit_log_format(ab, " res=%d", !result ? 0 : 1); | 64 | audit_log_format(ab, " res=%d", !result ? 0 : 1); |
63 | audit_log_end(ab); | 65 | audit_log_end(ab); |
64 | } | 66 | } |
diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 7bd6f138236b..293b8c45b1d1 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c | |||
@@ -232,13 +232,14 @@ static void dump_common_audit_data(struct audit_buffer *ab, | |||
232 | case LSM_AUDIT_DATA_PATH: { | 232 | case LSM_AUDIT_DATA_PATH: { |
233 | struct inode *inode; | 233 | struct inode *inode; |
234 | 234 | ||
235 | audit_log_d_path(ab, "path=", &a->u.path); | 235 | audit_log_d_path(ab, " path=", &a->u.path); |
236 | 236 | ||
237 | inode = a->u.path.dentry->d_inode; | 237 | inode = a->u.path.dentry->d_inode; |
238 | if (inode) | 238 | if (inode) { |
239 | audit_log_format(ab, " dev=%s ino=%lu", | 239 | audit_log_format(ab, " dev="); |
240 | inode->i_sb->s_id, | 240 | audit_log_untrustedstring(ab, inode->i_sb->s_id); |
241 | inode->i_ino); | 241 | audit_log_format(ab, " ino=%lu", inode->i_ino); |
242 | } | ||
242 | break; | 243 | break; |
243 | } | 244 | } |
244 | case LSM_AUDIT_DATA_DENTRY: { | 245 | case LSM_AUDIT_DATA_DENTRY: { |
@@ -248,10 +249,11 @@ static void dump_common_audit_data(struct audit_buffer *ab, | |||
248 | audit_log_untrustedstring(ab, a->u.dentry->d_name.name); | 249 | audit_log_untrustedstring(ab, a->u.dentry->d_name.name); |
249 | 250 | ||
250 | inode = a->u.dentry->d_inode; | 251 | inode = a->u.dentry->d_inode; |
251 | if (inode) | 252 | if (inode) { |
252 | audit_log_format(ab, " dev=%s ino=%lu", | 253 | audit_log_format(ab, " dev="); |
253 | inode->i_sb->s_id, | 254 | audit_log_untrustedstring(ab, inode->i_sb->s_id); |
254 | inode->i_ino); | 255 | audit_log_format(ab, " ino=%lu", inode->i_ino); |
256 | } | ||
255 | break; | 257 | break; |
256 | } | 258 | } |
257 | case LSM_AUDIT_DATA_INODE: { | 259 | case LSM_AUDIT_DATA_INODE: { |
@@ -266,8 +268,9 @@ static void dump_common_audit_data(struct audit_buffer *ab, | |||
266 | dentry->d_name.name); | 268 | dentry->d_name.name); |
267 | dput(dentry); | 269 | dput(dentry); |
268 | } | 270 | } |
269 | audit_log_format(ab, " dev=%s ino=%lu", inode->i_sb->s_id, | 271 | audit_log_format(ab, " dev="); |
270 | inode->i_ino); | 272 | audit_log_untrustedstring(ab, inode->i_sb->s_id); |
273 | audit_log_format(ab, " ino=%lu", inode->i_ino); | ||
271 | break; | 274 | break; |
272 | } | 275 | } |
273 | case LSM_AUDIT_DATA_TASK: | 276 | case LSM_AUDIT_DATA_TASK: |
@@ -315,7 +318,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, | |||
315 | .dentry = u->dentry, | 318 | .dentry = u->dentry, |
316 | .mnt = u->mnt | 319 | .mnt = u->mnt |
317 | }; | 320 | }; |
318 | audit_log_d_path(ab, "path=", &path); | 321 | audit_log_d_path(ab, " path=", &path); |
319 | break; | 322 | break; |
320 | } | 323 | } |
321 | if (!u->addr) | 324 | if (!u->addr) |
diff --git a/sound/core/Kconfig b/sound/core/Kconfig index ad409381f8cc..b413ed05e74d 100644 --- a/sound/core/Kconfig +++ b/sound/core/Kconfig | |||
@@ -12,6 +12,9 @@ config SND_HWDEP | |||
12 | config SND_RAWMIDI | 12 | config SND_RAWMIDI |
13 | tristate | 13 | tristate |
14 | 14 | ||
15 | config SND_COMPRESS_OFFLOAD | ||
16 | tristate | ||
17 | |||
15 | # To be effective this also requires INPUT - users should say: | 18 | # To be effective this also requires INPUT - users should say: |
16 | # select SND_JACK if INPUT=y || INPUT=SND | 19 | # select SND_JACK if INPUT=y || INPUT=SND |
17 | # to avoid having to force INPUT on. | 20 | # to avoid having to force INPUT on. |
@@ -154,16 +157,6 @@ config SND_DYNAMIC_MINORS | |||
154 | 157 | ||
155 | If you are unsure about this, say N here. | 158 | If you are unsure about this, say N here. |
156 | 159 | ||
157 | config SND_COMPRESS_OFFLOAD | ||
158 | tristate "ALSA Compressed audio offload support" | ||
159 | default n | ||
160 | help | ||
161 | If you want support for offloading compressed audio and have such | ||
162 | a hardware, then you should say Y here and also to the DSP driver | ||
163 | of your platform. | ||
164 | |||
165 | If you are unsure about this, say N here. | ||
166 | |||
167 | config SND_SUPPORT_OLD_API | 160 | config SND_SUPPORT_OLD_API |
168 | bool "Support old ALSA API" | 161 | bool "Support old ALSA API" |
169 | default y | 162 | default y |
diff --git a/sound/pci/au88x0/au88x0.c b/sound/pci/au88x0/au88x0.c index 762bb108c51c..f13ad536b2d5 100644 --- a/sound/pci/au88x0/au88x0.c +++ b/sound/pci/au88x0/au88x0.c | |||
@@ -268,8 +268,14 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) | |||
268 | card->shortname, chip->io, chip->irq); | 268 | card->shortname, chip->io, chip->irq); |
269 | 269 | ||
270 | // (4) Alloc components. | 270 | // (4) Alloc components. |
271 | err = snd_vortex_mixer(chip); | ||
272 | if (err < 0) { | ||
273 | snd_card_free(card); | ||
274 | return err; | ||
275 | } | ||
271 | // ADB pcm. | 276 | // ADB pcm. |
272 | if ((err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_ADB)) < 0) { | 277 | err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_PCM); |
278 | if (err < 0) { | ||
273 | snd_card_free(card); | 279 | snd_card_free(card); |
274 | return err; | 280 | return err; |
275 | } | 281 | } |
@@ -299,11 +305,6 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) | |||
299 | return err; | 305 | return err; |
300 | } | 306 | } |
301 | #endif | 307 | #endif |
302 | // snd_ac97_mixer and Vortex mixer. | ||
303 | if ((err = snd_vortex_mixer(chip)) < 0) { | ||
304 | snd_card_free(card); | ||
305 | return err; | ||
306 | } | ||
307 | if ((err = snd_vortex_midi(chip)) < 0) { | 308 | if ((err = snd_vortex_midi(chip)) < 0) { |
308 | snd_card_free(card); | 309 | snd_card_free(card); |
309 | return err; | 310 | return err; |
diff --git a/sound/pci/au88x0/au88x0.h b/sound/pci/au88x0/au88x0.h index 02f6e08f7592..bb938153a964 100644 --- a/sound/pci/au88x0/au88x0.h +++ b/sound/pci/au88x0/au88x0.h | |||
@@ -105,6 +105,7 @@ | |||
105 | #define MIX_SPDIF(x) (vortex->mixspdif[x]) | 105 | #define MIX_SPDIF(x) (vortex->mixspdif[x]) |
106 | 106 | ||
107 | #define NR_WTPB 0x20 /* WT channels per each bank. */ | 107 | #define NR_WTPB 0x20 /* WT channels per each bank. */ |
108 | #define NR_PCM 0x10 | ||
108 | 109 | ||
109 | /* Structs */ | 110 | /* Structs */ |
110 | typedef struct { | 111 | typedef struct { |
diff --git a/sound/pci/au88x0/au88x0_pcm.c b/sound/pci/au88x0/au88x0_pcm.c index 0488633ea874..0ef2f9712208 100644 --- a/sound/pci/au88x0/au88x0_pcm.c +++ b/sound/pci/au88x0/au88x0_pcm.c | |||
@@ -168,6 +168,7 @@ static int snd_vortex_pcm_open(struct snd_pcm_substream *substream) | |||
168 | runtime->hw = snd_vortex_playback_hw_adb; | 168 | runtime->hw = snd_vortex_playback_hw_adb; |
169 | #ifdef CHIP_AU8830 | 169 | #ifdef CHIP_AU8830 |
170 | if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && | 170 | if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && |
171 | VORTEX_IS_QUAD(vortex) && | ||
171 | VORTEX_PCM_TYPE(substream->pcm) == VORTEX_PCM_ADB) { | 172 | VORTEX_PCM_TYPE(substream->pcm) == VORTEX_PCM_ADB) { |
172 | runtime->hw.channels_max = 4; | 173 | runtime->hw.channels_max = 4; |
173 | snd_pcm_hw_constraint_list(runtime, 0, | 174 | snd_pcm_hw_constraint_list(runtime, 0, |
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 0852e204a4c8..fb35474c1203 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c | |||
@@ -2498,6 +2498,7 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = { | |||
2498 | SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB), | 2498 | SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB), |
2499 | SND_PCI_QUIRK(0x1043, 0x81e7, "ASUS M2V", POS_FIX_LPIB), | 2499 | SND_PCI_QUIRK(0x1043, 0x81e7, "ASUS M2V", POS_FIX_LPIB), |
2500 | SND_PCI_QUIRK(0x104d, 0x9069, "Sony VPCS11V9E", POS_FIX_LPIB), | 2500 | SND_PCI_QUIRK(0x104d, 0x9069, "Sony VPCS11V9E", POS_FIX_LPIB), |
2501 | SND_PCI_QUIRK(0x10de, 0xcb89, "Macbook Pro 7,1", POS_FIX_LPIB), | ||
2501 | SND_PCI_QUIRK(0x1297, 0x3166, "Shuttle", POS_FIX_LPIB), | 2502 | SND_PCI_QUIRK(0x1297, 0x3166, "Shuttle", POS_FIX_LPIB), |
2502 | SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB), | 2503 | SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB), |
2503 | SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB), | 2504 | SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB), |
diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 87e684fa830f..3556408d6ece 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c | |||
@@ -1596,7 +1596,7 @@ static const struct snd_pci_quirk stac92hd73xx_cfg_tbl[] = { | |||
1596 | SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02bd, | 1596 | SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02bd, |
1597 | "Dell Studio 1557", STAC_DELL_M6_DMIC), | 1597 | "Dell Studio 1557", STAC_DELL_M6_DMIC), |
1598 | SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02fe, | 1598 | SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02fe, |
1599 | "Dell Studio XPS 1645", STAC_DELL_M6_BOTH), | 1599 | "Dell Studio XPS 1645", STAC_DELL_M6_DMIC), |
1600 | SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0413, | 1600 | SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0413, |
1601 | "Dell Studio 1558", STAC_DELL_M6_DMIC), | 1601 | "Dell Studio 1558", STAC_DELL_M6_DMIC), |
1602 | {} /* terminator */ | 1602 | {} /* terminator */ |
diff --git a/sound/pci/oxygen/xonar_wm87x6.c b/sound/pci/oxygen/xonar_wm87x6.c index 478303e6c2b0..63cff90706bf 100644 --- a/sound/pci/oxygen/xonar_wm87x6.c +++ b/sound/pci/oxygen/xonar_wm87x6.c | |||
@@ -177,6 +177,7 @@ static void wm8776_registers_init(struct oxygen *chip) | |||
177 | struct xonar_wm87x6 *data = chip->model_data; | 177 | struct xonar_wm87x6 *data = chip->model_data; |
178 | 178 | ||
179 | wm8776_write(chip, WM8776_RESET, 0); | 179 | wm8776_write(chip, WM8776_RESET, 0); |
180 | wm8776_write(chip, WM8776_PHASESWAP, WM8776_PH_MASK); | ||
180 | wm8776_write(chip, WM8776_DACCTRL1, WM8776_DZCEN | | 181 | wm8776_write(chip, WM8776_DACCTRL1, WM8776_DZCEN | |
181 | WM8776_PL_LEFT_LEFT | WM8776_PL_RIGHT_RIGHT); | 182 | WM8776_PL_LEFT_LEFT | WM8776_PL_RIGHT_RIGHT); |
182 | wm8776_write(chip, WM8776_DACMUTE, chip->dac_mute ? WM8776_DMUTE : 0); | 183 | wm8776_write(chip, WM8776_DACMUTE, chip->dac_mute ? WM8776_DMUTE : 0); |