diff options
| -rw-r--r-- | arch/x86/Makefile | 5 | ||||
| -rw-r--r-- | arch/x86/include/asm/xor_32.h | 6 | ||||
| -rw-r--r-- | arch/x86/include/asm/xor_64.h | 8 | ||||
| -rw-r--r-- | arch/x86/include/asm/xor_avx.h | 214 | ||||
| -rw-r--r-- | crypto/xor.c | 13 | ||||
| -rw-r--r-- | drivers/md/bitmap.c | 1100 | ||||
| -rw-r--r-- | drivers/md/bitmap.h | 60 | ||||
| -rw-r--r-- | drivers/md/dm-raid.c | 22 | ||||
| -rw-r--r-- | drivers/md/md.c | 370 | ||||
| -rw-r--r-- | drivers/md/md.h | 12 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 22 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 1281 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 34 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 252 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 7 | ||||
| -rw-r--r-- | include/linux/raid/md_p.h | 15 | ||||
| -rw-r--r-- | include/linux/raid/pq.h | 18 | ||||
| -rw-r--r-- | lib/raid6/Makefile | 2 | ||||
| -rw-r--r-- | lib/raid6/algos.c | 127 | ||||
| -rw-r--r-- | lib/raid6/mktables.c | 25 | ||||
| -rw-r--r-- | lib/raid6/recov.c | 15 | ||||
| -rw-r--r-- | lib/raid6/recov_ssse3.c | 335 | ||||
| -rw-r--r-- | lib/raid6/test/Makefile | 2 | ||||
| -rw-r--r-- | lib/raid6/test/test.c | 32 | ||||
| -rw-r--r-- | lib/raid6/x86.h | 15 |
25 files changed, 3124 insertions, 868 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index dc611a40a336..1f2521434554 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
| @@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI | |||
| 115 | 115 | ||
| 116 | # does binutils support specific instructions? | 116 | # does binutils support specific instructions? |
| 117 | asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) | 117 | asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) |
| 118 | avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) | ||
| 118 | 119 | ||
| 119 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | 120 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) |
| 120 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | 121 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) |
| 121 | 122 | ||
| 122 | LDFLAGS := -m elf_$(UTS_MACHINE) | 123 | LDFLAGS := -m elf_$(UTS_MACHINE) |
| 123 | 124 | ||
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a0f495..454570891bdc 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h | |||
| @@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { | |||
| 861 | .do_5 = xor_sse_5, | 861 | .do_5 = xor_sse_5, |
| 862 | }; | 862 | }; |
| 863 | 863 | ||
| 864 | /* Also try the AVX routines */ | ||
| 865 | #include "xor_avx.h" | ||
| 866 | |||
| 864 | /* Also try the generic routines. */ | 867 | /* Also try the generic routines. */ |
| 865 | #include <asm-generic/xor.h> | 868 | #include <asm-generic/xor.h> |
| 866 | 869 | ||
| @@ -871,6 +874,7 @@ do { \ | |||
| 871 | xor_speed(&xor_block_8regs_p); \ | 874 | xor_speed(&xor_block_8regs_p); \ |
| 872 | xor_speed(&xor_block_32regs); \ | 875 | xor_speed(&xor_block_32regs); \ |
| 873 | xor_speed(&xor_block_32regs_p); \ | 876 | xor_speed(&xor_block_32regs_p); \ |
| 877 | AVX_XOR_SPEED; \ | ||
| 874 | if (cpu_has_xmm) \ | 878 | if (cpu_has_xmm) \ |
| 875 | xor_speed(&xor_block_pIII_sse); \ | 879 | xor_speed(&xor_block_pIII_sse); \ |
| 876 | if (cpu_has_mmx) { \ | 880 | if (cpu_has_mmx) { \ |
| @@ -883,6 +887,6 @@ do { \ | |||
| 883 | We may also be able to load into the L1 only depending on how the cpu | 887 | We may also be able to load into the L1 only depending on how the cpu |
| 884 | deals with a load to a line that is being prefetched. */ | 888 | deals with a load to a line that is being prefetched. */ |
| 885 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | 889 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
| 886 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | 890 | AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |
| 887 | 891 | ||
| 888 | #endif /* _ASM_X86_XOR_32_H */ | 892 | #endif /* _ASM_X86_XOR_32_H */ |
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e261f6..b9b2323e90fe 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h | |||
| @@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = { | |||
| 347 | .do_5 = xor_sse_5, | 347 | .do_5 = xor_sse_5, |
| 348 | }; | 348 | }; |
| 349 | 349 | ||
| 350 | |||
| 351 | /* Also try the AVX routines */ | ||
| 352 | #include "xor_avx.h" | ||
| 353 | |||
| 350 | #undef XOR_TRY_TEMPLATES | 354 | #undef XOR_TRY_TEMPLATES |
| 351 | #define XOR_TRY_TEMPLATES \ | 355 | #define XOR_TRY_TEMPLATES \ |
| 352 | do { \ | 356 | do { \ |
| 357 | AVX_XOR_SPEED; \ | ||
| 353 | xor_speed(&xor_block_sse); \ | 358 | xor_speed(&xor_block_sse); \ |
| 354 | } while (0) | 359 | } while (0) |
| 355 | 360 | ||
| 356 | /* We force the use of the SSE xor block because it can write around L2. | 361 | /* We force the use of the SSE xor block because it can write around L2. |
| 357 | We may also be able to load into the L1 only depending on how the cpu | 362 | We may also be able to load into the L1 only depending on how the cpu |
| 358 | deals with a load to a line that is being prefetched. */ | 363 | deals with a load to a line that is being prefetched. */ |
| 359 | #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) | 364 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
| 365 | AVX_SELECT(&xor_block_sse) | ||
| 360 | 366 | ||
| 361 | #endif /* _ASM_X86_XOR_64_H */ | 367 | #endif /* _ASM_X86_XOR_64_H */ |
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h | |||
| @@ -0,0 +1,214 @@ | |||
| 1 | #ifndef _ASM_X86_XOR_AVX_H | ||
| 2 | #define _ASM_X86_XOR_AVX_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * Optimized RAID-5 checksumming functions for AVX | ||
| 6 | * | ||
| 7 | * Copyright (C) 2012 Intel Corporation | ||
| 8 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> | ||
| 9 | * | ||
| 10 | * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines | ||
| 11 | * | ||
| 12 | * This program is free software; you can redistribute it and/or | ||
| 13 | * modify it under the terms of the GNU General Public License | ||
| 14 | * as published by the Free Software Foundation; version 2 | ||
| 15 | * of the License. | ||
| 16 | */ | ||
| 17 | |||
| 18 | #ifdef CONFIG_AS_AVX | ||
| 19 | |||
| 20 | #include <linux/compiler.h> | ||
| 21 | #include <asm/i387.h> | ||
| 22 | |||
| 23 | #define ALIGN32 __aligned(32) | ||
| 24 | |||
| 25 | #define YMM_SAVED_REGS 4 | ||
| 26 | |||
| 27 | #define YMMS_SAVE \ | ||
| 28 | do { \ | ||
| 29 | preempt_disable(); \ | ||
| 30 | cr0 = read_cr0(); \ | ||
| 31 | clts(); \ | ||
| 32 | asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ | ||
| 33 | asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ | ||
| 34 | asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ | ||
| 35 | asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ | ||
| 36 | } while (0); | ||
| 37 | |||
| 38 | #define YMMS_RESTORE \ | ||
| 39 | do { \ | ||
| 40 | asm volatile("sfence" : : : "memory"); \ | ||
| 41 | asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ | ||
| 42 | asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ | ||
| 43 | asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ | ||
| 44 | asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ | ||
| 45 | write_cr0(cr0); \ | ||
| 46 | preempt_enable(); \ | ||
| 47 | } while (0); | ||
| 48 | |||
| 49 | #define BLOCK4(i) \ | ||
| 50 | BLOCK(32 * i, 0) \ | ||
| 51 | BLOCK(32 * (i + 1), 1) \ | ||
| 52 | BLOCK(32 * (i + 2), 2) \ | ||
| 53 | BLOCK(32 * (i + 3), 3) | ||
| 54 | |||
| 55 | #define BLOCK16() \ | ||
| 56 | BLOCK4(0) \ | ||
| 57 | BLOCK4(4) \ | ||
| 58 | BLOCK4(8) \ | ||
| 59 | BLOCK4(12) | ||
| 60 | |||
| 61 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) | ||
| 62 | { | ||
| 63 | unsigned long cr0, lines = bytes >> 9; | ||
| 64 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
| 65 | |||
| 66 | YMMS_SAVE | ||
| 67 | |||
| 68 | while (lines--) { | ||
| 69 | #undef BLOCK | ||
| 70 | #define BLOCK(i, reg) \ | ||
| 71 | do { \ | ||
| 72 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ | ||
| 73 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 74 | "m" (p0[i / sizeof(*p0)])); \ | ||
| 75 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
| 76 | "=m" (p0[i / sizeof(*p0)])); \ | ||
| 77 | } while (0); | ||
| 78 | |||
| 79 | BLOCK16() | ||
| 80 | |||
| 81 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
| 82 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
| 83 | } | ||
| 84 | |||
| 85 | YMMS_RESTORE | ||
| 86 | } | ||
| 87 | |||
| 88 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
| 89 | unsigned long *p2) | ||
| 90 | { | ||
| 91 | unsigned long cr0, lines = bytes >> 9; | ||
| 92 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
| 93 | |||
| 94 | YMMS_SAVE | ||
| 95 | |||
| 96 | while (lines--) { | ||
| 97 | #undef BLOCK | ||
| 98 | #define BLOCK(i, reg) \ | ||
| 99 | do { \ | ||
| 100 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ | ||
| 101 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 102 | "m" (p1[i / sizeof(*p1)])); \ | ||
| 103 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 104 | "m" (p0[i / sizeof(*p0)])); \ | ||
| 105 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
| 106 | "=m" (p0[i / sizeof(*p0)])); \ | ||
| 107 | } while (0); | ||
| 108 | |||
| 109 | BLOCK16() | ||
| 110 | |||
| 111 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
| 112 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
| 113 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
| 114 | } | ||
| 115 | |||
| 116 | YMMS_RESTORE | ||
| 117 | } | ||
| 118 | |||
| 119 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
| 120 | unsigned long *p2, unsigned long *p3) | ||
| 121 | { | ||
| 122 | unsigned long cr0, lines = bytes >> 9; | ||
| 123 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
| 124 | |||
| 125 | YMMS_SAVE | ||
| 126 | |||
| 127 | while (lines--) { | ||
| 128 | #undef BLOCK | ||
| 129 | #define BLOCK(i, reg) \ | ||
| 130 | do { \ | ||
| 131 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ | ||
| 132 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 133 | "m" (p2[i / sizeof(*p2)])); \ | ||
| 134 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 135 | "m" (p1[i / sizeof(*p1)])); \ | ||
| 136 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 137 | "m" (p0[i / sizeof(*p0)])); \ | ||
| 138 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
| 139 | "=m" (p0[i / sizeof(*p0)])); \ | ||
| 140 | } while (0); | ||
| 141 | |||
| 142 | BLOCK16(); | ||
| 143 | |||
| 144 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
| 145 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
| 146 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
| 147 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
| 148 | } | ||
| 149 | |||
| 150 | YMMS_RESTORE | ||
| 151 | } | ||
| 152 | |||
| 153 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
| 154 | unsigned long *p2, unsigned long *p3, unsigned long *p4) | ||
| 155 | { | ||
| 156 | unsigned long cr0, lines = bytes >> 9; | ||
| 157 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
| 158 | |||
| 159 | YMMS_SAVE | ||
| 160 | |||
| 161 | while (lines--) { | ||
| 162 | #undef BLOCK | ||
| 163 | #define BLOCK(i, reg) \ | ||
| 164 | do { \ | ||
| 165 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ | ||
| 166 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 167 | "m" (p3[i / sizeof(*p3)])); \ | ||
| 168 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 169 | "m" (p2[i / sizeof(*p2)])); \ | ||
| 170 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 171 | "m" (p1[i / sizeof(*p1)])); \ | ||
| 172 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
| 173 | "m" (p0[i / sizeof(*p0)])); \ | ||
| 174 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
| 175 | "=m" (p0[i / sizeof(*p0)])); \ | ||
| 176 | } while (0); | ||
| 177 | |||
| 178 | BLOCK16() | ||
| 179 | |||
| 180 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
| 181 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
| 182 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
| 183 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
| 184 | p4 = (unsigned long *)((uintptr_t)p4 + 512); | ||
| 185 | } | ||
| 186 | |||
| 187 | YMMS_RESTORE | ||
| 188 | } | ||
| 189 | |||
| 190 | static struct xor_block_template xor_block_avx = { | ||
| 191 | .name = "avx", | ||
| 192 | .do_2 = xor_avx_2, | ||
| 193 | .do_3 = xor_avx_3, | ||
| 194 | .do_4 = xor_avx_4, | ||
| 195 | .do_5 = xor_avx_5, | ||
| 196 | }; | ||
| 197 | |||
| 198 | #define AVX_XOR_SPEED \ | ||
| 199 | do { \ | ||
| 200 | if (cpu_has_avx) \ | ||
| 201 | xor_speed(&xor_block_avx); \ | ||
| 202 | } while (0) | ||
| 203 | |||
| 204 | #define AVX_SELECT(FASTEST) \ | ||
| 205 | (cpu_has_avx ? &xor_block_avx : FASTEST) | ||
| 206 | |||
| 207 | #else | ||
| 208 | |||
| 209 | #define AVX_XOR_SPEED {} | ||
| 210 | |||
| 211 | #define AVX_SELECT(FASTEST) (FASTEST) | ||
| 212 | |||
| 213 | #endif | ||
| 214 | #endif | ||
diff --git a/crypto/xor.c b/crypto/xor.c index 664b6dfa9e2c..65c7b416b4a3 100644 --- a/crypto/xor.c +++ b/crypto/xor.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/gfp.h> | 21 | #include <linux/gfp.h> |
| 22 | #include <linux/raid/xor.h> | 22 | #include <linux/raid/xor.h> |
| 23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
| 24 | #include <linux/preempt.h> | ||
| 24 | #include <asm/xor.h> | 25 | #include <asm/xor.h> |
| 25 | 26 | ||
| 26 | /* The xor routines to use. */ | 27 | /* The xor routines to use. */ |
| @@ -63,12 +64,14 @@ static void | |||
| 63 | do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | 64 | do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) |
| 64 | { | 65 | { |
| 65 | int speed; | 66 | int speed; |
| 66 | unsigned long now; | 67 | unsigned long now, j; |
| 67 | int i, count, max; | 68 | int i, count, max; |
| 68 | 69 | ||
| 69 | tmpl->next = template_list; | 70 | tmpl->next = template_list; |
| 70 | template_list = tmpl; | 71 | template_list = tmpl; |
| 71 | 72 | ||
| 73 | preempt_disable(); | ||
| 74 | |||
| 72 | /* | 75 | /* |
| 73 | * Count the number of XORs done during a whole jiffy, and use | 76 | * Count the number of XORs done during a whole jiffy, and use |
| 74 | * this to calculate the speed of checksumming. We use a 2-page | 77 | * this to calculate the speed of checksumming. We use a 2-page |
| @@ -76,9 +79,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | |||
| 76 | */ | 79 | */ |
| 77 | max = 0; | 80 | max = 0; |
| 78 | for (i = 0; i < 5; i++) { | 81 | for (i = 0; i < 5; i++) { |
| 79 | now = jiffies; | 82 | j = jiffies; |
| 80 | count = 0; | 83 | count = 0; |
| 81 | while (jiffies == now) { | 84 | while ((now = jiffies) == j) |
| 85 | cpu_relax(); | ||
| 86 | while (time_before(jiffies, now + 1)) { | ||
| 82 | mb(); /* prevent loop optimzation */ | 87 | mb(); /* prevent loop optimzation */ |
| 83 | tmpl->do_2(BENCH_SIZE, b1, b2); | 88 | tmpl->do_2(BENCH_SIZE, b1, b2); |
| 84 | mb(); | 89 | mb(); |
| @@ -89,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | |||
| 89 | max = count; | 94 | max = count; |
| 90 | } | 95 | } |
| 91 | 96 | ||
| 97 | preempt_enable(); | ||
| 98 | |||
| 92 | speed = max * (HZ * BENCH_SIZE / 1024); | 99 | speed = max * (HZ * BENCH_SIZE / 1024); |
| 93 | tmpl->speed = speed; | 100 | tmpl->speed = speed; |
| 94 | 101 | ||
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 17e2b472e16d..15dbe03117e4 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -45,7 +45,7 @@ static inline char *bmname(struct bitmap *bitmap) | |||
| 45 | * if we find our page, we increment the page's refcount so that it stays | 45 | * if we find our page, we increment the page's refcount so that it stays |
| 46 | * allocated while we're using it | 46 | * allocated while we're using it |
| 47 | */ | 47 | */ |
| 48 | static int bitmap_checkpage(struct bitmap *bitmap, | 48 | static int bitmap_checkpage(struct bitmap_counts *bitmap, |
| 49 | unsigned long page, int create) | 49 | unsigned long page, int create) |
| 50 | __releases(bitmap->lock) | 50 | __releases(bitmap->lock) |
| 51 | __acquires(bitmap->lock) | 51 | __acquires(bitmap->lock) |
| @@ -76,8 +76,7 @@ __acquires(bitmap->lock) | |||
| 76 | spin_lock_irq(&bitmap->lock); | 76 | spin_lock_irq(&bitmap->lock); |
| 77 | 77 | ||
| 78 | if (mappage == NULL) { | 78 | if (mappage == NULL) { |
| 79 | pr_debug("%s: bitmap map page allocation failed, hijacking\n", | 79 | pr_debug("md/bitmap: map page allocation failed, hijacking\n"); |
| 80 | bmname(bitmap)); | ||
| 81 | /* failed - set the hijacked flag so that we can use the | 80 | /* failed - set the hijacked flag so that we can use the |
| 82 | * pointer as a counter */ | 81 | * pointer as a counter */ |
| 83 | if (!bitmap->bp[page].map) | 82 | if (!bitmap->bp[page].map) |
| @@ -100,7 +99,7 @@ __acquires(bitmap->lock) | |||
| 100 | /* if page is completely empty, put it back on the free list, or dealloc it */ | 99 | /* if page is completely empty, put it back on the free list, or dealloc it */ |
| 101 | /* if page was hijacked, unmark the flag so it might get alloced next time */ | 100 | /* if page was hijacked, unmark the flag so it might get alloced next time */ |
| 102 | /* Note: lock should be held when calling this */ | 101 | /* Note: lock should be held when calling this */ |
| 103 | static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) | 102 | static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) |
| 104 | { | 103 | { |
| 105 | char *ptr; | 104 | char *ptr; |
| 106 | 105 | ||
| @@ -130,22 +129,14 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) | |||
| 130 | */ | 129 | */ |
| 131 | 130 | ||
| 132 | /* IO operations when bitmap is stored near all superblocks */ | 131 | /* IO operations when bitmap is stored near all superblocks */ |
| 133 | static struct page *read_sb_page(struct mddev *mddev, loff_t offset, | 132 | static int read_sb_page(struct mddev *mddev, loff_t offset, |
| 134 | struct page *page, | 133 | struct page *page, |
| 135 | unsigned long index, int size) | 134 | unsigned long index, int size) |
| 136 | { | 135 | { |
| 137 | /* choose a good rdev and read the page from there */ | 136 | /* choose a good rdev and read the page from there */ |
| 138 | 137 | ||
| 139 | struct md_rdev *rdev; | 138 | struct md_rdev *rdev; |
| 140 | sector_t target; | 139 | sector_t target; |
| 141 | int did_alloc = 0; | ||
| 142 | |||
| 143 | if (!page) { | ||
| 144 | page = alloc_page(GFP_KERNEL); | ||
| 145 | if (!page) | ||
| 146 | return ERR_PTR(-ENOMEM); | ||
| 147 | did_alloc = 1; | ||
| 148 | } | ||
| 149 | 140 | ||
| 150 | rdev_for_each(rdev, mddev) { | 141 | rdev_for_each(rdev, mddev) { |
| 151 | if (! test_bit(In_sync, &rdev->flags) | 142 | if (! test_bit(In_sync, &rdev->flags) |
| @@ -158,15 +149,10 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, | |||
| 158 | roundup(size, bdev_logical_block_size(rdev->bdev)), | 149 | roundup(size, bdev_logical_block_size(rdev->bdev)), |
| 159 | page, READ, true)) { | 150 | page, READ, true)) { |
| 160 | page->index = index; | 151 | page->index = index; |
| 161 | attach_page_buffers(page, NULL); /* so that free_buffer will | 152 | return 0; |
| 162 | * quietly no-op */ | ||
| 163 | return page; | ||
| 164 | } | 153 | } |
| 165 | } | 154 | } |
| 166 | if (did_alloc) | 155 | return -EIO; |
| 167 | put_page(page); | ||
| 168 | return ERR_PTR(-EIO); | ||
| 169 | |||
| 170 | } | 156 | } |
| 171 | 157 | ||
| 172 | static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) | 158 | static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) |
| @@ -208,6 +194,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 208 | struct md_rdev *rdev = NULL; | 194 | struct md_rdev *rdev = NULL; |
| 209 | struct block_device *bdev; | 195 | struct block_device *bdev; |
| 210 | struct mddev *mddev = bitmap->mddev; | 196 | struct mddev *mddev = bitmap->mddev; |
| 197 | struct bitmap_storage *store = &bitmap->storage; | ||
| 211 | 198 | ||
| 212 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 199 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
| 213 | int size = PAGE_SIZE; | 200 | int size = PAGE_SIZE; |
| @@ -215,9 +202,13 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 215 | 202 | ||
| 216 | bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; | 203 | bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; |
| 217 | 204 | ||
| 218 | if (page->index == bitmap->file_pages-1) | 205 | if (page->index == store->file_pages-1) { |
| 219 | size = roundup(bitmap->last_page_size, | 206 | int last_page_size = store->bytes & (PAGE_SIZE-1); |
| 207 | if (last_page_size == 0) | ||
| 208 | last_page_size = PAGE_SIZE; | ||
| 209 | size = roundup(last_page_size, | ||
| 220 | bdev_logical_block_size(bdev)); | 210 | bdev_logical_block_size(bdev)); |
| 211 | } | ||
| 221 | /* Just make sure we aren't corrupting data or | 212 | /* Just make sure we aren't corrupting data or |
| 222 | * metadata | 213 | * metadata |
| 223 | */ | 214 | */ |
| @@ -276,10 +267,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 276 | { | 267 | { |
| 277 | struct buffer_head *bh; | 268 | struct buffer_head *bh; |
| 278 | 269 | ||
| 279 | if (bitmap->file == NULL) { | 270 | if (bitmap->storage.file == NULL) { |
| 280 | switch (write_sb_page(bitmap, page, wait)) { | 271 | switch (write_sb_page(bitmap, page, wait)) { |
| 281 | case -EINVAL: | 272 | case -EINVAL: |
| 282 | bitmap->flags |= BITMAP_WRITE_ERROR; | 273 | set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); |
| 283 | } | 274 | } |
| 284 | } else { | 275 | } else { |
| 285 | 276 | ||
| @@ -297,20 +288,16 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 297 | wait_event(bitmap->write_wait, | 288 | wait_event(bitmap->write_wait, |
| 298 | atomic_read(&bitmap->pending_writes)==0); | 289 | atomic_read(&bitmap->pending_writes)==0); |
| 299 | } | 290 | } |
| 300 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 291 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
| 301 | bitmap_file_kick(bitmap); | 292 | bitmap_file_kick(bitmap); |
| 302 | } | 293 | } |
| 303 | 294 | ||
| 304 | static void end_bitmap_write(struct buffer_head *bh, int uptodate) | 295 | static void end_bitmap_write(struct buffer_head *bh, int uptodate) |
| 305 | { | 296 | { |
| 306 | struct bitmap *bitmap = bh->b_private; | 297 | struct bitmap *bitmap = bh->b_private; |
| 307 | unsigned long flags; | ||
| 308 | 298 | ||
| 309 | if (!uptodate) { | 299 | if (!uptodate) |
| 310 | spin_lock_irqsave(&bitmap->lock, flags); | 300 | set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); |
| 311 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
| 312 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 313 | } | ||
| 314 | if (atomic_dec_and_test(&bitmap->pending_writes)) | 301 | if (atomic_dec_and_test(&bitmap->pending_writes)) |
| 315 | wake_up(&bitmap->write_wait); | 302 | wake_up(&bitmap->write_wait); |
| 316 | } | 303 | } |
| @@ -325,8 +312,12 @@ __clear_page_buffers(struct page *page) | |||
| 325 | } | 312 | } |
| 326 | static void free_buffers(struct page *page) | 313 | static void free_buffers(struct page *page) |
| 327 | { | 314 | { |
| 328 | struct buffer_head *bh = page_buffers(page); | 315 | struct buffer_head *bh; |
| 329 | 316 | ||
| 317 | if (!PagePrivate(page)) | ||
| 318 | return; | ||
| 319 | |||
| 320 | bh = page_buffers(page); | ||
| 330 | while (bh) { | 321 | while (bh) { |
| 331 | struct buffer_head *next = bh->b_this_page; | 322 | struct buffer_head *next = bh->b_this_page; |
| 332 | free_buffer_head(bh); | 323 | free_buffer_head(bh); |
| @@ -343,11 +334,12 @@ static void free_buffers(struct page *page) | |||
| 343 | * This usage is similar to how swap files are handled, and allows us | 334 | * This usage is similar to how swap files are handled, and allows us |
| 344 | * to write to a file with no concerns of memory allocation failing. | 335 | * to write to a file with no concerns of memory allocation failing. |
| 345 | */ | 336 | */ |
| 346 | static struct page *read_page(struct file *file, unsigned long index, | 337 | static int read_page(struct file *file, unsigned long index, |
| 347 | struct bitmap *bitmap, | 338 | struct bitmap *bitmap, |
| 348 | unsigned long count) | 339 | unsigned long count, |
| 340 | struct page *page) | ||
| 349 | { | 341 | { |
| 350 | struct page *page = NULL; | 342 | int ret = 0; |
| 351 | struct inode *inode = file->f_path.dentry->d_inode; | 343 | struct inode *inode = file->f_path.dentry->d_inode; |
| 352 | struct buffer_head *bh; | 344 | struct buffer_head *bh; |
| 353 | sector_t block; | 345 | sector_t block; |
| @@ -355,16 +347,9 @@ static struct page *read_page(struct file *file, unsigned long index, | |||
| 355 | pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, | 347 | pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, |
| 356 | (unsigned long long)index << PAGE_SHIFT); | 348 | (unsigned long long)index << PAGE_SHIFT); |
| 357 | 349 | ||
| 358 | page = alloc_page(GFP_KERNEL); | ||
| 359 | if (!page) | ||
| 360 | page = ERR_PTR(-ENOMEM); | ||
| 361 | if (IS_ERR(page)) | ||
| 362 | goto out; | ||
| 363 | |||
| 364 | bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); | 350 | bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); |
| 365 | if (!bh) { | 351 | if (!bh) { |
| 366 | put_page(page); | 352 | ret = -ENOMEM; |
| 367 | page = ERR_PTR(-ENOMEM); | ||
| 368 | goto out; | 353 | goto out; |
| 369 | } | 354 | } |
| 370 | attach_page_buffers(page, bh); | 355 | attach_page_buffers(page, bh); |
| @@ -376,8 +361,7 @@ static struct page *read_page(struct file *file, unsigned long index, | |||
| 376 | bh->b_blocknr = bmap(inode, block); | 361 | bh->b_blocknr = bmap(inode, block); |
| 377 | if (bh->b_blocknr == 0) { | 362 | if (bh->b_blocknr == 0) { |
| 378 | /* Cannot use this file! */ | 363 | /* Cannot use this file! */ |
| 379 | free_buffers(page); | 364 | ret = -EINVAL; |
| 380 | page = ERR_PTR(-EINVAL); | ||
| 381 | goto out; | 365 | goto out; |
| 382 | } | 366 | } |
| 383 | bh->b_bdev = inode->i_sb->s_bdev; | 367 | bh->b_bdev = inode->i_sb->s_bdev; |
| @@ -400,17 +384,15 @@ static struct page *read_page(struct file *file, unsigned long index, | |||
| 400 | 384 | ||
| 401 | wait_event(bitmap->write_wait, | 385 | wait_event(bitmap->write_wait, |
| 402 | atomic_read(&bitmap->pending_writes)==0); | 386 | atomic_read(&bitmap->pending_writes)==0); |
| 403 | if (bitmap->flags & BITMAP_WRITE_ERROR) { | 387 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
| 404 | free_buffers(page); | 388 | ret = -EIO; |
| 405 | page = ERR_PTR(-EIO); | ||
| 406 | } | ||
| 407 | out: | 389 | out: |
| 408 | if (IS_ERR(page)) | 390 | if (ret) |
| 409 | printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", | 391 | printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", |
| 410 | (int)PAGE_SIZE, | 392 | (int)PAGE_SIZE, |
| 411 | (unsigned long long)index << PAGE_SHIFT, | 393 | (unsigned long long)index << PAGE_SHIFT, |
| 412 | PTR_ERR(page)); | 394 | ret); |
| 413 | return page; | 395 | return ret; |
| 414 | } | 396 | } |
| 415 | 397 | ||
| 416 | /* | 398 | /* |
| @@ -426,9 +408,9 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
| 426 | return; | 408 | return; |
| 427 | if (bitmap->mddev->bitmap_info.external) | 409 | if (bitmap->mddev->bitmap_info.external) |
| 428 | return; | 410 | return; |
| 429 | if (!bitmap->sb_page) /* no superblock */ | 411 | if (!bitmap->storage.sb_page) /* no superblock */ |
| 430 | return; | 412 | return; |
| 431 | sb = kmap_atomic(bitmap->sb_page); | 413 | sb = kmap_atomic(bitmap->storage.sb_page); |
| 432 | sb->events = cpu_to_le64(bitmap->mddev->events); | 414 | sb->events = cpu_to_le64(bitmap->mddev->events); |
| 433 | if (bitmap->mddev->events < bitmap->events_cleared) | 415 | if (bitmap->mddev->events < bitmap->events_cleared) |
| 434 | /* rocking back to read-only */ | 416 | /* rocking back to read-only */ |
| @@ -438,8 +420,13 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
| 438 | /* Just in case these have been changed via sysfs: */ | 420 | /* Just in case these have been changed via sysfs: */ |
| 439 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); | 421 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); |
| 440 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); | 422 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); |
| 423 | /* This might have been changed by a reshape */ | ||
| 424 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); | ||
| 425 | sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); | ||
| 426 | sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> | ||
| 427 | bitmap_info.space); | ||
| 441 | kunmap_atomic(sb); | 428 | kunmap_atomic(sb); |
| 442 | write_page(bitmap, bitmap->sb_page, 1); | 429 | write_page(bitmap, bitmap->storage.sb_page, 1); |
| 443 | } | 430 | } |
| 444 | 431 | ||
| 445 | /* print out the bitmap file superblock */ | 432 | /* print out the bitmap file superblock */ |
| @@ -447,9 +434,9 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
| 447 | { | 434 | { |
| 448 | bitmap_super_t *sb; | 435 | bitmap_super_t *sb; |
| 449 | 436 | ||
| 450 | if (!bitmap || !bitmap->sb_page) | 437 | if (!bitmap || !bitmap->storage.sb_page) |
| 451 | return; | 438 | return; |
| 452 | sb = kmap_atomic(bitmap->sb_page); | 439 | sb = kmap_atomic(bitmap->storage.sb_page); |
| 453 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); | 440 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); |
| 454 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); | 441 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); |
| 455 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); | 442 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); |
| @@ -488,15 +475,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
| 488 | unsigned long chunksize, daemon_sleep, write_behind; | 475 | unsigned long chunksize, daemon_sleep, write_behind; |
| 489 | int err = -EINVAL; | 476 | int err = -EINVAL; |
| 490 | 477 | ||
| 491 | bitmap->sb_page = alloc_page(GFP_KERNEL); | 478 | bitmap->storage.sb_page = alloc_page(GFP_KERNEL); |
| 492 | if (IS_ERR(bitmap->sb_page)) { | 479 | if (IS_ERR(bitmap->storage.sb_page)) { |
| 493 | err = PTR_ERR(bitmap->sb_page); | 480 | err = PTR_ERR(bitmap->storage.sb_page); |
| 494 | bitmap->sb_page = NULL; | 481 | bitmap->storage.sb_page = NULL; |
| 495 | return err; | 482 | return err; |
| 496 | } | 483 | } |
| 497 | bitmap->sb_page->index = 0; | 484 | bitmap->storage.sb_page->index = 0; |
| 498 | 485 | ||
| 499 | sb = kmap_atomic(bitmap->sb_page); | 486 | sb = kmap_atomic(bitmap->storage.sb_page); |
| 500 | 487 | ||
| 501 | sb->magic = cpu_to_le32(BITMAP_MAGIC); | 488 | sb->magic = cpu_to_le32(BITMAP_MAGIC); |
| 502 | sb->version = cpu_to_le32(BITMAP_MAJOR_HI); | 489 | sb->version = cpu_to_le32(BITMAP_MAJOR_HI); |
| @@ -534,8 +521,8 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
| 534 | 521 | ||
| 535 | memcpy(sb->uuid, bitmap->mddev->uuid, 16); | 522 | memcpy(sb->uuid, bitmap->mddev->uuid, 16); |
| 536 | 523 | ||
| 537 | bitmap->flags |= BITMAP_STALE; | 524 | set_bit(BITMAP_STALE, &bitmap->flags); |
| 538 | sb->state |= cpu_to_le32(BITMAP_STALE); | 525 | sb->state = cpu_to_le32(bitmap->flags); |
| 539 | bitmap->events_cleared = bitmap->mddev->events; | 526 | bitmap->events_cleared = bitmap->mddev->events; |
| 540 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); | 527 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); |
| 541 | 528 | ||
| @@ -551,31 +538,45 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
| 551 | bitmap_super_t *sb; | 538 | bitmap_super_t *sb; |
| 552 | unsigned long chunksize, daemon_sleep, write_behind; | 539 | unsigned long chunksize, daemon_sleep, write_behind; |
| 553 | unsigned long long events; | 540 | unsigned long long events; |
| 541 | unsigned long sectors_reserved = 0; | ||
| 554 | int err = -EINVAL; | 542 | int err = -EINVAL; |
| 543 | struct page *sb_page; | ||
| 555 | 544 | ||
| 545 | if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { | ||
| 546 | chunksize = 128 * 1024 * 1024; | ||
| 547 | daemon_sleep = 5 * HZ; | ||
| 548 | write_behind = 0; | ||
| 549 | set_bit(BITMAP_STALE, &bitmap->flags); | ||
| 550 | err = 0; | ||
| 551 | goto out_no_sb; | ||
| 552 | } | ||
| 556 | /* page 0 is the superblock, read it... */ | 553 | /* page 0 is the superblock, read it... */ |
| 557 | if (bitmap->file) { | 554 | sb_page = alloc_page(GFP_KERNEL); |
| 558 | loff_t isize = i_size_read(bitmap->file->f_mapping->host); | 555 | if (!sb_page) |
| 556 | return -ENOMEM; | ||
| 557 | bitmap->storage.sb_page = sb_page; | ||
| 558 | |||
| 559 | if (bitmap->storage.file) { | ||
| 560 | loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); | ||
| 559 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; | 561 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; |
| 560 | 562 | ||
| 561 | bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); | 563 | err = read_page(bitmap->storage.file, 0, |
| 564 | bitmap, bytes, sb_page); | ||
| 562 | } else { | 565 | } else { |
| 563 | bitmap->sb_page = read_sb_page(bitmap->mddev, | 566 | err = read_sb_page(bitmap->mddev, |
| 564 | bitmap->mddev->bitmap_info.offset, | 567 | bitmap->mddev->bitmap_info.offset, |
| 565 | NULL, | 568 | sb_page, |
| 566 | 0, sizeof(bitmap_super_t)); | 569 | 0, sizeof(bitmap_super_t)); |
| 567 | } | 570 | } |
| 568 | if (IS_ERR(bitmap->sb_page)) { | 571 | if (err) |
| 569 | err = PTR_ERR(bitmap->sb_page); | ||
| 570 | bitmap->sb_page = NULL; | ||
| 571 | return err; | 572 | return err; |
| 572 | } | ||
| 573 | 573 | ||
| 574 | sb = kmap_atomic(bitmap->sb_page); | 574 | sb = kmap_atomic(sb_page); |
| 575 | 575 | ||
| 576 | chunksize = le32_to_cpu(sb->chunksize); | 576 | chunksize = le32_to_cpu(sb->chunksize); |
| 577 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; | 577 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; |
| 578 | write_behind = le32_to_cpu(sb->write_behind); | 578 | write_behind = le32_to_cpu(sb->write_behind); |
| 579 | sectors_reserved = le32_to_cpu(sb->sectors_reserved); | ||
| 579 | 580 | ||
| 580 | /* verify that the bitmap-specific fields are valid */ | 581 | /* verify that the bitmap-specific fields are valid */ |
| 581 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) | 582 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) |
| @@ -618,60 +619,32 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
| 618 | "-- forcing full recovery\n", | 619 | "-- forcing full recovery\n", |
| 619 | bmname(bitmap), events, | 620 | bmname(bitmap), events, |
| 620 | (unsigned long long) bitmap->mddev->events); | 621 | (unsigned long long) bitmap->mddev->events); |
| 621 | sb->state |= cpu_to_le32(BITMAP_STALE); | 622 | set_bit(BITMAP_STALE, &bitmap->flags); |
| 622 | } | 623 | } |
| 623 | } | 624 | } |
| 624 | 625 | ||
| 625 | /* assign fields using values from superblock */ | 626 | /* assign fields using values from superblock */ |
| 626 | bitmap->mddev->bitmap_info.chunksize = chunksize; | ||
| 627 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | ||
| 628 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; | ||
| 629 | bitmap->flags |= le32_to_cpu(sb->state); | 627 | bitmap->flags |= le32_to_cpu(sb->state); |
| 630 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 628 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
| 631 | bitmap->flags |= BITMAP_HOSTENDIAN; | 629 | set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); |
| 632 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 630 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
| 633 | if (bitmap->flags & BITMAP_STALE) | ||
| 634 | bitmap->events_cleared = bitmap->mddev->events; | ||
| 635 | err = 0; | 631 | err = 0; |
| 636 | out: | 632 | out: |
| 637 | kunmap_atomic(sb); | 633 | kunmap_atomic(sb); |
| 634 | out_no_sb: | ||
| 635 | if (test_bit(BITMAP_STALE, &bitmap->flags)) | ||
| 636 | bitmap->events_cleared = bitmap->mddev->events; | ||
| 637 | bitmap->mddev->bitmap_info.chunksize = chunksize; | ||
| 638 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | ||
| 639 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; | ||
| 640 | if (bitmap->mddev->bitmap_info.space == 0 || | ||
| 641 | bitmap->mddev->bitmap_info.space > sectors_reserved) | ||
| 642 | bitmap->mddev->bitmap_info.space = sectors_reserved; | ||
| 638 | if (err) | 643 | if (err) |
| 639 | bitmap_print_sb(bitmap); | 644 | bitmap_print_sb(bitmap); |
| 640 | return err; | 645 | return err; |
| 641 | } | 646 | } |
| 642 | 647 | ||
| 643 | enum bitmap_mask_op { | ||
| 644 | MASK_SET, | ||
| 645 | MASK_UNSET | ||
| 646 | }; | ||
| 647 | |||
| 648 | /* record the state of the bitmap in the superblock. Return the old value */ | ||
| 649 | static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | ||
| 650 | enum bitmap_mask_op op) | ||
| 651 | { | ||
| 652 | bitmap_super_t *sb; | ||
| 653 | int old; | ||
| 654 | |||
| 655 | if (!bitmap->sb_page) /* can't set the state */ | ||
| 656 | return 0; | ||
| 657 | sb = kmap_atomic(bitmap->sb_page); | ||
| 658 | old = le32_to_cpu(sb->state) & bits; | ||
| 659 | switch (op) { | ||
| 660 | case MASK_SET: | ||
| 661 | sb->state |= cpu_to_le32(bits); | ||
| 662 | bitmap->flags |= bits; | ||
| 663 | break; | ||
| 664 | case MASK_UNSET: | ||
| 665 | sb->state &= cpu_to_le32(~bits); | ||
| 666 | bitmap->flags &= ~bits; | ||
| 667 | break; | ||
| 668 | default: | ||
| 669 | BUG(); | ||
| 670 | } | ||
| 671 | kunmap_atomic(sb); | ||
| 672 | return old; | ||
| 673 | } | ||
| 674 | |||
| 675 | /* | 648 | /* |
| 676 | * general bitmap file operations | 649 | * general bitmap file operations |
| 677 | */ | 650 | */ |
| @@ -683,17 +656,19 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
| 683 | * file a page at a time. There's a superblock at the start of the file. | 656 | * file a page at a time. There's a superblock at the start of the file. |
| 684 | */ | 657 | */ |
| 685 | /* calculate the index of the page that contains this bit */ | 658 | /* calculate the index of the page that contains this bit */ |
| 686 | static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) | 659 | static inline unsigned long file_page_index(struct bitmap_storage *store, |
| 660 | unsigned long chunk) | ||
| 687 | { | 661 | { |
| 688 | if (!bitmap->mddev->bitmap_info.external) | 662 | if (store->sb_page) |
| 689 | chunk += sizeof(bitmap_super_t) << 3; | 663 | chunk += sizeof(bitmap_super_t) << 3; |
| 690 | return chunk >> PAGE_BIT_SHIFT; | 664 | return chunk >> PAGE_BIT_SHIFT; |
| 691 | } | 665 | } |
| 692 | 666 | ||
| 693 | /* calculate the (bit) offset of this bit within a page */ | 667 | /* calculate the (bit) offset of this bit within a page */ |
| 694 | static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) | 668 | static inline unsigned long file_page_offset(struct bitmap_storage *store, |
| 669 | unsigned long chunk) | ||
| 695 | { | 670 | { |
| 696 | if (!bitmap->mddev->bitmap_info.external) | 671 | if (store->sb_page) |
| 697 | chunk += sizeof(bitmap_super_t) << 3; | 672 | chunk += sizeof(bitmap_super_t) << 3; |
| 698 | return chunk & (PAGE_BITS - 1); | 673 | return chunk & (PAGE_BITS - 1); |
| 699 | } | 674 | } |
| @@ -705,57 +680,86 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon | |||
| 705 | * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page | 680 | * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page |
| 706 | * 0 or page 1 | 681 | * 0 or page 1 |
| 707 | */ | 682 | */ |
| 708 | static inline struct page *filemap_get_page(struct bitmap *bitmap, | 683 | static inline struct page *filemap_get_page(struct bitmap_storage *store, |
| 709 | unsigned long chunk) | 684 | unsigned long chunk) |
| 710 | { | 685 | { |
| 711 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) | 686 | if (file_page_index(store, chunk) >= store->file_pages) |
| 712 | return NULL; | 687 | return NULL; |
| 713 | return bitmap->filemap[file_page_index(bitmap, chunk) | 688 | return store->filemap[file_page_index(store, chunk) |
| 714 | - file_page_index(bitmap, 0)]; | 689 | - file_page_index(store, 0)]; |
| 715 | } | 690 | } |
| 716 | 691 | ||
| 717 | static void bitmap_file_unmap(struct bitmap *bitmap) | 692 | static int bitmap_storage_alloc(struct bitmap_storage *store, |
| 693 | unsigned long chunks, int with_super) | ||
| 694 | { | ||
| 695 | int pnum; | ||
| 696 | unsigned long num_pages; | ||
| 697 | unsigned long bytes; | ||
| 698 | |||
| 699 | bytes = DIV_ROUND_UP(chunks, 8); | ||
| 700 | if (with_super) | ||
| 701 | bytes += sizeof(bitmap_super_t); | ||
| 702 | |||
| 703 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); | ||
| 704 | |||
| 705 | store->filemap = kmalloc(sizeof(struct page *) | ||
| 706 | * num_pages, GFP_KERNEL); | ||
| 707 | if (!store->filemap) | ||
| 708 | return -ENOMEM; | ||
| 709 | |||
| 710 | if (with_super && !store->sb_page) { | ||
| 711 | store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); | ||
| 712 | if (store->sb_page == NULL) | ||
| 713 | return -ENOMEM; | ||
| 714 | store->sb_page->index = 0; | ||
| 715 | } | ||
| 716 | pnum = 0; | ||
| 717 | if (store->sb_page) { | ||
| 718 | store->filemap[0] = store->sb_page; | ||
| 719 | pnum = 1; | ||
| 720 | } | ||
| 721 | for ( ; pnum < num_pages; pnum++) { | ||
| 722 | store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); | ||
| 723 | if (!store->filemap[pnum]) { | ||
| 724 | store->file_pages = pnum; | ||
| 725 | return -ENOMEM; | ||
| 726 | } | ||
| 727 | store->filemap[pnum]->index = pnum; | ||
| 728 | } | ||
| 729 | store->file_pages = pnum; | ||
| 730 | |||
| 731 | /* We need 4 bits per page, rounded up to a multiple | ||
| 732 | * of sizeof(unsigned long) */ | ||
| 733 | store->filemap_attr = kzalloc( | ||
| 734 | roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), | ||
| 735 | GFP_KERNEL); | ||
| 736 | if (!store->filemap_attr) | ||
| 737 | return -ENOMEM; | ||
| 738 | |||
| 739 | store->bytes = bytes; | ||
| 740 | |||
| 741 | return 0; | ||
| 742 | } | ||
| 743 | |||
| 744 | static void bitmap_file_unmap(struct bitmap_storage *store) | ||
| 718 | { | 745 | { |
| 719 | struct page **map, *sb_page; | 746 | struct page **map, *sb_page; |
| 720 | unsigned long *attr; | ||
| 721 | int pages; | 747 | int pages; |
| 722 | unsigned long flags; | 748 | struct file *file; |
| 723 | 749 | ||
| 724 | spin_lock_irqsave(&bitmap->lock, flags); | 750 | file = store->file; |
| 725 | map = bitmap->filemap; | 751 | map = store->filemap; |
| 726 | bitmap->filemap = NULL; | 752 | pages = store->file_pages; |
| 727 | attr = bitmap->filemap_attr; | 753 | sb_page = store->sb_page; |
| 728 | bitmap->filemap_attr = NULL; | ||
| 729 | pages = bitmap->file_pages; | ||
| 730 | bitmap->file_pages = 0; | ||
| 731 | sb_page = bitmap->sb_page; | ||
| 732 | bitmap->sb_page = NULL; | ||
| 733 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 734 | 754 | ||
| 735 | while (pages--) | 755 | while (pages--) |
| 736 | if (map[pages] != sb_page) /* 0 is sb_page, release it below */ | 756 | if (map[pages] != sb_page) /* 0 is sb_page, release it below */ |
| 737 | free_buffers(map[pages]); | 757 | free_buffers(map[pages]); |
| 738 | kfree(map); | 758 | kfree(map); |
| 739 | kfree(attr); | 759 | kfree(store->filemap_attr); |
| 740 | 760 | ||
| 741 | if (sb_page) | 761 | if (sb_page) |
| 742 | free_buffers(sb_page); | 762 | free_buffers(sb_page); |
| 743 | } | ||
| 744 | |||
| 745 | static void bitmap_file_put(struct bitmap *bitmap) | ||
| 746 | { | ||
| 747 | struct file *file; | ||
| 748 | unsigned long flags; | ||
| 749 | |||
| 750 | spin_lock_irqsave(&bitmap->lock, flags); | ||
| 751 | file = bitmap->file; | ||
| 752 | bitmap->file = NULL; | ||
| 753 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 754 | |||
| 755 | if (file) | ||
| 756 | wait_event(bitmap->write_wait, | ||
| 757 | atomic_read(&bitmap->pending_writes)==0); | ||
| 758 | bitmap_file_unmap(bitmap); | ||
| 759 | 763 | ||
| 760 | if (file) { | 764 | if (file) { |
| 761 | struct inode *inode = file->f_path.dentry->d_inode; | 765 | struct inode *inode = file->f_path.dentry->d_inode; |
| @@ -773,14 +777,14 @@ static void bitmap_file_kick(struct bitmap *bitmap) | |||
| 773 | { | 777 | { |
| 774 | char *path, *ptr = NULL; | 778 | char *path, *ptr = NULL; |
| 775 | 779 | ||
| 776 | if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { | 780 | if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { |
| 777 | bitmap_update_sb(bitmap); | 781 | bitmap_update_sb(bitmap); |
| 778 | 782 | ||
| 779 | if (bitmap->file) { | 783 | if (bitmap->storage.file) { |
| 780 | path = kmalloc(PAGE_SIZE, GFP_KERNEL); | 784 | path = kmalloc(PAGE_SIZE, GFP_KERNEL); |
| 781 | if (path) | 785 | if (path) |
| 782 | ptr = d_path(&bitmap->file->f_path, path, | 786 | ptr = d_path(&bitmap->storage.file->f_path, |
| 783 | PAGE_SIZE); | 787 | path, PAGE_SIZE); |
| 784 | 788 | ||
| 785 | printk(KERN_ALERT | 789 | printk(KERN_ALERT |
| 786 | "%s: kicking failed bitmap file %s from array!\n", | 790 | "%s: kicking failed bitmap file %s from array!\n", |
| @@ -792,10 +796,6 @@ static void bitmap_file_kick(struct bitmap *bitmap) | |||
| 792 | "%s: disabling internal bitmap due to errors\n", | 796 | "%s: disabling internal bitmap due to errors\n", |
| 793 | bmname(bitmap)); | 797 | bmname(bitmap)); |
| 794 | } | 798 | } |
| 795 | |||
| 796 | bitmap_file_put(bitmap); | ||
| 797 | |||
| 798 | return; | ||
| 799 | } | 799 | } |
| 800 | 800 | ||
| 801 | enum bitmap_page_attr { | 801 | enum bitmap_page_attr { |
| @@ -805,24 +805,30 @@ enum bitmap_page_attr { | |||
| 805 | BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ | 805 | BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ |
| 806 | }; | 806 | }; |
| 807 | 807 | ||
| 808 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, | 808 | static inline void set_page_attr(struct bitmap *bitmap, int pnum, |
| 809 | enum bitmap_page_attr attr) | 809 | enum bitmap_page_attr attr) |
| 810 | { | 810 | { |
| 811 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); | 811 | set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); |
| 812 | } | 812 | } |
| 813 | 813 | ||
| 814 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, | 814 | static inline void clear_page_attr(struct bitmap *bitmap, int pnum, |
| 815 | enum bitmap_page_attr attr) | 815 | enum bitmap_page_attr attr) |
| 816 | { | 816 | { |
| 817 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); | 817 | clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); |
| 818 | } | 818 | } |
| 819 | 819 | ||
| 820 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, | 820 | static inline int test_page_attr(struct bitmap *bitmap, int pnum, |
| 821 | enum bitmap_page_attr attr) | 821 | enum bitmap_page_attr attr) |
| 822 | { | 822 | { |
| 823 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); | 823 | return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); |
| 824 | } | 824 | } |
| 825 | 825 | ||
| 826 | static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, | ||
| 827 | enum bitmap_page_attr attr) | ||
| 828 | { | ||
| 829 | return test_and_clear_bit((pnum<<2) + attr, | ||
| 830 | bitmap->storage.filemap_attr); | ||
| 831 | } | ||
| 826 | /* | 832 | /* |
| 827 | * bitmap_file_set_bit -- called before performing a write to the md device | 833 | * bitmap_file_set_bit -- called before performing a write to the md device |
| 828 | * to set (and eventually sync) a particular bit in the bitmap file | 834 | * to set (and eventually sync) a particular bit in the bitmap file |
| @@ -835,26 +841,46 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
| 835 | unsigned long bit; | 841 | unsigned long bit; |
| 836 | struct page *page; | 842 | struct page *page; |
| 837 | void *kaddr; | 843 | void *kaddr; |
| 838 | unsigned long chunk = block >> bitmap->chunkshift; | 844 | unsigned long chunk = block >> bitmap->counts.chunkshift; |
| 839 | 845 | ||
| 840 | if (!bitmap->filemap) | 846 | page = filemap_get_page(&bitmap->storage, chunk); |
| 841 | return; | ||
| 842 | |||
| 843 | page = filemap_get_page(bitmap, chunk); | ||
| 844 | if (!page) | 847 | if (!page) |
| 845 | return; | 848 | return; |
| 846 | bit = file_page_offset(bitmap, chunk); | 849 | bit = file_page_offset(&bitmap->storage, chunk); |
| 847 | 850 | ||
| 848 | /* set the bit */ | 851 | /* set the bit */ |
| 849 | kaddr = kmap_atomic(page); | 852 | kaddr = kmap_atomic(page); |
| 850 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 853 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) |
| 851 | set_bit(bit, kaddr); | 854 | set_bit(bit, kaddr); |
| 852 | else | 855 | else |
| 853 | __set_bit_le(bit, kaddr); | 856 | test_and_set_bit_le(bit, kaddr); |
| 854 | kunmap_atomic(kaddr); | 857 | kunmap_atomic(kaddr); |
| 855 | pr_debug("set file bit %lu page %lu\n", bit, page->index); | 858 | pr_debug("set file bit %lu page %lu\n", bit, page->index); |
| 856 | /* record page number so it gets flushed to disk when unplug occurs */ | 859 | /* record page number so it gets flushed to disk when unplug occurs */ |
| 857 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | 860 | set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); |
| 861 | } | ||
| 862 | |||
| 863 | static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) | ||
| 864 | { | ||
| 865 | unsigned long bit; | ||
| 866 | struct page *page; | ||
| 867 | void *paddr; | ||
| 868 | unsigned long chunk = block >> bitmap->counts.chunkshift; | ||
| 869 | |||
| 870 | page = filemap_get_page(&bitmap->storage, chunk); | ||
| 871 | if (!page) | ||
| 872 | return; | ||
| 873 | bit = file_page_offset(&bitmap->storage, chunk); | ||
| 874 | paddr = kmap_atomic(page); | ||
| 875 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) | ||
| 876 | clear_bit(bit, paddr); | ||
| 877 | else | ||
| 878 | test_and_clear_bit_le(bit, paddr); | ||
| 879 | kunmap_atomic(paddr); | ||
| 880 | if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { | ||
| 881 | set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); | ||
| 882 | bitmap->allclean = 0; | ||
| 883 | } | ||
| 858 | } | 884 | } |
| 859 | 885 | ||
| 860 | /* this gets called when the md device is ready to unplug its underlying | 886 | /* this gets called when the md device is ready to unplug its underlying |
| @@ -862,42 +888,37 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
| 862 | * sync the dirty pages of the bitmap file to disk */ | 888 | * sync the dirty pages of the bitmap file to disk */ |
| 863 | void bitmap_unplug(struct bitmap *bitmap) | 889 | void bitmap_unplug(struct bitmap *bitmap) |
| 864 | { | 890 | { |
| 865 | unsigned long i, flags; | 891 | unsigned long i; |
| 866 | int dirty, need_write; | 892 | int dirty, need_write; |
| 867 | struct page *page; | ||
| 868 | int wait = 0; | 893 | int wait = 0; |
| 869 | 894 | ||
| 870 | if (!bitmap) | 895 | if (!bitmap || !bitmap->storage.filemap || |
| 896 | test_bit(BITMAP_STALE, &bitmap->flags)) | ||
| 871 | return; | 897 | return; |
| 872 | 898 | ||
| 873 | /* look at each page to see if there are any set bits that need to be | 899 | /* look at each page to see if there are any set bits that need to be |
| 874 | * flushed out to disk */ | 900 | * flushed out to disk */ |
| 875 | for (i = 0; i < bitmap->file_pages; i++) { | 901 | for (i = 0; i < bitmap->storage.file_pages; i++) { |
| 876 | spin_lock_irqsave(&bitmap->lock, flags); | 902 | if (!bitmap->storage.filemap) |
| 877 | if (!bitmap->filemap) { | ||
| 878 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 879 | return; | 903 | return; |
| 904 | dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); | ||
| 905 | need_write = test_and_clear_page_attr(bitmap, i, | ||
| 906 | BITMAP_PAGE_NEEDWRITE); | ||
| 907 | if (dirty || need_write) { | ||
| 908 | clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); | ||
| 909 | write_page(bitmap, bitmap->storage.filemap[i], 0); | ||
| 880 | } | 910 | } |
| 881 | page = bitmap->filemap[i]; | ||
| 882 | dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | ||
| 883 | need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); | ||
| 884 | clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | ||
| 885 | clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); | ||
| 886 | if (dirty) | 911 | if (dirty) |
| 887 | wait = 1; | 912 | wait = 1; |
| 888 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 889 | |||
| 890 | if (dirty || need_write) | ||
| 891 | write_page(bitmap, page, 0); | ||
| 892 | } | 913 | } |
| 893 | if (wait) { /* if any writes were performed, we need to wait on them */ | 914 | if (wait) { /* if any writes were performed, we need to wait on them */ |
| 894 | if (bitmap->file) | 915 | if (bitmap->storage.file) |
| 895 | wait_event(bitmap->write_wait, | 916 | wait_event(bitmap->write_wait, |
| 896 | atomic_read(&bitmap->pending_writes)==0); | 917 | atomic_read(&bitmap->pending_writes)==0); |
| 897 | else | 918 | else |
| 898 | md_super_wait(bitmap->mddev); | 919 | md_super_wait(bitmap->mddev); |
| 899 | } | 920 | } |
| 900 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 921 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
| 901 | bitmap_file_kick(bitmap); | 922 | bitmap_file_kick(bitmap); |
| 902 | } | 923 | } |
| 903 | EXPORT_SYMBOL(bitmap_unplug); | 924 | EXPORT_SYMBOL(bitmap_unplug); |
| @@ -917,98 +938,77 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
| 917 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | 938 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) |
| 918 | { | 939 | { |
| 919 | unsigned long i, chunks, index, oldindex, bit; | 940 | unsigned long i, chunks, index, oldindex, bit; |
| 920 | struct page *page = NULL, *oldpage = NULL; | 941 | struct page *page = NULL; |
| 921 | unsigned long num_pages, bit_cnt = 0; | 942 | unsigned long bit_cnt = 0; |
| 922 | struct file *file; | 943 | struct file *file; |
| 923 | unsigned long bytes, offset; | 944 | unsigned long offset; |
| 924 | int outofdate; | 945 | int outofdate; |
| 925 | int ret = -ENOSPC; | 946 | int ret = -ENOSPC; |
| 926 | void *paddr; | 947 | void *paddr; |
| 948 | struct bitmap_storage *store = &bitmap->storage; | ||
| 927 | 949 | ||
| 928 | chunks = bitmap->chunks; | 950 | chunks = bitmap->counts.chunks; |
| 929 | file = bitmap->file; | 951 | file = store->file; |
| 930 | 952 | ||
| 931 | BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); | 953 | if (!file && !bitmap->mddev->bitmap_info.offset) { |
| 954 | /* No permanent bitmap - fill with '1s'. */ | ||
| 955 | store->filemap = NULL; | ||
| 956 | store->file_pages = 0; | ||
| 957 | for (i = 0; i < chunks ; i++) { | ||
| 958 | /* if the disk bit is set, set the memory bit */ | ||
| 959 | int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) | ||
| 960 | >= start); | ||
| 961 | bitmap_set_memory_bits(bitmap, | ||
| 962 | (sector_t)i << bitmap->counts.chunkshift, | ||
| 963 | needed); | ||
| 964 | } | ||
| 965 | return 0; | ||
| 966 | } | ||
| 932 | 967 | ||
| 933 | outofdate = bitmap->flags & BITMAP_STALE; | 968 | outofdate = test_bit(BITMAP_STALE, &bitmap->flags); |
| 934 | if (outofdate) | 969 | if (outofdate) |
| 935 | printk(KERN_INFO "%s: bitmap file is out of date, doing full " | 970 | printk(KERN_INFO "%s: bitmap file is out of date, doing full " |
| 936 | "recovery\n", bmname(bitmap)); | 971 | "recovery\n", bmname(bitmap)); |
| 937 | 972 | ||
| 938 | bytes = DIV_ROUND_UP(bitmap->chunks, 8); | 973 | if (file && i_size_read(file->f_mapping->host) < store->bytes) { |
| 939 | if (!bitmap->mddev->bitmap_info.external) | ||
| 940 | bytes += sizeof(bitmap_super_t); | ||
| 941 | |||
| 942 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); | ||
| 943 | |||
| 944 | if (file && i_size_read(file->f_mapping->host) < bytes) { | ||
| 945 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", | 974 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", |
| 946 | bmname(bitmap), | 975 | bmname(bitmap), |
| 947 | (unsigned long) i_size_read(file->f_mapping->host), | 976 | (unsigned long) i_size_read(file->f_mapping->host), |
| 948 | bytes); | 977 | store->bytes); |
| 949 | goto err; | 978 | goto err; |
| 950 | } | 979 | } |
| 951 | 980 | ||
| 952 | ret = -ENOMEM; | ||
| 953 | |||
| 954 | bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); | ||
| 955 | if (!bitmap->filemap) | ||
| 956 | goto err; | ||
| 957 | |||
| 958 | /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ | ||
| 959 | bitmap->filemap_attr = kzalloc( | ||
| 960 | roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), | ||
| 961 | GFP_KERNEL); | ||
| 962 | if (!bitmap->filemap_attr) | ||
| 963 | goto err; | ||
| 964 | |||
| 965 | oldindex = ~0L; | 981 | oldindex = ~0L; |
| 982 | offset = 0; | ||
| 983 | if (!bitmap->mddev->bitmap_info.external) | ||
| 984 | offset = sizeof(bitmap_super_t); | ||
| 966 | 985 | ||
| 967 | for (i = 0; i < chunks; i++) { | 986 | for (i = 0; i < chunks; i++) { |
| 968 | int b; | 987 | int b; |
| 969 | index = file_page_index(bitmap, i); | 988 | index = file_page_index(&bitmap->storage, i); |
| 970 | bit = file_page_offset(bitmap, i); | 989 | bit = file_page_offset(&bitmap->storage, i); |
| 971 | if (index != oldindex) { /* this is a new page, read it in */ | 990 | if (index != oldindex) { /* this is a new page, read it in */ |
| 972 | int count; | 991 | int count; |
| 973 | /* unmap the old page, we're done with it */ | 992 | /* unmap the old page, we're done with it */ |
| 974 | if (index == num_pages-1) | 993 | if (index == store->file_pages-1) |
| 975 | count = bytes - index * PAGE_SIZE; | 994 | count = store->bytes - index * PAGE_SIZE; |
| 976 | else | 995 | else |
| 977 | count = PAGE_SIZE; | 996 | count = PAGE_SIZE; |
| 978 | if (index == 0 && bitmap->sb_page) { | 997 | page = store->filemap[index]; |
| 979 | /* | 998 | if (file) |
| 980 | * if we're here then the superblock page | 999 | ret = read_page(file, index, bitmap, |
| 981 | * contains some bits (PAGE_SIZE != sizeof sb) | 1000 | count, page); |
| 982 | * we've already read it in, so just use it | 1001 | else |
| 983 | */ | 1002 | ret = read_sb_page( |
| 984 | page = bitmap->sb_page; | 1003 | bitmap->mddev, |
| 985 | offset = sizeof(bitmap_super_t); | 1004 | bitmap->mddev->bitmap_info.offset, |
| 986 | if (!file) | 1005 | page, |
| 987 | page = read_sb_page( | 1006 | index, count); |
| 988 | bitmap->mddev, | 1007 | |
| 989 | bitmap->mddev->bitmap_info.offset, | 1008 | if (ret) |
| 990 | page, | ||
| 991 | index, count); | ||
| 992 | } else if (file) { | ||
| 993 | page = read_page(file, index, bitmap, count); | ||
| 994 | offset = 0; | ||
| 995 | } else { | ||
| 996 | page = read_sb_page(bitmap->mddev, | ||
| 997 | bitmap->mddev->bitmap_info.offset, | ||
| 998 | NULL, | ||
| 999 | index, count); | ||
| 1000 | offset = 0; | ||
| 1001 | } | ||
| 1002 | if (IS_ERR(page)) { /* read error */ | ||
| 1003 | ret = PTR_ERR(page); | ||
| 1004 | goto err; | 1009 | goto err; |
| 1005 | } | ||
| 1006 | 1010 | ||
| 1007 | oldindex = index; | 1011 | oldindex = index; |
| 1008 | oldpage = page; | ||
| 1009 | |||
| 1010 | bitmap->filemap[bitmap->file_pages++] = page; | ||
| 1011 | bitmap->last_page_size = count; | ||
| 1012 | 1012 | ||
| 1013 | if (outofdate) { | 1013 | if (outofdate) { |
| 1014 | /* | 1014 | /* |
| @@ -1022,39 +1022,33 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
| 1022 | write_page(bitmap, page, 1); | 1022 | write_page(bitmap, page, 1); |
| 1023 | 1023 | ||
| 1024 | ret = -EIO; | 1024 | ret = -EIO; |
| 1025 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 1025 | if (test_bit(BITMAP_WRITE_ERROR, |
| 1026 | &bitmap->flags)) | ||
| 1026 | goto err; | 1027 | goto err; |
| 1027 | } | 1028 | } |
| 1028 | } | 1029 | } |
| 1029 | paddr = kmap_atomic(page); | 1030 | paddr = kmap_atomic(page); |
| 1030 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1031 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) |
| 1031 | b = test_bit(bit, paddr); | 1032 | b = test_bit(bit, paddr); |
| 1032 | else | 1033 | else |
| 1033 | b = test_bit_le(bit, paddr); | 1034 | b = test_bit_le(bit, paddr); |
| 1034 | kunmap_atomic(paddr); | 1035 | kunmap_atomic(paddr); |
| 1035 | if (b) { | 1036 | if (b) { |
| 1036 | /* if the disk bit is set, set the memory bit */ | 1037 | /* if the disk bit is set, set the memory bit */ |
| 1037 | int needed = ((sector_t)(i+1) << bitmap->chunkshift | 1038 | int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift |
| 1038 | >= start); | 1039 | >= start); |
| 1039 | bitmap_set_memory_bits(bitmap, | 1040 | bitmap_set_memory_bits(bitmap, |
| 1040 | (sector_t)i << bitmap->chunkshift, | 1041 | (sector_t)i << bitmap->counts.chunkshift, |
| 1041 | needed); | 1042 | needed); |
| 1042 | bit_cnt++; | 1043 | bit_cnt++; |
| 1043 | } | 1044 | } |
| 1044 | } | 1045 | offset = 0; |
| 1045 | |||
| 1046 | /* everything went OK */ | ||
| 1047 | ret = 0; | ||
| 1048 | bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); | ||
| 1049 | |||
| 1050 | if (bit_cnt) { /* Kick recovery if any bits were set */ | ||
| 1051 | set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); | ||
| 1052 | md_wakeup_thread(bitmap->mddev->thread); | ||
| 1053 | } | 1046 | } |
| 1054 | 1047 | ||
| 1055 | printk(KERN_INFO "%s: bitmap initialized from disk: " | 1048 | printk(KERN_INFO "%s: bitmap initialized from disk: " |
| 1056 | "read %lu/%lu pages, set %lu of %lu bits\n", | 1049 | "read %lu pages, set %lu of %lu bits\n", |
| 1057 | bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); | 1050 | bmname(bitmap), store->file_pages, |
| 1051 | bit_cnt, chunks); | ||
| 1058 | 1052 | ||
| 1059 | return 0; | 1053 | return 0; |
| 1060 | 1054 | ||
| @@ -1071,22 +1065,38 @@ void bitmap_write_all(struct bitmap *bitmap) | |||
| 1071 | */ | 1065 | */ |
| 1072 | int i; | 1066 | int i; |
| 1073 | 1067 | ||
| 1074 | spin_lock_irq(&bitmap->lock); | 1068 | if (!bitmap || !bitmap->storage.filemap) |
| 1075 | for (i = 0; i < bitmap->file_pages; i++) | 1069 | return; |
| 1076 | set_page_attr(bitmap, bitmap->filemap[i], | 1070 | if (bitmap->storage.file) |
| 1071 | /* Only one copy, so nothing needed */ | ||
| 1072 | return; | ||
| 1073 | |||
| 1074 | for (i = 0; i < bitmap->storage.file_pages; i++) | ||
| 1075 | set_page_attr(bitmap, i, | ||
| 1077 | BITMAP_PAGE_NEEDWRITE); | 1076 | BITMAP_PAGE_NEEDWRITE); |
| 1078 | bitmap->allclean = 0; | 1077 | bitmap->allclean = 0; |
| 1079 | spin_unlock_irq(&bitmap->lock); | ||
| 1080 | } | 1078 | } |
| 1081 | 1079 | ||
| 1082 | static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) | 1080 | static void bitmap_count_page(struct bitmap_counts *bitmap, |
| 1081 | sector_t offset, int inc) | ||
| 1083 | { | 1082 | { |
| 1084 | sector_t chunk = offset >> bitmap->chunkshift; | 1083 | sector_t chunk = offset >> bitmap->chunkshift; |
| 1085 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; | 1084 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; |
| 1086 | bitmap->bp[page].count += inc; | 1085 | bitmap->bp[page].count += inc; |
| 1087 | bitmap_checkfree(bitmap, page); | 1086 | bitmap_checkfree(bitmap, page); |
| 1088 | } | 1087 | } |
| 1089 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1088 | |
| 1089 | static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) | ||
| 1090 | { | ||
| 1091 | sector_t chunk = offset >> bitmap->chunkshift; | ||
| 1092 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; | ||
| 1093 | struct bitmap_page *bp = &bitmap->bp[page]; | ||
| 1094 | |||
| 1095 | if (!bp->pending) | ||
| 1096 | bp->pending = 1; | ||
| 1097 | } | ||
| 1098 | |||
| 1099 | static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, | ||
| 1090 | sector_t offset, sector_t *blocks, | 1100 | sector_t offset, sector_t *blocks, |
| 1091 | int create); | 1101 | int create); |
| 1092 | 1102 | ||
| @@ -1099,10 +1109,9 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
| 1099 | { | 1109 | { |
| 1100 | struct bitmap *bitmap; | 1110 | struct bitmap *bitmap; |
| 1101 | unsigned long j; | 1111 | unsigned long j; |
| 1102 | unsigned long flags; | 1112 | unsigned long nextpage; |
| 1103 | struct page *page = NULL, *lastpage = NULL; | ||
| 1104 | sector_t blocks; | 1113 | sector_t blocks; |
| 1105 | void *paddr; | 1114 | struct bitmap_counts *counts; |
| 1106 | 1115 | ||
| 1107 | /* Use a mutex to guard daemon_work against | 1116 | /* Use a mutex to guard daemon_work against |
| 1108 | * bitmap_destroy. | 1117 | * bitmap_destroy. |
| @@ -1124,112 +1133,90 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
| 1124 | } | 1133 | } |
| 1125 | bitmap->allclean = 1; | 1134 | bitmap->allclean = 1; |
| 1126 | 1135 | ||
| 1127 | spin_lock_irqsave(&bitmap->lock, flags); | 1136 | /* Any file-page which is PENDING now needs to be written. |
| 1128 | for (j = 0; j < bitmap->chunks; j++) { | 1137 | * So set NEEDWRITE now, then after we make any last-minute changes |
| 1138 | * we will write it. | ||
| 1139 | */ | ||
| 1140 | for (j = 0; j < bitmap->storage.file_pages; j++) | ||
| 1141 | if (test_and_clear_page_attr(bitmap, j, | ||
| 1142 | BITMAP_PAGE_PENDING)) | ||
| 1143 | set_page_attr(bitmap, j, | ||
| 1144 | BITMAP_PAGE_NEEDWRITE); | ||
| 1145 | |||
| 1146 | if (bitmap->need_sync && | ||
| 1147 | mddev->bitmap_info.external == 0) { | ||
| 1148 | /* Arrange for superblock update as well as | ||
| 1149 | * other changes */ | ||
| 1150 | bitmap_super_t *sb; | ||
| 1151 | bitmap->need_sync = 0; | ||
| 1152 | if (bitmap->storage.filemap) { | ||
| 1153 | sb = kmap_atomic(bitmap->storage.sb_page); | ||
| 1154 | sb->events_cleared = | ||
| 1155 | cpu_to_le64(bitmap->events_cleared); | ||
| 1156 | kunmap_atomic(sb); | ||
| 1157 | set_page_attr(bitmap, 0, | ||
| 1158 | BITMAP_PAGE_NEEDWRITE); | ||
| 1159 | } | ||
| 1160 | } | ||
| 1161 | /* Now look at the bitmap counters and if any are '2' or '1', | ||
| 1162 | * decrement and handle accordingly. | ||
| 1163 | */ | ||
| 1164 | counts = &bitmap->counts; | ||
| 1165 | spin_lock_irq(&counts->lock); | ||
| 1166 | nextpage = 0; | ||
| 1167 | for (j = 0; j < counts->chunks; j++) { | ||
| 1129 | bitmap_counter_t *bmc; | 1168 | bitmap_counter_t *bmc; |
| 1130 | if (!bitmap->filemap) | 1169 | sector_t block = (sector_t)j << counts->chunkshift; |
| 1131 | /* error or shutdown */ | ||
| 1132 | break; | ||
| 1133 | 1170 | ||
| 1134 | page = filemap_get_page(bitmap, j); | 1171 | if (j == nextpage) { |
| 1135 | 1172 | nextpage += PAGE_COUNTER_RATIO; | |
| 1136 | if (page != lastpage) { | 1173 | if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { |
| 1137 | /* skip this page unless it's marked as needing cleaning */ | 1174 | j |= PAGE_COUNTER_MASK; |
| 1138 | if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) { | ||
| 1139 | int need_write = test_page_attr(bitmap, page, | ||
| 1140 | BITMAP_PAGE_NEEDWRITE); | ||
| 1141 | if (need_write) | ||
| 1142 | clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); | ||
| 1143 | |||
| 1144 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1145 | if (need_write) | ||
| 1146 | write_page(bitmap, page, 0); | ||
| 1147 | spin_lock_irqsave(&bitmap->lock, flags); | ||
| 1148 | j |= (PAGE_BITS - 1); | ||
| 1149 | continue; | 1175 | continue; |
| 1150 | } | 1176 | } |
| 1151 | 1177 | counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; | |
| 1152 | /* grab the new page, sync and release the old */ | ||
| 1153 | if (lastpage != NULL) { | ||
| 1154 | if (test_page_attr(bitmap, lastpage, | ||
| 1155 | BITMAP_PAGE_NEEDWRITE)) { | ||
| 1156 | clear_page_attr(bitmap, lastpage, | ||
| 1157 | BITMAP_PAGE_NEEDWRITE); | ||
| 1158 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1159 | write_page(bitmap, lastpage, 0); | ||
| 1160 | } else { | ||
| 1161 | set_page_attr(bitmap, lastpage, | ||
| 1162 | BITMAP_PAGE_NEEDWRITE); | ||
| 1163 | bitmap->allclean = 0; | ||
| 1164 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1165 | } | ||
| 1166 | } else | ||
| 1167 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1168 | lastpage = page; | ||
| 1169 | |||
| 1170 | /* We are possibly going to clear some bits, so make | ||
| 1171 | * sure that events_cleared is up-to-date. | ||
| 1172 | */ | ||
| 1173 | if (bitmap->need_sync && | ||
| 1174 | mddev->bitmap_info.external == 0) { | ||
| 1175 | bitmap_super_t *sb; | ||
| 1176 | bitmap->need_sync = 0; | ||
| 1177 | sb = kmap_atomic(bitmap->sb_page); | ||
| 1178 | sb->events_cleared = | ||
| 1179 | cpu_to_le64(bitmap->events_cleared); | ||
| 1180 | kunmap_atomic(sb); | ||
| 1181 | write_page(bitmap, bitmap->sb_page, 1); | ||
| 1182 | } | ||
| 1183 | spin_lock_irqsave(&bitmap->lock, flags); | ||
| 1184 | if (!bitmap->need_sync) | ||
| 1185 | clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | ||
| 1186 | else | ||
| 1187 | bitmap->allclean = 0; | ||
| 1188 | } | 1178 | } |
| 1189 | bmc = bitmap_get_counter(bitmap, | 1179 | bmc = bitmap_get_counter(counts, |
| 1190 | (sector_t)j << bitmap->chunkshift, | 1180 | block, |
| 1191 | &blocks, 0); | 1181 | &blocks, 0); |
| 1192 | if (!bmc) | 1182 | |
| 1183 | if (!bmc) { | ||
| 1193 | j |= PAGE_COUNTER_MASK; | 1184 | j |= PAGE_COUNTER_MASK; |
| 1194 | else if (*bmc) { | 1185 | continue; |
| 1195 | if (*bmc == 1 && !bitmap->need_sync) { | ||
| 1196 | /* we can clear the bit */ | ||
| 1197 | *bmc = 0; | ||
| 1198 | bitmap_count_page(bitmap, | ||
| 1199 | (sector_t)j << bitmap->chunkshift, | ||
| 1200 | -1); | ||
| 1201 | |||
| 1202 | /* clear the bit */ | ||
| 1203 | paddr = kmap_atomic(page); | ||
| 1204 | if (bitmap->flags & BITMAP_HOSTENDIAN) | ||
| 1205 | clear_bit(file_page_offset(bitmap, j), | ||
| 1206 | paddr); | ||
| 1207 | else | ||
| 1208 | __clear_bit_le( | ||
| 1209 | file_page_offset(bitmap, | ||
| 1210 | j), | ||
| 1211 | paddr); | ||
| 1212 | kunmap_atomic(paddr); | ||
| 1213 | } else if (*bmc <= 2) { | ||
| 1214 | *bmc = 1; /* maybe clear the bit next time */ | ||
| 1215 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | ||
| 1216 | bitmap->allclean = 0; | ||
| 1217 | } | ||
| 1218 | } | 1186 | } |
| 1219 | } | 1187 | if (*bmc == 1 && !bitmap->need_sync) { |
| 1220 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1188 | /* We can clear the bit */ |
| 1221 | 1189 | *bmc = 0; | |
| 1222 | /* now sync the final page */ | 1190 | bitmap_count_page(counts, block, -1); |
| 1223 | if (lastpage != NULL) { | 1191 | bitmap_file_clear_bit(bitmap, block); |
| 1224 | spin_lock_irqsave(&bitmap->lock, flags); | 1192 | } else if (*bmc && *bmc <= 2) { |
| 1225 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { | 1193 | *bmc = 1; |
| 1226 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1194 | bitmap_set_pending(counts, block); |
| 1227 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1228 | write_page(bitmap, lastpage, 0); | ||
| 1229 | } else { | ||
| 1230 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | ||
| 1231 | bitmap->allclean = 0; | 1195 | bitmap->allclean = 0; |
| 1232 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1196 | } |
| 1197 | } | ||
| 1198 | spin_unlock_irq(&counts->lock); | ||
| 1199 | |||
| 1200 | /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. | ||
| 1201 | * DIRTY pages need to be written by bitmap_unplug so it can wait | ||
| 1202 | * for them. | ||
| 1203 | * If we find any DIRTY page we stop there and let bitmap_unplug | ||
| 1204 | * handle all the rest. This is important in the case where | ||
| 1205 | * the first blocking holds the superblock and it has been updated. | ||
| 1206 | * We mustn't write any other blocks before the superblock. | ||
| 1207 | */ | ||
| 1208 | for (j = 0; | ||
| 1209 | j < bitmap->storage.file_pages | ||
| 1210 | && !test_bit(BITMAP_STALE, &bitmap->flags); | ||
| 1211 | j++) { | ||
| 1212 | |||
| 1213 | if (test_page_attr(bitmap, j, | ||
| 1214 | BITMAP_PAGE_DIRTY)) | ||
| 1215 | /* bitmap_unplug will handle the rest */ | ||
| 1216 | break; | ||
| 1217 | if (test_and_clear_page_attr(bitmap, j, | ||
| 1218 | BITMAP_PAGE_NEEDWRITE)) { | ||
| 1219 | write_page(bitmap, bitmap->storage.filemap[j], 0); | ||
| 1233 | } | 1220 | } |
| 1234 | } | 1221 | } |
| 1235 | 1222 | ||
| @@ -1240,7 +1227,7 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
| 1240 | mutex_unlock(&mddev->bitmap_info.mutex); | 1227 | mutex_unlock(&mddev->bitmap_info.mutex); |
| 1241 | } | 1228 | } |
| 1242 | 1229 | ||
| 1243 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1230 | static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, |
| 1244 | sector_t offset, sector_t *blocks, | 1231 | sector_t offset, sector_t *blocks, |
| 1245 | int create) | 1232 | int create) |
| 1246 | __releases(bitmap->lock) | 1233 | __releases(bitmap->lock) |
| @@ -1302,10 +1289,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
| 1302 | sector_t blocks; | 1289 | sector_t blocks; |
| 1303 | bitmap_counter_t *bmc; | 1290 | bitmap_counter_t *bmc; |
| 1304 | 1291 | ||
| 1305 | spin_lock_irq(&bitmap->lock); | 1292 | spin_lock_irq(&bitmap->counts.lock); |
| 1306 | bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); | 1293 | bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); |
| 1307 | if (!bmc) { | 1294 | if (!bmc) { |
| 1308 | spin_unlock_irq(&bitmap->lock); | 1295 | spin_unlock_irq(&bitmap->counts.lock); |
| 1309 | return 0; | 1296 | return 0; |
| 1310 | } | 1297 | } |
| 1311 | 1298 | ||
| @@ -1317,7 +1304,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
| 1317 | */ | 1304 | */ |
| 1318 | prepare_to_wait(&bitmap->overflow_wait, &__wait, | 1305 | prepare_to_wait(&bitmap->overflow_wait, &__wait, |
| 1319 | TASK_UNINTERRUPTIBLE); | 1306 | TASK_UNINTERRUPTIBLE); |
| 1320 | spin_unlock_irq(&bitmap->lock); | 1307 | spin_unlock_irq(&bitmap->counts.lock); |
| 1321 | io_schedule(); | 1308 | io_schedule(); |
| 1322 | finish_wait(&bitmap->overflow_wait, &__wait); | 1309 | finish_wait(&bitmap->overflow_wait, &__wait); |
| 1323 | continue; | 1310 | continue; |
| @@ -1326,7 +1313,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
| 1326 | switch (*bmc) { | 1313 | switch (*bmc) { |
| 1327 | case 0: | 1314 | case 0: |
| 1328 | bitmap_file_set_bit(bitmap, offset); | 1315 | bitmap_file_set_bit(bitmap, offset); |
| 1329 | bitmap_count_page(bitmap, offset, 1); | 1316 | bitmap_count_page(&bitmap->counts, offset, 1); |
| 1330 | /* fall through */ | 1317 | /* fall through */ |
| 1331 | case 1: | 1318 | case 1: |
| 1332 | *bmc = 2; | 1319 | *bmc = 2; |
| @@ -1334,7 +1321,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
| 1334 | 1321 | ||
| 1335 | (*bmc)++; | 1322 | (*bmc)++; |
| 1336 | 1323 | ||
| 1337 | spin_unlock_irq(&bitmap->lock); | 1324 | spin_unlock_irq(&bitmap->counts.lock); |
| 1338 | 1325 | ||
| 1339 | offset += blocks; | 1326 | offset += blocks; |
| 1340 | if (sectors > blocks) | 1327 | if (sectors > blocks) |
| @@ -1364,10 +1351,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
| 1364 | unsigned long flags; | 1351 | unsigned long flags; |
| 1365 | bitmap_counter_t *bmc; | 1352 | bitmap_counter_t *bmc; |
| 1366 | 1353 | ||
| 1367 | spin_lock_irqsave(&bitmap->lock, flags); | 1354 | spin_lock_irqsave(&bitmap->counts.lock, flags); |
| 1368 | bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); | 1355 | bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); |
| 1369 | if (!bmc) { | 1356 | if (!bmc) { |
| 1370 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1357 | spin_unlock_irqrestore(&bitmap->counts.lock, flags); |
| 1371 | return; | 1358 | return; |
| 1372 | } | 1359 | } |
| 1373 | 1360 | ||
| @@ -1386,14 +1373,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
| 1386 | 1373 | ||
| 1387 | (*bmc)--; | 1374 | (*bmc)--; |
| 1388 | if (*bmc <= 2) { | 1375 | if (*bmc <= 2) { |
| 1389 | set_page_attr(bitmap, | 1376 | bitmap_set_pending(&bitmap->counts, offset); |
| 1390 | filemap_get_page( | ||
| 1391 | bitmap, | ||
| 1392 | offset >> bitmap->chunkshift), | ||
| 1393 | BITMAP_PAGE_PENDING); | ||
| 1394 | bitmap->allclean = 0; | 1377 | bitmap->allclean = 0; |
| 1395 | } | 1378 | } |
| 1396 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1379 | spin_unlock_irqrestore(&bitmap->counts.lock, flags); |
| 1397 | offset += blocks; | 1380 | offset += blocks; |
| 1398 | if (sectors > blocks) | 1381 | if (sectors > blocks) |
| 1399 | sectors -= blocks; | 1382 | sectors -= blocks; |
| @@ -1412,8 +1395,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t | |||
| 1412 | *blocks = 1024; | 1395 | *blocks = 1024; |
| 1413 | return 1; /* always resync if no bitmap */ | 1396 | return 1; /* always resync if no bitmap */ |
| 1414 | } | 1397 | } |
| 1415 | spin_lock_irq(&bitmap->lock); | 1398 | spin_lock_irq(&bitmap->counts.lock); |
| 1416 | bmc = bitmap_get_counter(bitmap, offset, blocks, 0); | 1399 | bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); |
| 1417 | rv = 0; | 1400 | rv = 0; |
| 1418 | if (bmc) { | 1401 | if (bmc) { |
| 1419 | /* locked */ | 1402 | /* locked */ |
| @@ -1427,7 +1410,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t | |||
| 1427 | } | 1410 | } |
| 1428 | } | 1411 | } |
| 1429 | } | 1412 | } |
| 1430 | spin_unlock_irq(&bitmap->lock); | 1413 | spin_unlock_irq(&bitmap->counts.lock); |
| 1431 | return rv; | 1414 | return rv; |
| 1432 | } | 1415 | } |
| 1433 | 1416 | ||
| @@ -1464,8 +1447,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i | |||
| 1464 | *blocks = 1024; | 1447 | *blocks = 1024; |
| 1465 | return; | 1448 | return; |
| 1466 | } | 1449 | } |
| 1467 | spin_lock_irqsave(&bitmap->lock, flags); | 1450 | spin_lock_irqsave(&bitmap->counts.lock, flags); |
| 1468 | bmc = bitmap_get_counter(bitmap, offset, blocks, 0); | 1451 | bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); |
| 1469 | if (bmc == NULL) | 1452 | if (bmc == NULL) |
| 1470 | goto unlock; | 1453 | goto unlock; |
| 1471 | /* locked */ | 1454 | /* locked */ |
| @@ -1476,15 +1459,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i | |||
| 1476 | *bmc |= NEEDED_MASK; | 1459 | *bmc |= NEEDED_MASK; |
| 1477 | else { | 1460 | else { |
| 1478 | if (*bmc <= 2) { | 1461 | if (*bmc <= 2) { |
| 1479 | set_page_attr(bitmap, | 1462 | bitmap_set_pending(&bitmap->counts, offset); |
| 1480 | filemap_get_page(bitmap, offset >> bitmap->chunkshift), | ||
| 1481 | BITMAP_PAGE_PENDING); | ||
| 1482 | bitmap->allclean = 0; | 1463 | bitmap->allclean = 0; |
| 1483 | } | 1464 | } |
| 1484 | } | 1465 | } |
| 1485 | } | 1466 | } |
| 1486 | unlock: | 1467 | unlock: |
| 1487 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1468 | spin_unlock_irqrestore(&bitmap->counts.lock, flags); |
| 1488 | } | 1469 | } |
| 1489 | EXPORT_SYMBOL(bitmap_end_sync); | 1470 | EXPORT_SYMBOL(bitmap_end_sync); |
| 1490 | 1471 | ||
| @@ -1524,7 +1505,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
| 1524 | 1505 | ||
| 1525 | bitmap->mddev->curr_resync_completed = sector; | 1506 | bitmap->mddev->curr_resync_completed = sector; |
| 1526 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | 1507 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); |
| 1527 | sector &= ~((1ULL << bitmap->chunkshift) - 1); | 1508 | sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); |
| 1528 | s = 0; | 1509 | s = 0; |
| 1529 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { | 1510 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { |
| 1530 | bitmap_end_sync(bitmap, s, &blocks, 0); | 1511 | bitmap_end_sync(bitmap, s, &blocks, 0); |
| @@ -1538,27 +1519,25 @@ EXPORT_SYMBOL(bitmap_cond_end_sync); | |||
| 1538 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) | 1519 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) |
| 1539 | { | 1520 | { |
| 1540 | /* For each chunk covered by any of these sectors, set the | 1521 | /* For each chunk covered by any of these sectors, set the |
| 1541 | * counter to 1 and set resync_needed. They should all | 1522 | * counter to 2 and possibly set resync_needed. They should all |
| 1542 | * be 0 at this point | 1523 | * be 0 at this point |
| 1543 | */ | 1524 | */ |
| 1544 | 1525 | ||
| 1545 | sector_t secs; | 1526 | sector_t secs; |
| 1546 | bitmap_counter_t *bmc; | 1527 | bitmap_counter_t *bmc; |
| 1547 | spin_lock_irq(&bitmap->lock); | 1528 | spin_lock_irq(&bitmap->counts.lock); |
| 1548 | bmc = bitmap_get_counter(bitmap, offset, &secs, 1); | 1529 | bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); |
| 1549 | if (!bmc) { | 1530 | if (!bmc) { |
| 1550 | spin_unlock_irq(&bitmap->lock); | 1531 | spin_unlock_irq(&bitmap->counts.lock); |
| 1551 | return; | 1532 | return; |
| 1552 | } | 1533 | } |
| 1553 | if (!*bmc) { | 1534 | if (!*bmc) { |
| 1554 | struct page *page; | ||
| 1555 | *bmc = 2 | (needed ? NEEDED_MASK : 0); | 1535 | *bmc = 2 | (needed ? NEEDED_MASK : 0); |
| 1556 | bitmap_count_page(bitmap, offset, 1); | 1536 | bitmap_count_page(&bitmap->counts, offset, 1); |
| 1557 | page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); | 1537 | bitmap_set_pending(&bitmap->counts, offset); |
| 1558 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | ||
| 1559 | bitmap->allclean = 0; | 1538 | bitmap->allclean = 0; |
| 1560 | } | 1539 | } |
| 1561 | spin_unlock_irq(&bitmap->lock); | 1540 | spin_unlock_irq(&bitmap->counts.lock); |
| 1562 | } | 1541 | } |
| 1563 | 1542 | ||
| 1564 | /* dirty the memory and file bits for bitmap chunks "s" to "e" */ | 1543 | /* dirty the memory and file bits for bitmap chunks "s" to "e" */ |
| @@ -1567,11 +1546,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) | |||
| 1567 | unsigned long chunk; | 1546 | unsigned long chunk; |
| 1568 | 1547 | ||
| 1569 | for (chunk = s; chunk <= e; chunk++) { | 1548 | for (chunk = s; chunk <= e; chunk++) { |
| 1570 | sector_t sec = (sector_t)chunk << bitmap->chunkshift; | 1549 | sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; |
| 1571 | bitmap_set_memory_bits(bitmap, sec, 1); | 1550 | bitmap_set_memory_bits(bitmap, sec, 1); |
| 1572 | spin_lock_irq(&bitmap->lock); | ||
| 1573 | bitmap_file_set_bit(bitmap, sec); | 1551 | bitmap_file_set_bit(bitmap, sec); |
| 1574 | spin_unlock_irq(&bitmap->lock); | ||
| 1575 | if (sec < bitmap->mddev->recovery_cp) | 1552 | if (sec < bitmap->mddev->recovery_cp) |
| 1576 | /* We are asserting that the array is dirty, | 1553 | /* We are asserting that the array is dirty, |
| 1577 | * so move the recovery_cp address back so | 1554 | * so move the recovery_cp address back so |
| @@ -1616,11 +1593,15 @@ static void bitmap_free(struct bitmap *bitmap) | |||
| 1616 | if (!bitmap) /* there was no bitmap */ | 1593 | if (!bitmap) /* there was no bitmap */ |
| 1617 | return; | 1594 | return; |
| 1618 | 1595 | ||
| 1619 | /* release the bitmap file and kill the daemon */ | 1596 | /* Shouldn't be needed - but just in case.... */ |
| 1620 | bitmap_file_put(bitmap); | 1597 | wait_event(bitmap->write_wait, |
| 1598 | atomic_read(&bitmap->pending_writes) == 0); | ||
| 1599 | |||
| 1600 | /* release the bitmap file */ | ||
| 1601 | bitmap_file_unmap(&bitmap->storage); | ||
| 1621 | 1602 | ||
| 1622 | bp = bitmap->bp; | 1603 | bp = bitmap->counts.bp; |
| 1623 | pages = bitmap->pages; | 1604 | pages = bitmap->counts.pages; |
| 1624 | 1605 | ||
| 1625 | /* free all allocated memory */ | 1606 | /* free all allocated memory */ |
| 1626 | 1607 | ||
| @@ -1659,25 +1640,19 @@ int bitmap_create(struct mddev *mddev) | |||
| 1659 | { | 1640 | { |
| 1660 | struct bitmap *bitmap; | 1641 | struct bitmap *bitmap; |
| 1661 | sector_t blocks = mddev->resync_max_sectors; | 1642 | sector_t blocks = mddev->resync_max_sectors; |
| 1662 | unsigned long chunks; | ||
| 1663 | unsigned long pages; | ||
| 1664 | struct file *file = mddev->bitmap_info.file; | 1643 | struct file *file = mddev->bitmap_info.file; |
| 1665 | int err; | 1644 | int err; |
| 1666 | struct sysfs_dirent *bm = NULL; | 1645 | struct sysfs_dirent *bm = NULL; |
| 1667 | 1646 | ||
| 1668 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1647 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
| 1669 | 1648 | ||
| 1670 | if (!file | ||
| 1671 | && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ | ||
| 1672 | return 0; | ||
| 1673 | |||
| 1674 | BUG_ON(file && mddev->bitmap_info.offset); | 1649 | BUG_ON(file && mddev->bitmap_info.offset); |
| 1675 | 1650 | ||
| 1676 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1651 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
| 1677 | if (!bitmap) | 1652 | if (!bitmap) |
| 1678 | return -ENOMEM; | 1653 | return -ENOMEM; |
| 1679 | 1654 | ||
| 1680 | spin_lock_init(&bitmap->lock); | 1655 | spin_lock_init(&bitmap->counts.lock); |
| 1681 | atomic_set(&bitmap->pending_writes, 0); | 1656 | atomic_set(&bitmap->pending_writes, 0); |
| 1682 | init_waitqueue_head(&bitmap->write_wait); | 1657 | init_waitqueue_head(&bitmap->write_wait); |
| 1683 | init_waitqueue_head(&bitmap->overflow_wait); | 1658 | init_waitqueue_head(&bitmap->overflow_wait); |
| @@ -1693,7 +1668,7 @@ int bitmap_create(struct mddev *mddev) | |||
| 1693 | } else | 1668 | } else |
| 1694 | bitmap->sysfs_can_clear = NULL; | 1669 | bitmap->sysfs_can_clear = NULL; |
| 1695 | 1670 | ||
| 1696 | bitmap->file = file; | 1671 | bitmap->storage.file = file; |
| 1697 | if (file) { | 1672 | if (file) { |
| 1698 | get_file(file); | 1673 | get_file(file); |
| 1699 | /* As future accesses to this file will use bmap, | 1674 | /* As future accesses to this file will use bmap, |
| @@ -1724,32 +1699,15 @@ int bitmap_create(struct mddev *mddev) | |||
| 1724 | goto error; | 1699 | goto error; |
| 1725 | 1700 | ||
| 1726 | bitmap->daemon_lastrun = jiffies; | 1701 | bitmap->daemon_lastrun = jiffies; |
| 1727 | bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) | 1702 | err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); |
| 1728 | - BITMAP_BLOCK_SHIFT); | 1703 | if (err) |
| 1729 | |||
| 1730 | chunks = (blocks + (1 << bitmap->chunkshift) - 1) >> | ||
| 1731 | bitmap->chunkshift; | ||
| 1732 | pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; | ||
| 1733 | |||
| 1734 | BUG_ON(!pages); | ||
| 1735 | |||
| 1736 | bitmap->chunks = chunks; | ||
| 1737 | bitmap->pages = pages; | ||
| 1738 | bitmap->missing_pages = pages; | ||
| 1739 | |||
| 1740 | bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); | ||
| 1741 | |||
| 1742 | err = -ENOMEM; | ||
| 1743 | if (!bitmap->bp) | ||
| 1744 | goto error; | 1704 | goto error; |
| 1745 | 1705 | ||
| 1746 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1706 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", |
| 1747 | pages, bmname(bitmap)); | 1707 | bitmap->counts.pages, bmname(bitmap)); |
| 1748 | 1708 | ||
| 1749 | mddev->bitmap = bitmap; | 1709 | mddev->bitmap = bitmap; |
| 1750 | 1710 | return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; | |
| 1751 | |||
| 1752 | return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; | ||
| 1753 | 1711 | ||
| 1754 | error: | 1712 | error: |
| 1755 | bitmap_free(bitmap); | 1713 | bitmap_free(bitmap); |
| @@ -1790,13 +1748,17 @@ int bitmap_load(struct mddev *mddev) | |||
| 1790 | 1748 | ||
| 1791 | if (err) | 1749 | if (err) |
| 1792 | goto out; | 1750 | goto out; |
| 1751 | clear_bit(BITMAP_STALE, &bitmap->flags); | ||
| 1752 | |||
| 1753 | /* Kick recovery in case any bits were set */ | ||
| 1754 | set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); | ||
| 1793 | 1755 | ||
| 1794 | mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; | 1756 | mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; |
| 1795 | md_wakeup_thread(mddev->thread); | 1757 | md_wakeup_thread(mddev->thread); |
| 1796 | 1758 | ||
| 1797 | bitmap_update_sb(bitmap); | 1759 | bitmap_update_sb(bitmap); |
| 1798 | 1760 | ||
| 1799 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 1761 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
| 1800 | err = -EIO; | 1762 | err = -EIO; |
| 1801 | out: | 1763 | out: |
| 1802 | return err; | 1764 | return err; |
| @@ -1806,30 +1768,194 @@ EXPORT_SYMBOL_GPL(bitmap_load); | |||
| 1806 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) | 1768 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) |
| 1807 | { | 1769 | { |
| 1808 | unsigned long chunk_kb; | 1770 | unsigned long chunk_kb; |
| 1809 | unsigned long flags; | 1771 | struct bitmap_counts *counts; |
| 1810 | 1772 | ||
| 1811 | if (!bitmap) | 1773 | if (!bitmap) |
| 1812 | return; | 1774 | return; |
| 1813 | 1775 | ||
| 1814 | spin_lock_irqsave(&bitmap->lock, flags); | 1776 | counts = &bitmap->counts; |
| 1777 | |||
| 1815 | chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; | 1778 | chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; |
| 1816 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | 1779 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " |
| 1817 | "%lu%s chunk", | 1780 | "%lu%s chunk", |
| 1818 | bitmap->pages - bitmap->missing_pages, | 1781 | counts->pages - counts->missing_pages, |
| 1819 | bitmap->pages, | 1782 | counts->pages, |
| 1820 | (bitmap->pages - bitmap->missing_pages) | 1783 | (counts->pages - counts->missing_pages) |
| 1821 | << (PAGE_SHIFT - 10), | 1784 | << (PAGE_SHIFT - 10), |
| 1822 | chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, | 1785 | chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, |
| 1823 | chunk_kb ? "KB" : "B"); | 1786 | chunk_kb ? "KB" : "B"); |
| 1824 | if (bitmap->file) { | 1787 | if (bitmap->storage.file) { |
| 1825 | seq_printf(seq, ", file: "); | 1788 | seq_printf(seq, ", file: "); |
| 1826 | seq_path(seq, &bitmap->file->f_path, " \t\n"); | 1789 | seq_path(seq, &bitmap->storage.file->f_path, " \t\n"); |
| 1827 | } | 1790 | } |
| 1828 | 1791 | ||
| 1829 | seq_printf(seq, "\n"); | 1792 | seq_printf(seq, "\n"); |
| 1830 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1831 | } | 1793 | } |
| 1832 | 1794 | ||
| 1795 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | ||
| 1796 | int chunksize, int init) | ||
| 1797 | { | ||
| 1798 | /* If chunk_size is 0, choose an appropriate chunk size. | ||
| 1799 | * Then possibly allocate new storage space. | ||
| 1800 | * Then quiesce, copy bits, replace bitmap, and re-start | ||
| 1801 | * | ||
| 1802 | * This function is called both to set up the initial bitmap | ||
| 1803 | * and to resize the bitmap while the array is active. | ||
| 1804 | * If this happens as a result of the array being resized, | ||
| 1805 | * chunksize will be zero, and we need to choose a suitable | ||
| 1806 | * chunksize, otherwise we use what we are given. | ||
| 1807 | */ | ||
| 1808 | struct bitmap_storage store; | ||
| 1809 | struct bitmap_counts old_counts; | ||
| 1810 | unsigned long chunks; | ||
| 1811 | sector_t block; | ||
| 1812 | sector_t old_blocks, new_blocks; | ||
| 1813 | int chunkshift; | ||
| 1814 | int ret = 0; | ||
| 1815 | long pages; | ||
| 1816 | struct bitmap_page *new_bp; | ||
| 1817 | |||
| 1818 | if (chunksize == 0) { | ||
| 1819 | /* If there is enough space, leave the chunk size unchanged, | ||
| 1820 | * else increase by factor of two until there is enough space. | ||
| 1821 | */ | ||
| 1822 | long bytes; | ||
| 1823 | long space = bitmap->mddev->bitmap_info.space; | ||
| 1824 | |||
| 1825 | if (space == 0) { | ||
| 1826 | /* We don't know how much space there is, so limit | ||
| 1827 | * to current size - in sectors. | ||
| 1828 | */ | ||
| 1829 | bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); | ||
| 1830 | if (!bitmap->mddev->bitmap_info.external) | ||
| 1831 | bytes += sizeof(bitmap_super_t); | ||
| 1832 | space = DIV_ROUND_UP(bytes, 512); | ||
| 1833 | bitmap->mddev->bitmap_info.space = space; | ||
| 1834 | } | ||
| 1835 | chunkshift = bitmap->counts.chunkshift; | ||
| 1836 | chunkshift--; | ||
| 1837 | do { | ||
| 1838 | /* 'chunkshift' is shift from block size to chunk size */ | ||
| 1839 | chunkshift++; | ||
| 1840 | chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); | ||
| 1841 | bytes = DIV_ROUND_UP(chunks, 8); | ||
| 1842 | if (!bitmap->mddev->bitmap_info.external) | ||
| 1843 | bytes += sizeof(bitmap_super_t); | ||
| 1844 | } while (bytes > (space << 9)); | ||
| 1845 | } else | ||
| 1846 | chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; | ||
| 1847 | |||
| 1848 | chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); | ||
| 1849 | memset(&store, 0, sizeof(store)); | ||
| 1850 | if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) | ||
| 1851 | ret = bitmap_storage_alloc(&store, chunks, | ||
| 1852 | !bitmap->mddev->bitmap_info.external); | ||
| 1853 | if (ret) | ||
| 1854 | goto err; | ||
| 1855 | |||
| 1856 | pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); | ||
| 1857 | |||
| 1858 | new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL); | ||
| 1859 | ret = -ENOMEM; | ||
| 1860 | if (!new_bp) { | ||
| 1861 | bitmap_file_unmap(&store); | ||
| 1862 | goto err; | ||
| 1863 | } | ||
| 1864 | |||
| 1865 | if (!init) | ||
| 1866 | bitmap->mddev->pers->quiesce(bitmap->mddev, 1); | ||
| 1867 | |||
| 1868 | store.file = bitmap->storage.file; | ||
| 1869 | bitmap->storage.file = NULL; | ||
| 1870 | |||
| 1871 | if (store.sb_page && bitmap->storage.sb_page) | ||
| 1872 | memcpy(page_address(store.sb_page), | ||
| 1873 | page_address(bitmap->storage.sb_page), | ||
| 1874 | sizeof(bitmap_super_t)); | ||
| 1875 | bitmap_file_unmap(&bitmap->storage); | ||
| 1876 | bitmap->storage = store; | ||
| 1877 | |||
| 1878 | old_counts = bitmap->counts; | ||
| 1879 | bitmap->counts.bp = new_bp; | ||
| 1880 | bitmap->counts.pages = pages; | ||
| 1881 | bitmap->counts.missing_pages = pages; | ||
| 1882 | bitmap->counts.chunkshift = chunkshift; | ||
| 1883 | bitmap->counts.chunks = chunks; | ||
| 1884 | bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + | ||
| 1885 | BITMAP_BLOCK_SHIFT); | ||
| 1886 | |||
| 1887 | blocks = min(old_counts.chunks << old_counts.chunkshift, | ||
| 1888 | chunks << chunkshift); | ||
| 1889 | |||
| 1890 | spin_lock_irq(&bitmap->counts.lock); | ||
| 1891 | for (block = 0; block < blocks; ) { | ||
| 1892 | bitmap_counter_t *bmc_old, *bmc_new; | ||
| 1893 | int set; | ||
| 1894 | |||
| 1895 | bmc_old = bitmap_get_counter(&old_counts, block, | ||
| 1896 | &old_blocks, 0); | ||
| 1897 | set = bmc_old && NEEDED(*bmc_old); | ||
| 1898 | |||
| 1899 | if (set) { | ||
| 1900 | bmc_new = bitmap_get_counter(&bitmap->counts, block, | ||
| 1901 | &new_blocks, 1); | ||
| 1902 | if (*bmc_new == 0) { | ||
| 1903 | /* need to set on-disk bits too. */ | ||
| 1904 | sector_t end = block + new_blocks; | ||
| 1905 | sector_t start = block >> chunkshift; | ||
| 1906 | start <<= chunkshift; | ||
| 1907 | while (start < end) { | ||
| 1908 | bitmap_file_set_bit(bitmap, block); | ||
| 1909 | start += 1 << chunkshift; | ||
| 1910 | } | ||
| 1911 | *bmc_new = 2; | ||
| 1912 | bitmap_count_page(&bitmap->counts, | ||
| 1913 | block, 1); | ||
| 1914 | bitmap_set_pending(&bitmap->counts, | ||
| 1915 | block); | ||
| 1916 | } | ||
| 1917 | *bmc_new |= NEEDED_MASK; | ||
| 1918 | if (new_blocks < old_blocks) | ||
| 1919 | old_blocks = new_blocks; | ||
| 1920 | } | ||
| 1921 | block += old_blocks; | ||
| 1922 | } | ||
| 1923 | |||
| 1924 | if (!init) { | ||
| 1925 | int i; | ||
| 1926 | while (block < (chunks << chunkshift)) { | ||
| 1927 | bitmap_counter_t *bmc; | ||
| 1928 | bmc = bitmap_get_counter(&bitmap->counts, block, | ||
| 1929 | &new_blocks, 1); | ||
| 1930 | if (bmc) { | ||
| 1931 | /* new space. It needs to be resynced, so | ||
| 1932 | * we set NEEDED_MASK. | ||
| 1933 | */ | ||
| 1934 | if (*bmc == 0) { | ||
| 1935 | *bmc = NEEDED_MASK | 2; | ||
| 1936 | bitmap_count_page(&bitmap->counts, | ||
| 1937 | block, 1); | ||
| 1938 | bitmap_set_pending(&bitmap->counts, | ||
| 1939 | block); | ||
| 1940 | } | ||
| 1941 | } | ||
| 1942 | block += new_blocks; | ||
| 1943 | } | ||
| 1944 | for (i = 0; i < bitmap->storage.file_pages; i++) | ||
| 1945 | set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); | ||
| 1946 | } | ||
| 1947 | spin_unlock_irq(&bitmap->counts.lock); | ||
| 1948 | |||
| 1949 | if (!init) { | ||
| 1950 | bitmap_unplug(bitmap); | ||
| 1951 | bitmap->mddev->pers->quiesce(bitmap->mddev, 0); | ||
| 1952 | } | ||
| 1953 | ret = 0; | ||
| 1954 | err: | ||
| 1955 | return ret; | ||
| 1956 | } | ||
| 1957 | EXPORT_SYMBOL_GPL(bitmap_resize); | ||
| 1958 | |||
| 1833 | static ssize_t | 1959 | static ssize_t |
| 1834 | location_show(struct mddev *mddev, char *page) | 1960 | location_show(struct mddev *mddev, char *page) |
| 1835 | { | 1961 | { |
| @@ -1923,6 +2049,43 @@ location_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 1923 | static struct md_sysfs_entry bitmap_location = | 2049 | static struct md_sysfs_entry bitmap_location = |
| 1924 | __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); | 2050 | __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); |
| 1925 | 2051 | ||
| 2052 | /* 'bitmap/space' is the space available at 'location' for the | ||
| 2053 | * bitmap. This allows the kernel to know when it is safe to | ||
| 2054 | * resize the bitmap to match a resized array. | ||
| 2055 | */ | ||
| 2056 | static ssize_t | ||
| 2057 | space_show(struct mddev *mddev, char *page) | ||
| 2058 | { | ||
| 2059 | return sprintf(page, "%lu\n", mddev->bitmap_info.space); | ||
| 2060 | } | ||
| 2061 | |||
| 2062 | static ssize_t | ||
| 2063 | space_store(struct mddev *mddev, const char *buf, size_t len) | ||
| 2064 | { | ||
| 2065 | unsigned long sectors; | ||
| 2066 | int rv; | ||
| 2067 | |||
| 2068 | rv = kstrtoul(buf, 10, §ors); | ||
| 2069 | if (rv) | ||
| 2070 | return rv; | ||
| 2071 | |||
| 2072 | if (sectors == 0) | ||
| 2073 | return -EINVAL; | ||
| 2074 | |||
| 2075 | if (mddev->bitmap && | ||
| 2076 | sectors < (mddev->bitmap->storage.bytes + 511) >> 9) | ||
| 2077 | return -EFBIG; /* Bitmap is too big for this small space */ | ||
| 2078 | |||
| 2079 | /* could make sure it isn't too big, but that isn't really | ||
| 2080 | * needed - user-space should be careful. | ||
| 2081 | */ | ||
| 2082 | mddev->bitmap_info.space = sectors; | ||
| 2083 | return len; | ||
| 2084 | } | ||
| 2085 | |||
| 2086 | static struct md_sysfs_entry bitmap_space = | ||
| 2087 | __ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); | ||
| 2088 | |||
| 1926 | static ssize_t | 2089 | static ssize_t |
| 1927 | timeout_show(struct mddev *mddev, char *page) | 2090 | timeout_show(struct mddev *mddev, char *page) |
| 1928 | { | 2091 | { |
| @@ -2098,6 +2261,7 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, | |||
| 2098 | 2261 | ||
| 2099 | static struct attribute *md_bitmap_attrs[] = { | 2262 | static struct attribute *md_bitmap_attrs[] = { |
| 2100 | &bitmap_location.attr, | 2263 | &bitmap_location.attr, |
| 2264 | &bitmap_space.attr, | ||
| 2101 | &bitmap_timeout.attr, | 2265 | &bitmap_timeout.attr, |
| 2102 | &bitmap_backlog.attr, | 2266 | &bitmap_backlog.attr, |
| 2103 | &bitmap_chunksize.attr, | 2267 | &bitmap_chunksize.attr, |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index b44b0aba2d47..df4aeb6ac6f0 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
| @@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t; | |||
| 111 | 111 | ||
| 112 | /* use these for bitmap->flags and bitmap->sb->state bit-fields */ | 112 | /* use these for bitmap->flags and bitmap->sb->state bit-fields */ |
| 113 | enum bitmap_state { | 113 | enum bitmap_state { |
| 114 | BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ | 114 | BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ |
| 115 | BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ | 115 | BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ |
| 116 | BITMAP_HOSTENDIAN = 0x8000, | 116 | BITMAP_HOSTENDIAN =15, |
| 117 | }; | 117 | }; |
| 118 | 118 | ||
| 119 | /* the superblock at the front of the bitmap file -- little endian */ | 119 | /* the superblock at the front of the bitmap file -- little endian */ |
| @@ -128,8 +128,10 @@ typedef struct bitmap_super_s { | |||
| 128 | __le32 chunksize; /* 52 the bitmap chunk size in bytes */ | 128 | __le32 chunksize; /* 52 the bitmap chunk size in bytes */ |
| 129 | __le32 daemon_sleep; /* 56 seconds between disk flushes */ | 129 | __le32 daemon_sleep; /* 56 seconds between disk flushes */ |
| 130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ | 130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ |
| 131 | __le32 sectors_reserved; /* 64 number of 512-byte sectors that are | ||
| 132 | * reserved for the bitmap. */ | ||
| 131 | 133 | ||
| 132 | __u8 pad[256 - 64]; /* set to zero */ | 134 | __u8 pad[256 - 68]; /* set to zero */ |
| 133 | } bitmap_super_t; | 135 | } bitmap_super_t; |
| 134 | 136 | ||
| 135 | /* notes: | 137 | /* notes: |
| @@ -160,35 +162,48 @@ struct bitmap_page { | |||
| 160 | */ | 162 | */ |
| 161 | unsigned int hijacked:1; | 163 | unsigned int hijacked:1; |
| 162 | /* | 164 | /* |
| 165 | * If any counter in this page is '1' or '2' - and so could be | ||
| 166 | * cleared then that page is marked as 'pending' | ||
| 167 | */ | ||
| 168 | unsigned int pending:1; | ||
| 169 | /* | ||
| 163 | * count of dirty bits on the page | 170 | * count of dirty bits on the page |
| 164 | */ | 171 | */ |
| 165 | unsigned int count:31; | 172 | unsigned int count:30; |
| 166 | }; | 173 | }; |
| 167 | 174 | ||
| 168 | /* the main bitmap structure - one per mddev */ | 175 | /* the main bitmap structure - one per mddev */ |
| 169 | struct bitmap { | 176 | struct bitmap { |
| 170 | struct bitmap_page *bp; | ||
| 171 | unsigned long pages; /* total number of pages in the bitmap */ | ||
| 172 | unsigned long missing_pages; /* number of pages not yet allocated */ | ||
| 173 | 177 | ||
| 174 | struct mddev *mddev; /* the md device that the bitmap is for */ | 178 | struct bitmap_counts { |
| 179 | spinlock_t lock; | ||
| 180 | struct bitmap_page *bp; | ||
| 181 | unsigned long pages; /* total number of pages | ||
| 182 | * in the bitmap */ | ||
| 183 | unsigned long missing_pages; /* number of pages | ||
| 184 | * not yet allocated */ | ||
| 185 | unsigned long chunkshift; /* chunksize = 2^chunkshift | ||
| 186 | * (for bitops) */ | ||
| 187 | unsigned long chunks; /* Total number of data | ||
| 188 | * chunks for the array */ | ||
| 189 | } counts; | ||
| 175 | 190 | ||
| 176 | /* bitmap chunksize -- how much data does each bit represent? */ | 191 | struct mddev *mddev; /* the md device that the bitmap is for */ |
| 177 | unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ | ||
| 178 | unsigned long chunks; /* total number of data chunks for the array */ | ||
| 179 | 192 | ||
| 180 | __u64 events_cleared; | 193 | __u64 events_cleared; |
| 181 | int need_sync; | 194 | int need_sync; |
| 182 | 195 | ||
| 183 | /* bitmap spinlock */ | 196 | struct bitmap_storage { |
| 184 | spinlock_t lock; | 197 | struct file *file; /* backing disk file */ |
| 185 | 198 | struct page *sb_page; /* cached copy of the bitmap | |
| 186 | struct file *file; /* backing disk file */ | 199 | * file superblock */ |
| 187 | struct page *sb_page; /* cached copy of the bitmap file superblock */ | 200 | struct page **filemap; /* list of cache pages for |
| 188 | struct page **filemap; /* list of cache pages for the file */ | 201 | * the file */ |
| 189 | unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ | 202 | unsigned long *filemap_attr; /* attributes associated |
| 190 | unsigned long file_pages; /* number of pages in the file */ | 203 | * w/ filemap pages */ |
| 191 | int last_page_size; /* bytes in the last page */ | 204 | unsigned long file_pages; /* number of pages in the file*/ |
| 205 | unsigned long bytes; /* total bytes in the bitmap */ | ||
| 206 | } storage; | ||
| 192 | 207 | ||
| 193 | unsigned long flags; | 208 | unsigned long flags; |
| 194 | 209 | ||
| @@ -242,6 +257,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); | |||
| 242 | 257 | ||
| 243 | void bitmap_unplug(struct bitmap *bitmap); | 258 | void bitmap_unplug(struct bitmap *bitmap); |
| 244 | void bitmap_daemon_work(struct mddev *mddev); | 259 | void bitmap_daemon_work(struct mddev *mddev); |
| 260 | |||
| 261 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | ||
| 262 | int chunksize, int init); | ||
| 245 | #endif | 263 | #endif |
| 246 | 264 | ||
| 247 | #endif | 265 | #endif |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 68965e663248..017c34d78d61 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
| @@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs) | |||
| 155 | for (i = 0; i < rs->md.raid_disks; i++) { | 155 | for (i = 0; i < rs->md.raid_disks; i++) { |
| 156 | if (rs->dev[i].meta_dev) | 156 | if (rs->dev[i].meta_dev) |
| 157 | dm_put_device(rs->ti, rs->dev[i].meta_dev); | 157 | dm_put_device(rs->ti, rs->dev[i].meta_dev); |
| 158 | if (rs->dev[i].rdev.sb_page) | 158 | md_rdev_clear(&rs->dev[i].rdev); |
| 159 | put_page(rs->dev[i].rdev.sb_page); | ||
| 160 | rs->dev[i].rdev.sb_page = NULL; | ||
| 161 | rs->dev[i].rdev.sb_loaded = 0; | ||
| 162 | if (rs->dev[i].data_dev) | 159 | if (rs->dev[i].data_dev) |
| 163 | dm_put_device(rs->ti, rs->dev[i].data_dev); | 160 | dm_put_device(rs->ti, rs->dev[i].data_dev); |
| 164 | } | 161 | } |
| @@ -606,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
| 606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { | 603 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { |
| 607 | DMERR("Failed to read superblock of device at position %d", | 604 | DMERR("Failed to read superblock of device at position %d", |
| 608 | rdev->raid_disk); | 605 | rdev->raid_disk); |
| 609 | set_bit(Faulty, &rdev->flags); | 606 | md_error(rdev->mddev, rdev); |
| 610 | return -EINVAL; | 607 | return -EINVAL; |
| 611 | } | 608 | } |
| 612 | 609 | ||
| @@ -617,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
| 617 | 614 | ||
| 618 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) | 615 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) |
| 619 | { | 616 | { |
| 620 | struct md_rdev *r; | 617 | int i; |
| 621 | uint64_t failed_devices; | 618 | uint64_t failed_devices; |
| 622 | struct dm_raid_superblock *sb; | 619 | struct dm_raid_superblock *sb; |
| 620 | struct raid_set *rs = container_of(mddev, struct raid_set, md); | ||
| 623 | 621 | ||
| 624 | sb = page_address(rdev->sb_page); | 622 | sb = page_address(rdev->sb_page); |
| 625 | failed_devices = le64_to_cpu(sb->failed_devices); | 623 | failed_devices = le64_to_cpu(sb->failed_devices); |
| 626 | 624 | ||
| 627 | rdev_for_each(r, mddev) | 625 | for (i = 0; i < mddev->raid_disks; i++) |
| 628 | if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) | 626 | if (!rs->dev[i].data_dev || |
| 629 | failed_devices |= (1ULL << r->raid_disk); | 627 | test_bit(Faulty, &(rs->dev[i].rdev.flags))) |
| 628 | failed_devices |= (1ULL << i); | ||
| 630 | 629 | ||
| 631 | memset(sb, 0, sizeof(*sb)); | 630 | memset(sb, 0, sizeof(*sb)); |
| 632 | 631 | ||
| @@ -1252,12 +1251,13 @@ static void raid_resume(struct dm_target *ti) | |||
| 1252 | { | 1251 | { |
| 1253 | struct raid_set *rs = ti->private; | 1252 | struct raid_set *rs = ti->private; |
| 1254 | 1253 | ||
| 1254 | set_bit(MD_CHANGE_DEVS, &rs->md.flags); | ||
| 1255 | if (!rs->bitmap_loaded) { | 1255 | if (!rs->bitmap_loaded) { |
| 1256 | bitmap_load(&rs->md); | 1256 | bitmap_load(&rs->md); |
| 1257 | rs->bitmap_loaded = 1; | 1257 | rs->bitmap_loaded = 1; |
| 1258 | } else | 1258 | } |
| 1259 | md_wakeup_thread(rs->md.thread); | ||
| 1260 | 1259 | ||
| 1260 | clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); | ||
| 1261 | mddev_resume(&rs->md); | 1261 | mddev_resume(&rs->md); |
| 1262 | } | 1262 | } |
| 1263 | 1263 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 01233d855eb2..1c2f9048e1ae 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev) | |||
| 402 | wake_up(&mddev->sb_wait); | 402 | wake_up(&mddev->sb_wait); |
| 403 | mddev->pers->quiesce(mddev, 0); | 403 | mddev->pers->quiesce(mddev, 0); |
| 404 | 404 | ||
| 405 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
| 405 | md_wakeup_thread(mddev->thread); | 406 | md_wakeup_thread(mddev->thread); |
| 406 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | 407 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
| 407 | } | 408 | } |
| @@ -452,7 +453,7 @@ static void submit_flushes(struct work_struct *ws) | |||
| 452 | atomic_inc(&rdev->nr_pending); | 453 | atomic_inc(&rdev->nr_pending); |
| 453 | atomic_inc(&rdev->nr_pending); | 454 | atomic_inc(&rdev->nr_pending); |
| 454 | rcu_read_unlock(); | 455 | rcu_read_unlock(); |
| 455 | bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); | 456 | bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); |
| 456 | bi->bi_end_io = md_end_flush; | 457 | bi->bi_end_io = md_end_flush; |
| 457 | bi->bi_private = rdev; | 458 | bi->bi_private = rdev; |
| 458 | bi->bi_bdev = rdev->bdev; | 459 | bi->bi_bdev = rdev->bdev; |
| @@ -607,6 +608,7 @@ void mddev_init(struct mddev *mddev) | |||
| 607 | init_waitqueue_head(&mddev->sb_wait); | 608 | init_waitqueue_head(&mddev->sb_wait); |
| 608 | init_waitqueue_head(&mddev->recovery_wait); | 609 | init_waitqueue_head(&mddev->recovery_wait); |
| 609 | mddev->reshape_position = MaxSector; | 610 | mddev->reshape_position = MaxSector; |
| 611 | mddev->reshape_backwards = 0; | ||
| 610 | mddev->resync_min = 0; | 612 | mddev->resync_min = 0; |
| 611 | mddev->resync_max = MaxSector; | 613 | mddev->resync_max = MaxSector; |
| 612 | mddev->level = LEVEL_NONE; | 614 | mddev->level = LEVEL_NONE; |
| @@ -802,7 +804,7 @@ static int alloc_disk_sb(struct md_rdev * rdev) | |||
| 802 | return 0; | 804 | return 0; |
| 803 | } | 805 | } |
| 804 | 806 | ||
| 805 | static void free_disk_sb(struct md_rdev * rdev) | 807 | void md_rdev_clear(struct md_rdev *rdev) |
| 806 | { | 808 | { |
| 807 | if (rdev->sb_page) { | 809 | if (rdev->sb_page) { |
| 808 | put_page(rdev->sb_page); | 810 | put_page(rdev->sb_page); |
| @@ -815,8 +817,10 @@ static void free_disk_sb(struct md_rdev * rdev) | |||
| 815 | put_page(rdev->bb_page); | 817 | put_page(rdev->bb_page); |
| 816 | rdev->bb_page = NULL; | 818 | rdev->bb_page = NULL; |
| 817 | } | 819 | } |
| 820 | kfree(rdev->badblocks.page); | ||
| 821 | rdev->badblocks.page = NULL; | ||
| 818 | } | 822 | } |
| 819 | 823 | EXPORT_SYMBOL_GPL(md_rdev_clear); | |
| 820 | 824 | ||
| 821 | static void super_written(struct bio *bio, int error) | 825 | static void super_written(struct bio *bio, int error) |
| 822 | { | 826 | { |
| @@ -887,6 +891,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | |||
| 887 | rdev->meta_bdev : rdev->bdev; | 891 | rdev->meta_bdev : rdev->bdev; |
| 888 | if (metadata_op) | 892 | if (metadata_op) |
| 889 | bio->bi_sector = sector + rdev->sb_start; | 893 | bio->bi_sector = sector + rdev->sb_start; |
| 894 | else if (rdev->mddev->reshape_position != MaxSector && | ||
| 895 | (rdev->mddev->reshape_backwards == | ||
| 896 | (sector >= rdev->mddev->reshape_position))) | ||
| 897 | bio->bi_sector = sector + rdev->new_data_offset; | ||
| 890 | else | 898 | else |
| 891 | bio->bi_sector = sector + rdev->data_offset; | 899 | bio->bi_sector = sector + rdev->data_offset; |
| 892 | bio_add_page(bio, page, size, 0); | 900 | bio_add_page(bio, page, size, 0); |
| @@ -1034,12 +1042,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) | |||
| 1034 | struct super_type { | 1042 | struct super_type { |
| 1035 | char *name; | 1043 | char *name; |
| 1036 | struct module *owner; | 1044 | struct module *owner; |
| 1037 | int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, | 1045 | int (*load_super)(struct md_rdev *rdev, |
| 1046 | struct md_rdev *refdev, | ||
| 1038 | int minor_version); | 1047 | int minor_version); |
| 1039 | int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); | 1048 | int (*validate_super)(struct mddev *mddev, |
| 1040 | void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); | 1049 | struct md_rdev *rdev); |
| 1050 | void (*sync_super)(struct mddev *mddev, | ||
| 1051 | struct md_rdev *rdev); | ||
| 1041 | unsigned long long (*rdev_size_change)(struct md_rdev *rdev, | 1052 | unsigned long long (*rdev_size_change)(struct md_rdev *rdev, |
| 1042 | sector_t num_sectors); | 1053 | sector_t num_sectors); |
| 1054 | int (*allow_new_offset)(struct md_rdev *rdev, | ||
| 1055 | unsigned long long new_offset); | ||
| 1043 | }; | 1056 | }; |
| 1044 | 1057 | ||
| 1045 | /* | 1058 | /* |
| @@ -1111,6 +1124,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
| 1111 | 1124 | ||
| 1112 | rdev->preferred_minor = sb->md_minor; | 1125 | rdev->preferred_minor = sb->md_minor; |
| 1113 | rdev->data_offset = 0; | 1126 | rdev->data_offset = 0; |
| 1127 | rdev->new_data_offset = 0; | ||
| 1114 | rdev->sb_size = MD_SB_BYTES; | 1128 | rdev->sb_size = MD_SB_BYTES; |
| 1115 | rdev->badblocks.shift = -1; | 1129 | rdev->badblocks.shift = -1; |
| 1116 | 1130 | ||
| @@ -1184,7 +1198,11 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1184 | mddev->dev_sectors = ((sector_t)sb->size) * 2; | 1198 | mddev->dev_sectors = ((sector_t)sb->size) * 2; |
| 1185 | mddev->events = ev1; | 1199 | mddev->events = ev1; |
| 1186 | mddev->bitmap_info.offset = 0; | 1200 | mddev->bitmap_info.offset = 0; |
| 1201 | mddev->bitmap_info.space = 0; | ||
| 1202 | /* bitmap can use 60 K after the 4K superblocks */ | ||
| 1187 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; | 1203 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
| 1204 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); | ||
| 1205 | mddev->reshape_backwards = 0; | ||
| 1188 | 1206 | ||
| 1189 | if (mddev->minor_version >= 91) { | 1207 | if (mddev->minor_version >= 91) { |
| 1190 | mddev->reshape_position = sb->reshape_position; | 1208 | mddev->reshape_position = sb->reshape_position; |
| @@ -1192,6 +1210,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1192 | mddev->new_level = sb->new_level; | 1210 | mddev->new_level = sb->new_level; |
| 1193 | mddev->new_layout = sb->new_layout; | 1211 | mddev->new_layout = sb->new_layout; |
| 1194 | mddev->new_chunk_sectors = sb->new_chunk >> 9; | 1212 | mddev->new_chunk_sectors = sb->new_chunk >> 9; |
| 1213 | if (mddev->delta_disks < 0) | ||
| 1214 | mddev->reshape_backwards = 1; | ||
| 1195 | } else { | 1215 | } else { |
| 1196 | mddev->reshape_position = MaxSector; | 1216 | mddev->reshape_position = MaxSector; |
| 1197 | mddev->delta_disks = 0; | 1217 | mddev->delta_disks = 0; |
| @@ -1218,9 +1238,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1218 | mddev->max_disks = MD_SB_DISKS; | 1238 | mddev->max_disks = MD_SB_DISKS; |
| 1219 | 1239 | ||
| 1220 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && | 1240 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
| 1221 | mddev->bitmap_info.file == NULL) | 1241 | mddev->bitmap_info.file == NULL) { |
| 1222 | mddev->bitmap_info.offset = | 1242 | mddev->bitmap_info.offset = |
| 1223 | mddev->bitmap_info.default_offset; | 1243 | mddev->bitmap_info.default_offset; |
| 1244 | mddev->bitmap_info.space = | ||
| 1245 | mddev->bitmap_info.space; | ||
| 1246 | } | ||
| 1224 | 1247 | ||
| 1225 | } else if (mddev->pers == NULL) { | 1248 | } else if (mddev->pers == NULL) { |
| 1226 | /* Insist on good event counter while assembling, except | 1249 | /* Insist on good event counter while assembling, except |
| @@ -1434,6 +1457,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
| 1434 | return num_sectors; | 1457 | return num_sectors; |
| 1435 | } | 1458 | } |
| 1436 | 1459 | ||
| 1460 | static int | ||
| 1461 | super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) | ||
| 1462 | { | ||
| 1463 | /* non-zero offset changes not possible with v0.90 */ | ||
| 1464 | return new_offset == 0; | ||
| 1465 | } | ||
| 1437 | 1466 | ||
| 1438 | /* | 1467 | /* |
| 1439 | * version 1 superblock | 1468 | * version 1 superblock |
| @@ -1469,6 +1498,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
| 1469 | struct mdp_superblock_1 *sb; | 1498 | struct mdp_superblock_1 *sb; |
| 1470 | int ret; | 1499 | int ret; |
| 1471 | sector_t sb_start; | 1500 | sector_t sb_start; |
| 1501 | sector_t sectors; | ||
| 1472 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 1502 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| 1473 | int bmask; | 1503 | int bmask; |
| 1474 | 1504 | ||
| @@ -1523,9 +1553,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
| 1523 | bdevname(rdev->bdev,b)); | 1553 | bdevname(rdev->bdev,b)); |
| 1524 | return -EINVAL; | 1554 | return -EINVAL; |
| 1525 | } | 1555 | } |
| 1556 | if (sb->pad0 || | ||
| 1557 | sb->pad3[0] || | ||
| 1558 | memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) | ||
| 1559 | /* Some padding is non-zero, might be a new feature */ | ||
| 1560 | return -EINVAL; | ||
| 1526 | 1561 | ||
| 1527 | rdev->preferred_minor = 0xffff; | 1562 | rdev->preferred_minor = 0xffff; |
| 1528 | rdev->data_offset = le64_to_cpu(sb->data_offset); | 1563 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
| 1564 | rdev->new_data_offset = rdev->data_offset; | ||
| 1565 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && | ||
| 1566 | (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) | ||
| 1567 | rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); | ||
| 1529 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); | 1568 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
| 1530 | 1569 | ||
| 1531 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; | 1570 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
| @@ -1536,6 +1575,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
| 1536 | if (minor_version | 1575 | if (minor_version |
| 1537 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) | 1576 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
| 1538 | return -EINVAL; | 1577 | return -EINVAL; |
| 1578 | if (minor_version | ||
| 1579 | && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) | ||
| 1580 | return -EINVAL; | ||
| 1539 | 1581 | ||
| 1540 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) | 1582 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) |
| 1541 | rdev->desc_nr = -1; | 1583 | rdev->desc_nr = -1; |
| @@ -1607,16 +1649,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
| 1607 | else | 1649 | else |
| 1608 | ret = 0; | 1650 | ret = 0; |
| 1609 | } | 1651 | } |
| 1610 | if (minor_version) | 1652 | if (minor_version) { |
| 1611 | rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - | 1653 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); |
| 1612 | le64_to_cpu(sb->data_offset); | 1654 | sectors -= rdev->data_offset; |
| 1613 | else | 1655 | } else |
| 1614 | rdev->sectors = rdev->sb_start; | 1656 | sectors = rdev->sb_start; |
| 1615 | if (rdev->sectors < le64_to_cpu(sb->data_size)) | 1657 | if (sectors < le64_to_cpu(sb->data_size)) |
| 1616 | return -EINVAL; | 1658 | return -EINVAL; |
| 1617 | rdev->sectors = le64_to_cpu(sb->data_size); | 1659 | rdev->sectors = le64_to_cpu(sb->data_size); |
| 1618 | if (le64_to_cpu(sb->size) > rdev->sectors) | ||
| 1619 | return -EINVAL; | ||
| 1620 | return ret; | 1660 | return ret; |
| 1621 | } | 1661 | } |
| 1622 | 1662 | ||
| @@ -1644,17 +1684,37 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1644 | mddev->dev_sectors = le64_to_cpu(sb->size); | 1684 | mddev->dev_sectors = le64_to_cpu(sb->size); |
| 1645 | mddev->events = ev1; | 1685 | mddev->events = ev1; |
| 1646 | mddev->bitmap_info.offset = 0; | 1686 | mddev->bitmap_info.offset = 0; |
| 1687 | mddev->bitmap_info.space = 0; | ||
| 1688 | /* Default location for bitmap is 1K after superblock | ||
| 1689 | * using 3K - total of 4K | ||
| 1690 | */ | ||
| 1647 | mddev->bitmap_info.default_offset = 1024 >> 9; | 1691 | mddev->bitmap_info.default_offset = 1024 >> 9; |
| 1648 | 1692 | mddev->bitmap_info.default_space = (4096-1024) >> 9; | |
| 1693 | mddev->reshape_backwards = 0; | ||
| 1694 | |||
| 1649 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); | 1695 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
| 1650 | memcpy(mddev->uuid, sb->set_uuid, 16); | 1696 | memcpy(mddev->uuid, sb->set_uuid, 16); |
| 1651 | 1697 | ||
| 1652 | mddev->max_disks = (4096-256)/2; | 1698 | mddev->max_disks = (4096-256)/2; |
| 1653 | 1699 | ||
| 1654 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && | 1700 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
| 1655 | mddev->bitmap_info.file == NULL ) | 1701 | mddev->bitmap_info.file == NULL) { |
| 1656 | mddev->bitmap_info.offset = | 1702 | mddev->bitmap_info.offset = |
| 1657 | (__s32)le32_to_cpu(sb->bitmap_offset); | 1703 | (__s32)le32_to_cpu(sb->bitmap_offset); |
| 1704 | /* Metadata doesn't record how much space is available. | ||
| 1705 | * For 1.0, we assume we can use up to the superblock | ||
| 1706 | * if before, else to 4K beyond superblock. | ||
| 1707 | * For others, assume no change is possible. | ||
| 1708 | */ | ||
| 1709 | if (mddev->minor_version > 0) | ||
| 1710 | mddev->bitmap_info.space = 0; | ||
| 1711 | else if (mddev->bitmap_info.offset > 0) | ||
| 1712 | mddev->bitmap_info.space = | ||
| 1713 | 8 - mddev->bitmap_info.offset; | ||
| 1714 | else | ||
| 1715 | mddev->bitmap_info.space = | ||
| 1716 | -mddev->bitmap_info.offset; | ||
| 1717 | } | ||
| 1658 | 1718 | ||
| 1659 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { | 1719 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
| 1660 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); | 1720 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
| @@ -1662,6 +1722,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1662 | mddev->new_level = le32_to_cpu(sb->new_level); | 1722 | mddev->new_level = le32_to_cpu(sb->new_level); |
| 1663 | mddev->new_layout = le32_to_cpu(sb->new_layout); | 1723 | mddev->new_layout = le32_to_cpu(sb->new_layout); |
| 1664 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); | 1724 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
| 1725 | if (mddev->delta_disks < 0 || | ||
| 1726 | (mddev->delta_disks == 0 && | ||
| 1727 | (le32_to_cpu(sb->feature_map) | ||
| 1728 | & MD_FEATURE_RESHAPE_BACKWARDS))) | ||
| 1729 | mddev->reshape_backwards = 1; | ||
| 1665 | } else { | 1730 | } else { |
| 1666 | mddev->reshape_position = MaxSector; | 1731 | mddev->reshape_position = MaxSector; |
| 1667 | mddev->delta_disks = 0; | 1732 | mddev->delta_disks = 0; |
| @@ -1735,7 +1800,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1735 | sb->feature_map = 0; | 1800 | sb->feature_map = 0; |
| 1736 | sb->pad0 = 0; | 1801 | sb->pad0 = 0; |
| 1737 | sb->recovery_offset = cpu_to_le64(0); | 1802 | sb->recovery_offset = cpu_to_le64(0); |
| 1738 | memset(sb->pad1, 0, sizeof(sb->pad1)); | ||
| 1739 | memset(sb->pad3, 0, sizeof(sb->pad3)); | 1803 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
| 1740 | 1804 | ||
| 1741 | sb->utime = cpu_to_le64((__u64)mddev->utime); | 1805 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
| @@ -1757,6 +1821,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1757 | sb->devflags |= WriteMostly1; | 1821 | sb->devflags |= WriteMostly1; |
| 1758 | else | 1822 | else |
| 1759 | sb->devflags &= ~WriteMostly1; | 1823 | sb->devflags &= ~WriteMostly1; |
| 1824 | sb->data_offset = cpu_to_le64(rdev->data_offset); | ||
| 1825 | sb->data_size = cpu_to_le64(rdev->sectors); | ||
| 1760 | 1826 | ||
| 1761 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { | 1827 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
| 1762 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); | 1828 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
| @@ -1781,6 +1847,16 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1781 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); | 1847 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
| 1782 | sb->new_level = cpu_to_le32(mddev->new_level); | 1848 | sb->new_level = cpu_to_le32(mddev->new_level); |
| 1783 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); | 1849 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
| 1850 | if (mddev->delta_disks == 0 && | ||
| 1851 | mddev->reshape_backwards) | ||
| 1852 | sb->feature_map | ||
| 1853 | |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); | ||
| 1854 | if (rdev->new_data_offset != rdev->data_offset) { | ||
| 1855 | sb->feature_map | ||
| 1856 | |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); | ||
| 1857 | sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset | ||
| 1858 | - rdev->data_offset)); | ||
| 1859 | } | ||
| 1784 | } | 1860 | } |
| 1785 | 1861 | ||
| 1786 | if (rdev->badblocks.count == 0) | 1862 | if (rdev->badblocks.count == 0) |
| @@ -1857,6 +1933,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
| 1857 | sector_t max_sectors; | 1933 | sector_t max_sectors; |
| 1858 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) | 1934 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
| 1859 | return 0; /* component must fit device */ | 1935 | return 0; /* component must fit device */ |
| 1936 | if (rdev->data_offset != rdev->new_data_offset) | ||
| 1937 | return 0; /* too confusing */ | ||
| 1860 | if (rdev->sb_start < rdev->data_offset) { | 1938 | if (rdev->sb_start < rdev->data_offset) { |
| 1861 | /* minor versions 1 and 2; superblock before data */ | 1939 | /* minor versions 1 and 2; superblock before data */ |
| 1862 | max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; | 1940 | max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; |
| @@ -1884,6 +1962,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
| 1884 | rdev->sb_page); | 1962 | rdev->sb_page); |
| 1885 | md_super_wait(rdev->mddev); | 1963 | md_super_wait(rdev->mddev); |
| 1886 | return num_sectors; | 1964 | return num_sectors; |
| 1965 | |||
| 1966 | } | ||
| 1967 | |||
| 1968 | static int | ||
| 1969 | super_1_allow_new_offset(struct md_rdev *rdev, | ||
| 1970 | unsigned long long new_offset) | ||
| 1971 | { | ||
| 1972 | /* All necessary checks on new >= old have been done */ | ||
| 1973 | struct bitmap *bitmap; | ||
| 1974 | if (new_offset >= rdev->data_offset) | ||
| 1975 | return 1; | ||
| 1976 | |||
| 1977 | /* with 1.0 metadata, there is no metadata to tread on | ||
| 1978 | * so we can always move back */ | ||
| 1979 | if (rdev->mddev->minor_version == 0) | ||
| 1980 | return 1; | ||
| 1981 | |||
| 1982 | /* otherwise we must be sure not to step on | ||
| 1983 | * any metadata, so stay: | ||
| 1984 | * 36K beyond start of superblock | ||
| 1985 | * beyond end of badblocks | ||
| 1986 | * beyond write-intent bitmap | ||
| 1987 | */ | ||
| 1988 | if (rdev->sb_start + (32+4)*2 > new_offset) | ||
| 1989 | return 0; | ||
| 1990 | bitmap = rdev->mddev->bitmap; | ||
| 1991 | if (bitmap && !rdev->mddev->bitmap_info.file && | ||
| 1992 | rdev->sb_start + rdev->mddev->bitmap_info.offset + | ||
| 1993 | bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) | ||
| 1994 | return 0; | ||
| 1995 | if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) | ||
| 1996 | return 0; | ||
| 1997 | |||
| 1998 | return 1; | ||
| 1887 | } | 1999 | } |
| 1888 | 2000 | ||
| 1889 | static struct super_type super_types[] = { | 2001 | static struct super_type super_types[] = { |
| @@ -1894,6 +2006,7 @@ static struct super_type super_types[] = { | |||
| 1894 | .validate_super = super_90_validate, | 2006 | .validate_super = super_90_validate, |
| 1895 | .sync_super = super_90_sync, | 2007 | .sync_super = super_90_sync, |
| 1896 | .rdev_size_change = super_90_rdev_size_change, | 2008 | .rdev_size_change = super_90_rdev_size_change, |
| 2009 | .allow_new_offset = super_90_allow_new_offset, | ||
| 1897 | }, | 2010 | }, |
| 1898 | [1] = { | 2011 | [1] = { |
| 1899 | .name = "md-1", | 2012 | .name = "md-1", |
| @@ -1902,6 +2015,7 @@ static struct super_type super_types[] = { | |||
| 1902 | .validate_super = super_1_validate, | 2015 | .validate_super = super_1_validate, |
| 1903 | .sync_super = super_1_sync, | 2016 | .sync_super = super_1_sync, |
| 1904 | .rdev_size_change = super_1_rdev_size_change, | 2017 | .rdev_size_change = super_1_rdev_size_change, |
| 2018 | .allow_new_offset = super_1_allow_new_offset, | ||
| 1905 | }, | 2019 | }, |
| 1906 | }; | 2020 | }; |
| 1907 | 2021 | ||
| @@ -2105,9 +2219,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev) | |||
| 2105 | sysfs_remove_link(&rdev->kobj, "block"); | 2219 | sysfs_remove_link(&rdev->kobj, "block"); |
| 2106 | sysfs_put(rdev->sysfs_state); | 2220 | sysfs_put(rdev->sysfs_state); |
| 2107 | rdev->sysfs_state = NULL; | 2221 | rdev->sysfs_state = NULL; |
| 2108 | kfree(rdev->badblocks.page); | ||
| 2109 | rdev->badblocks.count = 0; | 2222 | rdev->badblocks.count = 0; |
| 2110 | rdev->badblocks.page = NULL; | ||
| 2111 | /* We need to delay this, otherwise we can deadlock when | 2223 | /* We need to delay this, otherwise we can deadlock when |
| 2112 | * writing to 'remove' to "dev/state". We also need | 2224 | * writing to 'remove' to "dev/state". We also need |
| 2113 | * to delay it due to rcu usage. | 2225 | * to delay it due to rcu usage. |
| @@ -2158,7 +2270,7 @@ static void export_rdev(struct md_rdev * rdev) | |||
| 2158 | bdevname(rdev->bdev,b)); | 2270 | bdevname(rdev->bdev,b)); |
| 2159 | if (rdev->mddev) | 2271 | if (rdev->mddev) |
| 2160 | MD_BUG(); | 2272 | MD_BUG(); |
| 2161 | free_disk_sb(rdev); | 2273 | md_rdev_clear(rdev); |
| 2162 | #ifndef MODULE | 2274 | #ifndef MODULE |
| 2163 | if (test_bit(AutoDetected, &rdev->flags)) | 2275 | if (test_bit(AutoDetected, &rdev->flags)) |
| 2164 | md_autodetect_dev(rdev->bdev->bd_dev); | 2276 | md_autodetect_dev(rdev->bdev->bd_dev); |
| @@ -2809,9 +2921,8 @@ offset_show(struct md_rdev *rdev, char *page) | |||
| 2809 | static ssize_t | 2921 | static ssize_t |
| 2810 | offset_store(struct md_rdev *rdev, const char *buf, size_t len) | 2922 | offset_store(struct md_rdev *rdev, const char *buf, size_t len) |
| 2811 | { | 2923 | { |
| 2812 | char *e; | 2924 | unsigned long long offset; |
| 2813 | unsigned long long offset = simple_strtoull(buf, &e, 10); | 2925 | if (strict_strtoull(buf, 10, &offset) < 0) |
| 2814 | if (e==buf || (*e && *e != '\n')) | ||
| 2815 | return -EINVAL; | 2926 | return -EINVAL; |
| 2816 | if (rdev->mddev->pers && rdev->raid_disk >= 0) | 2927 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
| 2817 | return -EBUSY; | 2928 | return -EBUSY; |
| @@ -2826,6 +2937,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2826 | static struct rdev_sysfs_entry rdev_offset = | 2937 | static struct rdev_sysfs_entry rdev_offset = |
| 2827 | __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); | 2938 | __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); |
| 2828 | 2939 | ||
| 2940 | static ssize_t new_offset_show(struct md_rdev *rdev, char *page) | ||
| 2941 | { | ||
| 2942 | return sprintf(page, "%llu\n", | ||
| 2943 | (unsigned long long)rdev->new_data_offset); | ||
| 2944 | } | ||
| 2945 | |||
| 2946 | static ssize_t new_offset_store(struct md_rdev *rdev, | ||
| 2947 | const char *buf, size_t len) | ||
| 2948 | { | ||
| 2949 | unsigned long long new_offset; | ||
| 2950 | struct mddev *mddev = rdev->mddev; | ||
| 2951 | |||
| 2952 | if (strict_strtoull(buf, 10, &new_offset) < 0) | ||
| 2953 | return -EINVAL; | ||
| 2954 | |||
| 2955 | if (mddev->sync_thread) | ||
| 2956 | return -EBUSY; | ||
| 2957 | if (new_offset == rdev->data_offset) | ||
| 2958 | /* reset is always permitted */ | ||
| 2959 | ; | ||
| 2960 | else if (new_offset > rdev->data_offset) { | ||
| 2961 | /* must not push array size beyond rdev_sectors */ | ||
| 2962 | if (new_offset - rdev->data_offset | ||
| 2963 | + mddev->dev_sectors > rdev->sectors) | ||
| 2964 | return -E2BIG; | ||
| 2965 | } | ||
| 2966 | /* Metadata worries about other space details. */ | ||
| 2967 | |||
| 2968 | /* decreasing the offset is inconsistent with a backwards | ||
| 2969 | * reshape. | ||
| 2970 | */ | ||
| 2971 | if (new_offset < rdev->data_offset && | ||
| 2972 | mddev->reshape_backwards) | ||
| 2973 | return -EINVAL; | ||
| 2974 | /* Increasing offset is inconsistent with forwards | ||
| 2975 | * reshape. reshape_direction should be set to | ||
| 2976 | * 'backwards' first. | ||
| 2977 | */ | ||
| 2978 | if (new_offset > rdev->data_offset && | ||
| 2979 | !mddev->reshape_backwards) | ||
| 2980 | return -EINVAL; | ||
| 2981 | |||
| 2982 | if (mddev->pers && mddev->persistent && | ||
| 2983 | !super_types[mddev->major_version] | ||
| 2984 | .allow_new_offset(rdev, new_offset)) | ||
| 2985 | return -E2BIG; | ||
| 2986 | rdev->new_data_offset = new_offset; | ||
| 2987 | if (new_offset > rdev->data_offset) | ||
| 2988 | mddev->reshape_backwards = 1; | ||
| 2989 | else if (new_offset < rdev->data_offset) | ||
| 2990 | mddev->reshape_backwards = 0; | ||
| 2991 | |||
| 2992 | return len; | ||
| 2993 | } | ||
| 2994 | static struct rdev_sysfs_entry rdev_new_offset = | ||
| 2995 | __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); | ||
| 2996 | |||
| 2829 | static ssize_t | 2997 | static ssize_t |
| 2830 | rdev_size_show(struct md_rdev *rdev, char *page) | 2998 | rdev_size_show(struct md_rdev *rdev, char *page) |
| 2831 | { | 2999 | { |
| @@ -2870,6 +3038,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2870 | 3038 | ||
| 2871 | if (strict_blocks_to_sectors(buf, §ors) < 0) | 3039 | if (strict_blocks_to_sectors(buf, §ors) < 0) |
| 2872 | return -EINVAL; | 3040 | return -EINVAL; |
| 3041 | if (rdev->data_offset != rdev->new_data_offset) | ||
| 3042 | return -EINVAL; /* too confusing */ | ||
| 2873 | if (my_mddev->pers && rdev->raid_disk >= 0) { | 3043 | if (my_mddev->pers && rdev->raid_disk >= 0) { |
| 2874 | if (my_mddev->persistent) { | 3044 | if (my_mddev->persistent) { |
| 2875 | sectors = super_types[my_mddev->major_version]. | 3045 | sectors = super_types[my_mddev->major_version]. |
| @@ -3006,6 +3176,7 @@ static struct attribute *rdev_default_attrs[] = { | |||
| 3006 | &rdev_errors.attr, | 3176 | &rdev_errors.attr, |
| 3007 | &rdev_slot.attr, | 3177 | &rdev_slot.attr, |
| 3008 | &rdev_offset.attr, | 3178 | &rdev_offset.attr, |
| 3179 | &rdev_new_offset.attr, | ||
| 3009 | &rdev_size.attr, | 3180 | &rdev_size.attr, |
| 3010 | &rdev_recovery_start.attr, | 3181 | &rdev_recovery_start.attr, |
| 3011 | &rdev_bad_blocks.attr, | 3182 | &rdev_bad_blocks.attr, |
| @@ -3080,6 +3251,7 @@ int md_rdev_init(struct md_rdev *rdev) | |||
| 3080 | rdev->raid_disk = -1; | 3251 | rdev->raid_disk = -1; |
| 3081 | rdev->flags = 0; | 3252 | rdev->flags = 0; |
| 3082 | rdev->data_offset = 0; | 3253 | rdev->data_offset = 0; |
| 3254 | rdev->new_data_offset = 0; | ||
| 3083 | rdev->sb_events = 0; | 3255 | rdev->sb_events = 0; |
| 3084 | rdev->last_read_error.tv_sec = 0; | 3256 | rdev->last_read_error.tv_sec = 0; |
| 3085 | rdev->last_read_error.tv_nsec = 0; | 3257 | rdev->last_read_error.tv_nsec = 0; |
| @@ -3178,8 +3350,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
| 3178 | abort_free: | 3350 | abort_free: |
| 3179 | if (rdev->bdev) | 3351 | if (rdev->bdev) |
| 3180 | unlock_rdev(rdev); | 3352 | unlock_rdev(rdev); |
| 3181 | free_disk_sb(rdev); | 3353 | md_rdev_clear(rdev); |
| 3182 | kfree(rdev->badblocks.page); | ||
| 3183 | kfree(rdev); | 3354 | kfree(rdev); |
| 3184 | return ERR_PTR(err); | 3355 | return ERR_PTR(err); |
| 3185 | } | 3356 | } |
| @@ -3419,6 +3590,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3419 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 3590 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 3420 | mddev->raid_disks -= mddev->delta_disks; | 3591 | mddev->raid_disks -= mddev->delta_disks; |
| 3421 | mddev->delta_disks = 0; | 3592 | mddev->delta_disks = 0; |
| 3593 | mddev->reshape_backwards = 0; | ||
| 3422 | module_put(pers->owner); | 3594 | module_put(pers->owner); |
| 3423 | printk(KERN_WARNING "md: %s: %s would not accept array\n", | 3595 | printk(KERN_WARNING "md: %s: %s would not accept array\n", |
| 3424 | mdname(mddev), clevel); | 3596 | mdname(mddev), clevel); |
| @@ -3492,6 +3664,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3492 | mddev->layout = mddev->new_layout; | 3664 | mddev->layout = mddev->new_layout; |
| 3493 | mddev->chunk_sectors = mddev->new_chunk_sectors; | 3665 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
| 3494 | mddev->delta_disks = 0; | 3666 | mddev->delta_disks = 0; |
| 3667 | mddev->reshape_backwards = 0; | ||
| 3495 | mddev->degraded = 0; | 3668 | mddev->degraded = 0; |
| 3496 | if (mddev->pers->sync_request == NULL) { | 3669 | if (mddev->pers->sync_request == NULL) { |
| 3497 | /* this is now an array without redundancy, so | 3670 | /* this is now an array without redundancy, so |
| @@ -3501,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3501 | del_timer_sync(&mddev->safemode_timer); | 3674 | del_timer_sync(&mddev->safemode_timer); |
| 3502 | } | 3675 | } |
| 3503 | pers->run(mddev); | 3676 | pers->run(mddev); |
| 3504 | mddev_resume(mddev); | ||
| 3505 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3677 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 3506 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3678 | mddev_resume(mddev); |
| 3507 | md_wakeup_thread(mddev->thread); | ||
| 3508 | sysfs_notify(&mddev->kobj, NULL, "level"); | 3679 | sysfs_notify(&mddev->kobj, NULL, "level"); |
| 3509 | md_new_event(mddev); | 3680 | md_new_event(mddev); |
| 3510 | return rv; | 3681 | return rv; |
| @@ -3582,9 +3753,20 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3582 | if (mddev->pers) | 3753 | if (mddev->pers) |
| 3583 | rv = update_raid_disks(mddev, n); | 3754 | rv = update_raid_disks(mddev, n); |
| 3584 | else if (mddev->reshape_position != MaxSector) { | 3755 | else if (mddev->reshape_position != MaxSector) { |
| 3756 | struct md_rdev *rdev; | ||
| 3585 | int olddisks = mddev->raid_disks - mddev->delta_disks; | 3757 | int olddisks = mddev->raid_disks - mddev->delta_disks; |
| 3758 | |||
| 3759 | rdev_for_each(rdev, mddev) { | ||
| 3760 | if (olddisks < n && | ||
| 3761 | rdev->data_offset < rdev->new_data_offset) | ||
| 3762 | return -EINVAL; | ||
| 3763 | if (olddisks > n && | ||
| 3764 | rdev->data_offset > rdev->new_data_offset) | ||
| 3765 | return -EINVAL; | ||
| 3766 | } | ||
| 3586 | mddev->delta_disks = n - olddisks; | 3767 | mddev->delta_disks = n - olddisks; |
| 3587 | mddev->raid_disks = n; | 3768 | mddev->raid_disks = n; |
| 3769 | mddev->reshape_backwards = (mddev->delta_disks < 0); | ||
| 3588 | } else | 3770 | } else |
| 3589 | mddev->raid_disks = n; | 3771 | mddev->raid_disks = n; |
| 3590 | return rv ? rv : len; | 3772 | return rv ? rv : len; |
| @@ -4266,7 +4448,8 @@ sync_completed_show(struct mddev *mddev, char *page) | |||
| 4266 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 4448 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
| 4267 | return sprintf(page, "none\n"); | 4449 | return sprintf(page, "none\n"); |
| 4268 | 4450 | ||
| 4269 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 4451 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
| 4452 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
| 4270 | max_sectors = mddev->resync_max_sectors; | 4453 | max_sectors = mddev->resync_max_sectors; |
| 4271 | else | 4454 | else |
| 4272 | max_sectors = mddev->dev_sectors; | 4455 | max_sectors = mddev->dev_sectors; |
| @@ -4428,6 +4611,7 @@ reshape_position_show(struct mddev *mddev, char *page) | |||
| 4428 | static ssize_t | 4611 | static ssize_t |
| 4429 | reshape_position_store(struct mddev *mddev, const char *buf, size_t len) | 4612 | reshape_position_store(struct mddev *mddev, const char *buf, size_t len) |
| 4430 | { | 4613 | { |
| 4614 | struct md_rdev *rdev; | ||
| 4431 | char *e; | 4615 | char *e; |
| 4432 | unsigned long long new = simple_strtoull(buf, &e, 10); | 4616 | unsigned long long new = simple_strtoull(buf, &e, 10); |
| 4433 | if (mddev->pers) | 4617 | if (mddev->pers) |
| @@ -4436,9 +4620,12 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 4436 | return -EINVAL; | 4620 | return -EINVAL; |
| 4437 | mddev->reshape_position = new; | 4621 | mddev->reshape_position = new; |
| 4438 | mddev->delta_disks = 0; | 4622 | mddev->delta_disks = 0; |
| 4623 | mddev->reshape_backwards = 0; | ||
| 4439 | mddev->new_level = mddev->level; | 4624 | mddev->new_level = mddev->level; |
| 4440 | mddev->new_layout = mddev->layout; | 4625 | mddev->new_layout = mddev->layout; |
| 4441 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 4626 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 4627 | rdev_for_each(rdev, mddev) | ||
| 4628 | rdev->new_data_offset = rdev->data_offset; | ||
| 4442 | return len; | 4629 | return len; |
| 4443 | } | 4630 | } |
| 4444 | 4631 | ||
| @@ -4447,6 +4634,42 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, | |||
| 4447 | reshape_position_store); | 4634 | reshape_position_store); |
| 4448 | 4635 | ||
| 4449 | static ssize_t | 4636 | static ssize_t |
| 4637 | reshape_direction_show(struct mddev *mddev, char *page) | ||
| 4638 | { | ||
| 4639 | return sprintf(page, "%s\n", | ||
| 4640 | mddev->reshape_backwards ? "backwards" : "forwards"); | ||
| 4641 | } | ||
| 4642 | |||
| 4643 | static ssize_t | ||
| 4644 | reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) | ||
| 4645 | { | ||
| 4646 | int backwards = 0; | ||
| 4647 | if (cmd_match(buf, "forwards")) | ||
| 4648 | backwards = 0; | ||
| 4649 | else if (cmd_match(buf, "backwards")) | ||
| 4650 | backwards = 1; | ||
| 4651 | else | ||
| 4652 | return -EINVAL; | ||
| 4653 | if (mddev->reshape_backwards == backwards) | ||
| 4654 | return len; | ||
| 4655 | |||
| 4656 | /* check if we are allowed to change */ | ||
| 4657 | if (mddev->delta_disks) | ||
| 4658 | return -EBUSY; | ||
| 4659 | |||
| 4660 | if (mddev->persistent && | ||
| 4661 | mddev->major_version == 0) | ||
| 4662 | return -EINVAL; | ||
| 4663 | |||
| 4664 | mddev->reshape_backwards = backwards; | ||
| 4665 | return len; | ||
| 4666 | } | ||
| 4667 | |||
| 4668 | static struct md_sysfs_entry md_reshape_direction = | ||
| 4669 | __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, | ||
| 4670 | reshape_direction_store); | ||
| 4671 | |||
| 4672 | static ssize_t | ||
| 4450 | array_size_show(struct mddev *mddev, char *page) | 4673 | array_size_show(struct mddev *mddev, char *page) |
| 4451 | { | 4674 | { |
| 4452 | if (mddev->external_size) | 4675 | if (mddev->external_size) |
| @@ -4501,6 +4724,7 @@ static struct attribute *md_default_attrs[] = { | |||
| 4501 | &md_safe_delay.attr, | 4724 | &md_safe_delay.attr, |
| 4502 | &md_array_state.attr, | 4725 | &md_array_state.attr, |
| 4503 | &md_reshape_position.attr, | 4726 | &md_reshape_position.attr, |
| 4727 | &md_reshape_direction.attr, | ||
| 4504 | &md_array_size.attr, | 4728 | &md_array_size.attr, |
| 4505 | &max_corr_read_errors.attr, | 4729 | &max_corr_read_errors.attr, |
| 4506 | NULL, | 4730 | NULL, |
| @@ -4914,7 +5138,8 @@ int md_run(struct mddev *mddev) | |||
| 4914 | err = -EINVAL; | 5138 | err = -EINVAL; |
| 4915 | mddev->pers->stop(mddev); | 5139 | mddev->pers->stop(mddev); |
| 4916 | } | 5140 | } |
| 4917 | if (err == 0 && mddev->pers->sync_request) { | 5141 | if (err == 0 && mddev->pers->sync_request && |
| 5142 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { | ||
| 4918 | err = bitmap_create(mddev); | 5143 | err = bitmap_create(mddev); |
| 4919 | if (err) { | 5144 | if (err) { |
| 4920 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 5145 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
| @@ -5064,6 +5289,7 @@ static void md_clean(struct mddev *mddev) | |||
| 5064 | mddev->events = 0; | 5289 | mddev->events = 0; |
| 5065 | mddev->can_decrease_events = 0; | 5290 | mddev->can_decrease_events = 0; |
| 5066 | mddev->delta_disks = 0; | 5291 | mddev->delta_disks = 0; |
| 5292 | mddev->reshape_backwards = 0; | ||
| 5067 | mddev->new_level = LEVEL_NONE; | 5293 | mddev->new_level = LEVEL_NONE; |
| 5068 | mddev->new_layout = 0; | 5294 | mddev->new_layout = 0; |
| 5069 | mddev->new_chunk_sectors = 0; | 5295 | mddev->new_chunk_sectors = 0; |
| @@ -5079,6 +5305,7 @@ static void md_clean(struct mddev *mddev) | |||
| 5079 | mddev->merge_check_needed = 0; | 5305 | mddev->merge_check_needed = 0; |
| 5080 | mddev->bitmap_info.offset = 0; | 5306 | mddev->bitmap_info.offset = 0; |
| 5081 | mddev->bitmap_info.default_offset = 0; | 5307 | mddev->bitmap_info.default_offset = 0; |
| 5308 | mddev->bitmap_info.default_space = 0; | ||
| 5082 | mddev->bitmap_info.chunksize = 0; | 5309 | mddev->bitmap_info.chunksize = 0; |
| 5083 | mddev->bitmap_info.daemon_sleep = 0; | 5310 | mddev->bitmap_info.daemon_sleep = 0; |
| 5084 | mddev->bitmap_info.max_write_behind = 0; | 5311 | mddev->bitmap_info.max_write_behind = 0; |
| @@ -5421,7 +5648,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) | |||
| 5421 | goto out; | 5648 | goto out; |
| 5422 | 5649 | ||
| 5423 | /* bitmap disabled, zero the first byte and copy out */ | 5650 | /* bitmap disabled, zero the first byte and copy out */ |
| 5424 | if (!mddev->bitmap || !mddev->bitmap->file) { | 5651 | if (!mddev->bitmap || !mddev->bitmap->storage.file) { |
| 5425 | file->pathname[0] = '\0'; | 5652 | file->pathname[0] = '\0'; |
| 5426 | goto copy_out; | 5653 | goto copy_out; |
| 5427 | } | 5654 | } |
| @@ -5430,7 +5657,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) | |||
| 5430 | if (!buf) | 5657 | if (!buf) |
| 5431 | goto out; | 5658 | goto out; |
| 5432 | 5659 | ||
| 5433 | ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); | 5660 | ptr = d_path(&mddev->bitmap->storage.file->f_path, |
| 5661 | buf, sizeof(file->pathname)); | ||
| 5434 | if (IS_ERR(ptr)) | 5662 | if (IS_ERR(ptr)) |
| 5435 | goto out; | 5663 | goto out; |
| 5436 | 5664 | ||
| @@ -5875,6 +6103,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) | |||
| 5875 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 6103 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 5876 | 6104 | ||
| 5877 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; | 6105 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
| 6106 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); | ||
| 5878 | mddev->bitmap_info.offset = 0; | 6107 | mddev->bitmap_info.offset = 0; |
| 5879 | 6108 | ||
| 5880 | mddev->reshape_position = MaxSector; | 6109 | mddev->reshape_position = MaxSector; |
| @@ -5888,6 +6117,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) | |||
| 5888 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 6117 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 5889 | mddev->new_layout = mddev->layout; | 6118 | mddev->new_layout = mddev->layout; |
| 5890 | mddev->delta_disks = 0; | 6119 | mddev->delta_disks = 0; |
| 6120 | mddev->reshape_backwards = 0; | ||
| 5891 | 6121 | ||
| 5892 | return 0; | 6122 | return 0; |
| 5893 | } | 6123 | } |
| @@ -5922,11 +6152,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) | |||
| 5922 | */ | 6152 | */ |
| 5923 | if (mddev->sync_thread) | 6153 | if (mddev->sync_thread) |
| 5924 | return -EBUSY; | 6154 | return -EBUSY; |
| 5925 | if (mddev->bitmap) | 6155 | |
| 5926 | /* Sorry, cannot grow a bitmap yet, just remove it, | ||
| 5927 | * grow, and re-add. | ||
| 5928 | */ | ||
| 5929 | return -EBUSY; | ||
| 5930 | rdev_for_each(rdev, mddev) { | 6156 | rdev_for_each(rdev, mddev) { |
| 5931 | sector_t avail = rdev->sectors; | 6157 | sector_t avail = rdev->sectors; |
| 5932 | 6158 | ||
| @@ -5944,6 +6170,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) | |||
| 5944 | static int update_raid_disks(struct mddev *mddev, int raid_disks) | 6170 | static int update_raid_disks(struct mddev *mddev, int raid_disks) |
| 5945 | { | 6171 | { |
| 5946 | int rv; | 6172 | int rv; |
| 6173 | struct md_rdev *rdev; | ||
| 5947 | /* change the number of raid disks */ | 6174 | /* change the number of raid disks */ |
| 5948 | if (mddev->pers->check_reshape == NULL) | 6175 | if (mddev->pers->check_reshape == NULL) |
| 5949 | return -EINVAL; | 6176 | return -EINVAL; |
| @@ -5952,11 +6179,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) | |||
| 5952 | return -EINVAL; | 6179 | return -EINVAL; |
| 5953 | if (mddev->sync_thread || mddev->reshape_position != MaxSector) | 6180 | if (mddev->sync_thread || mddev->reshape_position != MaxSector) |
| 5954 | return -EBUSY; | 6181 | return -EBUSY; |
| 6182 | |||
| 6183 | rdev_for_each(rdev, mddev) { | ||
| 6184 | if (mddev->raid_disks < raid_disks && | ||
| 6185 | rdev->data_offset < rdev->new_data_offset) | ||
| 6186 | return -EINVAL; | ||
| 6187 | if (mddev->raid_disks > raid_disks && | ||
| 6188 | rdev->data_offset > rdev->new_data_offset) | ||
| 6189 | return -EINVAL; | ||
| 6190 | } | ||
| 6191 | |||
| 5955 | mddev->delta_disks = raid_disks - mddev->raid_disks; | 6192 | mddev->delta_disks = raid_disks - mddev->raid_disks; |
| 6193 | if (mddev->delta_disks < 0) | ||
| 6194 | mddev->reshape_backwards = 1; | ||
| 6195 | else if (mddev->delta_disks > 0) | ||
| 6196 | mddev->reshape_backwards = 0; | ||
| 5956 | 6197 | ||
| 5957 | rv = mddev->pers->check_reshape(mddev); | 6198 | rv = mddev->pers->check_reshape(mddev); |
| 5958 | if (rv < 0) | 6199 | if (rv < 0) { |
| 5959 | mddev->delta_disks = 0; | 6200 | mddev->delta_disks = 0; |
| 6201 | mddev->reshape_backwards = 0; | ||
| 6202 | } | ||
| 5960 | return rv; | 6203 | return rv; |
| 5961 | } | 6204 | } |
| 5962 | 6205 | ||
| @@ -6039,6 +6282,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
| 6039 | return -EINVAL; | 6282 | return -EINVAL; |
| 6040 | mddev->bitmap_info.offset = | 6283 | mddev->bitmap_info.offset = |
| 6041 | mddev->bitmap_info.default_offset; | 6284 | mddev->bitmap_info.default_offset; |
| 6285 | mddev->bitmap_info.space = | ||
| 6286 | mddev->bitmap_info.default_space; | ||
| 6042 | mddev->pers->quiesce(mddev, 1); | 6287 | mddev->pers->quiesce(mddev, 1); |
| 6043 | rv = bitmap_create(mddev); | 6288 | rv = bitmap_create(mddev); |
| 6044 | if (!rv) | 6289 | if (!rv) |
| @@ -6050,7 +6295,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
| 6050 | /* remove the bitmap */ | 6295 | /* remove the bitmap */ |
| 6051 | if (!mddev->bitmap) | 6296 | if (!mddev->bitmap) |
| 6052 | return -ENOENT; | 6297 | return -ENOENT; |
| 6053 | if (mddev->bitmap->file) | 6298 | if (mddev->bitmap->storage.file) |
| 6054 | return -EINVAL; | 6299 | return -EINVAL; |
| 6055 | mddev->pers->quiesce(mddev, 1); | 6300 | mddev->pers->quiesce(mddev, 1); |
| 6056 | bitmap_destroy(mddev); | 6301 | bitmap_destroy(mddev); |
| @@ -6373,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
| 6373 | struct mddev *mddev = mddev_find(bdev->bd_dev); | 6618 | struct mddev *mddev = mddev_find(bdev->bd_dev); |
| 6374 | int err; | 6619 | int err; |
| 6375 | 6620 | ||
| 6621 | if (!mddev) | ||
| 6622 | return -ENODEV; | ||
| 6623 | |||
| 6376 | if (mddev->gendisk != bdev->bd_disk) { | 6624 | if (mddev->gendisk != bdev->bd_disk) { |
| 6377 | /* we are racing with mddev_put which is discarding this | 6625 | /* we are racing with mddev_put which is discarding this |
| 6378 | * bd_disk. | 6626 | * bd_disk. |
| @@ -6584,7 +6832,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) | |||
| 6584 | 6832 | ||
| 6585 | resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); | 6833 | resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); |
| 6586 | 6834 | ||
| 6587 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 6835 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
| 6836 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
| 6588 | max_sectors = mddev->resync_max_sectors; | 6837 | max_sectors = mddev->resync_max_sectors; |
| 6589 | else | 6838 | else |
| 6590 | max_sectors = mddev->dev_sectors; | 6839 | max_sectors = mddev->dev_sectors; |
| @@ -7147,7 +7396,7 @@ void md_do_sync(struct mddev *mddev) | |||
| 7147 | j = mddev->recovery_cp; | 7396 | j = mddev->recovery_cp; |
| 7148 | 7397 | ||
| 7149 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 7398 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
| 7150 | max_sectors = mddev->dev_sectors; | 7399 | max_sectors = mddev->resync_max_sectors; |
| 7151 | else { | 7400 | else { |
| 7152 | /* recovery follows the physical size of devices */ | 7401 | /* recovery follows the physical size of devices */ |
| 7153 | max_sectors = mddev->dev_sectors; | 7402 | max_sectors = mddev->dev_sectors; |
| @@ -7598,7 +7847,7 @@ void md_check_recovery(struct mddev *mddev) | |||
| 7598 | goto unlock; | 7847 | goto unlock; |
| 7599 | 7848 | ||
| 7600 | if (mddev->pers->sync_request) { | 7849 | if (mddev->pers->sync_request) { |
| 7601 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { | 7850 | if (spares) { |
| 7602 | /* We are adding a device or devices to an array | 7851 | /* We are adding a device or devices to an array |
| 7603 | * which has the bitmap stored on all devices. | 7852 | * which has the bitmap stored on all devices. |
| 7604 | * So make sure all bitmap pages get written | 7853 | * So make sure all bitmap pages get written |
| @@ -7646,6 +7895,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) | |||
| 7646 | } | 7895 | } |
| 7647 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | 7896 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
| 7648 | 7897 | ||
| 7898 | void md_finish_reshape(struct mddev *mddev) | ||
| 7899 | { | ||
| 7900 | /* called be personality module when reshape completes. */ | ||
| 7901 | struct md_rdev *rdev; | ||
| 7902 | |||
| 7903 | rdev_for_each(rdev, mddev) { | ||
| 7904 | if (rdev->data_offset > rdev->new_data_offset) | ||
| 7905 | rdev->sectors += rdev->data_offset - rdev->new_data_offset; | ||
| 7906 | else | ||
| 7907 | rdev->sectors -= rdev->new_data_offset - rdev->data_offset; | ||
| 7908 | rdev->data_offset = rdev->new_data_offset; | ||
| 7909 | } | ||
| 7910 | } | ||
| 7911 | EXPORT_SYMBOL(md_finish_reshape); | ||
| 7649 | 7912 | ||
| 7650 | /* Bad block management. | 7913 | /* Bad block management. |
| 7651 | * We can record which blocks on each device are 'bad' and so just | 7914 | * We can record which blocks on each device are 'bad' and so just |
| @@ -7894,10 +8157,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | |||
| 7894 | } | 8157 | } |
| 7895 | 8158 | ||
| 7896 | int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | 8159 | int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
| 7897 | int acknowledged) | 8160 | int is_new) |
| 7898 | { | 8161 | { |
| 7899 | int rv = md_set_badblocks(&rdev->badblocks, | 8162 | int rv; |
| 7900 | s + rdev->data_offset, sectors, acknowledged); | 8163 | if (is_new) |
| 8164 | s += rdev->new_data_offset; | ||
| 8165 | else | ||
| 8166 | s += rdev->data_offset; | ||
| 8167 | rv = md_set_badblocks(&rdev->badblocks, | ||
| 8168 | s, sectors, 0); | ||
| 7901 | if (rv) { | 8169 | if (rv) { |
| 7902 | /* Make sure they get written out promptly */ | 8170 | /* Make sure they get written out promptly */ |
| 7903 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 8171 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| @@ -8003,11 +8271,15 @@ out: | |||
| 8003 | return rv; | 8271 | return rv; |
| 8004 | } | 8272 | } |
| 8005 | 8273 | ||
| 8006 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) | 8274 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
| 8275 | int is_new) | ||
| 8007 | { | 8276 | { |
| 8277 | if (is_new) | ||
| 8278 | s += rdev->new_data_offset; | ||
| 8279 | else | ||
| 8280 | s += rdev->data_offset; | ||
| 8008 | return md_clear_badblocks(&rdev->badblocks, | 8281 | return md_clear_badblocks(&rdev->badblocks, |
| 8009 | s + rdev->data_offset, | 8282 | s, sectors); |
| 8010 | sectors); | ||
| 8011 | } | 8283 | } |
| 8012 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | 8284 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); |
| 8013 | 8285 | ||
diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c2063ccf48e..7b4a3c318cae 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
| @@ -55,6 +55,7 @@ struct md_rdev { | |||
| 55 | int sb_loaded; | 55 | int sb_loaded; |
| 56 | __u64 sb_events; | 56 | __u64 sb_events; |
| 57 | sector_t data_offset; /* start of data in array */ | 57 | sector_t data_offset; /* start of data in array */ |
| 58 | sector_t new_data_offset;/* only relevant while reshaping */ | ||
| 58 | sector_t sb_start; /* offset of the super block (in 512byte sectors) */ | 59 | sector_t sb_start; /* offset of the super block (in 512byte sectors) */ |
| 59 | int sb_size; /* bytes in the superblock */ | 60 | int sb_size; /* bytes in the superblock */ |
| 60 | int preferred_minor; /* autorun support */ | 61 | int preferred_minor; /* autorun support */ |
| @@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, | |||
| 193 | return 0; | 194 | return 0; |
| 194 | } | 195 | } |
| 195 | extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | 196 | extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
| 196 | int acknowledged); | 197 | int is_new); |
| 197 | extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); | 198 | extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
| 199 | int is_new); | ||
| 198 | extern void md_ack_all_badblocks(struct badblocks *bb); | 200 | extern void md_ack_all_badblocks(struct badblocks *bb); |
| 199 | 201 | ||
| 200 | struct mddev { | 202 | struct mddev { |
| @@ -262,6 +264,7 @@ struct mddev { | |||
| 262 | sector_t reshape_position; | 264 | sector_t reshape_position; |
| 263 | int delta_disks, new_level, new_layout; | 265 | int delta_disks, new_level, new_layout; |
| 264 | int new_chunk_sectors; | 266 | int new_chunk_sectors; |
| 267 | int reshape_backwards; | ||
| 265 | 268 | ||
| 266 | atomic_t plug_cnt; /* If device is expecting | 269 | atomic_t plug_cnt; /* If device is expecting |
| 267 | * more bios soon. | 270 | * more bios soon. |
| @@ -390,10 +393,13 @@ struct mddev { | |||
| 390 | * For external metadata, offset | 393 | * For external metadata, offset |
| 391 | * from start of device. | 394 | * from start of device. |
| 392 | */ | 395 | */ |
| 396 | unsigned long space; /* space available at this offset */ | ||
| 393 | loff_t default_offset; /* this is the offset to use when | 397 | loff_t default_offset; /* this is the offset to use when |
| 394 | * hot-adding a bitmap. It should | 398 | * hot-adding a bitmap. It should |
| 395 | * eventually be settable by sysfs. | 399 | * eventually be settable by sysfs. |
| 396 | */ | 400 | */ |
| 401 | unsigned long default_space; /* space available at | ||
| 402 | * default offset */ | ||
| 397 | struct mutex mutex; | 403 | struct mutex mutex; |
| 398 | unsigned long chunksize; | 404 | unsigned long chunksize; |
| 399 | unsigned long daemon_sleep; /* how many jiffies between updates? */ | 405 | unsigned long daemon_sleep; /* how many jiffies between updates? */ |
| @@ -591,6 +597,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi); | |||
| 591 | extern void md_write_end(struct mddev *mddev); | 597 | extern void md_write_end(struct mddev *mddev); |
| 592 | extern void md_done_sync(struct mddev *mddev, int blocks, int ok); | 598 | extern void md_done_sync(struct mddev *mddev, int blocks, int ok); |
| 593 | extern void md_error(struct mddev *mddev, struct md_rdev *rdev); | 599 | extern void md_error(struct mddev *mddev, struct md_rdev *rdev); |
| 600 | extern void md_finish_reshape(struct mddev *mddev); | ||
| 594 | 601 | ||
| 595 | extern int mddev_congested(struct mddev *mddev, int bits); | 602 | extern int mddev_congested(struct mddev *mddev, int bits); |
| 596 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); | 603 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); |
| @@ -615,6 +622,7 @@ extern int md_run(struct mddev *mddev); | |||
| 615 | extern void md_stop(struct mddev *mddev); | 622 | extern void md_stop(struct mddev *mddev); |
| 616 | extern void md_stop_writes(struct mddev *mddev); | 623 | extern void md_stop_writes(struct mddev *mddev); |
| 617 | extern int md_rdev_init(struct md_rdev *rdev); | 624 | extern int md_rdev_init(struct md_rdev *rdev); |
| 625 | extern void md_rdev_clear(struct md_rdev *rdev); | ||
| 618 | 626 | ||
| 619 | extern void mddev_suspend(struct mddev *mddev); | 627 | extern void mddev_suspend(struct mddev *mddev); |
| 620 | extern void mddev_resume(struct mddev *mddev); | 628 | extern void mddev_resume(struct mddev *mddev); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 15dd59b84e94..835de7168cd3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk, | |||
| 1859 | 1859 | ||
| 1860 | rdev = conf->mirrors[d].rdev; | 1860 | rdev = conf->mirrors[d].rdev; |
| 1861 | if (rdev && | 1861 | if (rdev && |
| 1862 | test_bit(In_sync, &rdev->flags) && | 1862 | (test_bit(In_sync, &rdev->flags) || |
| 1863 | (!test_bit(Faulty, &rdev->flags) && | ||
| 1864 | rdev->recovery_offset >= sect + s)) && | ||
| 1863 | is_badblock(rdev, sect, s, | 1865 | is_badblock(rdev, sect, s, |
| 1864 | &first_bad, &bad_sectors) == 0 && | 1866 | &first_bad, &bad_sectors) == 0 && |
| 1865 | sync_page_io(rdev, sect, s<<9, | 1867 | sync_page_io(rdev, sect, s<<9, |
| @@ -2024,7 +2026,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio | |||
| 2024 | continue; | 2026 | continue; |
| 2025 | if (test_bit(BIO_UPTODATE, &bio->bi_flags) && | 2027 | if (test_bit(BIO_UPTODATE, &bio->bi_flags) && |
| 2026 | test_bit(R1BIO_MadeGood, &r1_bio->state)) { | 2028 | test_bit(R1BIO_MadeGood, &r1_bio->state)) { |
| 2027 | rdev_clear_badblocks(rdev, r1_bio->sector, s); | 2029 | rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); |
| 2028 | } | 2030 | } |
| 2029 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | 2031 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && |
| 2030 | test_bit(R1BIO_WriteError, &r1_bio->state)) { | 2032 | test_bit(R1BIO_WriteError, &r1_bio->state)) { |
| @@ -2044,7 +2046,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | |||
| 2044 | struct md_rdev *rdev = conf->mirrors[m].rdev; | 2046 | struct md_rdev *rdev = conf->mirrors[m].rdev; |
| 2045 | rdev_clear_badblocks(rdev, | 2047 | rdev_clear_badblocks(rdev, |
| 2046 | r1_bio->sector, | 2048 | r1_bio->sector, |
| 2047 | r1_bio->sectors); | 2049 | r1_bio->sectors, 0); |
| 2048 | rdev_dec_pending(rdev, conf->mddev); | 2050 | rdev_dec_pending(rdev, conf->mddev); |
| 2049 | } else if (r1_bio->bios[m] != NULL) { | 2051 | } else if (r1_bio->bios[m] != NULL) { |
| 2050 | /* This drive got a write error. We need to | 2052 | /* This drive got a write error. We need to |
| @@ -2598,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2598 | if (!disk->rdev || | 2600 | if (!disk->rdev || |
| 2599 | !test_bit(In_sync, &disk->rdev->flags)) { | 2601 | !test_bit(In_sync, &disk->rdev->flags)) { |
| 2600 | disk->head_position = 0; | 2602 | disk->head_position = 0; |
| 2601 | if (disk->rdev) | 2603 | if (disk->rdev && |
| 2604 | (disk->rdev->saved_raid_disk < 0)) | ||
| 2602 | conf->fullsync = 1; | 2605 | conf->fullsync = 1; |
| 2603 | } else if (conf->last_used < 0) | 2606 | } else if (conf->last_used < 0) |
| 2604 | /* | 2607 | /* |
| @@ -2750,9 +2753,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) | |||
| 2750 | * any io in the removed space completes, but it hardly seems | 2753 | * any io in the removed space completes, but it hardly seems |
| 2751 | * worth it. | 2754 | * worth it. |
| 2752 | */ | 2755 | */ |
| 2753 | md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); | 2756 | sector_t newsize = raid1_size(mddev, sectors, 0); |
| 2754 | if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) | 2757 | if (mddev->external_size && |
| 2758 | mddev->array_sectors > newsize) | ||
| 2755 | return -EINVAL; | 2759 | return -EINVAL; |
| 2760 | if (mddev->bitmap) { | ||
| 2761 | int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0); | ||
| 2762 | if (ret) | ||
| 2763 | return ret; | ||
| 2764 | } | ||
| 2765 | md_set_array_sectors(mddev, newsize); | ||
| 2756 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2766 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 2757 | revalidate_disk(mddev->gendisk); | 2767 | revalidate_disk(mddev->gendisk); |
| 2758 | if (sectors > mddev->dev_sectors && | 2768 | if (sectors > mddev->dev_sectors && |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3f91c2e1dfe7..987db37cb875 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
| 26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
| 27 | #include <linux/kthread.h> | ||
| 27 | #include "md.h" | 28 | #include "md.h" |
| 28 | #include "raid10.h" | 29 | #include "raid10.h" |
| 29 | #include "raid0.h" | 30 | #include "raid0.h" |
| @@ -68,6 +69,11 @@ static int max_queued_requests = 1024; | |||
| 68 | static void allow_barrier(struct r10conf *conf); | 69 | static void allow_barrier(struct r10conf *conf); |
| 69 | static void lower_barrier(struct r10conf *conf); | 70 | static void lower_barrier(struct r10conf *conf); |
| 70 | static int enough(struct r10conf *conf, int ignore); | 71 | static int enough(struct r10conf *conf, int ignore); |
| 72 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
| 73 | int *skipped); | ||
| 74 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); | ||
| 75 | static void end_reshape_write(struct bio *bio, int error); | ||
| 76 | static void end_reshape(struct r10conf *conf); | ||
| 71 | 77 | ||
| 72 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 78 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
| 73 | { | 79 | { |
| @@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
| 112 | if (!r10_bio) | 118 | if (!r10_bio) |
| 113 | return NULL; | 119 | return NULL; |
| 114 | 120 | ||
| 115 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | 121 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || |
| 122 | test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) | ||
| 116 | nalloc = conf->copies; /* resync */ | 123 | nalloc = conf->copies; /* resync */ |
| 117 | else | 124 | else |
| 118 | nalloc = 2; /* recovery */ | 125 | nalloc = 2; /* recovery */ |
| @@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
| 140 | struct bio *rbio = r10_bio->devs[j].repl_bio; | 147 | struct bio *rbio = r10_bio->devs[j].repl_bio; |
| 141 | bio = r10_bio->devs[j].bio; | 148 | bio = r10_bio->devs[j].bio; |
| 142 | for (i = 0; i < RESYNC_PAGES; i++) { | 149 | for (i = 0; i < RESYNC_PAGES; i++) { |
| 143 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, | 150 | if (j > 0 && !test_bit(MD_RECOVERY_SYNC, |
| 144 | &conf->mddev->recovery)) { | 151 | &conf->mddev->recovery)) { |
| 145 | /* we can share bv_page's during recovery */ | 152 | /* we can share bv_page's during recovery |
| 153 | * and reshape */ | ||
| 146 | struct bio *rbio = r10_bio->devs[0].bio; | 154 | struct bio *rbio = r10_bio->devs[0].bio; |
| 147 | page = rbio->bi_io_vec[i].bv_page; | 155 | page = rbio->bi_io_vec[i].bv_page; |
| 148 | get_page(page); | 156 | get_page(page); |
| @@ -165,10 +173,11 @@ out_free_pages: | |||
| 165 | while (j--) | 173 | while (j--) |
| 166 | for (i = 0; i < RESYNC_PAGES ; i++) | 174 | for (i = 0; i < RESYNC_PAGES ; i++) |
| 167 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); | 175 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); |
| 168 | j = -1; | 176 | j = 0; |
| 169 | out_free_bio: | 177 | out_free_bio: |
| 170 | while (++j < nalloc) { | 178 | for ( ; j < nalloc; j++) { |
| 171 | bio_put(r10_bio->devs[j].bio); | 179 | if (r10_bio->devs[j].bio) |
| 180 | bio_put(r10_bio->devs[j].bio); | ||
| 172 | if (r10_bio->devs[j].repl_bio) | 181 | if (r10_bio->devs[j].repl_bio) |
| 173 | bio_put(r10_bio->devs[j].repl_bio); | 182 | bio_put(r10_bio->devs[j].repl_bio); |
| 174 | } | 183 | } |
| @@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
| 504 | * sector offset to a virtual address | 513 | * sector offset to a virtual address |
| 505 | */ | 514 | */ |
| 506 | 515 | ||
| 507 | static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) | 516 | static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) |
| 508 | { | 517 | { |
| 509 | int n,f; | 518 | int n,f; |
| 510 | sector_t sector; | 519 | sector_t sector; |
| 511 | sector_t chunk; | 520 | sector_t chunk; |
| 512 | sector_t stripe; | 521 | sector_t stripe; |
| 513 | int dev; | 522 | int dev; |
| 514 | |||
| 515 | int slot = 0; | 523 | int slot = 0; |
| 516 | 524 | ||
| 517 | /* now calculate first sector/dev */ | 525 | /* now calculate first sector/dev */ |
| 518 | chunk = r10bio->sector >> conf->chunk_shift; | 526 | chunk = r10bio->sector >> geo->chunk_shift; |
| 519 | sector = r10bio->sector & conf->chunk_mask; | 527 | sector = r10bio->sector & geo->chunk_mask; |
| 520 | 528 | ||
| 521 | chunk *= conf->near_copies; | 529 | chunk *= geo->near_copies; |
| 522 | stripe = chunk; | 530 | stripe = chunk; |
| 523 | dev = sector_div(stripe, conf->raid_disks); | 531 | dev = sector_div(stripe, geo->raid_disks); |
| 524 | if (conf->far_offset) | 532 | if (geo->far_offset) |
| 525 | stripe *= conf->far_copies; | 533 | stripe *= geo->far_copies; |
| 526 | 534 | ||
| 527 | sector += stripe << conf->chunk_shift; | 535 | sector += stripe << geo->chunk_shift; |
| 528 | 536 | ||
| 529 | /* and calculate all the others */ | 537 | /* and calculate all the others */ |
| 530 | for (n=0; n < conf->near_copies; n++) { | 538 | for (n = 0; n < geo->near_copies; n++) { |
| 531 | int d = dev; | 539 | int d = dev; |
| 532 | sector_t s = sector; | 540 | sector_t s = sector; |
| 533 | r10bio->devs[slot].addr = sector; | 541 | r10bio->devs[slot].addr = sector; |
| 534 | r10bio->devs[slot].devnum = d; | 542 | r10bio->devs[slot].devnum = d; |
| 535 | slot++; | 543 | slot++; |
| 536 | 544 | ||
| 537 | for (f = 1; f < conf->far_copies; f++) { | 545 | for (f = 1; f < geo->far_copies; f++) { |
| 538 | d += conf->near_copies; | 546 | d += geo->near_copies; |
| 539 | if (d >= conf->raid_disks) | 547 | if (d >= geo->raid_disks) |
| 540 | d -= conf->raid_disks; | 548 | d -= geo->raid_disks; |
| 541 | s += conf->stride; | 549 | s += geo->stride; |
| 542 | r10bio->devs[slot].devnum = d; | 550 | r10bio->devs[slot].devnum = d; |
| 543 | r10bio->devs[slot].addr = s; | 551 | r10bio->devs[slot].addr = s; |
| 544 | slot++; | 552 | slot++; |
| 545 | } | 553 | } |
| 546 | dev++; | 554 | dev++; |
| 547 | if (dev >= conf->raid_disks) { | 555 | if (dev >= geo->raid_disks) { |
| 548 | dev = 0; | 556 | dev = 0; |
| 549 | sector += (conf->chunk_mask + 1); | 557 | sector += (geo->chunk_mask + 1); |
| 550 | } | 558 | } |
| 551 | } | 559 | } |
| 552 | BUG_ON(slot != conf->copies); | 560 | } |
| 561 | |||
| 562 | static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) | ||
| 563 | { | ||
| 564 | struct geom *geo = &conf->geo; | ||
| 565 | |||
| 566 | if (conf->reshape_progress != MaxSector && | ||
| 567 | ((r10bio->sector >= conf->reshape_progress) != | ||
| 568 | conf->mddev->reshape_backwards)) { | ||
| 569 | set_bit(R10BIO_Previous, &r10bio->state); | ||
| 570 | geo = &conf->prev; | ||
| 571 | } else | ||
| 572 | clear_bit(R10BIO_Previous, &r10bio->state); | ||
| 573 | |||
| 574 | __raid10_find_phys(geo, r10bio); | ||
| 553 | } | 575 | } |
| 554 | 576 | ||
| 555 | static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | 577 | static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) |
| 556 | { | 578 | { |
| 557 | sector_t offset, chunk, vchunk; | 579 | sector_t offset, chunk, vchunk; |
| 580 | /* Never use conf->prev as this is only called during resync | ||
| 581 | * or recovery, so reshape isn't happening | ||
| 582 | */ | ||
| 583 | struct geom *geo = &conf->geo; | ||
| 558 | 584 | ||
| 559 | offset = sector & conf->chunk_mask; | 585 | offset = sector & geo->chunk_mask; |
| 560 | if (conf->far_offset) { | 586 | if (geo->far_offset) { |
| 561 | int fc; | 587 | int fc; |
| 562 | chunk = sector >> conf->chunk_shift; | 588 | chunk = sector >> geo->chunk_shift; |
| 563 | fc = sector_div(chunk, conf->far_copies); | 589 | fc = sector_div(chunk, geo->far_copies); |
| 564 | dev -= fc * conf->near_copies; | 590 | dev -= fc * geo->near_copies; |
| 565 | if (dev < 0) | 591 | if (dev < 0) |
| 566 | dev += conf->raid_disks; | 592 | dev += geo->raid_disks; |
| 567 | } else { | 593 | } else { |
| 568 | while (sector >= conf->stride) { | 594 | while (sector >= geo->stride) { |
| 569 | sector -= conf->stride; | 595 | sector -= geo->stride; |
| 570 | if (dev < conf->near_copies) | 596 | if (dev < geo->near_copies) |
| 571 | dev += conf->raid_disks - conf->near_copies; | 597 | dev += geo->raid_disks - geo->near_copies; |
| 572 | else | 598 | else |
| 573 | dev -= conf->near_copies; | 599 | dev -= geo->near_copies; |
| 574 | } | 600 | } |
| 575 | chunk = sector >> conf->chunk_shift; | 601 | chunk = sector >> geo->chunk_shift; |
| 576 | } | 602 | } |
| 577 | vchunk = chunk * conf->raid_disks + dev; | 603 | vchunk = chunk * geo->raid_disks + dev; |
| 578 | sector_div(vchunk, conf->near_copies); | 604 | sector_div(vchunk, geo->near_copies); |
| 579 | return (vchunk << conf->chunk_shift) + offset; | 605 | return (vchunk << geo->chunk_shift) + offset; |
| 580 | } | 606 | } |
| 581 | 607 | ||
| 582 | /** | 608 | /** |
| @@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
| 597 | struct r10conf *conf = mddev->private; | 623 | struct r10conf *conf = mddev->private; |
| 598 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 624 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
| 599 | int max; | 625 | int max; |
| 600 | unsigned int chunk_sectors = mddev->chunk_sectors; | 626 | unsigned int chunk_sectors; |
| 601 | unsigned int bio_sectors = bvm->bi_size >> 9; | 627 | unsigned int bio_sectors = bvm->bi_size >> 9; |
| 628 | struct geom *geo = &conf->geo; | ||
| 629 | |||
| 630 | chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; | ||
| 631 | if (conf->reshape_progress != MaxSector && | ||
| 632 | ((sector >= conf->reshape_progress) != | ||
| 633 | conf->mddev->reshape_backwards)) | ||
| 634 | geo = &conf->prev; | ||
| 602 | 635 | ||
| 603 | if (conf->near_copies < conf->raid_disks) { | 636 | if (geo->near_copies < geo->raid_disks) { |
| 604 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) | 637 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) |
| 605 | + bio_sectors)) << 9; | 638 | + bio_sectors)) << 9; |
| 606 | if (max < 0) | 639 | if (max < 0) |
| @@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
| 614 | if (mddev->merge_check_needed) { | 647 | if (mddev->merge_check_needed) { |
| 615 | struct r10bio r10_bio; | 648 | struct r10bio r10_bio; |
| 616 | int s; | 649 | int s; |
| 650 | if (conf->reshape_progress != MaxSector) { | ||
| 651 | /* Cannot give any guidance during reshape */ | ||
| 652 | if (max <= biovec->bv_len && bio_sectors == 0) | ||
| 653 | return biovec->bv_len; | ||
| 654 | return 0; | ||
| 655 | } | ||
| 617 | r10_bio.sector = sector; | 656 | r10_bio.sector = sector; |
| 618 | raid10_find_phys(conf, &r10_bio); | 657 | raid10_find_phys(conf, &r10_bio); |
| 619 | rcu_read_lock(); | 658 | rcu_read_lock(); |
| @@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
| 681 | struct md_rdev *rdev, *best_rdev; | 720 | struct md_rdev *rdev, *best_rdev; |
| 682 | int do_balance; | 721 | int do_balance; |
| 683 | int best_slot; | 722 | int best_slot; |
| 723 | struct geom *geo = &conf->geo; | ||
| 684 | 724 | ||
| 685 | raid10_find_phys(conf, r10_bio); | 725 | raid10_find_phys(conf, r10_bio); |
| 686 | rcu_read_lock(); | 726 | rcu_read_lock(); |
| @@ -761,11 +801,11 @@ retry: | |||
| 761 | * sequential read speed for 'far copies' arrays. So only | 801 | * sequential read speed for 'far copies' arrays. So only |
| 762 | * keep it for 'near' arrays, and review those later. | 802 | * keep it for 'near' arrays, and review those later. |
| 763 | */ | 803 | */ |
| 764 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) | 804 | if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
| 765 | break; | 805 | break; |
| 766 | 806 | ||
| 767 | /* for far > 1 always use the lowest address */ | 807 | /* for far > 1 always use the lowest address */ |
| 768 | if (conf->far_copies > 1) | 808 | if (geo->far_copies > 1) |
| 769 | new_distance = r10_bio->devs[slot].addr; | 809 | new_distance = r10_bio->devs[slot].addr; |
| 770 | else | 810 | else |
| 771 | new_distance = abs(r10_bio->devs[slot].addr - | 811 | new_distance = abs(r10_bio->devs[slot].addr - |
| @@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits) | |||
| 812 | if (mddev_congested(mddev, bits)) | 852 | if (mddev_congested(mddev, bits)) |
| 813 | return 1; | 853 | return 1; |
| 814 | rcu_read_lock(); | 854 | rcu_read_lock(); |
| 815 | for (i = 0; i < conf->raid_disks && ret == 0; i++) { | 855 | for (i = 0; |
| 856 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) | ||
| 857 | && ret == 0; | ||
| 858 | i++) { | ||
| 816 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | 859 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
| 817 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 860 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
| 818 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 861 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
| @@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf) | |||
| 973 | spin_unlock_irq(&conf->resync_lock); | 1016 | spin_unlock_irq(&conf->resync_lock); |
| 974 | } | 1017 | } |
| 975 | 1018 | ||
| 1019 | static sector_t choose_data_offset(struct r10bio *r10_bio, | ||
| 1020 | struct md_rdev *rdev) | ||
| 1021 | { | ||
| 1022 | if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || | ||
| 1023 | test_bit(R10BIO_Previous, &r10_bio->state)) | ||
| 1024 | return rdev->data_offset; | ||
| 1025 | else | ||
| 1026 | return rdev->new_data_offset; | ||
| 1027 | } | ||
| 1028 | |||
| 976 | static void make_request(struct mddev *mddev, struct bio * bio) | 1029 | static void make_request(struct mddev *mddev, struct bio * bio) |
| 977 | { | 1030 | { |
| 978 | struct r10conf *conf = mddev->private; | 1031 | struct r10conf *conf = mddev->private; |
| 979 | struct r10bio *r10_bio; | 1032 | struct r10bio *r10_bio; |
| 980 | struct bio *read_bio; | 1033 | struct bio *read_bio; |
| 981 | int i; | 1034 | int i; |
| 982 | int chunk_sects = conf->chunk_mask + 1; | 1035 | sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); |
| 1036 | int chunk_sects = chunk_mask + 1; | ||
| 983 | const int rw = bio_data_dir(bio); | 1037 | const int rw = bio_data_dir(bio); |
| 984 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 1038 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
| 985 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1039 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
| @@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
| 988 | int plugged; | 1042 | int plugged; |
| 989 | int sectors_handled; | 1043 | int sectors_handled; |
| 990 | int max_sectors; | 1044 | int max_sectors; |
| 1045 | int sectors; | ||
| 991 | 1046 | ||
| 992 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 1047 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
| 993 | md_flush_request(mddev, bio); | 1048 | md_flush_request(mddev, bio); |
| @@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
| 997 | /* If this request crosses a chunk boundary, we need to | 1052 | /* If this request crosses a chunk boundary, we need to |
| 998 | * split it. This will only happen for 1 PAGE (or less) requests. | 1053 | * split it. This will only happen for 1 PAGE (or less) requests. |
| 999 | */ | 1054 | */ |
| 1000 | if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) | 1055 | if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) |
| 1001 | > chunk_sects && | 1056 | > chunk_sects |
| 1002 | conf->near_copies < conf->raid_disks)) { | 1057 | && (conf->geo.near_copies < conf->geo.raid_disks |
| 1058 | || conf->prev.near_copies < conf->prev.raid_disks))) { | ||
| 1003 | struct bio_pair *bp; | 1059 | struct bio_pair *bp; |
| 1004 | /* Sanity check -- queue functions should prevent this happening */ | 1060 | /* Sanity check -- queue functions should prevent this happening */ |
| 1005 | if (bio->bi_vcnt != 1 || | 1061 | if (bio->bi_vcnt != 1 || |
| @@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
| 1051 | */ | 1107 | */ |
| 1052 | wait_barrier(conf); | 1108 | wait_barrier(conf); |
| 1053 | 1109 | ||
| 1110 | sectors = bio->bi_size >> 9; | ||
| 1111 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
| 1112 | bio->bi_sector < conf->reshape_progress && | ||
| 1113 | bio->bi_sector + sectors > conf->reshape_progress) { | ||
| 1114 | /* IO spans the reshape position. Need to wait for | ||
| 1115 | * reshape to pass | ||
| 1116 | */ | ||
| 1117 | allow_barrier(conf); | ||
| 1118 | wait_event(conf->wait_barrier, | ||
| 1119 | conf->reshape_progress <= bio->bi_sector || | ||
| 1120 | conf->reshape_progress >= bio->bi_sector + sectors); | ||
| 1121 | wait_barrier(conf); | ||
| 1122 | } | ||
| 1123 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
| 1124 | bio_data_dir(bio) == WRITE && | ||
| 1125 | (mddev->reshape_backwards | ||
| 1126 | ? (bio->bi_sector < conf->reshape_safe && | ||
| 1127 | bio->bi_sector + sectors > conf->reshape_progress) | ||
| 1128 | : (bio->bi_sector + sectors > conf->reshape_safe && | ||
| 1129 | bio->bi_sector < conf->reshape_progress))) { | ||
| 1130 | /* Need to update reshape_position in metadata */ | ||
| 1131 | mddev->reshape_position = conf->reshape_progress; | ||
| 1132 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
| 1133 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | ||
| 1134 | md_wakeup_thread(mddev->thread); | ||
| 1135 | wait_event(mddev->sb_wait, | ||
| 1136 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
| 1137 | |||
| 1138 | conf->reshape_safe = mddev->reshape_position; | ||
| 1139 | } | ||
| 1140 | |||
| 1054 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | 1141 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); |
| 1055 | 1142 | ||
| 1056 | r10_bio->master_bio = bio; | 1143 | r10_bio->master_bio = bio; |
| 1057 | r10_bio->sectors = bio->bi_size >> 9; | 1144 | r10_bio->sectors = sectors; |
| 1058 | 1145 | ||
| 1059 | r10_bio->mddev = mddev; | 1146 | r10_bio->mddev = mddev; |
| 1060 | r10_bio->sector = bio->bi_sector; | 1147 | r10_bio->sector = bio->bi_sector; |
| @@ -1093,7 +1180,7 @@ read_again: | |||
| 1093 | r10_bio->devs[slot].rdev = rdev; | 1180 | r10_bio->devs[slot].rdev = rdev; |
| 1094 | 1181 | ||
| 1095 | read_bio->bi_sector = r10_bio->devs[slot].addr + | 1182 | read_bio->bi_sector = r10_bio->devs[slot].addr + |
| 1096 | rdev->data_offset; | 1183 | choose_data_offset(r10_bio, rdev); |
| 1097 | read_bio->bi_bdev = rdev->bdev; | 1184 | read_bio->bi_bdev = rdev->bdev; |
| 1098 | read_bio->bi_end_io = raid10_end_read_request; | 1185 | read_bio->bi_end_io = raid10_end_read_request; |
| 1099 | read_bio->bi_rw = READ | do_sync; | 1186 | read_bio->bi_rw = READ | do_sync; |
| @@ -1297,7 +1384,8 @@ retry_write: | |||
| 1297 | r10_bio->devs[i].bio = mbio; | 1384 | r10_bio->devs[i].bio = mbio; |
| 1298 | 1385 | ||
| 1299 | mbio->bi_sector = (r10_bio->devs[i].addr+ | 1386 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
| 1300 | conf->mirrors[d].rdev->data_offset); | 1387 | choose_data_offset(r10_bio, |
| 1388 | conf->mirrors[d].rdev)); | ||
| 1301 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1389 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
| 1302 | mbio->bi_end_io = raid10_end_write_request; | 1390 | mbio->bi_end_io = raid10_end_write_request; |
| 1303 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1391 | mbio->bi_rw = WRITE | do_sync | do_fua; |
| @@ -1321,8 +1409,10 @@ retry_write: | |||
| 1321 | * so it cannot disappear, so the replacement cannot | 1409 | * so it cannot disappear, so the replacement cannot |
| 1322 | * become NULL here | 1410 | * become NULL here |
| 1323 | */ | 1411 | */ |
| 1324 | mbio->bi_sector = (r10_bio->devs[i].addr+ | 1412 | mbio->bi_sector = (r10_bio->devs[i].addr + |
| 1325 | conf->mirrors[d].replacement->data_offset); | 1413 | choose_data_offset( |
| 1414 | r10_bio, | ||
| 1415 | conf->mirrors[d].replacement)); | ||
| 1326 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; | 1416 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; |
| 1327 | mbio->bi_end_io = raid10_end_write_request; | 1417 | mbio->bi_end_io = raid10_end_write_request; |
| 1328 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1418 | mbio->bi_rw = WRITE | do_sync | do_fua; |
| @@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev) | |||
| 1368 | struct r10conf *conf = mddev->private; | 1458 | struct r10conf *conf = mddev->private; |
| 1369 | int i; | 1459 | int i; |
| 1370 | 1460 | ||
| 1371 | if (conf->near_copies < conf->raid_disks) | 1461 | if (conf->geo.near_copies < conf->geo.raid_disks) |
| 1372 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); | 1462 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); |
| 1373 | if (conf->near_copies > 1) | 1463 | if (conf->geo.near_copies > 1) |
| 1374 | seq_printf(seq, " %d near-copies", conf->near_copies); | 1464 | seq_printf(seq, " %d near-copies", conf->geo.near_copies); |
| 1375 | if (conf->far_copies > 1) { | 1465 | if (conf->geo.far_copies > 1) { |
| 1376 | if (conf->far_offset) | 1466 | if (conf->geo.far_offset) |
| 1377 | seq_printf(seq, " %d offset-copies", conf->far_copies); | 1467 | seq_printf(seq, " %d offset-copies", conf->geo.far_copies); |
| 1378 | else | 1468 | else |
| 1379 | seq_printf(seq, " %d far-copies", conf->far_copies); | 1469 | seq_printf(seq, " %d far-copies", conf->geo.far_copies); |
| 1380 | } | 1470 | } |
| 1381 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, | 1471 | seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, |
| 1382 | conf->raid_disks - mddev->degraded); | 1472 | conf->geo.raid_disks - mddev->degraded); |
| 1383 | for (i = 0; i < conf->raid_disks; i++) | 1473 | for (i = 0; i < conf->geo.raid_disks; i++) |
| 1384 | seq_printf(seq, "%s", | 1474 | seq_printf(seq, "%s", |
| 1385 | conf->mirrors[i].rdev && | 1475 | conf->mirrors[i].rdev && |
| 1386 | test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); | 1476 | test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); |
| @@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) | |||
| 1392 | * Don't consider the device numbered 'ignore' | 1482 | * Don't consider the device numbered 'ignore' |
| 1393 | * as we might be about to remove it. | 1483 | * as we might be about to remove it. |
| 1394 | */ | 1484 | */ |
| 1395 | static int enough(struct r10conf *conf, int ignore) | 1485 | static int _enough(struct r10conf *conf, struct geom *geo, int ignore) |
| 1396 | { | 1486 | { |
| 1397 | int first = 0; | 1487 | int first = 0; |
| 1398 | 1488 | ||
| @@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore) | |||
| 1403 | if (conf->mirrors[first].rdev && | 1493 | if (conf->mirrors[first].rdev && |
| 1404 | first != ignore) | 1494 | first != ignore) |
| 1405 | cnt++; | 1495 | cnt++; |
| 1406 | first = (first+1) % conf->raid_disks; | 1496 | first = (first+1) % geo->raid_disks; |
| 1407 | } | 1497 | } |
| 1408 | if (cnt == 0) | 1498 | if (cnt == 0) |
| 1409 | return 0; | 1499 | return 0; |
| @@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore) | |||
| 1411 | return 1; | 1501 | return 1; |
| 1412 | } | 1502 | } |
| 1413 | 1503 | ||
| 1504 | static int enough(struct r10conf *conf, int ignore) | ||
| 1505 | { | ||
| 1506 | return _enough(conf, &conf->geo, ignore) && | ||
| 1507 | _enough(conf, &conf->prev, ignore); | ||
| 1508 | } | ||
| 1509 | |||
| 1414 | static void error(struct mddev *mddev, struct md_rdev *rdev) | 1510 | static void error(struct mddev *mddev, struct md_rdev *rdev) |
| 1415 | { | 1511 | { |
| 1416 | char b[BDEVNAME_SIZE]; | 1512 | char b[BDEVNAME_SIZE]; |
| @@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1445 | "md/raid10:%s: Disk failure on %s, disabling device.\n" | 1541 | "md/raid10:%s: Disk failure on %s, disabling device.\n" |
| 1446 | "md/raid10:%s: Operation continuing on %d devices.\n", | 1542 | "md/raid10:%s: Operation continuing on %d devices.\n", |
| 1447 | mdname(mddev), bdevname(rdev->bdev, b), | 1543 | mdname(mddev), bdevname(rdev->bdev, b), |
| 1448 | mdname(mddev), conf->raid_disks - mddev->degraded); | 1544 | mdname(mddev), conf->geo.raid_disks - mddev->degraded); |
| 1449 | } | 1545 | } |
| 1450 | 1546 | ||
| 1451 | static void print_conf(struct r10conf *conf) | 1547 | static void print_conf(struct r10conf *conf) |
| @@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf) | |||
| 1458 | printk(KERN_DEBUG "(!conf)\n"); | 1554 | printk(KERN_DEBUG "(!conf)\n"); |
| 1459 | return; | 1555 | return; |
| 1460 | } | 1556 | } |
| 1461 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 1557 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, |
| 1462 | conf->raid_disks); | 1558 | conf->geo.raid_disks); |
| 1463 | 1559 | ||
| 1464 | for (i = 0; i < conf->raid_disks; i++) { | 1560 | for (i = 0; i < conf->geo.raid_disks; i++) { |
| 1465 | char b[BDEVNAME_SIZE]; | 1561 | char b[BDEVNAME_SIZE]; |
| 1466 | tmp = conf->mirrors + i; | 1562 | tmp = conf->mirrors + i; |
| 1467 | if (tmp->rdev) | 1563 | if (tmp->rdev) |
| @@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev) | |||
| 1493 | * Find all non-in_sync disks within the RAID10 configuration | 1589 | * Find all non-in_sync disks within the RAID10 configuration |
| 1494 | * and mark them in_sync | 1590 | * and mark them in_sync |
| 1495 | */ | 1591 | */ |
| 1496 | for (i = 0; i < conf->raid_disks; i++) { | 1592 | for (i = 0; i < conf->geo.raid_disks; i++) { |
| 1497 | tmp = conf->mirrors + i; | 1593 | tmp = conf->mirrors + i; |
| 1498 | if (tmp->replacement | 1594 | if (tmp->replacement |
| 1499 | && tmp->replacement->recovery_offset == MaxSector | 1595 | && tmp->replacement->recovery_offset == MaxSector |
| @@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1535 | int err = -EEXIST; | 1631 | int err = -EEXIST; |
| 1536 | int mirror; | 1632 | int mirror; |
| 1537 | int first = 0; | 1633 | int first = 0; |
| 1538 | int last = conf->raid_disks - 1; | 1634 | int last = conf->geo.raid_disks - 1; |
| 1539 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 1635 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
| 1540 | 1636 | ||
| 1541 | if (mddev->recovery_cp < MaxSector) | 1637 | if (mddev->recovery_cp < MaxSector) |
| @@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1543 | * very different from resync | 1639 | * very different from resync |
| 1544 | */ | 1640 | */ |
| 1545 | return -EBUSY; | 1641 | return -EBUSY; |
| 1546 | if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) | 1642 | if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) |
| 1547 | return -EINVAL; | 1643 | return -EINVAL; |
| 1548 | 1644 | ||
| 1549 | if (rdev->raid_disk >= 0) | 1645 | if (rdev->raid_disk >= 0) |
| @@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1635 | if (!test_bit(Faulty, &rdev->flags) && | 1731 | if (!test_bit(Faulty, &rdev->flags) && |
| 1636 | mddev->recovery_disabled != p->recovery_disabled && | 1732 | mddev->recovery_disabled != p->recovery_disabled && |
| 1637 | (!p->replacement || p->replacement == rdev) && | 1733 | (!p->replacement || p->replacement == rdev) && |
| 1734 | number < conf->geo.raid_disks && | ||
| 1638 | enough(conf, -1)) { | 1735 | enough(conf, -1)) { |
| 1639 | err = -EBUSY; | 1736 | err = -EBUSY; |
| 1640 | goto abort; | 1737 | goto abort; |
| @@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error) | |||
| 1676 | struct r10conf *conf = r10_bio->mddev->private; | 1773 | struct r10conf *conf = r10_bio->mddev->private; |
| 1677 | int d; | 1774 | int d; |
| 1678 | 1775 | ||
| 1679 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | 1776 | if (bio == r10_bio->master_bio) { |
| 1777 | /* this is a reshape read */ | ||
| 1778 | d = r10_bio->read_slot; /* really the read dev */ | ||
| 1779 | } else | ||
| 1780 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | ||
| 1680 | 1781 | ||
| 1681 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1782 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
| 1682 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1783 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
| @@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
| 2218 | " (%d sectors at %llu on %s)\n", | 2319 | " (%d sectors at %llu on %s)\n", |
| 2219 | mdname(mddev), s, | 2320 | mdname(mddev), s, |
| 2220 | (unsigned long long)( | 2321 | (unsigned long long)( |
| 2221 | sect + rdev->data_offset), | 2322 | sect + |
| 2323 | choose_data_offset(r10_bio, | ||
| 2324 | rdev)), | ||
| 2222 | bdevname(rdev->bdev, b)); | 2325 | bdevname(rdev->bdev, b)); |
| 2223 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2326 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
| 2224 | "drive\n", | 2327 | "drive\n", |
| @@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
| 2256 | " (%d sectors at %llu on %s)\n", | 2359 | " (%d sectors at %llu on %s)\n", |
| 2257 | mdname(mddev), s, | 2360 | mdname(mddev), s, |
| 2258 | (unsigned long long)( | 2361 | (unsigned long long)( |
| 2259 | sect + rdev->data_offset), | 2362 | sect + |
| 2363 | choose_data_offset(r10_bio, rdev)), | ||
| 2260 | bdevname(rdev->bdev, b)); | 2364 | bdevname(rdev->bdev, b)); |
| 2261 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2365 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
| 2262 | "drive\n", | 2366 | "drive\n", |
| @@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
| 2269 | " (%d sectors at %llu on %s)\n", | 2373 | " (%d sectors at %llu on %s)\n", |
| 2270 | mdname(mddev), s, | 2374 | mdname(mddev), s, |
| 2271 | (unsigned long long)( | 2375 | (unsigned long long)( |
| 2272 | sect + rdev->data_offset), | 2376 | sect + |
| 2377 | choose_data_offset(r10_bio, rdev)), | ||
| 2273 | bdevname(rdev->bdev, b)); | 2378 | bdevname(rdev->bdev, b)); |
| 2274 | atomic_add(s, &rdev->corrected_errors); | 2379 | atomic_add(s, &rdev->corrected_errors); |
| 2275 | } | 2380 | } |
| @@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) | |||
| 2343 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 2448 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
| 2344 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); | 2449 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); |
| 2345 | wbio->bi_sector = (r10_bio->devs[i].addr+ | 2450 | wbio->bi_sector = (r10_bio->devs[i].addr+ |
| 2346 | rdev->data_offset+ | 2451 | choose_data_offset(r10_bio, rdev) + |
| 2347 | (sector - r10_bio->sector)); | 2452 | (sector - r10_bio->sector)); |
| 2348 | wbio->bi_bdev = rdev->bdev; | 2453 | wbio->bi_bdev = rdev->bdev; |
| 2349 | if (submit_bio_wait(WRITE, wbio) == 0) | 2454 | if (submit_bio_wait(WRITE, wbio) == 0) |
| @@ -2420,7 +2525,7 @@ read_more: | |||
| 2420 | r10_bio->devs[slot].bio = bio; | 2525 | r10_bio->devs[slot].bio = bio; |
| 2421 | r10_bio->devs[slot].rdev = rdev; | 2526 | r10_bio->devs[slot].rdev = rdev; |
| 2422 | bio->bi_sector = r10_bio->devs[slot].addr | 2527 | bio->bi_sector = r10_bio->devs[slot].addr |
| 2423 | + rdev->data_offset; | 2528 | + choose_data_offset(r10_bio, rdev); |
| 2424 | bio->bi_bdev = rdev->bdev; | 2529 | bio->bi_bdev = rdev->bdev; |
| 2425 | bio->bi_rw = READ | do_sync; | 2530 | bio->bi_rw = READ | do_sync; |
| 2426 | bio->bi_private = r10_bio; | 2531 | bio->bi_private = r10_bio; |
| @@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
| 2480 | rdev_clear_badblocks( | 2585 | rdev_clear_badblocks( |
| 2481 | rdev, | 2586 | rdev, |
| 2482 | r10_bio->devs[m].addr, | 2587 | r10_bio->devs[m].addr, |
| 2483 | r10_bio->sectors); | 2588 | r10_bio->sectors, 0); |
| 2484 | } else { | 2589 | } else { |
| 2485 | if (!rdev_set_badblocks( | 2590 | if (!rdev_set_badblocks( |
| 2486 | rdev, | 2591 | rdev, |
| @@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
| 2496 | rdev_clear_badblocks( | 2601 | rdev_clear_badblocks( |
| 2497 | rdev, | 2602 | rdev, |
| 2498 | r10_bio->devs[m].addr, | 2603 | r10_bio->devs[m].addr, |
| 2499 | r10_bio->sectors); | 2604 | r10_bio->sectors, 0); |
| 2500 | } else { | 2605 | } else { |
| 2501 | if (!rdev_set_badblocks( | 2606 | if (!rdev_set_badblocks( |
| 2502 | rdev, | 2607 | rdev, |
| @@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
| 2515 | rdev_clear_badblocks( | 2620 | rdev_clear_badblocks( |
| 2516 | rdev, | 2621 | rdev, |
| 2517 | r10_bio->devs[m].addr, | 2622 | r10_bio->devs[m].addr, |
| 2518 | r10_bio->sectors); | 2623 | r10_bio->sectors, 0); |
| 2519 | rdev_dec_pending(rdev, conf->mddev); | 2624 | rdev_dec_pending(rdev, conf->mddev); |
| 2520 | } else if (bio != NULL && | 2625 | } else if (bio != NULL && |
| 2521 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { | 2626 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
| @@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
| 2532 | rdev_clear_badblocks( | 2637 | rdev_clear_badblocks( |
| 2533 | rdev, | 2638 | rdev, |
| 2534 | r10_bio->devs[m].addr, | 2639 | r10_bio->devs[m].addr, |
| 2535 | r10_bio->sectors); | 2640 | r10_bio->sectors, 0); |
| 2536 | rdev_dec_pending(rdev, conf->mddev); | 2641 | rdev_dec_pending(rdev, conf->mddev); |
| 2537 | } | 2642 | } |
| 2538 | } | 2643 | } |
| @@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev) | |||
| 2573 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || | 2678 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
| 2574 | test_bit(R10BIO_WriteError, &r10_bio->state)) | 2679 | test_bit(R10BIO_WriteError, &r10_bio->state)) |
| 2575 | handle_write_completed(conf, r10_bio); | 2680 | handle_write_completed(conf, r10_bio); |
| 2681 | else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) | ||
| 2682 | reshape_request_write(mddev, r10_bio); | ||
| 2576 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2683 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) |
| 2577 | sync_request_write(mddev, r10_bio); | 2684 | sync_request_write(mddev, r10_bio); |
| 2578 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2685 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
| @@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf) | |||
| 2603 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; | 2710 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; |
| 2604 | BUG_ON(conf->r10buf_pool); | 2711 | BUG_ON(conf->r10buf_pool); |
| 2605 | conf->have_replacement = 0; | 2712 | conf->have_replacement = 0; |
| 2606 | for (i = 0; i < conf->raid_disks; i++) | 2713 | for (i = 0; i < conf->geo.raid_disks; i++) |
| 2607 | if (conf->mirrors[i].replacement) | 2714 | if (conf->mirrors[i].replacement) |
| 2608 | conf->have_replacement = 1; | 2715 | conf->have_replacement = 1; |
| 2609 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); | 2716 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); |
| @@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2657 | sector_t sync_blocks; | 2764 | sector_t sync_blocks; |
| 2658 | sector_t sectors_skipped = 0; | 2765 | sector_t sectors_skipped = 0; |
| 2659 | int chunks_skipped = 0; | 2766 | int chunks_skipped = 0; |
| 2767 | sector_t chunk_mask = conf->geo.chunk_mask; | ||
| 2660 | 2768 | ||
| 2661 | if (!conf->r10buf_pool) | 2769 | if (!conf->r10buf_pool) |
| 2662 | if (init_resync(conf)) | 2770 | if (init_resync(conf)) |
| @@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2664 | 2772 | ||
| 2665 | skipped: | 2773 | skipped: |
| 2666 | max_sector = mddev->dev_sectors; | 2774 | max_sector = mddev->dev_sectors; |
| 2667 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2775 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
| 2776 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
| 2668 | max_sector = mddev->resync_max_sectors; | 2777 | max_sector = mddev->resync_max_sectors; |
| 2669 | if (sector_nr >= max_sector) { | 2778 | if (sector_nr >= max_sector) { |
| 2670 | /* If we aborted, we need to abort the | 2779 | /* If we aborted, we need to abort the |
| @@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2676 | * we need to convert that to several | 2785 | * we need to convert that to several |
| 2677 | * virtual addresses. | 2786 | * virtual addresses. |
| 2678 | */ | 2787 | */ |
| 2788 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | ||
| 2789 | end_reshape(conf); | ||
| 2790 | return 0; | ||
| 2791 | } | ||
| 2792 | |||
| 2679 | if (mddev->curr_resync < max_sector) { /* aborted */ | 2793 | if (mddev->curr_resync < max_sector) { /* aborted */ |
| 2680 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2794 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
| 2681 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | 2795 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, |
| 2682 | &sync_blocks, 1); | 2796 | &sync_blocks, 1); |
| 2683 | else for (i=0; i<conf->raid_disks; i++) { | 2797 | else for (i = 0; i < conf->geo.raid_disks; i++) { |
| 2684 | sector_t sect = | 2798 | sector_t sect = |
| 2685 | raid10_find_virt(conf, mddev->curr_resync, i); | 2799 | raid10_find_virt(conf, mddev->curr_resync, i); |
| 2686 | bitmap_end_sync(mddev->bitmap, sect, | 2800 | bitmap_end_sync(mddev->bitmap, sect, |
| @@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2694 | /* Completed a full sync so the replacements | 2808 | /* Completed a full sync so the replacements |
| 2695 | * are now fully recovered. | 2809 | * are now fully recovered. |
| 2696 | */ | 2810 | */ |
| 2697 | for (i = 0; i < conf->raid_disks; i++) | 2811 | for (i = 0; i < conf->geo.raid_disks; i++) |
| 2698 | if (conf->mirrors[i].replacement) | 2812 | if (conf->mirrors[i].replacement) |
| 2699 | conf->mirrors[i].replacement | 2813 | conf->mirrors[i].replacement |
| 2700 | ->recovery_offset | 2814 | ->recovery_offset |
| @@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2707 | *skipped = 1; | 2821 | *skipped = 1; |
| 2708 | return sectors_skipped; | 2822 | return sectors_skipped; |
| 2709 | } | 2823 | } |
| 2710 | if (chunks_skipped >= conf->raid_disks) { | 2824 | |
| 2825 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
| 2826 | return reshape_request(mddev, sector_nr, skipped); | ||
| 2827 | |||
| 2828 | if (chunks_skipped >= conf->geo.raid_disks) { | ||
| 2711 | /* if there has been nothing to do on any drive, | 2829 | /* if there has been nothing to do on any drive, |
| 2712 | * then there is nothing to do at all.. | 2830 | * then there is nothing to do at all.. |
| 2713 | */ | 2831 | */ |
| @@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2721 | /* make sure whole request will fit in a chunk - if chunks | 2839 | /* make sure whole request will fit in a chunk - if chunks |
| 2722 | * are meaningful | 2840 | * are meaningful |
| 2723 | */ | 2841 | */ |
| 2724 | if (conf->near_copies < conf->raid_disks && | 2842 | if (conf->geo.near_copies < conf->geo.raid_disks && |
| 2725 | max_sector > (sector_nr | conf->chunk_mask)) | 2843 | max_sector > (sector_nr | chunk_mask)) |
| 2726 | max_sector = (sector_nr | conf->chunk_mask) + 1; | 2844 | max_sector = (sector_nr | chunk_mask) + 1; |
| 2727 | /* | 2845 | /* |
| 2728 | * If there is non-resync activity waiting for us then | 2846 | * If there is non-resync activity waiting for us then |
| 2729 | * put in a delay to throttle resync. | 2847 | * put in a delay to throttle resync. |
| @@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2752 | int j; | 2870 | int j; |
| 2753 | r10_bio = NULL; | 2871 | r10_bio = NULL; |
| 2754 | 2872 | ||
| 2755 | for (i=0 ; i<conf->raid_disks; i++) { | 2873 | for (i = 0 ; i < conf->geo.raid_disks; i++) { |
| 2756 | int still_degraded; | 2874 | int still_degraded; |
| 2757 | struct r10bio *rb2; | 2875 | struct r10bio *rb2; |
| 2758 | sector_t sect; | 2876 | sector_t sect; |
| @@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2806 | /* Need to check if the array will still be | 2924 | /* Need to check if the array will still be |
| 2807 | * degraded | 2925 | * degraded |
| 2808 | */ | 2926 | */ |
| 2809 | for (j=0; j<conf->raid_disks; j++) | 2927 | for (j = 0; j < conf->geo.raid_disks; j++) |
| 2810 | if (conf->mirrors[j].rdev == NULL || | 2928 | if (conf->mirrors[j].rdev == NULL || |
| 2811 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { | 2929 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { |
| 2812 | still_degraded = 1; | 2930 | still_degraded = 1; |
| @@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2984 | r10_bio->sector = sector_nr; | 3102 | r10_bio->sector = sector_nr; |
| 2985 | set_bit(R10BIO_IsSync, &r10_bio->state); | 3103 | set_bit(R10BIO_IsSync, &r10_bio->state); |
| 2986 | raid10_find_phys(conf, r10_bio); | 3104 | raid10_find_phys(conf, r10_bio); |
| 2987 | r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; | 3105 | r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; |
| 2988 | 3106 | ||
| 2989 | for (i=0; i<conf->copies; i++) { | 3107 | for (i = 0; i < conf->copies; i++) { |
| 2990 | int d = r10_bio->devs[i].devnum; | 3108 | int d = r10_bio->devs[i].devnum; |
| 2991 | sector_t first_bad, sector; | 3109 | sector_t first_bad, sector; |
| 2992 | int bad_sectors; | 3110 | int bad_sectors; |
| @@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
| 3152 | struct r10conf *conf = mddev->private; | 3270 | struct r10conf *conf = mddev->private; |
| 3153 | 3271 | ||
| 3154 | if (!raid_disks) | 3272 | if (!raid_disks) |
| 3155 | raid_disks = conf->raid_disks; | 3273 | raid_disks = min(conf->geo.raid_disks, |
| 3274 | conf->prev.raid_disks); | ||
| 3156 | if (!sectors) | 3275 | if (!sectors) |
| 3157 | sectors = conf->dev_sectors; | 3276 | sectors = conf->dev_sectors; |
| 3158 | 3277 | ||
| 3159 | size = sectors >> conf->chunk_shift; | 3278 | size = sectors >> conf->geo.chunk_shift; |
| 3160 | sector_div(size, conf->far_copies); | 3279 | sector_div(size, conf->geo.far_copies); |
| 3161 | size = size * raid_disks; | 3280 | size = size * raid_disks; |
| 3162 | sector_div(size, conf->near_copies); | 3281 | sector_div(size, conf->geo.near_copies); |
| 3163 | 3282 | ||
| 3164 | return size << conf->chunk_shift; | 3283 | return size << conf->geo.chunk_shift; |
| 3165 | } | 3284 | } |
| 3166 | 3285 | ||
| 3167 | static void calc_sectors(struct r10conf *conf, sector_t size) | 3286 | static void calc_sectors(struct r10conf *conf, sector_t size) |
| @@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size) | |||
| 3171 | * conf->stride | 3290 | * conf->stride |
| 3172 | */ | 3291 | */ |
| 3173 | 3292 | ||
| 3174 | size = size >> conf->chunk_shift; | 3293 | size = size >> conf->geo.chunk_shift; |
| 3175 | sector_div(size, conf->far_copies); | 3294 | sector_div(size, conf->geo.far_copies); |
| 3176 | size = size * conf->raid_disks; | 3295 | size = size * conf->geo.raid_disks; |
| 3177 | sector_div(size, conf->near_copies); | 3296 | sector_div(size, conf->geo.near_copies); |
| 3178 | /* 'size' is now the number of chunks in the array */ | 3297 | /* 'size' is now the number of chunks in the array */ |
| 3179 | /* calculate "used chunks per device" */ | 3298 | /* calculate "used chunks per device" */ |
| 3180 | size = size * conf->copies; | 3299 | size = size * conf->copies; |
| @@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size) | |||
| 3182 | /* We need to round up when dividing by raid_disks to | 3301 | /* We need to round up when dividing by raid_disks to |
| 3183 | * get the stride size. | 3302 | * get the stride size. |
| 3184 | */ | 3303 | */ |
| 3185 | size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); | 3304 | size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); |
| 3186 | 3305 | ||
| 3187 | conf->dev_sectors = size << conf->chunk_shift; | 3306 | conf->dev_sectors = size << conf->geo.chunk_shift; |
| 3188 | 3307 | ||
| 3189 | if (conf->far_offset) | 3308 | if (conf->geo.far_offset) |
| 3190 | conf->stride = 1 << conf->chunk_shift; | 3309 | conf->geo.stride = 1 << conf->geo.chunk_shift; |
| 3191 | else { | 3310 | else { |
| 3192 | sector_div(size, conf->far_copies); | 3311 | sector_div(size, conf->geo.far_copies); |
| 3193 | conf->stride = size << conf->chunk_shift; | 3312 | conf->geo.stride = size << conf->geo.chunk_shift; |
| 3194 | } | 3313 | } |
| 3195 | } | 3314 | } |
| 3196 | 3315 | ||
| 3316 | enum geo_type {geo_new, geo_old, geo_start}; | ||
| 3317 | static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | ||
| 3318 | { | ||
| 3319 | int nc, fc, fo; | ||
| 3320 | int layout, chunk, disks; | ||
| 3321 | switch (new) { | ||
| 3322 | case geo_old: | ||
| 3323 | layout = mddev->layout; | ||
| 3324 | chunk = mddev->chunk_sectors; | ||
| 3325 | disks = mddev->raid_disks - mddev->delta_disks; | ||
| 3326 | break; | ||
| 3327 | case geo_new: | ||
| 3328 | layout = mddev->new_layout; | ||
| 3329 | chunk = mddev->new_chunk_sectors; | ||
| 3330 | disks = mddev->raid_disks; | ||
| 3331 | break; | ||
| 3332 | default: /* avoid 'may be unused' warnings */ | ||
| 3333 | case geo_start: /* new when starting reshape - raid_disks not | ||
| 3334 | * updated yet. */ | ||
| 3335 | layout = mddev->new_layout; | ||
| 3336 | chunk = mddev->new_chunk_sectors; | ||
| 3337 | disks = mddev->raid_disks + mddev->delta_disks; | ||
| 3338 | break; | ||
| 3339 | } | ||
| 3340 | if (layout >> 17) | ||
| 3341 | return -1; | ||
| 3342 | if (chunk < (PAGE_SIZE >> 9) || | ||
| 3343 | !is_power_of_2(chunk)) | ||
| 3344 | return -2; | ||
| 3345 | nc = layout & 255; | ||
| 3346 | fc = (layout >> 8) & 255; | ||
| 3347 | fo = layout & (1<<16); | ||
| 3348 | geo->raid_disks = disks; | ||
| 3349 | geo->near_copies = nc; | ||
| 3350 | geo->far_copies = fc; | ||
| 3351 | geo->far_offset = fo; | ||
| 3352 | geo->chunk_mask = chunk - 1; | ||
| 3353 | geo->chunk_shift = ffz(~chunk); | ||
| 3354 | return nc*fc; | ||
| 3355 | } | ||
| 3356 | |||
| 3197 | static struct r10conf *setup_conf(struct mddev *mddev) | 3357 | static struct r10conf *setup_conf(struct mddev *mddev) |
| 3198 | { | 3358 | { |
| 3199 | struct r10conf *conf = NULL; | 3359 | struct r10conf *conf = NULL; |
| 3200 | int nc, fc, fo; | ||
| 3201 | int err = -EINVAL; | 3360 | int err = -EINVAL; |
| 3361 | struct geom geo; | ||
| 3362 | int copies; | ||
| 3363 | |||
| 3364 | copies = setup_geo(&geo, mddev, geo_new); | ||
| 3202 | 3365 | ||
| 3203 | if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || | 3366 | if (copies == -2) { |
| 3204 | !is_power_of_2(mddev->new_chunk_sectors)) { | ||
| 3205 | printk(KERN_ERR "md/raid10:%s: chunk size must be " | 3367 | printk(KERN_ERR "md/raid10:%s: chunk size must be " |
| 3206 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", | 3368 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", |
| 3207 | mdname(mddev), PAGE_SIZE); | 3369 | mdname(mddev), PAGE_SIZE); |
| 3208 | goto out; | 3370 | goto out; |
| 3209 | } | 3371 | } |
| 3210 | 3372 | ||
| 3211 | nc = mddev->new_layout & 255; | 3373 | if (copies < 2 || copies > mddev->raid_disks) { |
| 3212 | fc = (mddev->new_layout >> 8) & 255; | ||
| 3213 | fo = mddev->new_layout & (1<<16); | ||
| 3214 | |||
| 3215 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || | ||
| 3216 | (mddev->new_layout >> 17)) { | ||
| 3217 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", | 3374 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
| 3218 | mdname(mddev), mddev->new_layout); | 3375 | mdname(mddev), mddev->new_layout); |
| 3219 | goto out; | 3376 | goto out; |
| @@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
| 3224 | if (!conf) | 3381 | if (!conf) |
| 3225 | goto out; | 3382 | goto out; |
| 3226 | 3383 | ||
| 3227 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 3384 | /* FIXME calc properly */ |
| 3385 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + | ||
| 3386 | max(0,mddev->delta_disks)), | ||
| 3228 | GFP_KERNEL); | 3387 | GFP_KERNEL); |
| 3229 | if (!conf->mirrors) | 3388 | if (!conf->mirrors) |
| 3230 | goto out; | 3389 | goto out; |
| @@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
| 3233 | if (!conf->tmppage) | 3392 | if (!conf->tmppage) |
| 3234 | goto out; | 3393 | goto out; |
| 3235 | 3394 | ||
| 3236 | 3395 | conf->geo = geo; | |
| 3237 | conf->raid_disks = mddev->raid_disks; | 3396 | conf->copies = copies; |
| 3238 | conf->near_copies = nc; | ||
| 3239 | conf->far_copies = fc; | ||
| 3240 | conf->copies = nc*fc; | ||
| 3241 | conf->far_offset = fo; | ||
| 3242 | conf->chunk_mask = mddev->new_chunk_sectors - 1; | ||
| 3243 | conf->chunk_shift = ffz(~mddev->new_chunk_sectors); | ||
| 3244 | |||
| 3245 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, | 3397 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, |
| 3246 | r10bio_pool_free, conf); | 3398 | r10bio_pool_free, conf); |
| 3247 | if (!conf->r10bio_pool) | 3399 | if (!conf->r10bio_pool) |
| 3248 | goto out; | 3400 | goto out; |
| 3249 | 3401 | ||
| 3250 | calc_sectors(conf, mddev->dev_sectors); | 3402 | calc_sectors(conf, mddev->dev_sectors); |
| 3251 | 3403 | if (mddev->reshape_position == MaxSector) { | |
| 3404 | conf->prev = conf->geo; | ||
| 3405 | conf->reshape_progress = MaxSector; | ||
| 3406 | } else { | ||
| 3407 | if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { | ||
| 3408 | err = -EINVAL; | ||
| 3409 | goto out; | ||
| 3410 | } | ||
| 3411 | conf->reshape_progress = mddev->reshape_position; | ||
| 3412 | if (conf->prev.far_offset) | ||
| 3413 | conf->prev.stride = 1 << conf->prev.chunk_shift; | ||
| 3414 | else | ||
| 3415 | /* far_copies must be 1 */ | ||
| 3416 | conf->prev.stride = conf->dev_sectors; | ||
| 3417 | } | ||
| 3252 | spin_lock_init(&conf->device_lock); | 3418 | spin_lock_init(&conf->device_lock); |
| 3253 | INIT_LIST_HEAD(&conf->retry_list); | 3419 | INIT_LIST_HEAD(&conf->retry_list); |
| 3254 | 3420 | ||
| @@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
| 3263 | return conf; | 3429 | return conf; |
| 3264 | 3430 | ||
| 3265 | out: | 3431 | out: |
| 3266 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", | 3432 | if (err == -ENOMEM) |
| 3267 | mdname(mddev)); | 3433 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", |
| 3434 | mdname(mddev)); | ||
| 3268 | if (conf) { | 3435 | if (conf) { |
| 3269 | if (conf->r10bio_pool) | 3436 | if (conf->r10bio_pool) |
| 3270 | mempool_destroy(conf->r10bio_pool); | 3437 | mempool_destroy(conf->r10bio_pool); |
| @@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev) | |||
| 3282 | struct mirror_info *disk; | 3449 | struct mirror_info *disk; |
| 3283 | struct md_rdev *rdev; | 3450 | struct md_rdev *rdev; |
| 3284 | sector_t size; | 3451 | sector_t size; |
| 3285 | 3452 | sector_t min_offset_diff = 0; | |
| 3286 | /* | 3453 | int first = 1; |
| 3287 | * copy the already verified devices into our private RAID10 | ||
| 3288 | * bookkeeping area. [whatever we allocate in run(), | ||
| 3289 | * should be freed in stop()] | ||
| 3290 | */ | ||
| 3291 | 3454 | ||
| 3292 | if (mddev->private == NULL) { | 3455 | if (mddev->private == NULL) { |
| 3293 | conf = setup_conf(mddev); | 3456 | conf = setup_conf(mddev); |
| @@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev) | |||
| 3304 | 3467 | ||
| 3305 | chunk_size = mddev->chunk_sectors << 9; | 3468 | chunk_size = mddev->chunk_sectors << 9; |
| 3306 | blk_queue_io_min(mddev->queue, chunk_size); | 3469 | blk_queue_io_min(mddev->queue, chunk_size); |
| 3307 | if (conf->raid_disks % conf->near_copies) | 3470 | if (conf->geo.raid_disks % conf->geo.near_copies) |
| 3308 | blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); | 3471 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
| 3309 | else | 3472 | else |
| 3310 | blk_queue_io_opt(mddev->queue, chunk_size * | 3473 | blk_queue_io_opt(mddev->queue, chunk_size * |
| 3311 | (conf->raid_disks / conf->near_copies)); | 3474 | (conf->geo.raid_disks / conf->geo.near_copies)); |
| 3312 | 3475 | ||
| 3313 | rdev_for_each(rdev, mddev) { | 3476 | rdev_for_each(rdev, mddev) { |
| 3477 | long long diff; | ||
| 3314 | 3478 | ||
| 3315 | disk_idx = rdev->raid_disk; | 3479 | disk_idx = rdev->raid_disk; |
| 3316 | if (disk_idx >= conf->raid_disks | 3480 | if (disk_idx < 0) |
| 3317 | || disk_idx < 0) | 3481 | continue; |
| 3482 | if (disk_idx >= conf->geo.raid_disks && | ||
| 3483 | disk_idx >= conf->prev.raid_disks) | ||
| 3318 | continue; | 3484 | continue; |
| 3319 | disk = conf->mirrors + disk_idx; | 3485 | disk = conf->mirrors + disk_idx; |
| 3320 | 3486 | ||
| @@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev) | |||
| 3327 | goto out_free_conf; | 3493 | goto out_free_conf; |
| 3328 | disk->rdev = rdev; | 3494 | disk->rdev = rdev; |
| 3329 | } | 3495 | } |
| 3496 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
| 3497 | if (!mddev->reshape_backwards) | ||
| 3498 | diff = -diff; | ||
| 3499 | if (diff < 0) | ||
| 3500 | diff = 0; | ||
| 3501 | if (first || diff < min_offset_diff) | ||
| 3502 | min_offset_diff = diff; | ||
| 3330 | 3503 | ||
| 3331 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3504 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
| 3332 | rdev->data_offset << 9); | 3505 | rdev->data_offset << 9); |
| 3333 | 3506 | ||
| 3334 | disk->head_position = 0; | 3507 | disk->head_position = 0; |
| 3335 | } | 3508 | } |
| 3509 | |||
| 3336 | /* need to check that every block has at least one working mirror */ | 3510 | /* need to check that every block has at least one working mirror */ |
| 3337 | if (!enough(conf, -1)) { | 3511 | if (!enough(conf, -1)) { |
| 3338 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 3512 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
| @@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev) | |||
| 3340 | goto out_free_conf; | 3514 | goto out_free_conf; |
| 3341 | } | 3515 | } |
| 3342 | 3516 | ||
| 3517 | if (conf->reshape_progress != MaxSector) { | ||
| 3518 | /* must ensure that shape change is supported */ | ||
| 3519 | if (conf->geo.far_copies != 1 && | ||
| 3520 | conf->geo.far_offset == 0) | ||
| 3521 | goto out_free_conf; | ||
| 3522 | if (conf->prev.far_copies != 1 && | ||
| 3523 | conf->geo.far_offset == 0) | ||
| 3524 | goto out_free_conf; | ||
| 3525 | } | ||
| 3526 | |||
| 3343 | mddev->degraded = 0; | 3527 | mddev->degraded = 0; |
| 3344 | for (i = 0; i < conf->raid_disks; i++) { | 3528 | for (i = 0; |
| 3529 | i < conf->geo.raid_disks | ||
| 3530 | || i < conf->prev.raid_disks; | ||
| 3531 | i++) { | ||
| 3345 | 3532 | ||
| 3346 | disk = conf->mirrors + i; | 3533 | disk = conf->mirrors + i; |
| 3347 | 3534 | ||
| @@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev) | |||
| 3368 | mdname(mddev)); | 3555 | mdname(mddev)); |
| 3369 | printk(KERN_INFO | 3556 | printk(KERN_INFO |
| 3370 | "md/raid10:%s: active with %d out of %d devices\n", | 3557 | "md/raid10:%s: active with %d out of %d devices\n", |
| 3371 | mdname(mddev), conf->raid_disks - mddev->degraded, | 3558 | mdname(mddev), conf->geo.raid_disks - mddev->degraded, |
| 3372 | conf->raid_disks); | 3559 | conf->geo.raid_disks); |
| 3373 | /* | 3560 | /* |
| 3374 | * Ok, everything is just fine now | 3561 | * Ok, everything is just fine now |
| 3375 | */ | 3562 | */ |
| @@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev) | |||
| 3386 | * maybe... | 3573 | * maybe... |
| 3387 | */ | 3574 | */ |
| 3388 | { | 3575 | { |
| 3389 | int stripe = conf->raid_disks * | 3576 | int stripe = conf->geo.raid_disks * |
| 3390 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | 3577 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
| 3391 | stripe /= conf->near_copies; | 3578 | stripe /= conf->geo.near_copies; |
| 3392 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 3579 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
| 3393 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 3580 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
| 3394 | } | 3581 | } |
| 3395 | 3582 | ||
| 3396 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 3583 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
| @@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev) | |||
| 3398 | if (md_integrity_register(mddev)) | 3585 | if (md_integrity_register(mddev)) |
| 3399 | goto out_free_conf; | 3586 | goto out_free_conf; |
| 3400 | 3587 | ||
| 3588 | if (conf->reshape_progress != MaxSector) { | ||
| 3589 | unsigned long before_length, after_length; | ||
| 3590 | |||
| 3591 | before_length = ((1 << conf->prev.chunk_shift) * | ||
| 3592 | conf->prev.far_copies); | ||
| 3593 | after_length = ((1 << conf->geo.chunk_shift) * | ||
| 3594 | conf->geo.far_copies); | ||
| 3595 | |||
| 3596 | if (max(before_length, after_length) > min_offset_diff) { | ||
| 3597 | /* This cannot work */ | ||
| 3598 | printk("md/raid10: offset difference not enough to continue reshape\n"); | ||
| 3599 | goto out_free_conf; | ||
| 3600 | } | ||
| 3601 | conf->offset_diff = min_offset_diff; | ||
| 3602 | |||
| 3603 | conf->reshape_safe = conf->reshape_progress; | ||
| 3604 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
| 3605 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
| 3606 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
| 3607 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
| 3608 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
| 3609 | "reshape"); | ||
| 3610 | } | ||
| 3611 | |||
| 3401 | return 0; | 3612 | return 0; |
| 3402 | 3613 | ||
| 3403 | out_free_conf: | 3614 | out_free_conf: |
| @@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) | |||
| 3460 | struct r10conf *conf = mddev->private; | 3671 | struct r10conf *conf = mddev->private; |
| 3461 | sector_t oldsize, size; | 3672 | sector_t oldsize, size; |
| 3462 | 3673 | ||
| 3463 | if (conf->far_copies > 1 && !conf->far_offset) | 3674 | if (mddev->reshape_position != MaxSector) |
| 3675 | return -EBUSY; | ||
| 3676 | |||
| 3677 | if (conf->geo.far_copies > 1 && !conf->geo.far_offset) | ||
| 3464 | return -EINVAL; | 3678 | return -EINVAL; |
| 3465 | 3679 | ||
| 3466 | oldsize = raid10_size(mddev, 0, 0); | 3680 | oldsize = raid10_size(mddev, 0, 0); |
| 3467 | size = raid10_size(mddev, sectors, 0); | 3681 | size = raid10_size(mddev, sectors, 0); |
| 3468 | md_set_array_sectors(mddev, size); | 3682 | if (mddev->external_size && |
| 3469 | if (mddev->array_sectors > size) | 3683 | mddev->array_sectors > size) |
| 3470 | return -EINVAL; | 3684 | return -EINVAL; |
| 3685 | if (mddev->bitmap) { | ||
| 3686 | int ret = bitmap_resize(mddev->bitmap, size, 0, 0); | ||
| 3687 | if (ret) | ||
| 3688 | return ret; | ||
| 3689 | } | ||
| 3690 | md_set_array_sectors(mddev, size); | ||
| 3471 | set_capacity(mddev->gendisk, mddev->array_sectors); | 3691 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 3472 | revalidate_disk(mddev->gendisk); | 3692 | revalidate_disk(mddev->gendisk); |
| 3473 | if (sectors > mddev->dev_sectors && | 3693 | if (sectors > mddev->dev_sectors && |
| @@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev) | |||
| 3534 | return ERR_PTR(-EINVAL); | 3754 | return ERR_PTR(-EINVAL); |
| 3535 | } | 3755 | } |
| 3536 | 3756 | ||
| 3757 | static int raid10_check_reshape(struct mddev *mddev) | ||
| 3758 | { | ||
| 3759 | /* Called when there is a request to change | ||
| 3760 | * - layout (to ->new_layout) | ||
| 3761 | * - chunk size (to ->new_chunk_sectors) | ||
| 3762 | * - raid_disks (by delta_disks) | ||
| 3763 | * or when trying to restart a reshape that was ongoing. | ||
| 3764 | * | ||
| 3765 | * We need to validate the request and possibly allocate | ||
| 3766 | * space if that might be an issue later. | ||
| 3767 | * | ||
| 3768 | * Currently we reject any reshape of a 'far' mode array, | ||
| 3769 | * allow chunk size to change if new is generally acceptable, | ||
| 3770 | * allow raid_disks to increase, and allow | ||
| 3771 | * a switch between 'near' mode and 'offset' mode. | ||
| 3772 | */ | ||
| 3773 | struct r10conf *conf = mddev->private; | ||
| 3774 | struct geom geo; | ||
| 3775 | |||
| 3776 | if (conf->geo.far_copies != 1 && !conf->geo.far_offset) | ||
| 3777 | return -EINVAL; | ||
| 3778 | |||
| 3779 | if (setup_geo(&geo, mddev, geo_start) != conf->copies) | ||
| 3780 | /* mustn't change number of copies */ | ||
| 3781 | return -EINVAL; | ||
| 3782 | if (geo.far_copies > 1 && !geo.far_offset) | ||
| 3783 | /* Cannot switch to 'far' mode */ | ||
| 3784 | return -EINVAL; | ||
| 3785 | |||
| 3786 | if (mddev->array_sectors & geo.chunk_mask) | ||
| 3787 | /* not factor of array size */ | ||
| 3788 | return -EINVAL; | ||
| 3789 | |||
| 3790 | if (!enough(conf, -1)) | ||
| 3791 | return -EINVAL; | ||
| 3792 | |||
| 3793 | kfree(conf->mirrors_new); | ||
| 3794 | conf->mirrors_new = NULL; | ||
| 3795 | if (mddev->delta_disks > 0) { | ||
| 3796 | /* allocate new 'mirrors' list */ | ||
| 3797 | conf->mirrors_new = kzalloc( | ||
| 3798 | sizeof(struct mirror_info) | ||
| 3799 | *(mddev->raid_disks + | ||
| 3800 | mddev->delta_disks), | ||
| 3801 | GFP_KERNEL); | ||
| 3802 | if (!conf->mirrors_new) | ||
| 3803 | return -ENOMEM; | ||
| 3804 | } | ||
| 3805 | return 0; | ||
| 3806 | } | ||
| 3807 | |||
| 3808 | /* | ||
| 3809 | * Need to check if array has failed when deciding whether to: | ||
| 3810 | * - start an array | ||
| 3811 | * - remove non-faulty devices | ||
| 3812 | * - add a spare | ||
| 3813 | * - allow a reshape | ||
| 3814 | * This determination is simple when no reshape is happening. | ||
| 3815 | * However if there is a reshape, we need to carefully check | ||
| 3816 | * both the before and after sections. | ||
| 3817 | * This is because some failed devices may only affect one | ||
| 3818 | * of the two sections, and some non-in_sync devices may | ||
| 3819 | * be insync in the section most affected by failed devices. | ||
| 3820 | */ | ||
| 3821 | static int calc_degraded(struct r10conf *conf) | ||
| 3822 | { | ||
| 3823 | int degraded, degraded2; | ||
| 3824 | int i; | ||
| 3825 | |||
| 3826 | rcu_read_lock(); | ||
| 3827 | degraded = 0; | ||
| 3828 | /* 'prev' section first */ | ||
| 3829 | for (i = 0; i < conf->prev.raid_disks; i++) { | ||
| 3830 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
| 3831 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
| 3832 | degraded++; | ||
| 3833 | else if (!test_bit(In_sync, &rdev->flags)) | ||
| 3834 | /* When we can reduce the number of devices in | ||
| 3835 | * an array, this might not contribute to | ||
| 3836 | * 'degraded'. It does now. | ||
| 3837 | */ | ||
| 3838 | degraded++; | ||
| 3839 | } | ||
| 3840 | rcu_read_unlock(); | ||
| 3841 | if (conf->geo.raid_disks == conf->prev.raid_disks) | ||
| 3842 | return degraded; | ||
| 3843 | rcu_read_lock(); | ||
| 3844 | degraded2 = 0; | ||
| 3845 | for (i = 0; i < conf->geo.raid_disks; i++) { | ||
| 3846 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
| 3847 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
| 3848 | degraded2++; | ||
| 3849 | else if (!test_bit(In_sync, &rdev->flags)) { | ||
| 3850 | /* If reshape is increasing the number of devices, | ||
| 3851 | * this section has already been recovered, so | ||
| 3852 | * it doesn't contribute to degraded. | ||
| 3853 | * else it does. | ||
| 3854 | */ | ||
| 3855 | if (conf->geo.raid_disks <= conf->prev.raid_disks) | ||
| 3856 | degraded2++; | ||
| 3857 | } | ||
| 3858 | } | ||
| 3859 | rcu_read_unlock(); | ||
| 3860 | if (degraded2 > degraded) | ||
| 3861 | return degraded2; | ||
| 3862 | return degraded; | ||
| 3863 | } | ||
| 3864 | |||
| 3865 | static int raid10_start_reshape(struct mddev *mddev) | ||
| 3866 | { | ||
| 3867 | /* A 'reshape' has been requested. This commits | ||
| 3868 | * the various 'new' fields and sets MD_RECOVER_RESHAPE | ||
| 3869 | * This also checks if there are enough spares and adds them | ||
| 3870 | * to the array. | ||
| 3871 | * We currently require enough spares to make the final | ||
| 3872 | * array non-degraded. We also require that the difference | ||
| 3873 | * between old and new data_offset - on each device - is | ||
| 3874 | * enough that we never risk over-writing. | ||
| 3875 | */ | ||
| 3876 | |||
| 3877 | unsigned long before_length, after_length; | ||
| 3878 | sector_t min_offset_diff = 0; | ||
| 3879 | int first = 1; | ||
| 3880 | struct geom new; | ||
| 3881 | struct r10conf *conf = mddev->private; | ||
| 3882 | struct md_rdev *rdev; | ||
| 3883 | int spares = 0; | ||
| 3884 | int ret; | ||
| 3885 | |||
| 3886 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
| 3887 | return -EBUSY; | ||
| 3888 | |||
| 3889 | if (setup_geo(&new, mddev, geo_start) != conf->copies) | ||
| 3890 | return -EINVAL; | ||
| 3891 | |||
| 3892 | before_length = ((1 << conf->prev.chunk_shift) * | ||
| 3893 | conf->prev.far_copies); | ||
| 3894 | after_length = ((1 << conf->geo.chunk_shift) * | ||
| 3895 | conf->geo.far_copies); | ||
| 3896 | |||
| 3897 | rdev_for_each(rdev, mddev) { | ||
| 3898 | if (!test_bit(In_sync, &rdev->flags) | ||
| 3899 | && !test_bit(Faulty, &rdev->flags)) | ||
| 3900 | spares++; | ||
| 3901 | if (rdev->raid_disk >= 0) { | ||
| 3902 | long long diff = (rdev->new_data_offset | ||
| 3903 | - rdev->data_offset); | ||
| 3904 | if (!mddev->reshape_backwards) | ||
| 3905 | diff = -diff; | ||
| 3906 | if (diff < 0) | ||
| 3907 | diff = 0; | ||
| 3908 | if (first || diff < min_offset_diff) | ||
| 3909 | min_offset_diff = diff; | ||
| 3910 | } | ||
| 3911 | } | ||
| 3912 | |||
| 3913 | if (max(before_length, after_length) > min_offset_diff) | ||
| 3914 | return -EINVAL; | ||
| 3915 | |||
| 3916 | if (spares < mddev->delta_disks) | ||
| 3917 | return -EINVAL; | ||
| 3918 | |||
| 3919 | conf->offset_diff = min_offset_diff; | ||
| 3920 | spin_lock_irq(&conf->device_lock); | ||
| 3921 | if (conf->mirrors_new) { | ||
| 3922 | memcpy(conf->mirrors_new, conf->mirrors, | ||
| 3923 | sizeof(struct mirror_info)*conf->prev.raid_disks); | ||
| 3924 | smp_mb(); | ||
| 3925 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ | ||
| 3926 | conf->mirrors_old = conf->mirrors; | ||
| 3927 | conf->mirrors = conf->mirrors_new; | ||
| 3928 | conf->mirrors_new = NULL; | ||
| 3929 | } | ||
| 3930 | setup_geo(&conf->geo, mddev, geo_start); | ||
| 3931 | smp_mb(); | ||
| 3932 | if (mddev->reshape_backwards) { | ||
| 3933 | sector_t size = raid10_size(mddev, 0, 0); | ||
| 3934 | if (size < mddev->array_sectors) { | ||
| 3935 | spin_unlock_irq(&conf->device_lock); | ||
| 3936 | printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", | ||
| 3937 | mdname(mddev)); | ||
| 3938 | return -EINVAL; | ||
| 3939 | } | ||
| 3940 | mddev->resync_max_sectors = size; | ||
| 3941 | conf->reshape_progress = size; | ||
| 3942 | } else | ||
| 3943 | conf->reshape_progress = 0; | ||
| 3944 | spin_unlock_irq(&conf->device_lock); | ||
| 3945 | |||
| 3946 | if (mddev->delta_disks && mddev->bitmap) { | ||
| 3947 | ret = bitmap_resize(mddev->bitmap, | ||
| 3948 | raid10_size(mddev, 0, | ||
| 3949 | conf->geo.raid_disks), | ||
| 3950 | 0, 0); | ||
| 3951 | if (ret) | ||
| 3952 | goto abort; | ||
| 3953 | } | ||
| 3954 | if (mddev->delta_disks > 0) { | ||
| 3955 | rdev_for_each(rdev, mddev) | ||
| 3956 | if (rdev->raid_disk < 0 && | ||
| 3957 | !test_bit(Faulty, &rdev->flags)) { | ||
| 3958 | if (raid10_add_disk(mddev, rdev) == 0) { | ||
| 3959 | if (rdev->raid_disk >= | ||
| 3960 | conf->prev.raid_disks) | ||
| 3961 | set_bit(In_sync, &rdev->flags); | ||
| 3962 | else | ||
| 3963 | rdev->recovery_offset = 0; | ||
| 3964 | |||
| 3965 | if (sysfs_link_rdev(mddev, rdev)) | ||
| 3966 | /* Failure here is OK */; | ||
| 3967 | } | ||
| 3968 | } else if (rdev->raid_disk >= conf->prev.raid_disks | ||
| 3969 | && !test_bit(Faulty, &rdev->flags)) { | ||
| 3970 | /* This is a spare that was manually added */ | ||
| 3971 | set_bit(In_sync, &rdev->flags); | ||
| 3972 | } | ||
| 3973 | } | ||
| 3974 | /* When a reshape changes the number of devices, | ||
| 3975 | * ->degraded is measured against the larger of the | ||
| 3976 | * pre and post numbers. | ||
| 3977 | */ | ||
| 3978 | spin_lock_irq(&conf->device_lock); | ||
| 3979 | mddev->degraded = calc_degraded(conf); | ||
| 3980 | spin_unlock_irq(&conf->device_lock); | ||
| 3981 | mddev->raid_disks = conf->geo.raid_disks; | ||
| 3982 | mddev->reshape_position = conf->reshape_progress; | ||
| 3983 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
| 3984 | |||
| 3985 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
| 3986 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
| 3987 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
| 3988 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
| 3989 | |||
| 3990 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
| 3991 | "reshape"); | ||
| 3992 | if (!mddev->sync_thread) { | ||
| 3993 | ret = -EAGAIN; | ||
| 3994 | goto abort; | ||
| 3995 | } | ||
| 3996 | conf->reshape_checkpoint = jiffies; | ||
| 3997 | md_wakeup_thread(mddev->sync_thread); | ||
| 3998 | md_new_event(mddev); | ||
| 3999 | return 0; | ||
| 4000 | |||
| 4001 | abort: | ||
| 4002 | mddev->recovery = 0; | ||
| 4003 | spin_lock_irq(&conf->device_lock); | ||
| 4004 | conf->geo = conf->prev; | ||
| 4005 | mddev->raid_disks = conf->geo.raid_disks; | ||
| 4006 | rdev_for_each(rdev, mddev) | ||
| 4007 | rdev->new_data_offset = rdev->data_offset; | ||
| 4008 | smp_wmb(); | ||
| 4009 | conf->reshape_progress = MaxSector; | ||
| 4010 | mddev->reshape_position = MaxSector; | ||
| 4011 | spin_unlock_irq(&conf->device_lock); | ||
| 4012 | return ret; | ||
| 4013 | } | ||
| 4014 | |||
| 4015 | /* Calculate the last device-address that could contain | ||
| 4016 | * any block from the chunk that includes the array-address 's' | ||
| 4017 | * and report the next address. | ||
| 4018 | * i.e. the address returned will be chunk-aligned and after | ||
| 4019 | * any data that is in the chunk containing 's'. | ||
| 4020 | */ | ||
| 4021 | static sector_t last_dev_address(sector_t s, struct geom *geo) | ||
| 4022 | { | ||
| 4023 | s = (s | geo->chunk_mask) + 1; | ||
| 4024 | s >>= geo->chunk_shift; | ||
| 4025 | s *= geo->near_copies; | ||
| 4026 | s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); | ||
| 4027 | s *= geo->far_copies; | ||
| 4028 | s <<= geo->chunk_shift; | ||
| 4029 | return s; | ||
| 4030 | } | ||
| 4031 | |||
| 4032 | /* Calculate the first device-address that could contain | ||
| 4033 | * any block from the chunk that includes the array-address 's'. | ||
| 4034 | * This too will be the start of a chunk | ||
| 4035 | */ | ||
| 4036 | static sector_t first_dev_address(sector_t s, struct geom *geo) | ||
| 4037 | { | ||
| 4038 | s >>= geo->chunk_shift; | ||
| 4039 | s *= geo->near_copies; | ||
| 4040 | sector_div(s, geo->raid_disks); | ||
| 4041 | s *= geo->far_copies; | ||
| 4042 | s <<= geo->chunk_shift; | ||
| 4043 | return s; | ||
| 4044 | } | ||
| 4045 | |||
| 4046 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
| 4047 | int *skipped) | ||
| 4048 | { | ||
| 4049 | /* We simply copy at most one chunk (smallest of old and new) | ||
| 4050 | * at a time, possibly less if that exceeds RESYNC_PAGES, | ||
| 4051 | * or we hit a bad block or something. | ||
| 4052 | * This might mean we pause for normal IO in the middle of | ||
| 4053 | * a chunk, but that is not a problem was mddev->reshape_position | ||
| 4054 | * can record any location. | ||
| 4055 | * | ||
| 4056 | * If we will want to write to a location that isn't | ||
| 4057 | * yet recorded as 'safe' (i.e. in metadata on disk) then | ||
| 4058 | * we need to flush all reshape requests and update the metadata. | ||
| 4059 | * | ||
| 4060 | * When reshaping forwards (e.g. to more devices), we interpret | ||
| 4061 | * 'safe' as the earliest block which might not have been copied | ||
| 4062 | * down yet. We divide this by previous stripe size and multiply | ||
| 4063 | * by previous stripe length to get lowest device offset that we | ||
| 4064 | * cannot write to yet. | ||
| 4065 | * We interpret 'sector_nr' as an address that we want to write to. | ||
| 4066 | * From this we use last_device_address() to find where we might | ||
| 4067 | * write to, and first_device_address on the 'safe' position. | ||
| 4068 | * If this 'next' write position is after the 'safe' position, | ||
| 4069 | * we must update the metadata to increase the 'safe' position. | ||
| 4070 | * | ||
| 4071 | * When reshaping backwards, we round in the opposite direction | ||
| 4072 | * and perform the reverse test: next write position must not be | ||
| 4073 | * less than current safe position. | ||
| 4074 | * | ||
| 4075 | * In all this the minimum difference in data offsets | ||
| 4076 | * (conf->offset_diff - always positive) allows a bit of slack, | ||
| 4077 | * so next can be after 'safe', but not by more than offset_disk | ||
| 4078 | * | ||
| 4079 | * We need to prepare all the bios here before we start any IO | ||
| 4080 | * to ensure the size we choose is acceptable to all devices. | ||
| 4081 | * The means one for each copy for write-out and an extra one for | ||
| 4082 | * read-in. | ||
| 4083 | * We store the read-in bio in ->master_bio and the others in | ||
| 4084 | * ->devs[x].bio and ->devs[x].repl_bio. | ||
| 4085 | */ | ||
| 4086 | struct r10conf *conf = mddev->private; | ||
| 4087 | struct r10bio *r10_bio; | ||
| 4088 | sector_t next, safe, last; | ||
| 4089 | int max_sectors; | ||
| 4090 | int nr_sectors; | ||
| 4091 | int s; | ||
| 4092 | struct md_rdev *rdev; | ||
| 4093 | int need_flush = 0; | ||
| 4094 | struct bio *blist; | ||
| 4095 | struct bio *bio, *read_bio; | ||
| 4096 | int sectors_done = 0; | ||
| 4097 | |||
| 4098 | if (sector_nr == 0) { | ||
| 4099 | /* If restarting in the middle, skip the initial sectors */ | ||
| 4100 | if (mddev->reshape_backwards && | ||
| 4101 | conf->reshape_progress < raid10_size(mddev, 0, 0)) { | ||
| 4102 | sector_nr = (raid10_size(mddev, 0, 0) | ||
| 4103 | - conf->reshape_progress); | ||
| 4104 | } else if (!mddev->reshape_backwards && | ||
| 4105 | conf->reshape_progress > 0) | ||
| 4106 | sector_nr = conf->reshape_progress; | ||
| 4107 | if (sector_nr) { | ||
| 4108 | mddev->curr_resync_completed = sector_nr; | ||
| 4109 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
| 4110 | *skipped = 1; | ||
| 4111 | return sector_nr; | ||
| 4112 | } | ||
| 4113 | } | ||
| 4114 | |||
| 4115 | /* We don't use sector_nr to track where we are up to | ||
| 4116 | * as that doesn't work well for ->reshape_backwards. | ||
| 4117 | * So just use ->reshape_progress. | ||
| 4118 | */ | ||
| 4119 | if (mddev->reshape_backwards) { | ||
| 4120 | /* 'next' is the earliest device address that we might | ||
| 4121 | * write to for this chunk in the new layout | ||
| 4122 | */ | ||
| 4123 | next = first_dev_address(conf->reshape_progress - 1, | ||
| 4124 | &conf->geo); | ||
| 4125 | |||
| 4126 | /* 'safe' is the last device address that we might read from | ||
| 4127 | * in the old layout after a restart | ||
| 4128 | */ | ||
| 4129 | safe = last_dev_address(conf->reshape_safe - 1, | ||
| 4130 | &conf->prev); | ||
| 4131 | |||
| 4132 | if (next + conf->offset_diff < safe) | ||
| 4133 | need_flush = 1; | ||
| 4134 | |||
| 4135 | last = conf->reshape_progress - 1; | ||
| 4136 | sector_nr = last & ~(sector_t)(conf->geo.chunk_mask | ||
| 4137 | & conf->prev.chunk_mask); | ||
| 4138 | if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) | ||
| 4139 | sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; | ||
| 4140 | } else { | ||
| 4141 | /* 'next' is after the last device address that we | ||
| 4142 | * might write to for this chunk in the new layout | ||
| 4143 | */ | ||
| 4144 | next = last_dev_address(conf->reshape_progress, &conf->geo); | ||
| 4145 | |||
| 4146 | /* 'safe' is the earliest device address that we might | ||
| 4147 | * read from in the old layout after a restart | ||
| 4148 | */ | ||
| 4149 | safe = first_dev_address(conf->reshape_safe, &conf->prev); | ||
| 4150 | |||
| 4151 | /* Need to update metadata if 'next' might be beyond 'safe' | ||
| 4152 | * as that would possibly corrupt data | ||
| 4153 | */ | ||
| 4154 | if (next > safe + conf->offset_diff) | ||
| 4155 | need_flush = 1; | ||
| 4156 | |||
| 4157 | sector_nr = conf->reshape_progress; | ||
| 4158 | last = sector_nr | (conf->geo.chunk_mask | ||
| 4159 | & conf->prev.chunk_mask); | ||
| 4160 | |||
| 4161 | if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) | ||
| 4162 | last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; | ||
| 4163 | } | ||
| 4164 | |||
| 4165 | if (need_flush || | ||
| 4166 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | ||
| 4167 | /* Need to update reshape_position in metadata */ | ||
| 4168 | wait_barrier(conf); | ||
| 4169 | mddev->reshape_position = conf->reshape_progress; | ||
| 4170 | if (mddev->reshape_backwards) | ||
| 4171 | mddev->curr_resync_completed = raid10_size(mddev, 0, 0) | ||
| 4172 | - conf->reshape_progress; | ||
| 4173 | else | ||
| 4174 | mddev->curr_resync_completed = conf->reshape_progress; | ||
| 4175 | conf->reshape_checkpoint = jiffies; | ||
| 4176 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
| 4177 | md_wakeup_thread(mddev->thread); | ||
| 4178 | wait_event(mddev->sb_wait, mddev->flags == 0 || | ||
| 4179 | kthread_should_stop()); | ||
| 4180 | conf->reshape_safe = mddev->reshape_position; | ||
| 4181 | allow_barrier(conf); | ||
| 4182 | } | ||
| 4183 | |||
| 4184 | read_more: | ||
| 4185 | /* Now schedule reads for blocks from sector_nr to last */ | ||
| 4186 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | ||
| 4187 | raise_barrier(conf, sectors_done != 0); | ||
| 4188 | atomic_set(&r10_bio->remaining, 0); | ||
| 4189 | r10_bio->mddev = mddev; | ||
| 4190 | r10_bio->sector = sector_nr; | ||
| 4191 | set_bit(R10BIO_IsReshape, &r10_bio->state); | ||
| 4192 | r10_bio->sectors = last - sector_nr + 1; | ||
| 4193 | rdev = read_balance(conf, r10_bio, &max_sectors); | ||
| 4194 | BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); | ||
| 4195 | |||
| 4196 | if (!rdev) { | ||
| 4197 | /* Cannot read from here, so need to record bad blocks | ||
| 4198 | * on all the target devices. | ||
| 4199 | */ | ||
| 4200 | // FIXME | ||
| 4201 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
| 4202 | return sectors_done; | ||
| 4203 | } | ||
| 4204 | |||
| 4205 | read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); | ||
| 4206 | |||
| 4207 | read_bio->bi_bdev = rdev->bdev; | ||
| 4208 | read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr | ||
| 4209 | + rdev->data_offset); | ||
| 4210 | read_bio->bi_private = r10_bio; | ||
| 4211 | read_bio->bi_end_io = end_sync_read; | ||
| 4212 | read_bio->bi_rw = READ; | ||
| 4213 | read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
| 4214 | read_bio->bi_flags |= 1 << BIO_UPTODATE; | ||
| 4215 | read_bio->bi_vcnt = 0; | ||
| 4216 | read_bio->bi_idx = 0; | ||
| 4217 | read_bio->bi_size = 0; | ||
| 4218 | r10_bio->master_bio = read_bio; | ||
| 4219 | r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; | ||
| 4220 | |||
| 4221 | /* Now find the locations in the new layout */ | ||
| 4222 | __raid10_find_phys(&conf->geo, r10_bio); | ||
| 4223 | |||
| 4224 | blist = read_bio; | ||
| 4225 | read_bio->bi_next = NULL; | ||
| 4226 | |||
| 4227 | for (s = 0; s < conf->copies*2; s++) { | ||
| 4228 | struct bio *b; | ||
| 4229 | int d = r10_bio->devs[s/2].devnum; | ||
| 4230 | struct md_rdev *rdev2; | ||
| 4231 | if (s&1) { | ||
| 4232 | rdev2 = conf->mirrors[d].replacement; | ||
| 4233 | b = r10_bio->devs[s/2].repl_bio; | ||
| 4234 | } else { | ||
| 4235 | rdev2 = conf->mirrors[d].rdev; | ||
| 4236 | b = r10_bio->devs[s/2].bio; | ||
| 4237 | } | ||
| 4238 | if (!rdev2 || test_bit(Faulty, &rdev2->flags)) | ||
| 4239 | continue; | ||
| 4240 | b->bi_bdev = rdev2->bdev; | ||
| 4241 | b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; | ||
| 4242 | b->bi_private = r10_bio; | ||
| 4243 | b->bi_end_io = end_reshape_write; | ||
| 4244 | b->bi_rw = WRITE; | ||
| 4245 | b->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
| 4246 | b->bi_flags |= 1 << BIO_UPTODATE; | ||
| 4247 | b->bi_next = blist; | ||
| 4248 | b->bi_vcnt = 0; | ||
| 4249 | b->bi_idx = 0; | ||
| 4250 | b->bi_size = 0; | ||
| 4251 | blist = b; | ||
| 4252 | } | ||
| 4253 | |||
| 4254 | /* Now add as many pages as possible to all of these bios. */ | ||
| 4255 | |||
| 4256 | nr_sectors = 0; | ||
| 4257 | for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { | ||
| 4258 | struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; | ||
| 4259 | int len = (max_sectors - s) << 9; | ||
| 4260 | if (len > PAGE_SIZE) | ||
| 4261 | len = PAGE_SIZE; | ||
| 4262 | for (bio = blist; bio ; bio = bio->bi_next) { | ||
| 4263 | struct bio *bio2; | ||
| 4264 | if (bio_add_page(bio, page, len, 0)) | ||
| 4265 | continue; | ||
| 4266 | |||
| 4267 | /* Didn't fit, must stop */ | ||
| 4268 | for (bio2 = blist; | ||
| 4269 | bio2 && bio2 != bio; | ||
| 4270 | bio2 = bio2->bi_next) { | ||
| 4271 | /* Remove last page from this bio */ | ||
| 4272 | bio2->bi_vcnt--; | ||
| 4273 | bio2->bi_size -= len; | ||
| 4274 | bio2->bi_flags &= ~(1<<BIO_SEG_VALID); | ||
| 4275 | } | ||
| 4276 | goto bio_full; | ||
| 4277 | } | ||
| 4278 | sector_nr += len >> 9; | ||
| 4279 | nr_sectors += len >> 9; | ||
| 4280 | } | ||
| 4281 | bio_full: | ||
| 4282 | r10_bio->sectors = nr_sectors; | ||
| 4283 | |||
| 4284 | /* Now submit the read */ | ||
| 4285 | md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); | ||
| 4286 | atomic_inc(&r10_bio->remaining); | ||
| 4287 | read_bio->bi_next = NULL; | ||
| 4288 | generic_make_request(read_bio); | ||
| 4289 | sector_nr += nr_sectors; | ||
| 4290 | sectors_done += nr_sectors; | ||
| 4291 | if (sector_nr <= last) | ||
| 4292 | goto read_more; | ||
| 4293 | |||
| 4294 | /* Now that we have done the whole section we can | ||
| 4295 | * update reshape_progress | ||
| 4296 | */ | ||
| 4297 | if (mddev->reshape_backwards) | ||
| 4298 | conf->reshape_progress -= sectors_done; | ||
| 4299 | else | ||
| 4300 | conf->reshape_progress += sectors_done; | ||
| 4301 | |||
| 4302 | return sectors_done; | ||
| 4303 | } | ||
| 4304 | |||
| 4305 | static void end_reshape_request(struct r10bio *r10_bio); | ||
| 4306 | static int handle_reshape_read_error(struct mddev *mddev, | ||
| 4307 | struct r10bio *r10_bio); | ||
| 4308 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) | ||
| 4309 | { | ||
| 4310 | /* Reshape read completed. Hopefully we have a block | ||
| 4311 | * to write out. | ||
| 4312 | * If we got a read error then we do sync 1-page reads from | ||
| 4313 | * elsewhere until we find the data - or give up. | ||
| 4314 | */ | ||
| 4315 | struct r10conf *conf = mddev->private; | ||
| 4316 | int s; | ||
| 4317 | |||
| 4318 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
| 4319 | if (handle_reshape_read_error(mddev, r10_bio) < 0) { | ||
| 4320 | /* Reshape has been aborted */ | ||
| 4321 | md_done_sync(mddev, r10_bio->sectors, 0); | ||
| 4322 | return; | ||
| 4323 | } | ||
| 4324 | |||
| 4325 | /* We definitely have the data in the pages, schedule the | ||
| 4326 | * writes. | ||
| 4327 | */ | ||
| 4328 | atomic_set(&r10_bio->remaining, 1); | ||
| 4329 | for (s = 0; s < conf->copies*2; s++) { | ||
| 4330 | struct bio *b; | ||
| 4331 | int d = r10_bio->devs[s/2].devnum; | ||
| 4332 | struct md_rdev *rdev; | ||
| 4333 | if (s&1) { | ||
| 4334 | rdev = conf->mirrors[d].replacement; | ||
| 4335 | b = r10_bio->devs[s/2].repl_bio; | ||
| 4336 | } else { | ||
| 4337 | rdev = conf->mirrors[d].rdev; | ||
| 4338 | b = r10_bio->devs[s/2].bio; | ||
| 4339 | } | ||
| 4340 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
| 4341 | continue; | ||
| 4342 | atomic_inc(&rdev->nr_pending); | ||
| 4343 | md_sync_acct(b->bi_bdev, r10_bio->sectors); | ||
| 4344 | atomic_inc(&r10_bio->remaining); | ||
| 4345 | b->bi_next = NULL; | ||
| 4346 | generic_make_request(b); | ||
| 4347 | } | ||
| 4348 | end_reshape_request(r10_bio); | ||
| 4349 | } | ||
| 4350 | |||
| 4351 | static void end_reshape(struct r10conf *conf) | ||
| 4352 | { | ||
| 4353 | if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) | ||
| 4354 | return; | ||
| 4355 | |||
| 4356 | spin_lock_irq(&conf->device_lock); | ||
| 4357 | conf->prev = conf->geo; | ||
| 4358 | md_finish_reshape(conf->mddev); | ||
| 4359 | smp_wmb(); | ||
| 4360 | conf->reshape_progress = MaxSector; | ||
| 4361 | spin_unlock_irq(&conf->device_lock); | ||
| 4362 | |||
| 4363 | /* read-ahead size must cover two whole stripes, which is | ||
| 4364 | * 2 * (datadisks) * chunksize where 'n' is the number of raid devices | ||
| 4365 | */ | ||
| 4366 | if (conf->mddev->queue) { | ||
| 4367 | int stripe = conf->geo.raid_disks * | ||
| 4368 | ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); | ||
| 4369 | stripe /= conf->geo.near_copies; | ||
| 4370 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | ||
| 4371 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | ||
| 4372 | } | ||
| 4373 | conf->fullsync = 0; | ||
| 4374 | } | ||
| 4375 | |||
| 4376 | |||
| 4377 | static int handle_reshape_read_error(struct mddev *mddev, | ||
| 4378 | struct r10bio *r10_bio) | ||
| 4379 | { | ||
| 4380 | /* Use sync reads to get the blocks from somewhere else */ | ||
| 4381 | int sectors = r10_bio->sectors; | ||
| 4382 | struct r10bio r10b; | ||
| 4383 | struct r10conf *conf = mddev->private; | ||
| 4384 | int slot = 0; | ||
| 4385 | int idx = 0; | ||
| 4386 | struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; | ||
| 4387 | |||
| 4388 | r10b.sector = r10_bio->sector; | ||
| 4389 | __raid10_find_phys(&conf->prev, &r10b); | ||
| 4390 | |||
| 4391 | while (sectors) { | ||
| 4392 | int s = sectors; | ||
| 4393 | int success = 0; | ||
| 4394 | int first_slot = slot; | ||
| 4395 | |||
| 4396 | if (s > (PAGE_SIZE >> 9)) | ||
| 4397 | s = PAGE_SIZE >> 9; | ||
| 4398 | |||
| 4399 | while (!success) { | ||
| 4400 | int d = r10b.devs[slot].devnum; | ||
| 4401 | struct md_rdev *rdev = conf->mirrors[d].rdev; | ||
| 4402 | sector_t addr; | ||
| 4403 | if (rdev == NULL || | ||
| 4404 | test_bit(Faulty, &rdev->flags) || | ||
| 4405 | !test_bit(In_sync, &rdev->flags)) | ||
| 4406 | goto failed; | ||
| 4407 | |||
| 4408 | addr = r10b.devs[slot].addr + idx * PAGE_SIZE; | ||
| 4409 | success = sync_page_io(rdev, | ||
| 4410 | addr, | ||
| 4411 | s << 9, | ||
| 4412 | bvec[idx].bv_page, | ||
| 4413 | READ, false); | ||
| 4414 | if (success) | ||
| 4415 | break; | ||
| 4416 | failed: | ||
| 4417 | slot++; | ||
| 4418 | if (slot >= conf->copies) | ||
| 4419 | slot = 0; | ||
| 4420 | if (slot == first_slot) | ||
| 4421 | break; | ||
| 4422 | } | ||
| 4423 | if (!success) { | ||
| 4424 | /* couldn't read this block, must give up */ | ||
| 4425 | set_bit(MD_RECOVERY_INTR, | ||
| 4426 | &mddev->recovery); | ||
| 4427 | return -EIO; | ||
| 4428 | } | ||
| 4429 | sectors -= s; | ||
| 4430 | idx++; | ||
| 4431 | } | ||
| 4432 | return 0; | ||
| 4433 | } | ||
| 4434 | |||
| 4435 | static void end_reshape_write(struct bio *bio, int error) | ||
| 4436 | { | ||
| 4437 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 4438 | struct r10bio *r10_bio = bio->bi_private; | ||
| 4439 | struct mddev *mddev = r10_bio->mddev; | ||
| 4440 | struct r10conf *conf = mddev->private; | ||
| 4441 | int d; | ||
| 4442 | int slot; | ||
| 4443 | int repl; | ||
| 4444 | struct md_rdev *rdev = NULL; | ||
| 4445 | |||
| 4446 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); | ||
| 4447 | if (repl) | ||
| 4448 | rdev = conf->mirrors[d].replacement; | ||
| 4449 | if (!rdev) { | ||
| 4450 | smp_mb(); | ||
| 4451 | rdev = conf->mirrors[d].rdev; | ||
| 4452 | } | ||
| 4453 | |||
| 4454 | if (!uptodate) { | ||
| 4455 | /* FIXME should record badblock */ | ||
| 4456 | md_error(mddev, rdev); | ||
| 4457 | } | ||
| 4458 | |||
| 4459 | rdev_dec_pending(rdev, mddev); | ||
| 4460 | end_reshape_request(r10_bio); | ||
| 4461 | } | ||
| 4462 | |||
| 4463 | static void end_reshape_request(struct r10bio *r10_bio) | ||
| 4464 | { | ||
| 4465 | if (!atomic_dec_and_test(&r10_bio->remaining)) | ||
| 4466 | return; | ||
| 4467 | md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); | ||
| 4468 | bio_put(r10_bio->master_bio); | ||
| 4469 | put_buf(r10_bio); | ||
| 4470 | } | ||
| 4471 | |||
| 4472 | static void raid10_finish_reshape(struct mddev *mddev) | ||
| 4473 | { | ||
| 4474 | struct r10conf *conf = mddev->private; | ||
| 4475 | |||
| 4476 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
| 4477 | return; | ||
| 4478 | |||
| 4479 | if (mddev->delta_disks > 0) { | ||
| 4480 | sector_t size = raid10_size(mddev, 0, 0); | ||
| 4481 | md_set_array_sectors(mddev, size); | ||
| 4482 | if (mddev->recovery_cp > mddev->resync_max_sectors) { | ||
| 4483 | mddev->recovery_cp = mddev->resync_max_sectors; | ||
| 4484 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
| 4485 | } | ||
| 4486 | mddev->resync_max_sectors = size; | ||
| 4487 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
| 4488 | revalidate_disk(mddev->gendisk); | ||
| 4489 | } else { | ||
| 4490 | int d; | ||
| 4491 | for (d = conf->geo.raid_disks ; | ||
| 4492 | d < conf->geo.raid_disks - mddev->delta_disks; | ||
| 4493 | d++) { | ||
| 4494 | struct md_rdev *rdev = conf->mirrors[d].rdev; | ||
| 4495 | if (rdev) | ||
| 4496 | clear_bit(In_sync, &rdev->flags); | ||
| 4497 | rdev = conf->mirrors[d].replacement; | ||
| 4498 | if (rdev) | ||
| 4499 | clear_bit(In_sync, &rdev->flags); | ||
| 4500 | } | ||
| 4501 | } | ||
| 4502 | mddev->layout = mddev->new_layout; | ||
| 4503 | mddev->chunk_sectors = 1 << conf->geo.chunk_shift; | ||
| 4504 | mddev->reshape_position = MaxSector; | ||
| 4505 | mddev->delta_disks = 0; | ||
| 4506 | mddev->reshape_backwards = 0; | ||
| 4507 | } | ||
| 4508 | |||
| 3537 | static struct md_personality raid10_personality = | 4509 | static struct md_personality raid10_personality = |
| 3538 | { | 4510 | { |
| 3539 | .name = "raid10", | 4511 | .name = "raid10", |
| @@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality = | |||
| 3552 | .size = raid10_size, | 4524 | .size = raid10_size, |
| 3553 | .resize = raid10_resize, | 4525 | .resize = raid10_resize, |
| 3554 | .takeover = raid10_takeover, | 4526 | .takeover = raid10_takeover, |
| 4527 | .check_reshape = raid10_check_reshape, | ||
| 4528 | .start_reshape = raid10_start_reshape, | ||
| 4529 | .finish_reshape = raid10_finish_reshape, | ||
| 3555 | }; | 4530 | }; |
| 3556 | 4531 | ||
| 3557 | static int __init raid_init(void) | 4532 | static int __init raid_init(void) |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 7c615613c381..135b1b0a1554 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -14,32 +14,38 @@ struct mirror_info { | |||
| 14 | struct r10conf { | 14 | struct r10conf { |
| 15 | struct mddev *mddev; | 15 | struct mddev *mddev; |
| 16 | struct mirror_info *mirrors; | 16 | struct mirror_info *mirrors; |
| 17 | int raid_disks; | 17 | struct mirror_info *mirrors_new, *mirrors_old; |
| 18 | spinlock_t device_lock; | 18 | spinlock_t device_lock; |
| 19 | 19 | ||
| 20 | /* geometry */ | 20 | /* geometry */ |
| 21 | int near_copies; /* number of copies laid out | 21 | struct geom { |
| 22 | int raid_disks; | ||
| 23 | int near_copies; /* number of copies laid out | ||
| 22 | * raid0 style */ | 24 | * raid0 style */ |
| 23 | int far_copies; /* number of copies laid out | 25 | int far_copies; /* number of copies laid out |
| 24 | * at large strides across drives | 26 | * at large strides across drives |
| 25 | */ | 27 | */ |
| 26 | int far_offset; /* far_copies are offset by 1 | 28 | int far_offset; /* far_copies are offset by 1 |
| 27 | * stripe instead of many | 29 | * stripe instead of many |
| 28 | */ | 30 | */ |
| 29 | int copies; /* near_copies * far_copies. | 31 | sector_t stride; /* distance between far copies. |
| 30 | * must be <= raid_disks | ||
| 31 | */ | ||
| 32 | sector_t stride; /* distance between far copies. | ||
| 33 | * This is size / far_copies unless | 32 | * This is size / far_copies unless |
| 34 | * far_offset, in which case it is | 33 | * far_offset, in which case it is |
| 35 | * 1 stripe. | 34 | * 1 stripe. |
| 36 | */ | 35 | */ |
| 36 | int chunk_shift; /* shift from chunks to sectors */ | ||
| 37 | sector_t chunk_mask; | ||
| 38 | } prev, geo; | ||
| 39 | int copies; /* near_copies * far_copies. | ||
| 40 | * must be <= raid_disks | ||
| 41 | */ | ||
| 37 | 42 | ||
| 38 | sector_t dev_sectors; /* temp copy of | 43 | sector_t dev_sectors; /* temp copy of |
| 39 | * mddev->dev_sectors */ | 44 | * mddev->dev_sectors */ |
| 40 | 45 | sector_t reshape_progress; | |
| 41 | int chunk_shift; /* shift from chunks to sectors */ | 46 | sector_t reshape_safe; |
| 42 | sector_t chunk_mask; | 47 | unsigned long reshape_checkpoint; |
| 48 | sector_t offset_diff; | ||
| 43 | 49 | ||
| 44 | struct list_head retry_list; | 50 | struct list_head retry_list; |
| 45 | /* queue pending writes and submit them on unplug */ | 51 | /* queue pending writes and submit them on unplug */ |
| @@ -136,6 +142,7 @@ enum r10bio_state { | |||
| 136 | R10BIO_Uptodate, | 142 | R10BIO_Uptodate, |
| 137 | R10BIO_IsSync, | 143 | R10BIO_IsSync, |
| 138 | R10BIO_IsRecover, | 144 | R10BIO_IsRecover, |
| 145 | R10BIO_IsReshape, | ||
| 139 | R10BIO_Degraded, | 146 | R10BIO_Degraded, |
| 140 | /* Set ReadError on bios that experience a read error | 147 | /* Set ReadError on bios that experience a read error |
| 141 | * so that raid10d knows what to do with them. | 148 | * so that raid10d knows what to do with them. |
| @@ -146,5 +153,10 @@ enum r10bio_state { | |||
| 146 | */ | 153 | */ |
| 147 | R10BIO_MadeGood, | 154 | R10BIO_MadeGood, |
| 148 | R10BIO_WriteError, | 155 | R10BIO_WriteError, |
| 156 | /* During a reshape we might be performing IO on the | ||
| 157 | * 'previous' part of the array, in which case this | ||
| 158 | * flag is set | ||
| 159 | */ | ||
| 160 | R10BIO_Previous, | ||
| 149 | }; | 161 | }; |
| 150 | #endif | 162 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f351422938e0..d26767246d26 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
| 488 | return sh; | 488 | return sh; |
| 489 | } | 489 | } |
| 490 | 490 | ||
| 491 | /* Determine if 'data_offset' or 'new_data_offset' should be used | ||
| 492 | * in this stripe_head. | ||
| 493 | */ | ||
| 494 | static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) | ||
| 495 | { | ||
| 496 | sector_t progress = conf->reshape_progress; | ||
| 497 | /* Need a memory barrier to make sure we see the value | ||
| 498 | * of conf->generation, or ->data_offset that was set before | ||
| 499 | * reshape_progress was updated. | ||
| 500 | */ | ||
| 501 | smp_rmb(); | ||
| 502 | if (progress == MaxSector) | ||
| 503 | return 0; | ||
| 504 | if (sh->generation == conf->generation - 1) | ||
| 505 | return 0; | ||
| 506 | /* We are in a reshape, and this is a new-generation stripe, | ||
| 507 | * so use new_data_offset. | ||
| 508 | */ | ||
| 509 | return 1; | ||
| 510 | } | ||
| 511 | |||
| 491 | static void | 512 | static void |
| 492 | raid5_end_read_request(struct bio *bi, int error); | 513 | raid5_end_read_request(struct bio *bi, int error); |
| 493 | static void | 514 | static void |
| @@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 518 | replace_only = 1; | 539 | replace_only = 1; |
| 519 | } else | 540 | } else |
| 520 | continue; | 541 | continue; |
| 542 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) | ||
| 543 | rw |= REQ_SYNC; | ||
| 521 | 544 | ||
| 522 | bi = &sh->dev[i].req; | 545 | bi = &sh->dev[i].req; |
| 523 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | 546 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ |
| @@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 603 | __func__, (unsigned long long)sh->sector, | 626 | __func__, (unsigned long long)sh->sector, |
| 604 | bi->bi_rw, i); | 627 | bi->bi_rw, i); |
| 605 | atomic_inc(&sh->count); | 628 | atomic_inc(&sh->count); |
| 606 | bi->bi_sector = sh->sector + rdev->data_offset; | 629 | if (use_new_offset(conf, sh)) |
| 630 | bi->bi_sector = (sh->sector | ||
| 631 | + rdev->new_data_offset); | ||
| 632 | else | ||
| 633 | bi->bi_sector = (sh->sector | ||
| 634 | + rdev->data_offset); | ||
| 607 | bi->bi_flags = 1 << BIO_UPTODATE; | 635 | bi->bi_flags = 1 << BIO_UPTODATE; |
| 608 | bi->bi_idx = 0; | 636 | bi->bi_idx = 0; |
| 609 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 637 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| @@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 627 | __func__, (unsigned long long)sh->sector, | 655 | __func__, (unsigned long long)sh->sector, |
| 628 | rbi->bi_rw, i); | 656 | rbi->bi_rw, i); |
| 629 | atomic_inc(&sh->count); | 657 | atomic_inc(&sh->count); |
| 630 | rbi->bi_sector = sh->sector + rrdev->data_offset; | 658 | if (use_new_offset(conf, sh)) |
| 659 | rbi->bi_sector = (sh->sector | ||
| 660 | + rrdev->new_data_offset); | ||
| 661 | else | ||
| 662 | rbi->bi_sector = (sh->sector | ||
| 663 | + rrdev->data_offset); | ||
| 631 | rbi->bi_flags = 1 << BIO_UPTODATE; | 664 | rbi->bi_flags = 1 << BIO_UPTODATE; |
| 632 | rbi->bi_idx = 0; | 665 | rbi->bi_idx = 0; |
| 633 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 666 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| @@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1114 | dev->sector + STRIPE_SECTORS) { | 1147 | dev->sector + STRIPE_SECTORS) { |
| 1115 | if (wbi->bi_rw & REQ_FUA) | 1148 | if (wbi->bi_rw & REQ_FUA) |
| 1116 | set_bit(R5_WantFUA, &dev->flags); | 1149 | set_bit(R5_WantFUA, &dev->flags); |
| 1150 | if (wbi->bi_rw & REQ_SYNC) | ||
| 1151 | set_bit(R5_SyncIO, &dev->flags); | ||
| 1117 | tx = async_copy_data(1, wbi, dev->page, | 1152 | tx = async_copy_data(1, wbi, dev->page, |
| 1118 | dev->sector, tx); | 1153 | dev->sector, tx); |
| 1119 | wbi = r5_next_bio(wbi, dev->sector); | 1154 | wbi = r5_next_bio(wbi, dev->sector); |
| @@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
| 1131 | int pd_idx = sh->pd_idx; | 1166 | int pd_idx = sh->pd_idx; |
| 1132 | int qd_idx = sh->qd_idx; | 1167 | int qd_idx = sh->qd_idx; |
| 1133 | int i; | 1168 | int i; |
| 1134 | bool fua = false; | 1169 | bool fua = false, sync = false; |
| 1135 | 1170 | ||
| 1136 | pr_debug("%s: stripe %llu\n", __func__, | 1171 | pr_debug("%s: stripe %llu\n", __func__, |
| 1137 | (unsigned long long)sh->sector); | 1172 | (unsigned long long)sh->sector); |
| 1138 | 1173 | ||
| 1139 | for (i = disks; i--; ) | 1174 | for (i = disks; i--; ) { |
| 1140 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); | 1175 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); |
| 1176 | sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); | ||
| 1177 | } | ||
| 1141 | 1178 | ||
| 1142 | for (i = disks; i--; ) { | 1179 | for (i = disks; i--; ) { |
| 1143 | struct r5dev *dev = &sh->dev[i]; | 1180 | struct r5dev *dev = &sh->dev[i]; |
| @@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
| 1146 | set_bit(R5_UPTODATE, &dev->flags); | 1183 | set_bit(R5_UPTODATE, &dev->flags); |
| 1147 | if (fua) | 1184 | if (fua) |
| 1148 | set_bit(R5_WantFUA, &dev->flags); | 1185 | set_bit(R5_WantFUA, &dev->flags); |
| 1186 | if (sync) | ||
| 1187 | set_bit(R5_SyncIO, &dev->flags); | ||
| 1149 | } | 1188 | } |
| 1150 | } | 1189 | } |
| 1151 | 1190 | ||
| @@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1648 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1687 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
| 1649 | char b[BDEVNAME_SIZE]; | 1688 | char b[BDEVNAME_SIZE]; |
| 1650 | struct md_rdev *rdev = NULL; | 1689 | struct md_rdev *rdev = NULL; |
| 1651 | 1690 | sector_t s; | |
| 1652 | 1691 | ||
| 1653 | for (i=0 ; i<disks; i++) | 1692 | for (i=0 ; i<disks; i++) |
| 1654 | if (bi == &sh->dev[i].req) | 1693 | if (bi == &sh->dev[i].req) |
| @@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1671 | if (!rdev) | 1710 | if (!rdev) |
| 1672 | rdev = conf->disks[i].rdev; | 1711 | rdev = conf->disks[i].rdev; |
| 1673 | 1712 | ||
| 1713 | if (use_new_offset(conf, sh)) | ||
| 1714 | s = sh->sector + rdev->new_data_offset; | ||
| 1715 | else | ||
| 1716 | s = sh->sector + rdev->data_offset; | ||
| 1674 | if (uptodate) { | 1717 | if (uptodate) { |
| 1675 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1718 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
| 1676 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1719 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
| @@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1683 | "md/raid:%s: read error corrected" | 1726 | "md/raid:%s: read error corrected" |
| 1684 | " (%lu sectors at %llu on %s)\n", | 1727 | " (%lu sectors at %llu on %s)\n", |
| 1685 | mdname(conf->mddev), STRIPE_SECTORS, | 1728 | mdname(conf->mddev), STRIPE_SECTORS, |
| 1686 | (unsigned long long)(sh->sector | 1729 | (unsigned long long)s, |
| 1687 | + rdev->data_offset), | ||
| 1688 | bdevname(rdev->bdev, b)); | 1730 | bdevname(rdev->bdev, b)); |
| 1689 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | 1731 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); |
| 1690 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1732 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| @@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1704 | "md/raid:%s: read error on replacement device " | 1746 | "md/raid:%s: read error on replacement device " |
| 1705 | "(sector %llu on %s).\n", | 1747 | "(sector %llu on %s).\n", |
| 1706 | mdname(conf->mddev), | 1748 | mdname(conf->mddev), |
| 1707 | (unsigned long long)(sh->sector | 1749 | (unsigned long long)s, |
| 1708 | + rdev->data_offset), | ||
| 1709 | bdn); | 1750 | bdn); |
| 1710 | else if (conf->mddev->degraded >= conf->max_degraded) | 1751 | else if (conf->mddev->degraded >= conf->max_degraded) |
| 1711 | printk_ratelimited( | 1752 | printk_ratelimited( |
| @@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1713 | "md/raid:%s: read error not correctable " | 1754 | "md/raid:%s: read error not correctable " |
| 1714 | "(sector %llu on %s).\n", | 1755 | "(sector %llu on %s).\n", |
| 1715 | mdname(conf->mddev), | 1756 | mdname(conf->mddev), |
| 1716 | (unsigned long long)(sh->sector | 1757 | (unsigned long long)s, |
| 1717 | + rdev->data_offset), | ||
| 1718 | bdn); | 1758 | bdn); |
| 1719 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1759 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
| 1720 | /* Oh, no!!! */ | 1760 | /* Oh, no!!! */ |
| @@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1723 | "md/raid:%s: read error NOT corrected!! " | 1763 | "md/raid:%s: read error NOT corrected!! " |
| 1724 | "(sector %llu on %s).\n", | 1764 | "(sector %llu on %s).\n", |
| 1725 | mdname(conf->mddev), | 1765 | mdname(conf->mddev), |
| 1726 | (unsigned long long)(sh->sector | 1766 | (unsigned long long)s, |
| 1727 | + rdev->data_offset), | ||
| 1728 | bdn); | 1767 | bdn); |
| 1729 | else if (atomic_read(&rdev->read_errors) | 1768 | else if (atomic_read(&rdev->read_errors) |
| 1730 | > conf->max_nr_stripes) | 1769 | > conf->max_nr_stripes) |
| @@ -3561,7 +3600,7 @@ finish: | |||
| 3561 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | 3600 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { |
| 3562 | rdev = conf->disks[i].rdev; | 3601 | rdev = conf->disks[i].rdev; |
| 3563 | rdev_clear_badblocks(rdev, sh->sector, | 3602 | rdev_clear_badblocks(rdev, sh->sector, |
| 3564 | STRIPE_SECTORS); | 3603 | STRIPE_SECTORS, 0); |
| 3565 | rdev_dec_pending(rdev, conf->mddev); | 3604 | rdev_dec_pending(rdev, conf->mddev); |
| 3566 | } | 3605 | } |
| 3567 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { | 3606 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { |
| @@ -3570,7 +3609,7 @@ finish: | |||
| 3570 | /* rdev have been moved down */ | 3609 | /* rdev have been moved down */ |
| 3571 | rdev = conf->disks[i].rdev; | 3610 | rdev = conf->disks[i].rdev; |
| 3572 | rdev_clear_badblocks(rdev, sh->sector, | 3611 | rdev_clear_badblocks(rdev, sh->sector, |
| 3573 | STRIPE_SECTORS); | 3612 | STRIPE_SECTORS, 0); |
| 3574 | rdev_dec_pending(rdev, conf->mddev); | 3613 | rdev_dec_pending(rdev, conf->mddev); |
| 3575 | } | 3614 | } |
| 3576 | } | 3615 | } |
| @@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
| 3842 | raid_bio->bi_next = (void*)rdev; | 3881 | raid_bio->bi_next = (void*)rdev; |
| 3843 | align_bi->bi_bdev = rdev->bdev; | 3882 | align_bi->bi_bdev = rdev->bdev; |
| 3844 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3883 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
| 3884 | /* No reshape active, so we can trust rdev->data_offset */ | ||
| 3845 | align_bi->bi_sector += rdev->data_offset; | 3885 | align_bi->bi_sector += rdev->data_offset; |
| 3846 | 3886 | ||
| 3847 | if (!bio_fits_rdev(align_bi) || | 3887 | if (!bio_fits_rdev(align_bi) || |
| @@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 3953 | plugged = mddev_check_plugged(mddev); | 3993 | plugged = mddev_check_plugged(mddev); |
| 3954 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 3994 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
| 3955 | DEFINE_WAIT(w); | 3995 | DEFINE_WAIT(w); |
| 3956 | int disks, data_disks; | ||
| 3957 | int previous; | 3996 | int previous; |
| 3958 | 3997 | ||
| 3959 | retry: | 3998 | retry: |
| 3960 | previous = 0; | 3999 | previous = 0; |
| 3961 | disks = conf->raid_disks; | ||
| 3962 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); | 4000 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); |
| 3963 | if (unlikely(conf->reshape_progress != MaxSector)) { | 4001 | if (unlikely(conf->reshape_progress != MaxSector)) { |
| 3964 | /* spinlock is needed as reshape_progress may be | 4002 | /* spinlock is needed as reshape_progress may be |
| @@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 3970 | * to check again. | 4008 | * to check again. |
| 3971 | */ | 4009 | */ |
| 3972 | spin_lock_irq(&conf->device_lock); | 4010 | spin_lock_irq(&conf->device_lock); |
| 3973 | if (mddev->delta_disks < 0 | 4011 | if (mddev->reshape_backwards |
| 3974 | ? logical_sector < conf->reshape_progress | 4012 | ? logical_sector < conf->reshape_progress |
| 3975 | : logical_sector >= conf->reshape_progress) { | 4013 | : logical_sector >= conf->reshape_progress) { |
| 3976 | disks = conf->previous_raid_disks; | ||
| 3977 | previous = 1; | 4014 | previous = 1; |
| 3978 | } else { | 4015 | } else { |
| 3979 | if (mddev->delta_disks < 0 | 4016 | if (mddev->reshape_backwards |
| 3980 | ? logical_sector < conf->reshape_safe | 4017 | ? logical_sector < conf->reshape_safe |
| 3981 | : logical_sector >= conf->reshape_safe) { | 4018 | : logical_sector >= conf->reshape_safe) { |
| 3982 | spin_unlock_irq(&conf->device_lock); | 4019 | spin_unlock_irq(&conf->device_lock); |
| @@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 3986 | } | 4023 | } |
| 3987 | spin_unlock_irq(&conf->device_lock); | 4024 | spin_unlock_irq(&conf->device_lock); |
| 3988 | } | 4025 | } |
| 3989 | data_disks = disks - conf->max_degraded; | ||
| 3990 | 4026 | ||
| 3991 | new_sector = raid5_compute_sector(conf, logical_sector, | 4027 | new_sector = raid5_compute_sector(conf, logical_sector, |
| 3992 | previous, | 4028 | previous, |
| @@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4009 | */ | 4045 | */ |
| 4010 | int must_retry = 0; | 4046 | int must_retry = 0; |
| 4011 | spin_lock_irq(&conf->device_lock); | 4047 | spin_lock_irq(&conf->device_lock); |
| 4012 | if (mddev->delta_disks < 0 | 4048 | if (mddev->reshape_backwards |
| 4013 | ? logical_sector >= conf->reshape_progress | 4049 | ? logical_sector >= conf->reshape_progress |
| 4014 | : logical_sector < conf->reshape_progress) | 4050 | : logical_sector < conf->reshape_progress) |
| 4015 | /* mismatch, need to try again */ | 4051 | /* mismatch, need to try again */ |
| @@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4108 | 4144 | ||
| 4109 | if (sector_nr == 0) { | 4145 | if (sector_nr == 0) { |
| 4110 | /* If restarting in the middle, skip the initial sectors */ | 4146 | /* If restarting in the middle, skip the initial sectors */ |
| 4111 | if (mddev->delta_disks < 0 && | 4147 | if (mddev->reshape_backwards && |
| 4112 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { | 4148 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { |
| 4113 | sector_nr = raid5_size(mddev, 0, 0) | 4149 | sector_nr = raid5_size(mddev, 0, 0) |
| 4114 | - conf->reshape_progress; | 4150 | - conf->reshape_progress; |
| 4115 | } else if (mddev->delta_disks >= 0 && | 4151 | } else if (!mddev->reshape_backwards && |
| 4116 | conf->reshape_progress > 0) | 4152 | conf->reshape_progress > 0) |
| 4117 | sector_nr = conf->reshape_progress; | 4153 | sector_nr = conf->reshape_progress; |
| 4118 | sector_div(sector_nr, new_data_disks); | 4154 | sector_div(sector_nr, new_data_disks); |
| @@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4133 | else | 4169 | else |
| 4134 | reshape_sectors = mddev->chunk_sectors; | 4170 | reshape_sectors = mddev->chunk_sectors; |
| 4135 | 4171 | ||
| 4136 | /* we update the metadata when there is more than 3Meg | 4172 | /* We update the metadata at least every 10 seconds, or when |
| 4137 | * in the block range (that is rather arbitrary, should | 4173 | * the data about to be copied would over-write the source of |
| 4138 | * probably be time based) or when the data about to be | 4174 | * the data at the front of the range. i.e. one new_stripe |
| 4139 | * copied would over-write the source of the data at | 4175 | * along from reshape_progress new_maps to after where |
| 4140 | * the front of the range. | 4176 | * reshape_safe old_maps to |
| 4141 | * i.e. one new_stripe along from reshape_progress new_maps | ||
| 4142 | * to after where reshape_safe old_maps to | ||
| 4143 | */ | 4177 | */ |
| 4144 | writepos = conf->reshape_progress; | 4178 | writepos = conf->reshape_progress; |
| 4145 | sector_div(writepos, new_data_disks); | 4179 | sector_div(writepos, new_data_disks); |
| @@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4147 | sector_div(readpos, data_disks); | 4181 | sector_div(readpos, data_disks); |
| 4148 | safepos = conf->reshape_safe; | 4182 | safepos = conf->reshape_safe; |
| 4149 | sector_div(safepos, data_disks); | 4183 | sector_div(safepos, data_disks); |
| 4150 | if (mddev->delta_disks < 0) { | 4184 | if (mddev->reshape_backwards) { |
| 4151 | writepos -= min_t(sector_t, reshape_sectors, writepos); | 4185 | writepos -= min_t(sector_t, reshape_sectors, writepos); |
| 4152 | readpos += reshape_sectors; | 4186 | readpos += reshape_sectors; |
| 4153 | safepos += reshape_sectors; | 4187 | safepos += reshape_sectors; |
| @@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4157 | safepos -= min_t(sector_t, reshape_sectors, safepos); | 4191 | safepos -= min_t(sector_t, reshape_sectors, safepos); |
| 4158 | } | 4192 | } |
| 4159 | 4193 | ||
| 4194 | /* Having calculated the 'writepos' possibly use it | ||
| 4195 | * to set 'stripe_addr' which is where we will write to. | ||
| 4196 | */ | ||
| 4197 | if (mddev->reshape_backwards) { | ||
| 4198 | BUG_ON(conf->reshape_progress == 0); | ||
| 4199 | stripe_addr = writepos; | ||
| 4200 | BUG_ON((mddev->dev_sectors & | ||
| 4201 | ~((sector_t)reshape_sectors - 1)) | ||
| 4202 | - reshape_sectors - stripe_addr | ||
| 4203 | != sector_nr); | ||
| 4204 | } else { | ||
| 4205 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
| 4206 | stripe_addr = sector_nr; | ||
| 4207 | } | ||
| 4208 | |||
| 4160 | /* 'writepos' is the most advanced device address we might write. | 4209 | /* 'writepos' is the most advanced device address we might write. |
| 4161 | * 'readpos' is the least advanced device address we might read. | 4210 | * 'readpos' is the least advanced device address we might read. |
| 4162 | * 'safepos' is the least address recorded in the metadata as having | 4211 | * 'safepos' is the least address recorded in the metadata as having |
| 4163 | * been reshaped. | 4212 | * been reshaped. |
| 4164 | * If 'readpos' is behind 'writepos', then there is no way that we can | 4213 | * If there is a min_offset_diff, these are adjusted either by |
| 4214 | * increasing the safepos/readpos if diff is negative, or | ||
| 4215 | * increasing writepos if diff is positive. | ||
| 4216 | * If 'readpos' is then behind 'writepos', there is no way that we can | ||
| 4165 | * ensure safety in the face of a crash - that must be done by userspace | 4217 | * ensure safety in the face of a crash - that must be done by userspace |
| 4166 | * making a backup of the data. So in that case there is no particular | 4218 | * making a backup of the data. So in that case there is no particular |
| 4167 | * rush to update metadata. | 4219 | * rush to update metadata. |
| @@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4174 | * Maybe that number should be configurable, but I'm not sure it is | 4226 | * Maybe that number should be configurable, but I'm not sure it is |
| 4175 | * worth it.... maybe it could be a multiple of safemode_delay??? | 4227 | * worth it.... maybe it could be a multiple of safemode_delay??? |
| 4176 | */ | 4228 | */ |
| 4177 | if ((mddev->delta_disks < 0 | 4229 | if (conf->min_offset_diff < 0) { |
| 4230 | safepos += -conf->min_offset_diff; | ||
| 4231 | readpos += -conf->min_offset_diff; | ||
| 4232 | } else | ||
| 4233 | writepos += conf->min_offset_diff; | ||
| 4234 | |||
| 4235 | if ((mddev->reshape_backwards | ||
| 4178 | ? (safepos > writepos && readpos < writepos) | 4236 | ? (safepos > writepos && readpos < writepos) |
| 4179 | : (safepos < writepos && readpos > writepos)) || | 4237 | : (safepos < writepos && readpos > writepos)) || |
| 4180 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | 4238 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
| @@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4195 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4253 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| 4196 | } | 4254 | } |
| 4197 | 4255 | ||
| 4198 | if (mddev->delta_disks < 0) { | ||
| 4199 | BUG_ON(conf->reshape_progress == 0); | ||
| 4200 | stripe_addr = writepos; | ||
| 4201 | BUG_ON((mddev->dev_sectors & | ||
| 4202 | ~((sector_t)reshape_sectors - 1)) | ||
| 4203 | - reshape_sectors - stripe_addr | ||
| 4204 | != sector_nr); | ||
| 4205 | } else { | ||
| 4206 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
| 4207 | stripe_addr = sector_nr; | ||
| 4208 | } | ||
| 4209 | INIT_LIST_HEAD(&stripes); | 4256 | INIT_LIST_HEAD(&stripes); |
| 4210 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { | 4257 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { |
| 4211 | int j; | 4258 | int j; |
| @@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4239 | list_add(&sh->lru, &stripes); | 4286 | list_add(&sh->lru, &stripes); |
| 4240 | } | 4287 | } |
| 4241 | spin_lock_irq(&conf->device_lock); | 4288 | spin_lock_irq(&conf->device_lock); |
| 4242 | if (mddev->delta_disks < 0) | 4289 | if (mddev->reshape_backwards) |
| 4243 | conf->reshape_progress -= reshape_sectors * new_data_disks; | 4290 | conf->reshape_progress -= reshape_sectors * new_data_disks; |
| 4244 | else | 4291 | else |
| 4245 | conf->reshape_progress += reshape_sectors * new_data_disks; | 4292 | conf->reshape_progress += reshape_sectors * new_data_disks; |
| @@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev) | |||
| 4952 | struct md_rdev *rdev; | 4999 | struct md_rdev *rdev; |
| 4953 | sector_t reshape_offset = 0; | 5000 | sector_t reshape_offset = 0; |
| 4954 | int i; | 5001 | int i; |
| 5002 | long long min_offset_diff = 0; | ||
| 5003 | int first = 1; | ||
| 4955 | 5004 | ||
| 4956 | if (mddev->recovery_cp != MaxSector) | 5005 | if (mddev->recovery_cp != MaxSector) |
| 4957 | printk(KERN_NOTICE "md/raid:%s: not clean" | 5006 | printk(KERN_NOTICE "md/raid:%s: not clean" |
| 4958 | " -- starting background reconstruction\n", | 5007 | " -- starting background reconstruction\n", |
| 4959 | mdname(mddev)); | 5008 | mdname(mddev)); |
| 5009 | |||
| 5010 | rdev_for_each(rdev, mddev) { | ||
| 5011 | long long diff; | ||
| 5012 | if (rdev->raid_disk < 0) | ||
| 5013 | continue; | ||
| 5014 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
| 5015 | if (first) { | ||
| 5016 | min_offset_diff = diff; | ||
| 5017 | first = 0; | ||
| 5018 | } else if (mddev->reshape_backwards && | ||
| 5019 | diff < min_offset_diff) | ||
| 5020 | min_offset_diff = diff; | ||
| 5021 | else if (!mddev->reshape_backwards && | ||
| 5022 | diff > min_offset_diff) | ||
| 5023 | min_offset_diff = diff; | ||
| 5024 | } | ||
| 5025 | |||
| 4960 | if (mddev->reshape_position != MaxSector) { | 5026 | if (mddev->reshape_position != MaxSector) { |
| 4961 | /* Check that we can continue the reshape. | 5027 | /* Check that we can continue the reshape. |
| 4962 | * Currently only disks can change, it must | 5028 | * Difficulties arise if the stripe we would write to |
| 4963 | * increase, and we must be past the point where | 5029 | * next is at or after the stripe we would read from next. |
| 4964 | * a stripe over-writes itself | 5030 | * For a reshape that changes the number of devices, this |
| 5031 | * is only possible for a very short time, and mdadm makes | ||
| 5032 | * sure that time appears to have past before assembling | ||
| 5033 | * the array. So we fail if that time hasn't passed. | ||
| 5034 | * For a reshape that keeps the number of devices the same | ||
| 5035 | * mdadm must be monitoring the reshape can keeping the | ||
| 5036 | * critical areas read-only and backed up. It will start | ||
| 5037 | * the array in read-only mode, so we check for that. | ||
| 4965 | */ | 5038 | */ |
| 4966 | sector_t here_new, here_old; | 5039 | sector_t here_new, here_old; |
| 4967 | int old_disks; | 5040 | int old_disks; |
| @@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev) | |||
| 4993 | /* here_old is the first stripe that we might need to read | 5066 | /* here_old is the first stripe that we might need to read |
| 4994 | * from */ | 5067 | * from */ |
| 4995 | if (mddev->delta_disks == 0) { | 5068 | if (mddev->delta_disks == 0) { |
| 5069 | if ((here_new * mddev->new_chunk_sectors != | ||
| 5070 | here_old * mddev->chunk_sectors)) { | ||
| 5071 | printk(KERN_ERR "md/raid:%s: reshape position is" | ||
| 5072 | " confused - aborting\n", mdname(mddev)); | ||
| 5073 | return -EINVAL; | ||
| 5074 | } | ||
| 4996 | /* We cannot be sure it is safe to start an in-place | 5075 | /* We cannot be sure it is safe to start an in-place |
| 4997 | * reshape. It is only safe if user-space if monitoring | 5076 | * reshape. It is only safe if user-space is monitoring |
| 4998 | * and taking constant backups. | 5077 | * and taking constant backups. |
| 4999 | * mdadm always starts a situation like this in | 5078 | * mdadm always starts a situation like this in |
| 5000 | * readonly mode so it can take control before | 5079 | * readonly mode so it can take control before |
| 5001 | * allowing any writes. So just check for that. | 5080 | * allowing any writes. So just check for that. |
| 5002 | */ | 5081 | */ |
| 5003 | if ((here_new * mddev->new_chunk_sectors != | 5082 | if (abs(min_offset_diff) >= mddev->chunk_sectors && |
| 5004 | here_old * mddev->chunk_sectors) || | 5083 | abs(min_offset_diff) >= mddev->new_chunk_sectors) |
| 5005 | mddev->ro == 0) { | 5084 | /* not really in-place - so OK */; |
| 5006 | printk(KERN_ERR "md/raid:%s: in-place reshape must be started" | 5085 | else if (mddev->ro == 0) { |
| 5007 | " in read-only mode - aborting\n", | 5086 | printk(KERN_ERR "md/raid:%s: in-place reshape " |
| 5087 | "must be started in read-only mode " | ||
| 5088 | "- aborting\n", | ||
| 5008 | mdname(mddev)); | 5089 | mdname(mddev)); |
| 5009 | return -EINVAL; | 5090 | return -EINVAL; |
| 5010 | } | 5091 | } |
| 5011 | } else if (mddev->delta_disks < 0 | 5092 | } else if (mddev->reshape_backwards |
| 5012 | ? (here_new * mddev->new_chunk_sectors <= | 5093 | ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= |
| 5013 | here_old * mddev->chunk_sectors) | 5094 | here_old * mddev->chunk_sectors) |
| 5014 | : (here_new * mddev->new_chunk_sectors >= | 5095 | : (here_new * mddev->new_chunk_sectors >= |
| 5015 | here_old * mddev->chunk_sectors)) { | 5096 | here_old * mddev->chunk_sectors + (-min_offset_diff))) { |
| 5016 | /* Reading from the same stripe as writing to - bad */ | 5097 | /* Reading from the same stripe as writing to - bad */ |
| 5017 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " | 5098 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " |
| 5018 | "auto-recovery - aborting.\n", | 5099 | "auto-recovery - aborting.\n", |
| @@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev) | |||
| 5037 | if (IS_ERR(conf)) | 5118 | if (IS_ERR(conf)) |
| 5038 | return PTR_ERR(conf); | 5119 | return PTR_ERR(conf); |
| 5039 | 5120 | ||
| 5121 | conf->min_offset_diff = min_offset_diff; | ||
| 5040 | mddev->thread = conf->thread; | 5122 | mddev->thread = conf->thread; |
| 5041 | conf->thread = NULL; | 5123 | conf->thread = NULL; |
| 5042 | mddev->private = conf; | 5124 | mddev->private = conf; |
| @@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev) | |||
| 5182 | blk_queue_io_opt(mddev->queue, chunk_size * | 5264 | blk_queue_io_opt(mddev->queue, chunk_size * |
| 5183 | (conf->raid_disks - conf->max_degraded)); | 5265 | (conf->raid_disks - conf->max_degraded)); |
| 5184 | 5266 | ||
| 5185 | rdev_for_each(rdev, mddev) | 5267 | rdev_for_each(rdev, mddev) { |
| 5186 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5268 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
| 5187 | rdev->data_offset << 9); | 5269 | rdev->data_offset << 9); |
| 5270 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
| 5271 | rdev->new_data_offset << 9); | ||
| 5272 | } | ||
| 5188 | } | 5273 | } |
| 5189 | 5274 | ||
| 5190 | return 0; | 5275 | return 0; |
| @@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) | |||
| 5418 | * any io in the removed space completes, but it hardly seems | 5503 | * any io in the removed space completes, but it hardly seems |
| 5419 | * worth it. | 5504 | * worth it. |
| 5420 | */ | 5505 | */ |
| 5506 | sector_t newsize; | ||
| 5421 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); | 5507 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
| 5422 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, | 5508 | newsize = raid5_size(mddev, sectors, mddev->raid_disks); |
| 5423 | mddev->raid_disks)); | 5509 | if (mddev->external_size && |
| 5424 | if (mddev->array_sectors > | 5510 | mddev->array_sectors > newsize) |
| 5425 | raid5_size(mddev, sectors, mddev->raid_disks)) | ||
| 5426 | return -EINVAL; | 5511 | return -EINVAL; |
| 5512 | if (mddev->bitmap) { | ||
| 5513 | int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); | ||
| 5514 | if (ret) | ||
| 5515 | return ret; | ||
| 5516 | } | ||
| 5517 | md_set_array_sectors(mddev, newsize); | ||
| 5427 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5518 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 5428 | revalidate_disk(mddev->gendisk); | 5519 | revalidate_disk(mddev->gendisk); |
| 5429 | if (sectors > mddev->dev_sectors && | 5520 | if (sectors > mddev->dev_sectors && |
| @@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev) | |||
| 5468 | mddev->new_layout == mddev->layout && | 5559 | mddev->new_layout == mddev->layout && |
| 5469 | mddev->new_chunk_sectors == mddev->chunk_sectors) | 5560 | mddev->new_chunk_sectors == mddev->chunk_sectors) |
| 5470 | return 0; /* nothing to do */ | 5561 | return 0; /* nothing to do */ |
| 5471 | if (mddev->bitmap) | ||
| 5472 | /* Cannot grow a bitmap yet */ | ||
| 5473 | return -EBUSY; | ||
| 5474 | if (has_failed(conf)) | 5562 | if (has_failed(conf)) |
| 5475 | return -EINVAL; | 5563 | return -EINVAL; |
| 5476 | if (mddev->delta_disks < 0) { | 5564 | if (mddev->delta_disks < 0) { |
| @@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 5505 | if (!check_stripe_cache(mddev)) | 5593 | if (!check_stripe_cache(mddev)) |
| 5506 | return -ENOSPC; | 5594 | return -ENOSPC; |
| 5507 | 5595 | ||
| 5508 | rdev_for_each(rdev, mddev) | 5596 | if (has_failed(conf)) |
| 5597 | return -EINVAL; | ||
| 5598 | |||
| 5599 | rdev_for_each(rdev, mddev) { | ||
| 5509 | if (!test_bit(In_sync, &rdev->flags) | 5600 | if (!test_bit(In_sync, &rdev->flags) |
| 5510 | && !test_bit(Faulty, &rdev->flags)) | 5601 | && !test_bit(Faulty, &rdev->flags)) |
| 5511 | spares++; | 5602 | spares++; |
| 5603 | } | ||
| 5512 | 5604 | ||
| 5513 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) | 5605 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) |
| 5514 | /* Not enough devices even to make a degraded array | 5606 | /* Not enough devices even to make a degraded array |
| @@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 5535 | conf->chunk_sectors = mddev->new_chunk_sectors; | 5627 | conf->chunk_sectors = mddev->new_chunk_sectors; |
| 5536 | conf->prev_algo = conf->algorithm; | 5628 | conf->prev_algo = conf->algorithm; |
| 5537 | conf->algorithm = mddev->new_layout; | 5629 | conf->algorithm = mddev->new_layout; |
| 5538 | if (mddev->delta_disks < 0) | 5630 | conf->generation++; |
| 5631 | /* Code that selects data_offset needs to see the generation update | ||
| 5632 | * if reshape_progress has been set - so a memory barrier needed. | ||
| 5633 | */ | ||
| 5634 | smp_mb(); | ||
| 5635 | if (mddev->reshape_backwards) | ||
| 5539 | conf->reshape_progress = raid5_size(mddev, 0, 0); | 5636 | conf->reshape_progress = raid5_size(mddev, 0, 0); |
| 5540 | else | 5637 | else |
| 5541 | conf->reshape_progress = 0; | 5638 | conf->reshape_progress = 0; |
| 5542 | conf->reshape_safe = conf->reshape_progress; | 5639 | conf->reshape_safe = conf->reshape_progress; |
| 5543 | conf->generation++; | ||
| 5544 | spin_unlock_irq(&conf->device_lock); | 5640 | spin_unlock_irq(&conf->device_lock); |
| 5545 | 5641 | ||
| 5546 | /* Add some new drives, as many as will fit. | 5642 | /* Add some new drives, as many as will fit. |
| @@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 5592 | mddev->recovery = 0; | 5688 | mddev->recovery = 0; |
| 5593 | spin_lock_irq(&conf->device_lock); | 5689 | spin_lock_irq(&conf->device_lock); |
| 5594 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 5690 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
| 5691 | rdev_for_each(rdev, mddev) | ||
| 5692 | rdev->new_data_offset = rdev->data_offset; | ||
| 5693 | smp_wmb(); | ||
| 5595 | conf->reshape_progress = MaxSector; | 5694 | conf->reshape_progress = MaxSector; |
| 5596 | mddev->reshape_position = MaxSector; | 5695 | mddev->reshape_position = MaxSector; |
| 5597 | spin_unlock_irq(&conf->device_lock); | 5696 | spin_unlock_irq(&conf->device_lock); |
| @@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf) | |||
| 5610 | { | 5709 | { |
| 5611 | 5710 | ||
| 5612 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { | 5711 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { |
| 5712 | struct md_rdev *rdev; | ||
| 5613 | 5713 | ||
| 5614 | spin_lock_irq(&conf->device_lock); | 5714 | spin_lock_irq(&conf->device_lock); |
| 5615 | conf->previous_raid_disks = conf->raid_disks; | 5715 | conf->previous_raid_disks = conf->raid_disks; |
| 5716 | rdev_for_each(rdev, conf->mddev) | ||
| 5717 | rdev->data_offset = rdev->new_data_offset; | ||
| 5718 | smp_wmb(); | ||
| 5616 | conf->reshape_progress = MaxSector; | 5719 | conf->reshape_progress = MaxSector; |
| 5617 | spin_unlock_irq(&conf->device_lock); | 5720 | spin_unlock_irq(&conf->device_lock); |
| 5618 | wake_up(&conf->wait_for_overlap); | 5721 | wake_up(&conf->wait_for_overlap); |
| @@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev) | |||
| 5652 | d < conf->raid_disks - mddev->delta_disks; | 5755 | d < conf->raid_disks - mddev->delta_disks; |
| 5653 | d++) { | 5756 | d++) { |
| 5654 | struct md_rdev *rdev = conf->disks[d].rdev; | 5757 | struct md_rdev *rdev = conf->disks[d].rdev; |
| 5655 | if (rdev && | 5758 | if (rdev) |
| 5656 | raid5_remove_disk(mddev, rdev) == 0) { | 5759 | clear_bit(In_sync, &rdev->flags); |
| 5657 | sysfs_unlink_rdev(mddev, rdev); | 5760 | rdev = conf->disks[d].replacement; |
| 5658 | rdev->raid_disk = -1; | 5761 | if (rdev) |
| 5659 | } | 5762 | clear_bit(In_sync, &rdev->flags); |
| 5660 | } | 5763 | } |
| 5661 | } | 5764 | } |
| 5662 | mddev->layout = conf->algorithm; | 5765 | mddev->layout = conf->algorithm; |
| 5663 | mddev->chunk_sectors = conf->chunk_sectors; | 5766 | mddev->chunk_sectors = conf->chunk_sectors; |
| 5664 | mddev->reshape_position = MaxSector; | 5767 | mddev->reshape_position = MaxSector; |
| 5665 | mddev->delta_disks = 0; | 5768 | mddev->delta_disks = 0; |
| 5769 | mddev->reshape_backwards = 0; | ||
| 5666 | } | 5770 | } |
| 5667 | } | 5771 | } |
| 5668 | 5772 | ||
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 8d8e13934a48..2164021f3b5f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -285,6 +285,7 @@ enum r5dev_flags { | |||
| 285 | */ | 285 | */ |
| 286 | R5_Wantdrain, /* dev->towrite needs to be drained */ | 286 | R5_Wantdrain, /* dev->towrite needs to be drained */ |
| 287 | R5_WantFUA, /* Write should be FUA */ | 287 | R5_WantFUA, /* Write should be FUA */ |
| 288 | R5_SyncIO, /* The IO is sync */ | ||
| 288 | R5_WriteError, /* got a write error - need to record it */ | 289 | R5_WriteError, /* got a write error - need to record it */ |
| 289 | R5_MadeGood, /* A bad block has been fixed by writing to it */ | 290 | R5_MadeGood, /* A bad block has been fixed by writing to it */ |
| 290 | R5_ReadRepl, /* Will/did read from replacement rather than orig */ | 291 | R5_ReadRepl, /* Will/did read from replacement rather than orig */ |
| @@ -385,6 +386,12 @@ struct r5conf { | |||
| 385 | short generation; /* increments with every reshape */ | 386 | short generation; /* increments with every reshape */ |
| 386 | unsigned long reshape_checkpoint; /* Time we last updated | 387 | unsigned long reshape_checkpoint; /* Time we last updated |
| 387 | * metadata */ | 388 | * metadata */ |
| 389 | long long min_offset_diff; /* minimum difference between | ||
| 390 | * data_offset and | ||
| 391 | * new_data_offset across all | ||
| 392 | * devices. May be negative, | ||
| 393 | * but is closest to zero. | ||
| 394 | */ | ||
| 388 | 395 | ||
| 389 | struct list_head handle_list; /* stripes needing handling */ | 396 | struct list_head handle_list; /* stripes needing handling */ |
| 390 | struct list_head hold_list; /* preread ready stripes */ | 397 | struct list_head hold_list; /* preread ready stripes */ |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 8c0a3adc5df5..ee753536ab70 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
| @@ -233,7 +233,10 @@ struct mdp_superblock_1 { | |||
| 233 | __le32 delta_disks; /* change in number of raid_disks */ | 233 | __le32 delta_disks; /* change in number of raid_disks */ |
| 234 | __le32 new_layout; /* new layout */ | 234 | __le32 new_layout; /* new layout */ |
| 235 | __le32 new_chunk; /* new chunk size (512byte sectors) */ | 235 | __le32 new_chunk; /* new chunk size (512byte sectors) */ |
| 236 | __u8 pad1[128-124]; /* set to 0 when written */ | 236 | __le32 new_offset; /* signed number to add to data_offset in new |
| 237 | * layout. 0 == no-change. This can be | ||
| 238 | * different on each device in the array. | ||
| 239 | */ | ||
| 237 | 240 | ||
| 238 | /* constant this-device information - 64 bytes */ | 241 | /* constant this-device information - 64 bytes */ |
| 239 | __le64 data_offset; /* sector start of data, often 0 */ | 242 | __le64 data_offset; /* sector start of data, often 0 */ |
| @@ -281,10 +284,18 @@ struct mdp_superblock_1 { | |||
| 281 | * active device with same 'role'. | 284 | * active device with same 'role'. |
| 282 | * 'recovery_offset' is also set. | 285 | * 'recovery_offset' is also set. |
| 283 | */ | 286 | */ |
| 287 | #define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number | ||
| 288 | * of devices, but is going | ||
| 289 | * backwards anyway. | ||
| 290 | */ | ||
| 291 | #define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ | ||
| 284 | #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ | 292 | #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |
| 285 | |MD_FEATURE_RECOVERY_OFFSET \ | 293 | |MD_FEATURE_RECOVERY_OFFSET \ |
| 286 | |MD_FEATURE_RESHAPE_ACTIVE \ | 294 | |MD_FEATURE_RESHAPE_ACTIVE \ |
| 287 | |MD_FEATURE_BAD_BLOCKS \ | 295 | |MD_FEATURE_BAD_BLOCKS \ |
| 288 | |MD_FEATURE_REPLACEMENT) | 296 | |MD_FEATURE_REPLACEMENT \ |
| 297 | |MD_FEATURE_RESHAPE_BACKWARDS \ | ||
| 298 | |MD_FEATURE_NEW_OFFSET \ | ||
| 299 | ) | ||
| 289 | 300 | ||
| 290 | #endif | 301 | #endif |
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 53272e9860a7..640c69ceec96 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h | |||
| @@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2; | |||
| 99 | extern const struct raid6_calls raid6_altivec4; | 99 | extern const struct raid6_calls raid6_altivec4; |
| 100 | extern const struct raid6_calls raid6_altivec8; | 100 | extern const struct raid6_calls raid6_altivec8; |
| 101 | 101 | ||
| 102 | struct raid6_recov_calls { | ||
| 103 | void (*data2)(int, size_t, int, int, void **); | ||
| 104 | void (*datap)(int, size_t, int, void **); | ||
| 105 | int (*valid)(void); | ||
| 106 | const char *name; | ||
| 107 | int priority; | ||
| 108 | }; | ||
| 109 | |||
| 110 | extern const struct raid6_recov_calls raid6_recov_intx1; | ||
| 111 | extern const struct raid6_recov_calls raid6_recov_ssse3; | ||
| 112 | |||
| 102 | /* Algorithm list */ | 113 | /* Algorithm list */ |
| 103 | extern const struct raid6_calls * const raid6_algos[]; | 114 | extern const struct raid6_calls * const raid6_algos[]; |
| 115 | extern const struct raid6_recov_calls *const raid6_recov_algos[]; | ||
| 104 | int raid6_select_algo(void); | 116 | int raid6_select_algo(void); |
| 105 | 117 | ||
| 106 | /* Return values from chk_syndrome */ | 118 | /* Return values from chk_syndrome */ |
| @@ -111,14 +123,16 @@ int raid6_select_algo(void); | |||
| 111 | 123 | ||
| 112 | /* Galois field tables */ | 124 | /* Galois field tables */ |
| 113 | extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); | 125 | extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); |
| 126 | extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256))); | ||
| 114 | extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); | 127 | extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); |
| 115 | extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); | 128 | extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); |
| 116 | extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); | 129 | extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); |
| 117 | 130 | ||
| 118 | /* Recovery routines */ | 131 | /* Recovery routines */ |
| 119 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | 132 | extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb, |
| 120 | void **ptrs); | 133 | void **ptrs); |
| 121 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); | 134 | extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila, |
| 135 | void **ptrs); | ||
| 122 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, | 136 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, |
| 123 | void **ptrs); | 137 | void **ptrs); |
| 124 | 138 | ||
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 8a38102770f3..de06dfe165b8 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | obj-$(CONFIG_RAID6_PQ) += raid6_pq.o | 1 | obj-$(CONFIG_RAID6_PQ) += raid6_pq.o |
| 2 | 2 | ||
| 3 | raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ | 3 | raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ |
| 4 | int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ | 4 | int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ |
| 5 | altivec8.o mmx.o sse1.o sse2.o | 5 | altivec8.o mmx.o sse1.o sse2.o |
| 6 | hostprogs-y += mktables | 6 | hostprogs-y += mktables |
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 8b02f60ffc86..589f5f50ad2e 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c | |||
| @@ -17,11 +17,11 @@ | |||
| 17 | */ | 17 | */ |
| 18 | 18 | ||
| 19 | #include <linux/raid/pq.h> | 19 | #include <linux/raid/pq.h> |
| 20 | #include <linux/module.h> | ||
| 21 | #ifndef __KERNEL__ | 20 | #ifndef __KERNEL__ |
| 22 | #include <sys/mman.h> | 21 | #include <sys/mman.h> |
| 23 | #include <stdio.h> | 22 | #include <stdio.h> |
| 24 | #else | 23 | #else |
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
| 26 | #if !RAID6_USE_EMPTY_ZERO_PAGE | 26 | #if !RAID6_USE_EMPTY_ZERO_PAGE |
| 27 | /* In .bss so it's zeroed */ | 27 | /* In .bss so it's zeroed */ |
| @@ -34,10 +34,6 @@ struct raid6_calls raid6_call; | |||
| 34 | EXPORT_SYMBOL_GPL(raid6_call); | 34 | EXPORT_SYMBOL_GPL(raid6_call); |
| 35 | 35 | ||
| 36 | const struct raid6_calls * const raid6_algos[] = { | 36 | const struct raid6_calls * const raid6_algos[] = { |
| 37 | &raid6_intx1, | ||
| 38 | &raid6_intx2, | ||
| 39 | &raid6_intx4, | ||
| 40 | &raid6_intx8, | ||
| 41 | #if defined(__ia64__) | 37 | #if defined(__ia64__) |
| 42 | &raid6_intx16, | 38 | &raid6_intx16, |
| 43 | &raid6_intx32, | 39 | &raid6_intx32, |
| @@ -61,6 +57,24 @@ const struct raid6_calls * const raid6_algos[] = { | |||
| 61 | &raid6_altivec4, | 57 | &raid6_altivec4, |
| 62 | &raid6_altivec8, | 58 | &raid6_altivec8, |
| 63 | #endif | 59 | #endif |
| 60 | &raid6_intx1, | ||
| 61 | &raid6_intx2, | ||
| 62 | &raid6_intx4, | ||
| 63 | &raid6_intx8, | ||
| 64 | NULL | ||
| 65 | }; | ||
| 66 | |||
| 67 | void (*raid6_2data_recov)(int, size_t, int, int, void **); | ||
| 68 | EXPORT_SYMBOL_GPL(raid6_2data_recov); | ||
| 69 | |||
| 70 | void (*raid6_datap_recov)(int, size_t, int, void **); | ||
| 71 | EXPORT_SYMBOL_GPL(raid6_datap_recov); | ||
| 72 | |||
| 73 | const struct raid6_recov_calls *const raid6_recov_algos[] = { | ||
| 74 | #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) | ||
| 75 | &raid6_recov_ssse3, | ||
| 76 | #endif | ||
| 77 | &raid6_recov_intx1, | ||
| 64 | NULL | 78 | NULL |
| 65 | }; | 79 | }; |
| 66 | 80 | ||
| @@ -72,59 +86,55 @@ const struct raid6_calls * const raid6_algos[] = { | |||
| 72 | #define time_before(x, y) ((x) < (y)) | 86 | #define time_before(x, y) ((x) < (y)) |
| 73 | #endif | 87 | #endif |
| 74 | 88 | ||
| 75 | /* Try to pick the best algorithm */ | 89 | static inline const struct raid6_recov_calls *raid6_choose_recov(void) |
| 76 | /* This code uses the gfmul table as convenient data set to abuse */ | ||
| 77 | |||
| 78 | int __init raid6_select_algo(void) | ||
| 79 | { | 90 | { |
| 80 | const struct raid6_calls * const * algo; | 91 | const struct raid6_recov_calls *const *algo; |
| 81 | const struct raid6_calls * best; | 92 | const struct raid6_recov_calls *best; |
| 82 | char *syndromes; | ||
| 83 | void *dptrs[(65536/PAGE_SIZE)+2]; | ||
| 84 | int i, disks; | ||
| 85 | unsigned long perf, bestperf; | ||
| 86 | int bestprefer; | ||
| 87 | unsigned long j0, j1; | ||
| 88 | 93 | ||
| 89 | disks = (65536/PAGE_SIZE)+2; | 94 | for (best = NULL, algo = raid6_recov_algos; *algo; algo++) |
| 90 | for ( i = 0 ; i < disks-2 ; i++ ) { | 95 | if (!best || (*algo)->priority > best->priority) |
| 91 | dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; | 96 | if (!(*algo)->valid || (*algo)->valid()) |
| 92 | } | 97 | best = *algo; |
| 93 | 98 | ||
| 94 | /* Normal code - use a 2-page allocation to avoid D$ conflict */ | 99 | if (best) { |
| 95 | syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); | 100 | raid6_2data_recov = best->data2; |
| 101 | raid6_datap_recov = best->datap; | ||
| 96 | 102 | ||
| 97 | if ( !syndromes ) { | 103 | printk("raid6: using %s recovery algorithm\n", best->name); |
| 98 | printk("raid6: Yikes! No memory available.\n"); | 104 | } else |
| 99 | return -ENOMEM; | 105 | printk("raid6: Yikes! No recovery algorithm found!\n"); |
| 100 | } | ||
| 101 | 106 | ||
| 102 | dptrs[disks-2] = syndromes; | 107 | return best; |
| 103 | dptrs[disks-1] = syndromes + PAGE_SIZE; | 108 | } |
| 109 | |||
| 110 | static inline const struct raid6_calls *raid6_choose_gen( | ||
| 111 | void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) | ||
| 112 | { | ||
| 113 | unsigned long perf, bestperf, j0, j1; | ||
| 114 | const struct raid6_calls *const *algo; | ||
| 115 | const struct raid6_calls *best; | ||
| 104 | 116 | ||
| 105 | bestperf = 0; bestprefer = 0; best = NULL; | 117 | for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { |
| 118 | if (!best || (*algo)->prefer >= best->prefer) { | ||
| 119 | if ((*algo)->valid && !(*algo)->valid()) | ||
| 120 | continue; | ||
| 106 | 121 | ||
| 107 | for ( algo = raid6_algos ; *algo ; algo++ ) { | ||
| 108 | if ( !(*algo)->valid || (*algo)->valid() ) { | ||
| 109 | perf = 0; | 122 | perf = 0; |
| 110 | 123 | ||
| 111 | preempt_disable(); | 124 | preempt_disable(); |
| 112 | j0 = jiffies; | 125 | j0 = jiffies; |
| 113 | while ( (j1 = jiffies) == j0 ) | 126 | while ((j1 = jiffies) == j0) |
| 114 | cpu_relax(); | 127 | cpu_relax(); |
| 115 | while (time_before(jiffies, | 128 | while (time_before(jiffies, |
| 116 | j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { | 129 | j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { |
| 117 | (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); | 130 | (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs); |
| 118 | perf++; | 131 | perf++; |
| 119 | } | 132 | } |
| 120 | preempt_enable(); | 133 | preempt_enable(); |
| 121 | 134 | ||
| 122 | if ( (*algo)->prefer > bestprefer || | 135 | if (perf > bestperf) { |
| 123 | ((*algo)->prefer == bestprefer && | ||
| 124 | perf > bestperf) ) { | ||
| 125 | best = *algo; | ||
| 126 | bestprefer = best->prefer; | ||
| 127 | bestperf = perf; | 136 | bestperf = perf; |
| 137 | best = *algo; | ||
| 128 | } | 138 | } |
| 129 | printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, | 139 | printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, |
| 130 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); | 140 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); |
| @@ -139,9 +149,46 @@ int __init raid6_select_algo(void) | |||
| 139 | } else | 149 | } else |
| 140 | printk("raid6: Yikes! No algorithm found!\n"); | 150 | printk("raid6: Yikes! No algorithm found!\n"); |
| 141 | 151 | ||
| 152 | return best; | ||
| 153 | } | ||
| 154 | |||
| 155 | |||
| 156 | /* Try to pick the best algorithm */ | ||
| 157 | /* This code uses the gfmul table as convenient data set to abuse */ | ||
| 158 | |||
| 159 | int __init raid6_select_algo(void) | ||
| 160 | { | ||
| 161 | const int disks = (65536/PAGE_SIZE)+2; | ||
| 162 | |||
| 163 | const struct raid6_calls *gen_best; | ||
| 164 | const struct raid6_recov_calls *rec_best; | ||
| 165 | char *syndromes; | ||
| 166 | void *dptrs[(65536/PAGE_SIZE)+2]; | ||
| 167 | int i; | ||
| 168 | |||
| 169 | for (i = 0; i < disks-2; i++) | ||
| 170 | dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; | ||
| 171 | |||
| 172 | /* Normal code - use a 2-page allocation to avoid D$ conflict */ | ||
| 173 | syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); | ||
| 174 | |||
| 175 | if (!syndromes) { | ||
| 176 | printk("raid6: Yikes! No memory available.\n"); | ||
| 177 | return -ENOMEM; | ||
| 178 | } | ||
| 179 | |||
| 180 | dptrs[disks-2] = syndromes; | ||
| 181 | dptrs[disks-1] = syndromes + PAGE_SIZE; | ||
| 182 | |||
| 183 | /* select raid gen_syndrome function */ | ||
| 184 | gen_best = raid6_choose_gen(&dptrs, disks); | ||
| 185 | |||
| 186 | /* select raid recover functions */ | ||
| 187 | rec_best = raid6_choose_recov(); | ||
| 188 | |||
| 142 | free_pages((unsigned long)syndromes, 1); | 189 | free_pages((unsigned long)syndromes, 1); |
| 143 | 190 | ||
| 144 | return best ? 0 : -EINVAL; | 191 | return gen_best && rec_best ? 0 : -EINVAL; |
| 145 | } | 192 | } |
| 146 | 193 | ||
| 147 | static void raid6_exit(void) | 194 | static void raid6_exit(void) |
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index 8a3780902cec..39787db588b0 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c | |||
| @@ -81,6 +81,31 @@ int main(int argc, char *argv[]) | |||
| 81 | printf("EXPORT_SYMBOL(raid6_gfmul);\n"); | 81 | printf("EXPORT_SYMBOL(raid6_gfmul);\n"); |
| 82 | printf("#endif\n"); | 82 | printf("#endif\n"); |
| 83 | 83 | ||
| 84 | /* Compute vector multiplication table */ | ||
| 85 | printf("\nconst u8 __attribute__((aligned(256)))\n" | ||
| 86 | "raid6_vgfmul[256][32] =\n" | ||
| 87 | "{\n"); | ||
| 88 | for (i = 0; i < 256; i++) { | ||
| 89 | printf("\t{\n"); | ||
| 90 | for (j = 0; j < 16; j += 8) { | ||
| 91 | printf("\t\t"); | ||
| 92 | for (k = 0; k < 8; k++) | ||
| 93 | printf("0x%02x,%c", gfmul(i, j + k), | ||
| 94 | (k == 7) ? '\n' : ' '); | ||
| 95 | } | ||
| 96 | for (j = 0; j < 16; j += 8) { | ||
| 97 | printf("\t\t"); | ||
| 98 | for (k = 0; k < 8; k++) | ||
| 99 | printf("0x%02x,%c", gfmul(i, (j + k) << 4), | ||
| 100 | (k == 7) ? '\n' : ' '); | ||
| 101 | } | ||
| 102 | printf("\t},\n"); | ||
| 103 | } | ||
| 104 | printf("};\n"); | ||
| 105 | printf("#ifdef __KERNEL__\n"); | ||
| 106 | printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); | ||
| 107 | printf("#endif\n"); | ||
| 108 | |||
| 84 | /* Compute power-of-2 table (exponent) */ | 109 | /* Compute power-of-2 table (exponent) */ |
| 85 | v = 1; | 110 | v = 1; |
| 86 | printf("\nconst u8 __attribute__((aligned(256)))\n" | 111 | printf("\nconst u8 __attribute__((aligned(256)))\n" |
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index fe275d7b6b36..1805a5cc5daa 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c | |||
| @@ -22,7 +22,7 @@ | |||
| 22 | #include <linux/raid/pq.h> | 22 | #include <linux/raid/pq.h> |
| 23 | 23 | ||
| 24 | /* Recover two failed data blocks. */ | 24 | /* Recover two failed data blocks. */ |
| 25 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | 25 | void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, |
| 26 | void **ptrs) | 26 | void **ptrs) |
| 27 | { | 27 | { |
| 28 | u8 *p, *q, *dp, *dq; | 28 | u8 *p, *q, *dp, *dq; |
| @@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | |||
| 64 | p++; q++; | 64 | p++; q++; |
| 65 | } | 65 | } |
| 66 | } | 66 | } |
| 67 | EXPORT_SYMBOL_GPL(raid6_2data_recov); | ||
| 68 | 67 | ||
| 69 | /* Recover failure of one data block plus the P block */ | 68 | /* Recover failure of one data block plus the P block */ |
| 70 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) | 69 | void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs) |
| 71 | { | 70 | { |
| 72 | u8 *p, *q, *dq; | 71 | u8 *p, *q, *dq; |
| 73 | const u8 *qmul; /* Q multiplier table */ | 72 | const u8 *qmul; /* Q multiplier table */ |
| @@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) | |||
| 96 | q++; dq++; | 95 | q++; dq++; |
| 97 | } | 96 | } |
| 98 | } | 97 | } |
| 99 | EXPORT_SYMBOL_GPL(raid6_datap_recov); | 98 | |
| 99 | |||
| 100 | const struct raid6_recov_calls raid6_recov_intx1 = { | ||
| 101 | .data2 = raid6_2data_recov_intx1, | ||
| 102 | .datap = raid6_datap_recov_intx1, | ||
| 103 | .valid = NULL, | ||
| 104 | .name = "intx1", | ||
| 105 | .priority = 0, | ||
| 106 | }; | ||
| 100 | 107 | ||
| 101 | #ifndef __KERNEL__ | 108 | #ifndef __KERNEL__ |
| 102 | /* Testing only */ | 109 | /* Testing only */ |
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c new file mode 100644 index 000000000000..37ae61930559 --- /dev/null +++ b/lib/raid6/recov_ssse3.c | |||
| @@ -0,0 +1,335 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Intel Corporation | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation; version 2 | ||
| 7 | * of the License. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) | ||
| 11 | |||
| 12 | #include <linux/raid/pq.h> | ||
| 13 | #include "x86.h" | ||
| 14 | |||
| 15 | static int raid6_has_ssse3(void) | ||
| 16 | { | ||
| 17 | return boot_cpu_has(X86_FEATURE_XMM) && | ||
| 18 | boot_cpu_has(X86_FEATURE_XMM2) && | ||
| 19 | boot_cpu_has(X86_FEATURE_SSSE3); | ||
| 20 | } | ||
| 21 | |||
| 22 | void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, | ||
| 23 | void **ptrs) | ||
| 24 | { | ||
| 25 | u8 *p, *q, *dp, *dq; | ||
| 26 | const u8 *pbmul; /* P multiplier table for B data */ | ||
| 27 | const u8 *qmul; /* Q multiplier table (for both) */ | ||
| 28 | static const u8 __aligned(16) x0f[16] = { | ||
| 29 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, | ||
| 30 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; | ||
| 31 | |||
| 32 | p = (u8 *)ptrs[disks-2]; | ||
| 33 | q = (u8 *)ptrs[disks-1]; | ||
| 34 | |||
| 35 | /* Compute syndrome with zero for the missing data pages | ||
| 36 | Use the dead data pages as temporary storage for | ||
| 37 | delta p and delta q */ | ||
| 38 | dp = (u8 *)ptrs[faila]; | ||
| 39 | ptrs[faila] = (void *)raid6_empty_zero_page; | ||
| 40 | ptrs[disks-2] = dp; | ||
| 41 | dq = (u8 *)ptrs[failb]; | ||
| 42 | ptrs[failb] = (void *)raid6_empty_zero_page; | ||
| 43 | ptrs[disks-1] = dq; | ||
| 44 | |||
| 45 | raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
| 46 | |||
| 47 | /* Restore pointer table */ | ||
| 48 | ptrs[faila] = dp; | ||
| 49 | ptrs[failb] = dq; | ||
| 50 | ptrs[disks-2] = p; | ||
| 51 | ptrs[disks-1] = q; | ||
| 52 | |||
| 53 | /* Now, pick the proper data tables */ | ||
| 54 | pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; | ||
| 55 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ | ||
| 56 | raid6_gfexp[failb]]]; | ||
| 57 | |||
| 58 | kernel_fpu_begin(); | ||
| 59 | |||
| 60 | asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); | ||
| 61 | |||
| 62 | #ifdef CONFIG_X86_64 | ||
| 63 | asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); | ||
| 64 | asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); | ||
| 65 | asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); | ||
| 66 | #endif | ||
| 67 | |||
| 68 | /* Now do it... */ | ||
| 69 | while (bytes) { | ||
| 70 | #ifdef CONFIG_X86_64 | ||
| 71 | /* xmm6, xmm14, xmm15 */ | ||
| 72 | |||
| 73 | asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); | ||
| 74 | asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); | ||
| 75 | asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); | ||
| 76 | asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); | ||
| 77 | asm volatile("pxor %0,%%xmm1" : : "m" (dq[0])); | ||
| 78 | asm volatile("pxor %0,%%xmm9" : : "m" (dq[16])); | ||
| 79 | asm volatile("pxor %0,%%xmm0" : : "m" (dp[0])); | ||
| 80 | asm volatile("pxor %0,%%xmm8" : : "m" (dp[16])); | ||
| 81 | |||
| 82 | /* xmm0/8 = px */ | ||
| 83 | |||
| 84 | asm volatile("movdqa %xmm6,%xmm4"); | ||
| 85 | asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); | ||
| 86 | asm volatile("movdqa %xmm6,%xmm12"); | ||
| 87 | asm volatile("movdqa %xmm5,%xmm13"); | ||
| 88 | asm volatile("movdqa %xmm1,%xmm3"); | ||
| 89 | asm volatile("movdqa %xmm9,%xmm11"); | ||
| 90 | asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ | ||
| 91 | asm volatile("movdqa %xmm8,%xmm10"); | ||
| 92 | asm volatile("psraw $4,%xmm1"); | ||
| 93 | asm volatile("psraw $4,%xmm9"); | ||
| 94 | asm volatile("pand %xmm7,%xmm3"); | ||
| 95 | asm volatile("pand %xmm7,%xmm11"); | ||
| 96 | asm volatile("pand %xmm7,%xmm1"); | ||
| 97 | asm volatile("pand %xmm7,%xmm9"); | ||
| 98 | asm volatile("pshufb %xmm3,%xmm4"); | ||
| 99 | asm volatile("pshufb %xmm11,%xmm12"); | ||
| 100 | asm volatile("pshufb %xmm1,%xmm5"); | ||
| 101 | asm volatile("pshufb %xmm9,%xmm13"); | ||
| 102 | asm volatile("pxor %xmm4,%xmm5"); | ||
| 103 | asm volatile("pxor %xmm12,%xmm13"); | ||
| 104 | |||
| 105 | /* xmm5/13 = qx */ | ||
| 106 | |||
| 107 | asm volatile("movdqa %xmm14,%xmm4"); | ||
| 108 | asm volatile("movdqa %xmm15,%xmm1"); | ||
| 109 | asm volatile("movdqa %xmm14,%xmm12"); | ||
| 110 | asm volatile("movdqa %xmm15,%xmm9"); | ||
| 111 | asm volatile("movdqa %xmm2,%xmm3"); | ||
| 112 | asm volatile("movdqa %xmm10,%xmm11"); | ||
| 113 | asm volatile("psraw $4,%xmm2"); | ||
| 114 | asm volatile("psraw $4,%xmm10"); | ||
| 115 | asm volatile("pand %xmm7,%xmm3"); | ||
| 116 | asm volatile("pand %xmm7,%xmm11"); | ||
| 117 | asm volatile("pand %xmm7,%xmm2"); | ||
| 118 | asm volatile("pand %xmm7,%xmm10"); | ||
| 119 | asm volatile("pshufb %xmm3,%xmm4"); | ||
| 120 | asm volatile("pshufb %xmm11,%xmm12"); | ||
| 121 | asm volatile("pshufb %xmm2,%xmm1"); | ||
| 122 | asm volatile("pshufb %xmm10,%xmm9"); | ||
| 123 | asm volatile("pxor %xmm4,%xmm1"); | ||
| 124 | asm volatile("pxor %xmm12,%xmm9"); | ||
| 125 | |||
| 126 | /* xmm1/9 = pbmul[px] */ | ||
| 127 | asm volatile("pxor %xmm5,%xmm1"); | ||
| 128 | asm volatile("pxor %xmm13,%xmm9"); | ||
| 129 | /* xmm1/9 = db = DQ */ | ||
| 130 | asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); | ||
| 131 | asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); | ||
| 132 | |||
| 133 | asm volatile("pxor %xmm1,%xmm0"); | ||
| 134 | asm volatile("pxor %xmm9,%xmm8"); | ||
| 135 | asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); | ||
| 136 | asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); | ||
| 137 | |||
| 138 | bytes -= 32; | ||
| 139 | p += 32; | ||
| 140 | q += 32; | ||
| 141 | dp += 32; | ||
| 142 | dq += 32; | ||
| 143 | #else | ||
| 144 | asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); | ||
| 145 | asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); | ||
| 146 | asm volatile("pxor %0,%%xmm1" : : "m" (*dq)); | ||
| 147 | asm volatile("pxor %0,%%xmm0" : : "m" (*dp)); | ||
| 148 | |||
| 149 | /* 1 = dq ^ q | ||
| 150 | * 0 = dp ^ p | ||
| 151 | */ | ||
| 152 | asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); | ||
| 153 | asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); | ||
| 154 | |||
| 155 | asm volatile("movdqa %xmm1,%xmm3"); | ||
| 156 | asm volatile("psraw $4,%xmm1"); | ||
| 157 | asm volatile("pand %xmm7,%xmm3"); | ||
| 158 | asm volatile("pand %xmm7,%xmm1"); | ||
| 159 | asm volatile("pshufb %xmm3,%xmm4"); | ||
| 160 | asm volatile("pshufb %xmm1,%xmm5"); | ||
| 161 | asm volatile("pxor %xmm4,%xmm5"); | ||
| 162 | |||
| 163 | asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ | ||
| 164 | |||
| 165 | /* xmm5 = qx */ | ||
| 166 | |||
| 167 | asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); | ||
| 168 | asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); | ||
| 169 | asm volatile("movdqa %xmm2,%xmm3"); | ||
| 170 | asm volatile("psraw $4,%xmm2"); | ||
| 171 | asm volatile("pand %xmm7,%xmm3"); | ||
| 172 | asm volatile("pand %xmm7,%xmm2"); | ||
| 173 | asm volatile("pshufb %xmm3,%xmm4"); | ||
| 174 | asm volatile("pshufb %xmm2,%xmm1"); | ||
| 175 | asm volatile("pxor %xmm4,%xmm1"); | ||
| 176 | |||
| 177 | /* xmm1 = pbmul[px] */ | ||
| 178 | asm volatile("pxor %xmm5,%xmm1"); | ||
| 179 | /* xmm1 = db = DQ */ | ||
| 180 | asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); | ||
| 181 | |||
| 182 | asm volatile("pxor %xmm1,%xmm0"); | ||
| 183 | asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); | ||
| 184 | |||
| 185 | bytes -= 16; | ||
| 186 | p += 16; | ||
| 187 | q += 16; | ||
| 188 | dp += 16; | ||
| 189 | dq += 16; | ||
| 190 | #endif | ||
| 191 | } | ||
| 192 | |||
| 193 | kernel_fpu_end(); | ||
| 194 | } | ||
| 195 | |||
| 196 | |||
| 197 | void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs) | ||
| 198 | { | ||
| 199 | u8 *p, *q, *dq; | ||
| 200 | const u8 *qmul; /* Q multiplier table */ | ||
| 201 | static const u8 __aligned(16) x0f[16] = { | ||
| 202 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, | ||
| 203 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; | ||
| 204 | |||
| 205 | p = (u8 *)ptrs[disks-2]; | ||
| 206 | q = (u8 *)ptrs[disks-1]; | ||
| 207 | |||
| 208 | /* Compute syndrome with zero for the missing data page | ||
| 209 | Use the dead data page as temporary storage for delta q */ | ||
| 210 | dq = (u8 *)ptrs[faila]; | ||
| 211 | ptrs[faila] = (void *)raid6_empty_zero_page; | ||
| 212 | ptrs[disks-1] = dq; | ||
| 213 | |||
| 214 | raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
| 215 | |||
| 216 | /* Restore pointer table */ | ||
| 217 | ptrs[faila] = dq; | ||
| 218 | ptrs[disks-1] = q; | ||
| 219 | |||
| 220 | /* Now, pick the proper data tables */ | ||
| 221 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; | ||
| 222 | |||
| 223 | kernel_fpu_begin(); | ||
| 224 | |||
| 225 | asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); | ||
| 226 | |||
| 227 | while (bytes) { | ||
| 228 | #ifdef CONFIG_X86_64 | ||
| 229 | asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); | ||
| 230 | asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); | ||
| 231 | asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); | ||
| 232 | asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); | ||
| 233 | |||
| 234 | /* xmm3 = q[0] ^ dq[0] */ | ||
| 235 | |||
| 236 | asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); | ||
| 237 | asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); | ||
| 238 | |||
| 239 | /* xmm4 = q[16] ^ dq[16] */ | ||
| 240 | |||
| 241 | asm volatile("movdqa %xmm3, %xmm6"); | ||
| 242 | asm volatile("movdqa %xmm4, %xmm8"); | ||
| 243 | |||
| 244 | /* xmm4 = xmm8 = q[16] ^ dq[16] */ | ||
| 245 | |||
| 246 | asm volatile("psraw $4, %xmm3"); | ||
| 247 | asm volatile("pand %xmm7, %xmm6"); | ||
| 248 | asm volatile("pand %xmm7, %xmm3"); | ||
| 249 | asm volatile("pshufb %xmm6, %xmm0"); | ||
| 250 | asm volatile("pshufb %xmm3, %xmm1"); | ||
| 251 | asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); | ||
| 252 | asm volatile("pxor %xmm0, %xmm1"); | ||
| 253 | asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); | ||
| 254 | |||
| 255 | /* xmm1 = qmul[q[0] ^ dq[0]] */ | ||
| 256 | |||
| 257 | asm volatile("psraw $4, %xmm4"); | ||
| 258 | asm volatile("pand %xmm7, %xmm8"); | ||
| 259 | asm volatile("pand %xmm7, %xmm4"); | ||
| 260 | asm volatile("pshufb %xmm8, %xmm10"); | ||
| 261 | asm volatile("pshufb %xmm4, %xmm11"); | ||
| 262 | asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); | ||
| 263 | asm volatile("pxor %xmm10, %xmm11"); | ||
| 264 | asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); | ||
| 265 | |||
| 266 | /* xmm11 = qmul[q[16] ^ dq[16]] */ | ||
| 267 | |||
| 268 | asm volatile("pxor %xmm1, %xmm2"); | ||
| 269 | |||
| 270 | /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ | ||
| 271 | |||
| 272 | asm volatile("pxor %xmm11, %xmm12"); | ||
| 273 | |||
| 274 | /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ | ||
| 275 | |||
| 276 | asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); | ||
| 277 | asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); | ||
| 278 | |||
| 279 | asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); | ||
| 280 | asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); | ||
| 281 | |||
| 282 | bytes -= 32; | ||
| 283 | p += 32; | ||
| 284 | q += 32; | ||
| 285 | dq += 32; | ||
| 286 | |||
| 287 | #else | ||
| 288 | asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); | ||
| 289 | asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); | ||
| 290 | asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); | ||
| 291 | asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); | ||
| 292 | |||
| 293 | /* xmm3 = *q ^ *dq */ | ||
| 294 | |||
| 295 | asm volatile("movdqa %xmm3, %xmm6"); | ||
| 296 | asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); | ||
| 297 | asm volatile("psraw $4, %xmm3"); | ||
| 298 | asm volatile("pand %xmm7, %xmm6"); | ||
| 299 | asm volatile("pand %xmm7, %xmm3"); | ||
| 300 | asm volatile("pshufb %xmm6, %xmm0"); | ||
| 301 | asm volatile("pshufb %xmm3, %xmm1"); | ||
| 302 | asm volatile("pxor %xmm0, %xmm1"); | ||
| 303 | |||
| 304 | /* xmm1 = qmul[*q ^ *dq */ | ||
| 305 | |||
| 306 | asm volatile("pxor %xmm1, %xmm2"); | ||
| 307 | |||
| 308 | /* xmm2 = *p ^ qmul[*q ^ *dq] */ | ||
| 309 | |||
| 310 | asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); | ||
| 311 | asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); | ||
| 312 | |||
| 313 | bytes -= 16; | ||
| 314 | p += 16; | ||
| 315 | q += 16; | ||
| 316 | dq += 16; | ||
| 317 | #endif | ||
| 318 | } | ||
| 319 | |||
| 320 | kernel_fpu_end(); | ||
| 321 | } | ||
| 322 | |||
| 323 | const struct raid6_recov_calls raid6_recov_ssse3 = { | ||
| 324 | .data2 = raid6_2data_recov_ssse3, | ||
| 325 | .datap = raid6_datap_recov_ssse3, | ||
| 326 | .valid = raid6_has_ssse3, | ||
| 327 | #ifdef CONFIG_X86_64 | ||
| 328 | .name = "ssse3x2", | ||
| 329 | #else | ||
| 330 | .name = "ssse3x1", | ||
| 331 | #endif | ||
| 332 | .priority = 1, | ||
| 333 | }; | ||
| 334 | |||
| 335 | #endif | ||
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index aa651697b6dc..c76151d94764 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile | |||
| @@ -23,7 +23,7 @@ RANLIB = ranlib | |||
| 23 | all: raid6.a raid6test | 23 | all: raid6.a raid6test |
| 24 | 24 | ||
| 25 | raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ | 25 | raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ |
| 26 | altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ | 26 | altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \ |
| 27 | tables.o | 27 | tables.o |
| 28 | rm -f $@ | 28 | rm -f $@ |
| 29 | $(AR) cq $@ $^ | 29 | $(AR) cq $@ $^ |
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 7a930318b17d..5a485b7a7d3c 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c | |||
| @@ -90,25 +90,35 @@ static int test_disks(int i, int j) | |||
| 90 | int main(int argc, char *argv[]) | 90 | int main(int argc, char *argv[]) |
| 91 | { | 91 | { |
| 92 | const struct raid6_calls *const *algo; | 92 | const struct raid6_calls *const *algo; |
| 93 | const struct raid6_recov_calls *const *ra; | ||
| 93 | int i, j; | 94 | int i, j; |
| 94 | int err = 0; | 95 | int err = 0; |
| 95 | 96 | ||
| 96 | makedata(); | 97 | makedata(); |
| 97 | 98 | ||
| 98 | for (algo = raid6_algos; *algo; algo++) { | 99 | for (ra = raid6_recov_algos; *ra; ra++) { |
| 99 | if (!(*algo)->valid || (*algo)->valid()) { | 100 | if ((*ra)->valid && !(*ra)->valid()) |
| 100 | raid6_call = **algo; | 101 | continue; |
| 102 | raid6_2data_recov = (*ra)->data2; | ||
| 103 | raid6_datap_recov = (*ra)->datap; | ||
| 101 | 104 | ||
| 102 | /* Nuke syndromes */ | 105 | printf("using recovery %s\n", (*ra)->name); |
| 103 | memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); | ||
| 104 | 106 | ||
| 105 | /* Generate assumed good syndrome */ | 107 | for (algo = raid6_algos; *algo; algo++) { |
| 106 | raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, | 108 | if (!(*algo)->valid || (*algo)->valid()) { |
| 107 | (void **)&dataptrs); | 109 | raid6_call = **algo; |
| 108 | 110 | ||
| 109 | for (i = 0; i < NDISKS-1; i++) | 111 | /* Nuke syndromes */ |
| 110 | for (j = i+1; j < NDISKS; j++) | 112 | memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); |
| 111 | err += test_disks(i, j); | 113 | |
| 114 | /* Generate assumed good syndrome */ | ||
| 115 | raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, | ||
| 116 | (void **)&dataptrs); | ||
| 117 | |||
| 118 | for (i = 0; i < NDISKS-1; i++) | ||
| 119 | for (j = i+1; j < NDISKS; j++) | ||
| 120 | err += test_disks(i, j); | ||
| 121 | } | ||
| 112 | } | 122 | } |
| 113 | printf("\n"); | 123 | printf("\n"); |
| 114 | } | 124 | } |
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index cb2a8c91c886..d55d63232c55 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h | |||
| @@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void) | |||
| 35 | { | 35 | { |
| 36 | } | 36 | } |
| 37 | 37 | ||
| 38 | #define __aligned(x) __attribute__((aligned(x))) | ||
| 39 | |||
| 38 | #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ | 40 | #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ |
| 39 | #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions | 41 | #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions |
| 40 | * (fast save and restore) */ | 42 | * (fast save and restore) */ |
| 41 | #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ | 43 | #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ |
| 42 | #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ | 44 | #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ |
| 45 | #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ | ||
| 46 | #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ | ||
| 47 | #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ | ||
| 43 | #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ | 48 | #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ |
| 44 | 49 | ||
| 45 | /* Should work well enough on modern CPUs for testing */ | 50 | /* Should work well enough on modern CPUs for testing */ |
| 46 | static inline int boot_cpu_has(int flag) | 51 | static inline int boot_cpu_has(int flag) |
| 47 | { | 52 | { |
| 48 | u32 eax = (flag >> 5) ? 0x80000001 : 1; | 53 | u32 eax = (flag & 0x20) ? 0x80000001 : 1; |
| 49 | u32 edx; | 54 | u32 ecx, edx; |
| 50 | 55 | ||
| 51 | asm volatile("cpuid" | 56 | asm volatile("cpuid" |
| 52 | : "+a" (eax), "=d" (edx) | 57 | : "+a" (eax), "=d" (edx), "=c" (ecx) |
| 53 | : : "ecx", "ebx"); | 58 | : : "ebx"); |
| 54 | 59 | ||
| 55 | return (edx >> (flag & 31)) & 1; | 60 | return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1; |
| 56 | } | 61 | } |
| 57 | 62 | ||
| 58 | #endif /* ndef __KERNEL__ */ | 63 | #endif /* ndef __KERNEL__ */ |
