diff options
Diffstat (limited to 'drivers/block/drbd/drbd_vli.h')
| -rw-r--r-- | drivers/block/drbd/drbd_vli.h | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h new file mode 100644 index 000000000000..fc824006e721 --- /dev/null +++ b/drivers/block/drbd/drbd_vli.h | |||
| @@ -0,0 +1,351 @@ | |||
| 1 | /* | ||
| 2 | -*- linux-c -*- | ||
| 3 | drbd_receiver.c | ||
| 4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
| 5 | |||
| 6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
| 7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
| 8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
| 9 | |||
| 10 | drbd is free software; you can redistribute it and/or modify | ||
| 11 | it under the terms of the GNU General Public License as published by | ||
| 12 | the Free Software Foundation; either version 2, or (at your option) | ||
| 13 | any later version. | ||
| 14 | |||
| 15 | drbd is distributed in the hope that it will be useful, | ||
| 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 18 | GNU General Public License for more details. | ||
| 19 | |||
| 20 | You should have received a copy of the GNU General Public License | ||
| 21 | along with drbd; see the file COPYING. If not, write to | ||
| 22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
| 23 | */ | ||
| 24 | |||
| 25 | #ifndef _DRBD_VLI_H | ||
| 26 | #define _DRBD_VLI_H | ||
| 27 | |||
| 28 | /* | ||
| 29 | * At a granularity of 4KiB storage represented per bit, | ||
| 30 | * and stroage sizes of several TiB, | ||
| 31 | * and possibly small-bandwidth replication, | ||
| 32 | * the bitmap transfer time can take much too long, | ||
| 33 | * if transmitted in plain text. | ||
| 34 | * | ||
| 35 | * We try to reduce the transfered bitmap information | ||
| 36 | * by encoding runlengths of bit polarity. | ||
| 37 | * | ||
| 38 | * We never actually need to encode a "zero" (runlengths are positive). | ||
| 39 | * But then we have to store the value of the first bit. | ||
| 40 | * The first bit of information thus shall encode if the first runlength | ||
| 41 | * gives the number of set or unset bits. | ||
| 42 | * | ||
| 43 | * We assume that large areas are either completely set or unset, | ||
| 44 | * which gives good compression with any runlength method, | ||
| 45 | * even when encoding the runlength as fixed size 32bit/64bit integers. | ||
| 46 | * | ||
| 47 | * Still, there may be areas where the polarity flips every few bits, | ||
| 48 | * and encoding the runlength sequence of those areas with fix size | ||
| 49 | * integers would be much worse than plaintext. | ||
| 50 | * | ||
| 51 | * We want to encode small runlength values with minimum code length, | ||
| 52 | * while still being able to encode a Huge run of all zeros. | ||
| 53 | * | ||
| 54 | * Thus we need a Variable Length Integer encoding, VLI. | ||
| 55 | * | ||
| 56 | * For some cases, we produce more code bits than plaintext input. | ||
| 57 | * We need to send incompressible chunks as plaintext, skip over them | ||
| 58 | * and then see if the next chunk compresses better. | ||
| 59 | * | ||
| 60 | * We don't care too much about "excellent" compression ratio for large | ||
| 61 | * runlengths (all set/all clear): whether we achieve a factor of 100 | ||
| 62 | * or 1000 is not that much of an issue. | ||
| 63 | * We do not want to waste too much on short runlengths in the "noisy" | ||
| 64 | * parts of the bitmap, though. | ||
| 65 | * | ||
| 66 | * There are endless variants of VLI, we experimented with: | ||
| 67 | * * simple byte-based | ||
| 68 | * * various bit based with different code word length. | ||
| 69 | * | ||
| 70 | * To avoid yet an other configuration parameter (choice of bitmap compression | ||
| 71 | * algorithm) which was difficult to explain and tune, we just chose the one | ||
| 72 | * variant that turned out best in all test cases. | ||
| 73 | * Based on real world usage patterns, with device sizes ranging from a few GiB | ||
| 74 | * to several TiB, file server/mailserver/webserver/mysql/postgress, | ||
| 75 | * mostly idle to really busy, the all time winner (though sometimes only | ||
| 76 | * marginally better) is: | ||
| 77 | */ | ||
| 78 | |||
| 79 | /* | ||
| 80 | * encoding is "visualised" as | ||
| 81 | * __little endian__ bitstream, least significant bit first (left most) | ||
| 82 | * | ||
| 83 | * this particular encoding is chosen so that the prefix code | ||
| 84 | * starts as unary encoding the level, then modified so that | ||
| 85 | * 10 levels can be described in 8bit, with minimal overhead | ||
| 86 | * for the smaller levels. | ||
| 87 | * | ||
| 88 | * Number of data bits follow fibonacci sequence, with the exception of the | ||
| 89 | * last level (+1 data bit, so it makes 64bit total). The only worse code when | ||
| 90 | * encoding bit polarity runlength is 1 plain bits => 2 code bits. | ||
| 91 | prefix data bits max val NÂș data bits | ||
| 92 | 0 x 0x2 1 | ||
| 93 | 10 x 0x4 1 | ||
| 94 | 110 xx 0x8 2 | ||
| 95 | 1110 xxx 0x10 3 | ||
| 96 | 11110 xxx xx 0x30 5 | ||
| 97 | 111110 xx xxxxxx 0x130 8 | ||
| 98 | 11111100 xxxxxxxx xxxxx 0x2130 13 | ||
| 99 | 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 | ||
| 100 | 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 | ||
| 101 | 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 | ||
| 102 | * maximum encodable value: 0x100000400202130 == 2**56 + some */ | ||
| 103 | |||
| 104 | /* compression "table": | ||
| 105 | transmitted x 0.29 | ||
| 106 | as plaintext x ........................ | ||
| 107 | x ........................ | ||
| 108 | x ........................ | ||
| 109 | x 0.59 0.21........................ | ||
| 110 | x ........................................................ | ||
| 111 | x .. c ................................................... | ||
| 112 | x 0.44.. o ................................................... | ||
| 113 | x .......... d ................................................... | ||
| 114 | x .......... e ................................................... | ||
| 115 | X............. ................................................... | ||
| 116 | x.............. b ................................................... | ||
| 117 | 2.0x............... i ................................................... | ||
| 118 | #X................ t ................................................... | ||
| 119 | #................. s ........................... plain bits .......... | ||
| 120 | -+----------------------------------------------------------------------- | ||
| 121 | 1 16 32 64 | ||
| 122 | */ | ||
| 123 | |||
| 124 | /* LEVEL: (total bits, prefix bits, prefix value), | ||
| 125 | * sorted ascending by number of total bits. | ||
| 126 | * The rest of the code table is calculated at compiletime from this. */ | ||
| 127 | |||
| 128 | /* fibonacci data 1, 1, ... */ | ||
| 129 | #define VLI_L_1_1() do { \ | ||
| 130 | LEVEL( 2, 1, 0x00); \ | ||
| 131 | LEVEL( 3, 2, 0x01); \ | ||
| 132 | LEVEL( 5, 3, 0x03); \ | ||
| 133 | LEVEL( 7, 4, 0x07); \ | ||
| 134 | LEVEL(10, 5, 0x0f); \ | ||
| 135 | LEVEL(14, 6, 0x1f); \ | ||
| 136 | LEVEL(21, 8, 0x3f); \ | ||
| 137 | LEVEL(29, 8, 0x7f); \ | ||
| 138 | LEVEL(42, 8, 0xbf); \ | ||
| 139 | LEVEL(64, 8, 0xff); \ | ||
| 140 | } while (0) | ||
| 141 | |||
| 142 | /* finds a suitable level to decode the least significant part of in. | ||
| 143 | * returns number of bits consumed. | ||
| 144 | * | ||
| 145 | * BUG() for bad input, as that would mean a buggy code table. */ | ||
| 146 | static inline int vli_decode_bits(u64 *out, const u64 in) | ||
| 147 | { | ||
| 148 | u64 adj = 1; | ||
| 149 | |||
| 150 | #define LEVEL(t,b,v) \ | ||
| 151 | do { \ | ||
| 152 | if ((in & ((1 << b) -1)) == v) { \ | ||
| 153 | *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ | ||
| 154 | return t; \ | ||
| 155 | } \ | ||
| 156 | adj += 1ULL << (t - b); \ | ||
| 157 | } while (0) | ||
| 158 | |||
| 159 | VLI_L_1_1(); | ||
| 160 | |||
| 161 | /* NOT REACHED, if VLI_LEVELS code table is defined properly */ | ||
| 162 | BUG(); | ||
| 163 | #undef LEVEL | ||
| 164 | } | ||
| 165 | |||
| 166 | /* return number of code bits needed, | ||
| 167 | * or negative error number */ | ||
| 168 | static inline int __vli_encode_bits(u64 *out, const u64 in) | ||
| 169 | { | ||
| 170 | u64 max = 0; | ||
| 171 | u64 adj = 1; | ||
| 172 | |||
| 173 | if (in == 0) | ||
| 174 | return -EINVAL; | ||
| 175 | |||
| 176 | #define LEVEL(t,b,v) do { \ | ||
| 177 | max += 1ULL << (t - b); \ | ||
| 178 | if (in <= max) { \ | ||
| 179 | if (out) \ | ||
| 180 | *out = ((in - adj) << b) | v; \ | ||
| 181 | return t; \ | ||
| 182 | } \ | ||
| 183 | adj = max + 1; \ | ||
| 184 | } while (0) | ||
| 185 | |||
| 186 | VLI_L_1_1(); | ||
| 187 | |||
| 188 | return -EOVERFLOW; | ||
| 189 | #undef LEVEL | ||
| 190 | } | ||
| 191 | |||
| 192 | #undef VLI_L_1_1 | ||
| 193 | |||
| 194 | /* code from here down is independend of actually used bit code */ | ||
| 195 | |||
| 196 | /* | ||
| 197 | * Code length is determined by some unique (e.g. unary) prefix. | ||
| 198 | * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, | ||
| 199 | * not a byte stream. | ||
| 200 | */ | ||
| 201 | |||
| 202 | /* for the bitstream, we need a cursor */ | ||
| 203 | struct bitstream_cursor { | ||
| 204 | /* the current byte */ | ||
| 205 | u8 *b; | ||
| 206 | /* the current bit within *b, nomalized: 0..7 */ | ||
| 207 | unsigned int bit; | ||
| 208 | }; | ||
| 209 | |||
| 210 | /* initialize cursor to point to first bit of stream */ | ||
| 211 | static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) | ||
| 212 | { | ||
| 213 | cur->b = s; | ||
| 214 | cur->bit = 0; | ||
| 215 | } | ||
| 216 | |||
| 217 | /* advance cursor by that many bits; maximum expected input value: 64, | ||
| 218 | * but depending on VLI implementation, it may be more. */ | ||
| 219 | static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) | ||
| 220 | { | ||
| 221 | bits += cur->bit; | ||
| 222 | cur->b = cur->b + (bits >> 3); | ||
| 223 | cur->bit = bits & 7; | ||
| 224 | } | ||
| 225 | |||
| 226 | /* the bitstream itself knows its length */ | ||
| 227 | struct bitstream { | ||
| 228 | struct bitstream_cursor cur; | ||
| 229 | unsigned char *buf; | ||
| 230 | size_t buf_len; /* in bytes */ | ||
| 231 | |||
| 232 | /* for input stream: | ||
| 233 | * number of trailing 0 bits for padding | ||
| 234 | * total number of valid bits in stream: buf_len * 8 - pad_bits */ | ||
| 235 | unsigned int pad_bits; | ||
| 236 | }; | ||
| 237 | |||
| 238 | static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) | ||
| 239 | { | ||
| 240 | bs->buf = s; | ||
| 241 | bs->buf_len = len; | ||
| 242 | bs->pad_bits = pad_bits; | ||
| 243 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
| 244 | } | ||
| 245 | |||
| 246 | static inline void bitstream_rewind(struct bitstream *bs) | ||
| 247 | { | ||
| 248 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
| 249 | memset(bs->buf, 0, bs->buf_len); | ||
| 250 | } | ||
| 251 | |||
| 252 | /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. | ||
| 253 | * Ignores "pad_bits". | ||
| 254 | * Returns zero if bits == 0 (nothing to do). | ||
| 255 | * Returns number of bits used if successful. | ||
| 256 | * | ||
| 257 | * If there is not enough room left in bitstream, | ||
| 258 | * leaves bitstream unchanged and returns -ENOBUFS. | ||
| 259 | */ | ||
| 260 | static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) | ||
| 261 | { | ||
| 262 | unsigned char *b = bs->cur.b; | ||
| 263 | unsigned int tmp; | ||
| 264 | |||
| 265 | if (bits == 0) | ||
| 266 | return 0; | ||
| 267 | |||
| 268 | if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) | ||
| 269 | return -ENOBUFS; | ||
| 270 | |||
| 271 | /* paranoia: strip off hi bits; they should not be set anyways. */ | ||
| 272 | if (bits < 64) | ||
| 273 | val &= ~0ULL >> (64 - bits); | ||
| 274 | |||
| 275 | *b++ |= (val & 0xff) << bs->cur.bit; | ||
| 276 | |||
| 277 | for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) | ||
| 278 | *b++ |= (val >> tmp) & 0xff; | ||
| 279 | |||
| 280 | bitstream_cursor_advance(&bs->cur, bits); | ||
| 281 | return bits; | ||
| 282 | } | ||
| 283 | |||
| 284 | /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. | ||
| 285 | * | ||
| 286 | * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. | ||
| 287 | * | ||
| 288 | * If there are less than the requested number of valid bits left in the | ||
| 289 | * bitstream, still fetches all available bits. | ||
| 290 | * | ||
| 291 | * Returns number of actually fetched bits. | ||
| 292 | */ | ||
| 293 | static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) | ||
| 294 | { | ||
| 295 | u64 val; | ||
| 296 | unsigned int n; | ||
| 297 | |||
| 298 | if (bits > 64) | ||
| 299 | return -EINVAL; | ||
| 300 | |||
| 301 | if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) | ||
| 302 | bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) | ||
| 303 | - bs->cur.bit - bs->pad_bits; | ||
| 304 | |||
| 305 | if (bits == 0) { | ||
| 306 | *out = 0; | ||
| 307 | return 0; | ||
| 308 | } | ||
| 309 | |||
| 310 | /* get the high bits */ | ||
| 311 | val = 0; | ||
| 312 | n = (bs->cur.bit + bits + 7) >> 3; | ||
| 313 | /* n may be at most 9, if cur.bit + bits > 64 */ | ||
| 314 | /* which means this copies at most 8 byte */ | ||
| 315 | if (n) { | ||
| 316 | memcpy(&val, bs->cur.b+1, n - 1); | ||
| 317 | val = le64_to_cpu(val) << (8 - bs->cur.bit); | ||
| 318 | } | ||
| 319 | |||
| 320 | /* we still need the low bits */ | ||
| 321 | val |= bs->cur.b[0] >> bs->cur.bit; | ||
| 322 | |||
| 323 | /* and mask out bits we don't want */ | ||
| 324 | val &= ~0ULL >> (64 - bits); | ||
| 325 | |||
| 326 | bitstream_cursor_advance(&bs->cur, bits); | ||
| 327 | *out = val; | ||
| 328 | |||
| 329 | return bits; | ||
| 330 | } | ||
| 331 | |||
| 332 | /* encodes @in as vli into @bs; | ||
| 333 | |||
| 334 | * return values | ||
| 335 | * > 0: number of bits successfully stored in bitstream | ||
| 336 | * -ENOBUFS @bs is full | ||
| 337 | * -EINVAL input zero (invalid) | ||
| 338 | * -EOVERFLOW input too large for this vli code (invalid) | ||
| 339 | */ | ||
| 340 | static inline int vli_encode_bits(struct bitstream *bs, u64 in) | ||
| 341 | { | ||
| 342 | u64 code = code; | ||
| 343 | int bits = __vli_encode_bits(&code, in); | ||
| 344 | |||
| 345 | if (bits <= 0) | ||
| 346 | return bits; | ||
| 347 | |||
| 348 | return bitstream_put_bits(bs, code, bits); | ||
| 349 | } | ||
| 350 | |||
| 351 | #endif | ||
