20 files changed, 1430 insertions, 45 deletions
diff --git a/lib/842/842.h b/lib/842/842.h
new file mode 100644
index 000000000000..7c200030acf7
--- /dev/null
+++ b/lib/842/842.h
@@ -0,0 +1,127 @@
+#ifndef __842_H__
+#define __842_H__
+/* The 842 compressed format is made up of multiple blocks, each of
+ * which have the format:
+ *
+ * <template>[arg1][arg2][arg3][arg4]
+ *
+ * where there are between 0 and 4 template args, depending on the specific
+ * template operation.  For normal operations, each arg is either a specific
+ * number of data bytes to add to the output buffer, or an index pointing
+ * to a previously-written number of data bytes to copy to the output buffer.
+ *
+ * The template code is a 5-bit value.  This code indicates what to do with
+ * the following data.  Template codes from 0 to 0x19 should use the template
+ * table, the static "decomp_ops" table used in decompress.  For each template
+ * (table row), there are between 1 and 4 actions; each action corresponds to
+ * an arg following the template code bits.  Each action is either a "data"
+ * type action, or a "index" type action, and each action results in 2, 4, or 8
+ * bytes being written to the output buffer.  Each template (i.e. all actions
+ * in the table row) will add up to 8 bytes being written to the output buffer.
+ * Any row with less than 4 actions is padded with noop actions, indicated by
+ * N0 (for which there is no corresponding arg in the compressed data buffer).
+ *
+ * "Data" actions, indicated in the table by D2, D4, and D8, mean that the
+ * corresponding arg is 2, 4, or 8 bytes, respectively, in the compressed data
+ * buffer should be copied directly to the output buffer.
+ *
+ * "Index" actions, indicated in the table by I2, I4, and I8, mean the
+ * corresponding arg is an index parameter that points to, respectively, a 2,
+ * 4, or 8 byte value already in the output buffer, that should be copied to
+ * the end of the output buffer.  Essentially, the index points to a position
+ * in a ring buffer that contains the last N bytes of output buffer data.
+ * The number of bits for each index's arg are: 8 bits for I2, 9 bits for I4,
+ * and 8 bits for I8.  Since each index points to a 2, 4, or 8 byte section,
+ * this means that I2 can reference 512 bytes ((2^8 bits = 256) * 2 bytes), I4
+ * can reference 2048 bytes ((2^9 = 512) * 4 bytes), and I8 can reference 2048
+ * bytes ((2^8 = 256) * 8 bytes).  Think of it as a kind-of ring buffer for
+ * each of I2, I4, and I8 that are updated for each byte written to the output
+ * buffer.  In this implementation, the output buffer is directly used for each
+ * index; there is no additional memory required.  Note that the index is into
+ * a ring buffer, not a sliding window; for example, if there have been 260
+ * bytes written to the output buffer, an I2 index of 0 would index to byte 256
+ * in the output buffer, while an I2 index of 16 would index to byte 16 in the
+ * output buffer.
+ *
+ * There are also 3 special template codes; 0x1b for "repeat", 0x1c for
+ * "zeros", and 0x1e for "end".  The "repeat" operation is followed by a 6 bit
+ * arg N indicating how many times to repeat.  The last 8 bytes written to the
+ * output buffer are written again to the output buffer, N + 1 times.  The
+ * "zeros" operation, which has no arg bits, writes 8 zeros to the output
+ * buffer.  The "end" operation, which also has no arg bits, signals the end
+ * of the compressed data.  There may be some number of padding (don't care,
+ * but usually 0) bits after the "end" operation bits, to fill the buffer
+ * length to a specific byte multiple (usually a multiple of 8, 16, or 32
+ * bytes).
+ *
+ * This software implementation also uses one of the undefined template values,
+ * 0x1d as a special "short data" template code, to represent less than 8 bytes
+ * of uncompressed data.  It is followed by a 3 bit arg N indicating how many
+ * data bytes will follow, and then N bytes of data, which should be copied to
+ * the output buffer.  This allows the software 842 compressor to accept input
+ * buffers that are not an exact multiple of 8 bytes long.  However, those
+ * compressed buffers containing this sw-only template will be rejected by
+ * the 842 hardware decompressor, and must be decompressed with this software
+ * library.  The 842 software compression module includes a parameter to
+ * disable using this sw-only "short data" template, and instead simply
+ * reject any input buffer that is not a multiple of 8 bytes long.
+ *
+ * After all actions for each operation code are processed, another template
+ * code is in the next 5 bits.  The decompression ends once the "end" template
+ * code is detected.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+#include <linux/sw842.h>
+/* special templates */
+#define OP_REPEAT       (0x1B)
+#define OP_ZEROS        (0x1C)
+#define OP_END          (0x1E)
+/* sw only template - this is not in the hw design; it's used only by this
+ * software compressor and decompressor, to allow input buffers that aren't
+ * a multiple of 8.
+ */
+#define OP_SHORT_DATA   (0x1D)
+/* additional bits of each op param */
+#define OP_BITS         (5)
+#define REPEAT_BITS     (6)
+#define SHORT_DATA_BITS (3)
+#define I2_BITS         (8)
+#define I4_BITS         (9)
+#define I8_BITS         (8)
+#define REPEAT_BITS_MAX         (0x3f)
+#define SHORT_DATA_BITS_MAX     (0x7)
+/* Arbitrary values used to indicate action */
+#define OP_ACTION       (0x70)
+#define OP_ACTION_INDEX (0x10)
+#define OP_ACTION_DATA  (0x20)
+#define OP_ACTION_NOOP  (0x40)
+#define OP_AMOUNT       (0x0f)
+#define OP_AMOUNT_0     (0x00)
+#define OP_AMOUNT_2     (0x02)
+#define OP_AMOUNT_4     (0x04)
+#define OP_AMOUNT_8     (0x08)
+#define D2              (OP_ACTION_DATA  | OP_AMOUNT_2)
+#define D4              (OP_ACTION_DATA  | OP_AMOUNT_4)
+#define D8              (OP_ACTION_DATA  | OP_AMOUNT_8)
+#define I2              (OP_ACTION_INDEX | OP_AMOUNT_2)
+#define I4              (OP_ACTION_INDEX | OP_AMOUNT_4)
+#define I8              (OP_ACTION_INDEX | OP_AMOUNT_8)
+#define N0              (OP_ACTION_NOOP  | OP_AMOUNT_0)
+/* the max of the regular templates - not including the special templates */
+#define OPS_MAX         (0x1a)
+#endif
diff --git a/lib/842/842_compress.c b/lib/842/842_compress.c
new file mode 100644
index 000000000000..7ce68948e68c
--- /dev/null
+++ b/lib/842/842_compress.c
@@ -0,0 +1,626 @@
+/*
+ * 842 Software Compression
+ *
+ * Copyright (C) 2015 Dan Streetman, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See 842.h for details of the 842 compressed format.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define MODULE_NAME "842_compress"
+#include <linux/hashtable.h>
+#include "842.h"
+#include "842_debugfs.h"
+#define SW842_HASHTABLE8_BITS   (10)
+#define SW842_HASHTABLE4_BITS   (11)
+#define SW842_HASHTABLE2_BITS   (10)
+/* By default, we allow compressing input buffers of any length, but we must
+ * use the non-standard "short data" template so the decompressor can correctly
+ * reproduce the uncompressed data buffer at the right length.  However the
+ * hardware 842 compressor will not recognize the "short data" template, and
+ * will fail to decompress any compressed buffer containing it (I have no idea
+ * why anyone would want to use software to compress and hardware to decompress
+ * but that's beside the point).  This parameter forces the compression
+ * function to simply reject any input buffer that isn't a multiple of 8 bytes
+ * long, instead of using the "short data" template, so that all compressed
+ * buffers produced by this function will be decompressable by the 842 hardware
+ * decompressor.  Unless you have a specific need for that, leave this disabled
+ * so that any length buffer can be compressed.
+ */
+static bool sw842_strict;
+module_param_named(strict, sw842_strict, bool, 0644);
+static u8 comp_ops[OPS_MAX][5] = { /* params size in bits */
+        { I8, N0, N0, N0, 0x19 }, /* 8 */
+        { I4, I4, N0, N0, 0x18 }, /* 18 */
+        { I4, I2, I2, N0, 0x17 }, /* 25 */
+        { I2, I2, I4, N0, 0x13 }, /* 25 */
+        { I2, I2, I2, I2, 0x12 }, /* 32 */
+        { I4, I2, D2, N0, 0x16 }, /* 33 */
+        { I4, D2, I2, N0, 0x15 }, /* 33 */
+        { I2, D2, I4, N0, 0x0e }, /* 33 */
+        { D2, I2, I4, N0, 0x09 }, /* 33 */
+        { I2, I2, I2, D2, 0x11 }, /* 40 */
+        { I2, I2, D2, I2, 0x10 }, /* 40 */
+        { I2, D2, I2, I2, 0x0d }, /* 40 */
+        { D2, I2, I2, I2, 0x08 }, /* 40 */
+        { I4, D4, N0, N0, 0x14 }, /* 41 */
+        { D4, I4, N0, N0, 0x04 }, /* 41 */
+        { I2, I2, D4, N0, 0x0f }, /* 48 */
+        { I2, D2, I2, D2, 0x0c }, /* 48 */
+        { I2, D4, I2, N0, 0x0b }, /* 48 */
+        { D2, I2, I2, D2, 0x07 }, /* 48 */
+        { D2, I2, D2, I2, 0x06 }, /* 48 */
+        { D4, I2, I2, N0, 0x03 }, /* 48 */
+        { I2, D2, D4, N0, 0x0a }, /* 56 */
+        { D2, I2, D4, N0, 0x05 }, /* 56 */
+        { D4, I2, D2, N0, 0x02 }, /* 56 */
+        { D4, D2, I2, N0, 0x01 }, /* 56 */
+        { D8, N0, N0, N0, 0x00 }, /* 64 */
+};
+struct sw842_hlist_node8 {
+        struct hlist_node node;
+        u64 data;
+        u8 index;
+};
+struct sw842_hlist_node4 {
+        struct hlist_node node;
+        u32 data;
+        u16 index;
+};
+struct sw842_hlist_node2 {
+        struct hlist_node node;
+        u16 data;
+        u8 index;
+};
+#define INDEX_NOT_FOUND         (-1)
+#define INDEX_NOT_CHECKED       (-2)
+struct sw842_param {
+        u8 *in;
+        u8 *instart;
+        u64 ilen;
+        u8 *out;
+        u64 olen;
+        u8 bit;
+        u64 data8[1];
+        u32 data4[2];
+        u16 data2[4];
+        int index8[1];
+        int index4[2];
+        int index2[4];
+        DECLARE_HASHTABLE(htable8, SW842_HASHTABLE8_BITS);
+        DECLARE_HASHTABLE(htable4, SW842_HASHTABLE4_BITS);
+        DECLARE_HASHTABLE(htable2, SW842_HASHTABLE2_BITS);
+        struct sw842_hlist_node8 node8[1 << I8_BITS];
+        struct sw842_hlist_node4 node4[1 << I4_BITS];
+        struct sw842_hlist_node2 node2[1 << I2_BITS];
+};
+#define get_input_data(p, o, b)                                         \
+        be##b##_to_cpu(get_unaligned((__be##b *)((p)->in + (o))))
+#define init_hashtable_nodes(p, b)      do {                    \
+        int _i;                                                 \
+        hash_init((p)->htable##b);                              \
+        for (_i = 0; _i < ARRAY_SIZE((p)->node##b); _i++) {     \
+                (p)->node##b[_i].index = _i;                    \
+                (p)->node##b[_i].data = 0;                      \
+                INIT_HLIST_NODE(&(p)->node##b[_i].node);        \
+        }                                                       \
+} while (0)
+#define find_index(p, b, n)     ({                                      \
+        struct sw842_hlist_node##b *_n;                                 \
+        p->index##b[n] = INDEX_NOT_FOUND;                               \
+        hash_for_each_possible(p->htable##b, _n, node, p->data##b[n]) { \
+                if (p->data##b[n] == _n->data) {                        \
+                        p->index##b[n] = _n->index;                     \
+                        break;                                          \
+                }                                                       \
+        }                                                               \
+        p->index##b[n] >= 0;                                            \
+})
+#define check_index(p, b, n)                    \
+        ((p)->index##b[n] == INDEX_NOT_CHECKED  \
+         ? find_index(p, b, n)                  \
+         : (p)->index##b[n] >= 0)
+#define replace_hash(p, b, i, d)        do {                            \
+        struct sw842_hlist_node##b *_n = &(p)->node##b[(i)+(d)];        \
+        hash_del(&_n->node);                                            \
+        _n->data = (p)->data##b[d];                                     \
+        pr_debug("add hash index%x %x pos %x data %lx\n", b,            \
+                 (unsigned int)_n->index,                               \
+                 (unsigned int)((p)->in - (p)->instart),                \
+                 (unsigned long)_n->data);                              \
+        hash_add((p)->htable##b, &_n->node, _n->data);                  \
+} while (0)
+static u8 bmask[8] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe };
+static int add_bits(struct sw842_param *p, u64 d, u8 n);
+static int __split_add_bits(struct sw842_param *p, u64 d, u8 n, u8 s)
+{
+        int ret;
+        if (n <= s)
+                return -EINVAL;
+        ret = add_bits(p, d >> s, n - s);
+        if (ret)
+                return ret;
+        return add_bits(p, d & GENMASK_ULL(s - 1, 0), s);
+}
+static int add_bits(struct sw842_param *p, u64 d, u8 n)
+{
+        int b = p->bit, bits = b + n, s = round_up(bits, 8) - bits;
+        u64 o;
+        u8 *out = p->out;
+        pr_debug("add %u bits %lx\n", (unsigned char)n, (unsigned long)d);
+        if (n > 64)
+                return -EINVAL;
+        /* split this up if writing to > 8 bytes (i.e. n == 64 && p->bit > 0),
+         * or if we're at the end of the output buffer and would write past end
+         */
+        if (bits > 64)
+                return __split_add_bits(p, d, n, 32);
+        else if (p->olen < 8 && bits > 32 && bits <= 56)
+                return __split_add_bits(p, d, n, 16);
+        else if (p->olen < 4 && bits > 16 && bits <= 24)
+                return __split_add_bits(p, d, n, 8);
+        if (DIV_ROUND_UP(bits, 8) > p->olen)
+                return -ENOSPC;
+        o = *out & bmask[b];
+        d <<= s;
+        if (bits <= 8)
+                *out = o | d;
+        else if (bits <= 16)
+                put_unaligned(cpu_to_be16(o << 8 | d), (__be16 *)out);
+        else if (bits <= 24)
+                put_unaligned(cpu_to_be32(o << 24 | d << 8), (__be32 *)out);
+        else if (bits <= 32)
+                put_unaligned(cpu_to_be32(o << 24 | d), (__be32 *)out);
+        else if (bits <= 40)
+                put_unaligned(cpu_to_be64(o << 56 | d << 24), (__be64 *)out);
+        else if (bits <= 48)
+                put_unaligned(cpu_to_be64(o << 56 | d << 16), (__be64 *)out);
+        else if (bits <= 56)
+                put_unaligned(cpu_to_be64(o << 56 | d << 8), (__be64 *)out);
+        else
+                put_unaligned(cpu_to_be64(o << 56 | d), (__be64 *)out);
+        p->bit += n;
+        if (p->bit > 7) {
+                p->out += p->bit / 8;
+                p->olen -= p->bit / 8;
+                p->bit %= 8;
+        }
+        return 0;
+}
+static int add_template(struct sw842_param *p, u8 c)
+{
+        int ret, i, b = 0;
+        u8 *t = comp_ops[c];
+        bool inv = false;
+        if (c >= OPS_MAX)
+                return -EINVAL;
+        pr_debug("template %x\n", t[4]);
+        ret = add_bits(p, t[4], OP_BITS);
+        if (ret)
+                return ret;
+        for (i = 0; i < 4; i++) {
+                pr_debug("op %x\n", t[i]);
+                switch (t[i] & OP_AMOUNT) {
+                case OP_AMOUNT_8:
+                        if (b)
+                                inv = true;
+                        else if (t[i] & OP_ACTION_INDEX)
+                                ret = add_bits(p, p->index8[0], I8_BITS);
+                        else if (t[i] & OP_ACTION_DATA)
+                                ret = add_bits(p, p->data8[0], 64);
+                        else
+                                inv = true;
+                        break;
+                case OP_AMOUNT_4:
+                        if (b == 2 && t[i] & OP_ACTION_DATA)
+                                ret = add_bits(p, get_input_data(p, 2, 32), 32);
+                        else if (b != 0 && b != 4)
+                                inv = true;
+                        else if (t[i] & OP_ACTION_INDEX)
+                                ret = add_bits(p, p->index4[b >> 2], I4_BITS);
+                        else if (t[i] & OP_ACTION_DATA)
+                                ret = add_bits(p, p->data4[b >> 2], 32);
+                        else
+                                inv = true;
+                        break;
+                case OP_AMOUNT_2:
+                        if (b != 0 && b != 2 && b != 4 && b != 6)
+                                inv = true;
+                        if (t[i] & OP_ACTION_INDEX)
+                                ret = add_bits(p, p->index2[b >> 1], I2_BITS);
+                        else if (t[i] & OP_ACTION_DATA)
+                                ret = add_bits(p, p->data2[b >> 1], 16);
+                        else
+                                inv = true;
+                        break;
+                case OP_AMOUNT_0:
+                        inv = (b != 8) || !(t[i] & OP_ACTION_NOOP);
+                        break;
+                default:
+                        inv = true;
+                        break;
+                }
+                if (ret)
+                        return ret;
+                if (inv) {
+                        pr_err("Invalid templ %x op %d : %x %x %x %x\n",
+                               c, i, t[0], t[1], t[2], t[3]);
+                        return -EINVAL;
+                }
+                b += t[i] & OP_AMOUNT;
+        }
+        if (b != 8) {
+                pr_err("Invalid template %x len %x : %x %x %x %x\n",
+                       c, b, t[0], t[1], t[2], t[3]);
+                return -EINVAL;
+        }
+        if (sw842_template_counts)
+                atomic_inc(&template_count[t[4]]);
+        return 0;
+}
+static int add_repeat_template(struct sw842_param *p, u8 r)
+{
+        int ret;
+        /* repeat param is 0-based */
+        if (!r || --r > REPEAT_BITS_MAX)
+                return -EINVAL;
+        ret = add_bits(p, OP_REPEAT, OP_BITS);
+        if (ret)
+                return ret;
+        ret = add_bits(p, r, REPEAT_BITS);
+        if (ret)
+                return ret;
+        if (sw842_template_counts)
+                atomic_inc(&template_repeat_count);
+        return 0;
+}
+static int add_short_data_template(struct sw842_param *p, u8 b)
+{
+        int ret, i;
+        if (!b || b > SHORT_DATA_BITS_MAX)
+                return -EINVAL;
+        ret = add_bits(p, OP_SHORT_DATA, OP_BITS);
+        if (ret)
+                return ret;
+        ret = add_bits(p, b, SHORT_DATA_BITS);
+        if (ret)
+                return ret;
+        for (i = 0; i < b; i++) {
+                ret = add_bits(p, p->in[i], 8);
+                if (ret)
+                        return ret;
+        }
+        if (sw842_template_counts)
+                atomic_inc(&template_short_data_count);
+        return 0;
+}
+static int add_zeros_template(struct sw842_param *p)
+{
+        int ret = add_bits(p, OP_ZEROS, OP_BITS);
+        if (ret)
+                return ret;
+        if (sw842_template_counts)
+                atomic_inc(&template_zeros_count);
+        return 0;
+}
+static int add_end_template(struct sw842_param *p)
+{
+        int ret = add_bits(p, OP_END, OP_BITS);
+        if (ret)
+                return ret;
+        if (sw842_template_counts)
+                atomic_inc(&template_end_count);
+        return 0;
+}
+static bool check_template(struct sw842_param *p, u8 c)
+{
+        u8 *t = comp_ops[c];
+        int i, match, b = 0;
+        if (c >= OPS_MAX)
+                return false;
+        for (i = 0; i < 4; i++) {
+                if (t[i] & OP_ACTION_INDEX) {
+                        if (t[i] & OP_AMOUNT_2)
+                                match = check_index(p, 2, b >> 1);
+                        else if (t[i] & OP_AMOUNT_4)
+                                match = check_index(p, 4, b >> 2);
+                        else if (t[i] & OP_AMOUNT_8)
+                                match = check_index(p, 8, 0);
+                        else
+                                return false;
+                        if (!match)
+                                return false;
+                }
+                b += t[i] & OP_AMOUNT;
+        }
+        return true;
+}
+static void get_next_data(struct sw842_param *p)
+{
+        p->data8[0] = get_input_data(p, 0, 64);
+        p->data4[0] = get_input_data(p, 0, 32);
+        p->data4[1] = get_input_data(p, 4, 32);
+        p->data2[0] = get_input_data(p, 0, 16);
+        p->data2[1] = get_input_data(p, 2, 16);
+        p->data2[2] = get_input_data(p, 4, 16);
+        p->data2[3] = get_input_data(p, 6, 16);
+}
+/* update the hashtable entries.
+ * only call this after finding/adding the current template
+ * the dataN fields for the current 8 byte block must be already updated
+ */
+static void update_hashtables(struct sw842_param *p)
+{
+        u64 pos = p->in - p->instart;
+        u64 n8 = (pos >> 3) % (1 << I8_BITS);
+        u64 n4 = (pos >> 2) % (1 << I4_BITS);
+        u64 n2 = (pos >> 1) % (1 << I2_BITS);
+        replace_hash(p, 8, n8, 0);
+        replace_hash(p, 4, n4, 0);
+        replace_hash(p, 4, n4, 1);
+        replace_hash(p, 2, n2, 0);
+        replace_hash(p, 2, n2, 1);
+        replace_hash(p, 2, n2, 2);
+        replace_hash(p, 2, n2, 3);
+}
+/* find the next template to use, and add it
+ * the p->dataN fields must already be set for the current 8 byte block
+ */
+static int process_next(struct sw842_param *p)
+{
+        int ret, i;
+        p->index8[0] = INDEX_NOT_CHECKED;
+        p->index4[0] = INDEX_NOT_CHECKED;
+        p->index4[1] = INDEX_NOT_CHECKED;
+        p->index2[0] = INDEX_NOT_CHECKED;
+        p->index2[1] = INDEX_NOT_CHECKED;
+        p->index2[2] = INDEX_NOT_CHECKED;
+        p->index2[3] = INDEX_NOT_CHECKED;
+        /* check up to OPS_MAX - 1; last op is our fallback */
+        for (i = 0; i < OPS_MAX - 1; i++) {
+                if (check_template(p, i))
+                        break;
+        }
+        ret = add_template(p, i);
+        if (ret)
+                return ret;
+        return 0;
+}
+/**
+ * sw842_compress
+ *
+ * Compress the uncompressed buffer of length @ilen at @in to the output buffer
+ * @out, using no more than @olen bytes, using the 842 compression format.
+ *
+ * Returns: 0 on success, error on failure.  The @olen parameter
+ * will contain the number of output bytes written on success, or
+ * 0 on error.
+ */
+int sw842_compress(const u8 *in, unsigned int ilen,
+                   u8 *out, unsigned int *olen, void *wmem)
+{
+        struct sw842_param *p = (struct sw842_param *)wmem;
+        int ret;
+        u64 last, next, pad, total;
+        u8 repeat_count = 0;
+        BUILD_BUG_ON(sizeof(*p) > SW842_MEM_COMPRESS);
+        init_hashtable_nodes(p, 8);
+        init_hashtable_nodes(p, 4);
+        init_hashtable_nodes(p, 2);
+        p->in = (u8 *)in;
+        p->instart = p->in;
+        p->ilen = ilen;
+        p->out = out;
+        p->olen = *olen;
+        p->bit = 0;
+        total = p->olen;
+        *olen = 0;
+        /* if using strict mode, we can only compress a multiple of 8 */
+        if (sw842_strict && (ilen % 8)) {
+                pr_err("Using strict mode, can't compress len %d\n", ilen);
+                return -EINVAL;
+        }
+        /* let's compress at least 8 bytes, mkay? */
+        if (unlikely(ilen < 8))
+                goto skip_comp;
+        /* make initial 'last' different so we don't match the first time */
+        last = ~get_unaligned((u64 *)p->in);
+        while (p->ilen > 7) {
+                next = get_unaligned((u64 *)p->in);
+                /* must get the next data, as we need to update the hashtable
+                 * entries with the new data every time
+                 */
+                get_next_data(p);
+                /* we don't care about endianness in last or next;
+                 * we're just comparing 8 bytes to another 8 bytes,
+                 * they're both the same endianness
+                 */
+                if (next == last) {
+                        /* repeat count bits are 0-based, so we stop at +1 */
+                        if (++repeat_count <= REPEAT_BITS_MAX)
+                                goto repeat;
+                }
+                if (repeat_count) {
+                        ret = add_repeat_template(p, repeat_count);
+                        repeat_count = 0;
+                        if (next == last) /* reached max repeat bits */
+                                goto repeat;
+                }
+                if (next == 0)
+                        ret = add_zeros_template(p);
+                else
+                        ret = process_next(p);
+                if (ret)
+                        return ret;
+repeat:
+                last = next;
+                update_hashtables(p);
+                p->in += 8;
+                p->ilen -= 8;
+        }
+        if (repeat_count) {
+                ret = add_repeat_template(p, repeat_count);
+                if (ret)
+                        return ret;
+        }
+skip_comp:
+        if (p->ilen > 0) {
+                ret = add_short_data_template(p, p->ilen);
+                if (ret)
+                        return ret;
+                p->in += p->ilen;
+                p->ilen = 0;
+        }
+        ret = add_end_template(p);
+        if (ret)
+                return ret;
+        if (p->bit) {
+                p->out++;
+                p->olen--;
+                p->bit = 0;
+        }
+        /* pad compressed length to multiple of 8 */
+        pad = (8 - ((total - p->olen) % 8)) % 8;
+        if (pad) {
+                if (pad > p->olen) /* we were so close! */
+                        return -ENOSPC;
+                memset(p->out, 0, pad);
+                p->out += pad;
+                p->olen -= pad;
+        }
+        if (unlikely((total - p->olen) > UINT_MAX))
+                return -ENOSPC;
+        *olen = total - p->olen;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(sw842_compress);
+static int __init sw842_init(void)
+{
+        if (sw842_template_counts)
+                sw842_debugfs_create();
+        return 0;
+}
+module_init(sw842_init);
+static void __exit sw842_exit(void)
+{
+        if (sw842_template_counts)
+                sw842_debugfs_remove();
+}
+module_exit(sw842_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Software 842 Compressor");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
diff --git a/lib/842/842_debugfs.h b/lib/842/842_debugfs.h
new file mode 100644
index 000000000000..e7f3bffaf255
--- /dev/null
+++ b/lib/842/842_debugfs.h
@@ -0,0 +1,52 @@
+#ifndef __842_DEBUGFS_H__
+#define __842_DEBUGFS_H__
+#include <linux/debugfs.h>
+static bool sw842_template_counts;
+module_param_named(template_counts, sw842_template_counts, bool, 0444);
+static atomic_t template_count[OPS_MAX], template_repeat_count,
+        template_zeros_count, template_short_data_count, template_end_count;
+static struct dentry *sw842_debugfs_root;
+static int __init sw842_debugfs_create(void)
+{
+        umode_t m = S_IRUGO | S_IWUSR;
+        int i;
+        if (!debugfs_initialized())
+                return -ENODEV;
+        sw842_debugfs_root = debugfs_create_dir(MODULE_NAME, NULL);
+        if (IS_ERR(sw842_debugfs_root))
+                return PTR_ERR(sw842_debugfs_root);
+        for (i = 0; i < ARRAY_SIZE(template_count); i++) {
+                char name[32];
+                snprintf(name, 32, "template_%02x", i);
+                debugfs_create_atomic_t(name, m, sw842_debugfs_root,
+                                        &template_count[i]);
+        }
+        debugfs_create_atomic_t("template_repeat", m, sw842_debugfs_root,
+                                &template_repeat_count);
+        debugfs_create_atomic_t("template_zeros", m, sw842_debugfs_root,
+                                &template_zeros_count);
+        debugfs_create_atomic_t("template_short_data", m, sw842_debugfs_root,
+                                &template_short_data_count);
+        debugfs_create_atomic_t("template_end", m, sw842_debugfs_root,
+                                &template_end_count);
+        return 0;
+}
+static void __exit sw842_debugfs_remove(void)
+{
+        if (sw842_debugfs_root && !IS_ERR(sw842_debugfs_root))
+                debugfs_remove_recursive(sw842_debugfs_root);
+}
+#endif
diff --git a/lib/842/842_decompress.c b/lib/842/842_decompress.c
new file mode 100644
index 000000000000..5446ff0c9ba0
--- /dev/null
+++ b/lib/842/842_decompress.c
@@ -0,0 +1,405 @@
+/*
+ * 842 Software Decompression
+ *
+ * Copyright (C) 2015 Dan Streetman, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See 842.h for details of the 842 compressed format.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define MODULE_NAME "842_decompress"
+#include "842.h"
+#include "842_debugfs.h"
+/* rolling fifo sizes */
+#define I2_FIFO_SIZE    (2 * (1 << I2_BITS))
+#define I4_FIFO_SIZE    (4 * (1 << I4_BITS))
+#define I8_FIFO_SIZE    (8 * (1 << I8_BITS))
+static u8 decomp_ops[OPS_MAX][4] = {
+        { D8, N0, N0, N0 },
+        { D4, D2, I2, N0 },
+        { D4, I2, D2, N0 },
+        { D4, I2, I2, N0 },
+        { D4, I4, N0, N0 },
+        { D2, I2, D4, N0 },
+        { D2, I2, D2, I2 },
+        { D2, I2, I2, D2 },
+        { D2, I2, I2, I2 },
+        { D2, I2, I4, N0 },
+        { I2, D2, D4, N0 },
+        { I2, D4, I2, N0 },
+        { I2, D2, I2, D2 },
+        { I2, D2, I2, I2 },
+        { I2, D2, I4, N0 },
+        { I2, I2, D4, N0 },
+        { I2, I2, D2, I2 },
+        { I2, I2, I2, D2 },
+        { I2, I2, I2, I2 },
+        { I2, I2, I4, N0 },
+        { I4, D4, N0, N0 },
+        { I4, D2, I2, N0 },
+        { I4, I2, D2, N0 },
+        { I4, I2, I2, N0 },
+        { I4, I4, N0, N0 },
+        { I8, N0, N0, N0 }
+};
+struct sw842_param {
+        u8 *in;
+        u8 bit;
+        u64 ilen;
+        u8 *out;
+        u8 *ostart;
+        u64 olen;
+};
+#define beN_to_cpu(d, s)                                        \
+        ((s) == 2 ? be16_to_cpu(get_unaligned((__be16 *)d)) :   \
+         (s) == 4 ? be32_to_cpu(get_unaligned((__be32 *)d)) :   \
+         (s) == 8 ? be64_to_cpu(get_unaligned((__be64 *)d)) :   \
+         WARN(1, "pr_debug param err invalid size %x\n", s))
+static int next_bits(struct sw842_param *p, u64 *d, u8 n);
+static int __split_next_bits(struct sw842_param *p, u64 *d, u8 n, u8 s)
+{
+        u64 tmp = 0;
+        int ret;
+        if (n <= s) {
+                pr_debug("split_next_bits invalid n %u s %u\n", n, s);
+                return -EINVAL;
+        }
+        ret = next_bits(p, &tmp, n - s);
+        if (ret)
+                return ret;
+        ret = next_bits(p, d, s);
+        if (ret)
+                return ret;
+        *d |= tmp << s;
+        return 0;
+}
+static int next_bits(struct sw842_param *p, u64 *d, u8 n)
+{
+        u8 *in = p->in, b = p->bit, bits = b + n;
+        if (n > 64) {
+                pr_debug("next_bits invalid n %u\n", n);
+                return -EINVAL;
+        }
+        /* split this up if reading > 8 bytes, or if we're at the end of
+         * the input buffer and would read past the end
+         */
+        if (bits > 64)
+                return __split_next_bits(p, d, n, 32);
+        else if (p->ilen < 8 && bits > 32 && bits <= 56)
+                return __split_next_bits(p, d, n, 16);
+        else if (p->ilen < 4 && bits > 16 && bits <= 24)
+                return __split_next_bits(p, d, n, 8);
+        if (DIV_ROUND_UP(bits, 8) > p->ilen)
+                return -EOVERFLOW;
+        if (bits <= 8)
+                *d = *in >> (8 - bits);
+        else if (bits <= 16)
+                *d = be16_to_cpu(get_unaligned((__be16 *)in)) >> (16 - bits);
+        else if (bits <= 32)
+                *d = be32_to_cpu(get_unaligned((__be32 *)in)) >> (32 - bits);
+        else
+                *d = be64_to_cpu(get_unaligned((__be64 *)in)) >> (64 - bits);
+        *d &= GENMASK_ULL(n - 1, 0);
+        p->bit += n;
+        if (p->bit > 7) {
+                p->in += p->bit / 8;
+                p->ilen -= p->bit / 8;
+                p->bit %= 8;
+        }
+        return 0;
+}
+static int do_data(struct sw842_param *p, u8 n)
+{
+        u64 v;
+        int ret;
+        if (n > p->olen)
+                return -ENOSPC;
+        ret = next_bits(p, &v, n * 8);
+        if (ret)
+                return ret;
+        switch (n) {
+        case 2:
+                put_unaligned(cpu_to_be16((u16)v), (__be16 *)p->out);
+                break;
+        case 4:
+                put_unaligned(cpu_to_be32((u32)v), (__be32 *)p->out);
+                break;
+        case 8:
+                put_unaligned(cpu_to_be64((u64)v), (__be64 *)p->out);
+                break;
+        default:
+                return -EINVAL;
+        }
+        p->out += n;
+        p->olen -= n;
+        return 0;
+}
+static int __do_index(struct sw842_param *p, u8 size, u8 bits, u64 fsize)
+{
+        u64 index, offset, total = round_down(p->out - p->ostart, 8);
+        int ret;
+        ret = next_bits(p, &index, bits);
+        if (ret)
+                return ret;
+        offset = index * size;
+        /* a ring buffer of fsize is used; correct the offset */
+        if (total > fsize) {
+                /* this is where the current fifo is */
+                u64 section = round_down(total, fsize);
+                /* the current pos in the fifo */
+                u64 pos = total - section;
+                /* if the offset is past/at the pos, we need to
+                 * go back to the last fifo section
+                 */
+                if (offset >= pos)
+                        section -= fsize;
+                offset += section;
+        }
+        if (offset + size > total) {
+                pr_debug("index%x %lx points past end %lx\n", size,
+                         (unsigned long)offset, (unsigned long)total);
+                return -EINVAL;
+        }
+        pr_debug("index%x to %lx off %lx adjoff %lx tot %lx data %lx\n",
+                 size, (unsigned long)index, (unsigned long)(index * size),
+                 (unsigned long)offset, (unsigned long)total,
+                 (unsigned long)beN_to_cpu(&p->ostart[offset], size));
+        memcpy(p->out, &p->ostart[offset], size);
+        p->out += size;
+        p->olen -= size;
+        return 0;
+}
+static int do_index(struct sw842_param *p, u8 n)
+{
+        switch (n) {
+        case 2:
+                return __do_index(p, 2, I2_BITS, I2_FIFO_SIZE);
+        case 4:
+                return __do_index(p, 4, I4_BITS, I4_FIFO_SIZE);
+        case 8:
+                return __do_index(p, 8, I8_BITS, I8_FIFO_SIZE);
+        default:
+                return -EINVAL;
+        }
+}
+static int do_op(struct sw842_param *p, u8 o)
+{
+        int i, ret = 0;
+        if (o >= OPS_MAX)
+                return -EINVAL;
+        for (i = 0; i < 4; i++) {
+                u8 op = decomp_ops[o][i];
+                pr_debug("op is %x\n", op);
+                switch (op & OP_ACTION) {
+                case OP_ACTION_DATA:
+                        ret = do_data(p, op & OP_AMOUNT);
+                        break;
+                case OP_ACTION_INDEX:
+                        ret = do_index(p, op & OP_AMOUNT);
+                        break;
+                case OP_ACTION_NOOP:
+                        break;
+                default:
+                        pr_err("Interal error, invalid op %x\n", op);
+                        return -EINVAL;
+                }
+                if (ret)
+                        return ret;
+        }
+        if (sw842_template_counts)
+                atomic_inc(&template_count[o]);
+        return 0;
+}
+/**
+ * sw842_decompress
+ *
+ * Decompress the 842-compressed buffer of length @ilen at @in
+ * to the output buffer @out, using no more than @olen bytes.
+ *
+ * The compressed buffer must be only a single 842-compressed buffer,
+ * with the standard format described in the comments in 842.h
+ * Processing will stop when the 842 "END" template is detected,
+ * not the end of the buffer.
+ *
+ * Returns: 0 on success, error on failure.  The @olen parameter
+ * will contain the number of output bytes written on success, or
+ * 0 on error.
+ */
+int sw842_decompress(const u8 *in, unsigned int ilen,
+                     u8 *out, unsigned int *olen)
+{
+        struct sw842_param p;
+        int ret;
+        u64 op, rep, tmp, bytes, total;
+        p.in = (u8 *)in;
+        p.bit = 0;
+        p.ilen = ilen;
+        p.out = out;
+        p.ostart = out;
+        p.olen = *olen;
+        total = p.olen;
+        *olen = 0;
+        do {
+                ret = next_bits(&p, &op, OP_BITS);
+                if (ret)
+                        return ret;
+                pr_debug("template is %lx\n", (unsigned long)op);
+                switch (op) {
+                case OP_REPEAT:
+                        ret = next_bits(&p, &rep, REPEAT_BITS);
+                        if (ret)
+                                return ret;
+                        if (p.out == out) /* no previous bytes */
+                                return -EINVAL;
+                        /* copy rep + 1 */
+                        rep++;
+                        if (rep * 8 > p.olen)
+                                return -ENOSPC;
+                        while (rep-- > 0) {
+                                memcpy(p.out, p.out - 8, 8);
+                                p.out += 8;
+                                p.olen -= 8;
+                        }
+                        if (sw842_template_counts)
+                                atomic_inc(&template_repeat_count);
+                        break;
+                case OP_ZEROS:
+                        if (8 > p.olen)
+                                return -ENOSPC;
+                        memset(p.out, 0, 8);
+                        p.out += 8;
+                        p.olen -= 8;
+                        if (sw842_template_counts)
+                                atomic_inc(&template_zeros_count);
+                        break;
+                case OP_SHORT_DATA:
+                        ret = next_bits(&p, &bytes, SHORT_DATA_BITS);
+                        if (ret)
+                                return ret;
+                        if (!bytes || bytes > SHORT_DATA_BITS_MAX)
+                                return -EINVAL;
+                        while (bytes-- > 0) {
+                                ret = next_bits(&p, &tmp, 8);
+                                if (ret)
+                                        return ret;
+                                *p.out = (u8)tmp;
+                                p.out++;
+                                p.olen--;
+                        }
+                        if (sw842_template_counts)
+                                atomic_inc(&template_short_data_count);
+                        break;
+                case OP_END:
+                        if (sw842_template_counts)
+                                atomic_inc(&template_end_count);
+                        break;
+                default: /* use template */
+                        ret = do_op(&p, op);
+                        if (ret)
+                                return ret;
+                        break;
+                }
+        } while (op != OP_END);
+        if (unlikely((total - p.olen) > UINT_MAX))
+                return -ENOSPC;
+        *olen = total - p.olen;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(sw842_decompress);
+static int __init sw842_init(void)
+{
+        if (sw842_template_counts)
+                sw842_debugfs_create();
+        return 0;
+}
+module_init(sw842_init);
+static void __exit sw842_exit(void)
+{
+        if (sw842_template_counts)
+                sw842_debugfs_remove();
+}
+module_exit(sw842_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Software 842 Decompressor");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
diff --git a/lib/842/Makefile b/lib/842/Makefile
new file mode 100644
index 000000000000..5d24c0baff2e
--- /dev/null
+++ b/lib/842/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_842_COMPRESS) += 842_compress.o
+obj-$(CONFIG_842_DECOMPRESS) += 842_decompress.o
diff --git a/lib/Kconfig b/lib/Kconfig
index 601965a948e8..34e332b8d326 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -212,6 +212,12 @@ config RANDOM32_SELFTEST
 #
 # compression support is select'ed if needed
 #
+config 842_COMPRESS
+        tristate
+config 842_DECOMPRESS
+        tristate
 config ZLIB_INFLATE
        tristate
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba2b0c87e65b..b908048f8d6a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1233,6 +1233,7 @@ config RCU_TORTURE_TEST
        depends on DEBUG_KERNEL
        select TORTURE_TEST
        select SRCU
+        select TASKS_RCU
        default n
        help
          This option provides a kernel module that runs torture tests
@@ -1261,12 +1262,38 @@ config RCU_TORTURE_TEST_RUNNABLE
          Say N here if you want the RCU torture tests to start only
          after being manually enabled via /proc.
+config RCU_TORTURE_TEST_SLOW_PREINIT
+        bool "Slow down RCU grace-period pre-initialization to expose races"
+        depends on RCU_TORTURE_TEST
+        help
+          This option delays grace-period pre-initialization (the
+          propagation of CPU-hotplug changes up the rcu_node combining
+          tree) for a few jiffies between initializing each pair of
+          consecutive rcu_node structures.  This helps to expose races
+          involving grace-period pre-initialization, in other words, it
+          makes your kernel less stable.  It can also greatly increase
+          grace-period latency, especially on systems with large numbers
+          of CPUs.  This is useful when torture-testing RCU, but in
+          almost no other circumstance.
+          Say Y here if you want your system to crash and hang more often.
+          Say N if you want a sane system.
+config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
+        int "How much to slow down RCU grace-period pre-initialization"
+        range 0 5
+        default 3
+        depends on RCU_TORTURE_TEST_SLOW_PREINIT
+        help
+          This option specifies the number of jiffies to wait between
+          each rcu_node structure pre-initialization step.
 config RCU_TORTURE_TEST_SLOW_INIT
        bool "Slow down RCU grace-period initialization to expose races"
        depends on RCU_TORTURE_TEST
        help
-          This option makes grace-period initialization block for a
+          This option delays grace-period initialization for a few
-          few jiffies between initializing each pair of consecutive
+          jiffies between initializing each pair of consecutive
          rcu_node structures.  This helps to expose races involving
          grace-period initialization, in other words, it makes your
          kernel less stable.  It can also greatly increase grace-period
@@ -1286,6 +1313,30 @@ config RCU_TORTURE_TEST_SLOW_INIT_DELAY
          This option specifies the number of jiffies to wait between
          each rcu_node structure initialization.
+config RCU_TORTURE_TEST_SLOW_CLEANUP
+        bool "Slow down RCU grace-period cleanup to expose races"
+        depends on RCU_TORTURE_TEST
+        help
+          This option delays grace-period cleanup for a few jiffies
+          between cleaning up each pair of consecutive rcu_node
+          structures.  This helps to expose races involving grace-period
+          cleanup, in other words, it makes your kernel less stable.
+          It can also greatly increase grace-period latency, especially
+          on systems with large numbers of CPUs.  This is useful when
+          torture-testing RCU, but in almost no other circumstance.
+          Say Y here if you want your system to crash and hang more often.
+          Say N if you want a sane system.
+config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
+        int "How much to slow down RCU grace-period cleanup"
+        range 0 5
+        default 3
+        depends on RCU_TORTURE_TEST_SLOW_CLEANUP
+        help
+          This option specifies the number of jiffies to wait between
+          each rcu_node structure cleanup operation.
 config RCU_CPU_STALL_TIMEOUT
        int "RCU CPU stall timeout in seconds"
        depends on RCU_STALL_COMMON
@@ -1322,6 +1373,17 @@ config RCU_TRACE
          Say Y here if you want to enable RCU tracing
          Say N if you are unsure.
+config RCU_EQS_DEBUG
+        bool "Use this when adding any sort of NO_HZ support to your arch"
+        depends on DEBUG_KERNEL
+        help
+          This option provides consistency checks in RCU's handling of
+          NO_HZ.  These checks have proven quite helpful in detecting
+          bugs in arch-specific NO_HZ code.
+          Say N here if you need ultimate kernel/user switch latencies
+          Say Y if you are unsure
 endmenu # "RCU Debugging"
 config DEBUG_BLOCK_EXT_DEVT
diff --git a/lib/Makefile b/lib/Makefile
index 6c37933336a0..ff37c8c2f7b2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -78,6 +78,8 @@ obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
 obj-$(CONFIG_CRC8)      += crc8.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
+obj-$(CONFIG_842_COMPRESS) += 842/
+obj-$(CONFIG_842_DECOMPRESS) += 842/
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 4f134d8907a7..f610b2a10b3e 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -191,7 +191,7 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
        /* Update distances based on topology */
        for_each_cpu(cpu, update_mask) {
                if (cpu_rmap_copy_neigh(rmap, cpu,
-                                        topology_thread_cpumask(cpu), 1))
+                                        topology_sibling_cpumask(cpu), 1))
                        continue;
                if (cpu_rmap_copy_neigh(rmap, cpu,
                                        topology_core_cpumask(cpu), 2))
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 5f627084f2e9..5a70f6196f57 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -16,11 +16,10 @@
 int cpumask_next_and(int n, const struct cpumask *src1p,
                     const struct cpumask *src2p)
 {
-        struct cpumask tmp;
+        while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
+                if (cpumask_test_cpu(n, src2p))
-        if (cpumask_and(&tmp, src1p, src2p))
+                        break;
-                return cpumask_next(n, &tmp);
+        return n;
-        return nr_cpu_ids;
 }
 EXPORT_SYMBOL(cpumask_next_and);
diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h
index aac511417ad1..a89d041592c8 100644
--- a/lib/mpi/longlong.h
+++ b/lib/mpi/longlong.h
@@ -639,7 +639,7 @@ do { \
        **************  MIPS  *****************
        ***************************************/
 #if defined(__mips__) && W_TYPE_SIZE == 32
-#if __GNUC__ >= 4 && __GNUC_MINOR__ >= 4
+#if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
 #define umul_ppmm(w1, w0, u, v)                 \
 do {                                            \
        UDItype __ll = (UDItype)(u) * (v);      \
@@ -671,7 +671,7 @@ do {						\
        **************  MIPS/64  **************
        ***************************************/
 #if (defined(__mips) && __mips >= 3) && W_TYPE_SIZE == 64
-#if __GNUC__ >= 4 && __GNUC_MINOR__ >= 4
+#if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
 #define umul_ppmm(w1, w0, u, v) \
 do {                                                                    \
        typedef unsigned int __ll_UTItype __attribute__((mode(TI)));    \
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c
index 4cc6442733f4..bc0a1da8afba 100644
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -128,28 +128,36 @@ leave:
 }
 EXPORT_SYMBOL_GPL(mpi_read_from_buffer);
-/****************
+/**
- * Return an allocated buffer with the MPI (msb first).
+ * mpi_read_buffer() - read MPI to a bufer provided by user (msb first)
- * NBYTES receives the length of this buffer. Caller must free the
+ *
- * return string (This function does return a 0 byte buffer with NBYTES
+ * @a:          a multi precision integer
- * set to zero if the value of A is zero. If sign is not NULL, it will
+ * @buf:        bufer to which the output will be written to. Needs to be at
- * be set to the sign of the A.
+ *              leaset mpi_get_size(a) long.
+ * @buf_len:    size of the buf.
+ * @nbytes:     receives the actual length of the data written.
+ * @sign:       if not NULL, it will be set to the sign of a.
+ *
+ * Return:      0 on success or error code in case of error
 */
-void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
+int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
+                    int *sign)
 {
-        uint8_t *p, *buffer;
+        uint8_t *p;
        mpi_limb_t alimb;
+        unsigned int n = mpi_get_size(a);
        int i;
-        unsigned int n;
+        if (buf_len < n || !buf)
+                return -EINVAL;
        if (sign)
                *sign = a->sign;
-        *nbytes = n = a->nlimbs * BYTES_PER_MPI_LIMB;
-        if (!n)
+        if (nbytes)
-                n++;            /* avoid zero length allocation */
+                *nbytes = n;
-        p = buffer = kmalloc(n, GFP_KERNEL);
-        if (!p)
+        p = buf;
-                return NULL;
        for (i = a->nlimbs - 1; i >= 0; i--) {
                alimb = a->d[i];
@@ -171,15 +179,56 @@ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
 #error please implement for this limb size.
 #endif
        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_read_buffer);
+/*
+ * mpi_get_buffer() - Returns an allocated buffer with the MPI (msb first).
+ * Caller must free the return string.
+ * This function does return a 0 byte buffer with nbytes set to zero if the
+ * value of A is zero.
+ *
+ * @a:          a multi precision integer.
+ * @nbytes:     receives the length of this buffer.
+ * @sign:       if not NULL, it will be set to the sign of the a.
+ *
+ * Return:      Pointer to MPI buffer or NULL on error
+ */
+void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
+{
+        uint8_t *buf, *p;
+        unsigned int n;
+        int ret;
+        if (!nbytes)
+                return NULL;
+        n = mpi_get_size(a);
+        if (!n)
+                n++;
+        buf = kmalloc(n, GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        ret = mpi_read_buffer(a, buf, n, nbytes, sign);
+        if (ret) {
+                kfree(buf);
+                return NULL;
+        }
        /* this is sub-optimal but we need to do the shift operation
         * because the caller has to free the returned buffer */
-        for (p = buffer; !*p && *nbytes; p++, --*nbytes)
+        for (p = buf; !*p && *nbytes; p++, --*nbytes)
                ;
-        if (p != buffer)
+        if (p != buf)
-                memmove(buffer, p, *nbytes);
+                memmove(buf, p, *nbytes);
-        return buffer;
+        return buf;
 }
 EXPORT_SYMBOL_GPL(mpi_get_buffer);
diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c
index bf076d281d40..314f4dfa603e 100644
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -69,7 +69,7 @@ void mpi_free_limb_space(mpi_ptr_t a)
        if (!a)
                return;
-        kfree(a);
+        kzfree(a);
 }
 void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs)
@@ -95,7 +95,7 @@ int mpi_resize(MPI a, unsigned nlimbs)
                if (!p)
                        return -ENOMEM;
                memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t));
-                kfree(a->d);
+                kzfree(a->d);
                a->d = p;
        } else {
                a->d = kzalloc(nlimbs * sizeof(mpi_limb_t), GFP_KERNEL);
@@ -112,7 +112,7 @@ void mpi_free(MPI a)
                return;
        if (a->flags & 4)
-                kfree(a->d);
+                kzfree(a->d);
        else
                mpi_free_limb_space(a->d);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3d2aa27b845b..061550de77bc 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -33,7 +33,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
-#include <linux/preempt_mask.h>         /* in_interrupt() */
+#include <linux/preempt.h>              /* in_interrupt() */
 /*
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index b7595484a815..8fe9d9662abb 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -23,7 +23,7 @@
 #ifdef __KERNEL__ /* Real code */
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #else /* Dummy code for user space testing */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4396434e4715..8609378e6505 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -26,6 +26,7 @@
 #include <linux/random.h>
 #include <linux/rhashtable.h>
 #include <linux/err.h>
+#include <linux/export.h>
 #define HASH_DEFAULT_SIZE       64UL
 #define HASH_MIN_SIZE           4U
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c9f2e8c6ccc9..99fbc2f238c4 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -56,6 +56,38 @@ int sg_nents(struct scatterlist *sg)
 }
 EXPORT_SYMBOL(sg_nents);
+/**
+ * sg_nents_for_len - return total count of entries in scatterlist
+ *                    needed to satisfy the supplied length
+ * @sg:         The scatterlist
+ * @len:        The total required length
+ *
+ * Description:
+ * Determines the number of entries in sg that are required to meet
+ * the supplied length, taking into acount chaining as well
+ *
+ * Returns:
+ *   the number of sg entries needed, negative error on failure
+ *
+ **/
+int sg_nents_for_len(struct scatterlist *sg, u64 len)
+{
+        int nents;
+        u64 total;
+        if (!len)
+                return 0;
+        for (nents = 0, total = 0; sg; sg = sg_next(sg)) {
+                nents++;
+                total += sg->length;
+                if (total >= len)
+                        return nents;
+        }
+        return -EINVAL;
+}
+EXPORT_SYMBOL(sg_nents_for_len);
 /**
 * sg_last - return the last scatterlist entry in a list
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index a28df5206d95..3a5f2b366d84 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -57,7 +57,8 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
                        return res + find_zero(data) + 1 - align;
                }
                res += sizeof(unsigned long);
-                if (unlikely(max < sizeof(unsigned long)))
+                /* We already handled 'unsigned long' bytes. Did we do it all ? */
+                if (unlikely(max <= sizeof(unsigned long)))
                        break;
                max -= sizeof(unsigned long);
                if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
@@ -84,13 +85,21 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 * @str: The string to measure.
 * @count: Maximum count (including NUL character)
 *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
 *
 * Get the size of a NUL-terminated string in user space.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
- * If the string is too long, returns 'count+1'.
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
+ *
+ * NOTE! You should basically never use this function. There is
+ * almost never any valid case for using the length of a user space
+ * string, since the string can be changed at any time by other
+ * threads. Use "strncpy_from_user()" instead to get a stable copy
+ * of the string.
 */
 long strnlen_user(const char __user *str, long count)
 {
@@ -113,7 +122,8 @@ EXPORT_SYMBOL(strnlen_user);
 * strlen_user: - Get the size of a user string INCLUDING final NUL.
 * @str: The string to measure.
 *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
 *
 * Get the size of a NUL-terminated string in user space.
 *
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 4abda074ea45..42e192decbfd 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -537,8 +537,9 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
 * Allocates bounce buffer and returns its kernel virtual address.
 */
-phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+static phys_addr_t
-                       enum dma_data_direction dir)
+map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+           enum dma_data_direction dir)
 {
        dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
@@ -655,7 +656,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                 */
                phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
                if (paddr == SWIOTLB_MAP_ERROR)
-                        return NULL;
+                        goto err_warn;
                ret = phys_to_virt(paddr);
                dev_addr = phys_to_dma(hwdev, paddr);
@@ -669,7 +670,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                        /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
                        swiotlb_tbl_unmap_single(hwdev, paddr,
                                                 size, DMA_TO_DEVICE);
-                        return NULL;
+                        goto err_warn;
                }
        }
@@ -677,6 +678,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        memset(ret, 0, size);
        return ret;
+err_warn:
+        pr_warn("swiotlb: coherent allocation failed for device %s size=%zu\n",
+                dev_name(hwdev), size);
+        dump_stack();
+        return NULL;
 }
 EXPORT_SYMBOL(swiotlb_alloc_coherent);
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index a382e4a32609..782ae8ca2c06 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -36,7 +36,7 @@
 * Adds the timer node to the timerqueue, sorted by the
 * node's expires value.
 */
-void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 {
        struct rb_node **p = &head->head.rb_node;
        struct rb_node *parent = NULL;
@@ -56,8 +56,11 @@ void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
        rb_link_node(&node->node, parent, p);
        rb_insert_color(&node->node, &head->head);
-        if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+        if (!head->next || node->expires.tv64 < head->next->expires.tv64) {
                head->next = node;
+                return true;
+        }
+        return false;
 }
 EXPORT_SYMBOL_GPL(timerqueue_add);
@@ -69,7 +72,7 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
 *
 * Removes the timer node from the timerqueue.
 */
-void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {
        WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
@@ -82,6 +85,7 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
        }
        rb_erase(&node->node, &head->head);
        RB_CLEAR_NODE(&node->node);
+        return head->next != NULL;
 }
 EXPORT_SYMBOL_GPL(timerqueue_del);