39 files changed, 627 insertions, 1100 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 68261430fe6e..ade12ec4224b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -14,7 +14,6 @@ config X86_32
        select ARCH_WANT_IPC_PARSE_VERSION
        select CLKSRC_I8253
        select CLONE_BACKWARDS
-        select HAVE_AOUT
        select HAVE_GENERIC_DMA_COHERENT
        select MODULES_USE_ELF_REL
        select OLD_SIGACTION
@@ -2843,6 +2842,7 @@ config IA32_EMULATION
 config IA32_AOUT
        tristate "IA32 a.out support"
        depends on IA32_EMULATION
+        depends on BROKEN
        ---help---
          Support old a.out binaries in the 32bit emulation.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f105ae8651c9..f62e347862cc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -602,10 +602,12 @@ ENTRY(trampoline_32bit_src)
 3:
        /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
        pushl   %ecx
+        pushl   %edx
        movl    $MSR_EFER, %ecx
        rdmsr
        btsl    $_EFER_LME, %eax
        wrmsr
+        popl    %edx
        popl    %ecx
        /* Enable PAE and LA57 (if required) paging modes */
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 2a356b948720..3ea71b871813 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -119,31 +119,20 @@ static void crypto_aegis128_aesni_process_ad(
 }
 static void crypto_aegis128_aesni_process_crypt(
-                struct aegis_state *state, struct aead_request *req,
+                struct aegis_state *state, struct skcipher_walk *walk,
                const struct aegis_crypt_ops *ops)
 {
-        struct skcipher_walk walk;
+        while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
-        u8 *src, *dst;
+                ops->crypt_blocks(state,
-        unsigned int chunksize, base;
+                                  round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
+                                  walk->src.virt.addr, walk->dst.virt.addr);
-        ops->skcipher_walk_init(&walk, req, false);
+                skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+        }
-        while (walk.nbytes) {
-                src = walk.src.virt.addr;
-                dst = walk.dst.virt.addr;
-                chunksize = walk.nbytes;
-                ops->crypt_blocks(state, chunksize, src, dst);
-                base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1);
-                src += base;
-                dst += base;
-                chunksize &= AEGIS128_BLOCK_SIZE - 1;
-                if (chunksize > 0)
-                        ops->crypt_tail(state, chunksize, src, dst);
-                skcipher_walk_done(&walk, 0);
+        if (walk->nbytes) {
+                ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+                                walk->dst.virt.addr);
+                skcipher_walk_done(walk, 0);
        }
 }
@@ -186,13 +175,16 @@ static void crypto_aegis128_aesni_crypt(struct aead_request *req,
 {
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+        struct skcipher_walk walk;
        struct aegis_state state;
+        ops->skcipher_walk_init(&walk, req, true);
        kernel_fpu_begin();
        crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
        crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
-        crypto_aegis128_aesni_process_crypt(&state, req, ops);
+        crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
        crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
        kernel_fpu_end();
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
index dbe8bb980da1..1b1b39c66c5e 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -119,31 +119,20 @@ static void crypto_aegis128l_aesni_process_ad(
 }
 static void crypto_aegis128l_aesni_process_crypt(
-                struct aegis_state *state, struct aead_request *req,
+                struct aegis_state *state, struct skcipher_walk *walk,
                const struct aegis_crypt_ops *ops)
 {
-        struct skcipher_walk walk;
+        while (walk->nbytes >= AEGIS128L_BLOCK_SIZE) {
-        u8 *src, *dst;
+                ops->crypt_blocks(state, round_down(walk->nbytes,
-        unsigned int chunksize, base;
+                                                    AEGIS128L_BLOCK_SIZE),
+                                  walk->src.virt.addr, walk->dst.virt.addr);
-        ops->skcipher_walk_init(&walk, req, false);
+                skcipher_walk_done(walk, walk->nbytes % AEGIS128L_BLOCK_SIZE);
+        }
-        while (walk.nbytes) {
-                src = walk.src.virt.addr;
-                dst = walk.dst.virt.addr;
-                chunksize = walk.nbytes;
-                ops->crypt_blocks(state, chunksize, src, dst);
-                base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1);
-                src += base;
-                dst += base;
-                chunksize &= AEGIS128L_BLOCK_SIZE - 1;
-                if (chunksize > 0)
-                        ops->crypt_tail(state, chunksize, src, dst);
-                skcipher_walk_done(&walk, 0);
+        if (walk->nbytes) {
+                ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+                                walk->dst.virt.addr);
+                skcipher_walk_done(walk, 0);
        }
 }
@@ -186,13 +175,16 @@ static void crypto_aegis128l_aesni_crypt(struct aead_request *req,
 {
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm);
+        struct skcipher_walk walk;
        struct aegis_state state;
+        ops->skcipher_walk_init(&walk, req, true);
        kernel_fpu_begin();
        crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv);
        crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen);
-        crypto_aegis128l_aesni_process_crypt(&state, req, ops);
+        crypto_aegis128l_aesni_process_crypt(&state, &walk, ops);
        crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
        kernel_fpu_end();
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
index 8bebda2de92f..6227ca3220a0 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -119,31 +119,20 @@ static void crypto_aegis256_aesni_process_ad(
 }
 static void crypto_aegis256_aesni_process_crypt(
-                struct aegis_state *state, struct aead_request *req,
+                struct aegis_state *state, struct skcipher_walk *walk,
                const struct aegis_crypt_ops *ops)
 {
-        struct skcipher_walk walk;
+        while (walk->nbytes >= AEGIS256_BLOCK_SIZE) {
-        u8 *src, *dst;
+                ops->crypt_blocks(state,
-        unsigned int chunksize, base;
+                                  round_down(walk->nbytes, AEGIS256_BLOCK_SIZE),
+                                  walk->src.virt.addr, walk->dst.virt.addr);
-        ops->skcipher_walk_init(&walk, req, false);
+                skcipher_walk_done(walk, walk->nbytes % AEGIS256_BLOCK_SIZE);
+        }
-        while (walk.nbytes) {
-                src = walk.src.virt.addr;
-                dst = walk.dst.virt.addr;
-                chunksize = walk.nbytes;
-                ops->crypt_blocks(state, chunksize, src, dst);
-                base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1);
-                src += base;
-                dst += base;
-                chunksize &= AEGIS256_BLOCK_SIZE - 1;
-                if (chunksize > 0)
-                        ops->crypt_tail(state, chunksize, src, dst);
-                skcipher_walk_done(&walk, 0);
+        if (walk->nbytes) {
+                ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+                                walk->dst.virt.addr);
+                skcipher_walk_done(walk, 0);
        }
 }
@@ -186,13 +175,16 @@ static void crypto_aegis256_aesni_crypt(struct aead_request *req,
 {
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm);
+        struct skcipher_walk walk;
        struct aegis_state state;
+        ops->skcipher_walk_init(&walk, req, true);
        kernel_fpu_begin();
        crypto_aegis256_aesni_init(&state, ctx->key, req->iv);
        crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen);
-        crypto_aegis256_aesni_process_crypt(&state, req, ops);
+        crypto_aegis256_aesni_process_crypt(&state, &walk, ops);
        crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
        kernel_fpu_end();
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1321700d6647..1e3d2102033a 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -175,26 +175,18 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
                                   struct gcm_context_data *gdata,
                                   u8 *auth_tag, unsigned long auth_tag_len);
-static struct aesni_gcm_tfm_s {
+static const struct aesni_gcm_tfm_s {
-void (*init)(void *ctx,
+        void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv,
-                                struct gcm_context_data *gdata,
+                     u8 *hash_subkey, const u8 *aad, unsigned long aad_len);
-                                u8 *iv,
+        void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
-                                u8 *hash_subkey, const u8 *aad,
+                           const u8 *in, unsigned long plaintext_len);
-                                unsigned long aad_len);
+        void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
-void (*enc_update)(void *ctx,
+                           const u8 *in, unsigned long ciphertext_len);
-                                        struct gcm_context_data *gdata, u8 *out,
+        void (*finalize)(void *ctx, struct gcm_context_data *gdata,
-                                        const u8 *in,
+                         u8 *auth_tag, unsigned long auth_tag_len);
-                                        unsigned long plaintext_len);
-void (*dec_update)(void *ctx,
-                                        struct gcm_context_data *gdata, u8 *out,
-                                        const u8 *in,
-                                        unsigned long ciphertext_len);
-void (*finalize)(void *ctx,
-                                struct gcm_context_data *gdata,
-                                u8 *auth_tag, unsigned long auth_tag_len);
 } *aesni_gcm_tfm;
-struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
        .init = &aesni_gcm_init,
        .enc_update = &aesni_gcm_enc_update,
        .dec_update = &aesni_gcm_dec_update,
@@ -243,7 +235,7 @@ asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
                        const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len);
-struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
        .init = &aesni_gcm_init_avx_gen2,
        .enc_update = &aesni_gcm_enc_update_avx_gen2,
        .dec_update = &aesni_gcm_dec_update_avx_gen2,
@@ -288,7 +280,7 @@ asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
                        const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len);
-struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
        .init = &aesni_gcm_init_avx_gen4,
        .enc_update = &aesni_gcm_enc_update_avx_gen4,
        .dec_update = &aesni_gcm_dec_update_avx_gen4,
@@ -778,7 +770,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 {
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-        struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
+        const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
        struct gcm_context_data data AESNI_ALIGN_ATTR;
        struct scatter_walk dst_sg_walk = {};
        unsigned long left = req->cryptlen;
@@ -821,11 +813,14 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
                scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
        }
-        src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+        if (left) {
-        scatterwalk_start(&src_sg_walk, src_sg);
+                src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
-        if (req->src != req->dst) {
+                scatterwalk_start(&src_sg_walk, src_sg);
-                dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen);
+                if (req->src != req->dst) {
-                scatterwalk_start(&dst_sg_walk, dst_sg);
+                        dst_sg = scatterwalk_ffwd(dst_start, req->dst,
+                                                  req->assoclen);
+                        scatterwalk_start(&dst_sg_walk, dst_sg);
+                }
        }
        kernel_fpu_begin();
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index de04d3e98d8d..3d873e67749d 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -43,609 +43,291 @@
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-########################################################################
-#       Function API:
-#       UINT16 crc_t10dif_pcl(
-#               UINT16 init_crc, //initial CRC value, 16 bits
-#               const unsigned char *buf, //buffer pointer to calculate CRC on
-#               UINT64 len //buffer length in bytes (64-bit data)
-#       );
 #
 #       Reference paper titled "Fast CRC Computation for Generic
 #       Polynomials Using PCLMULQDQ Instruction"
 #       URL: http://www.intel.com/content/dam/www/public/us/en/documents
 #  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 #
-#
 #include <linux/linkage.h>
 .text
-#define        arg1 %rdi
+#define         init_crc        %edi
-#define        arg2 %rsi
+#define         buf             %rsi
-#define        arg3 %rdx
+#define         len             %rdx
-#define        arg1_low32 %edi
+#define         FOLD_CONSTS     %xmm10
+#define         BSWAP_MASK      %xmm11
+# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
+# reg1, reg2.
+.macro  fold_32_bytes   offset, reg1, reg2
+        movdqu  \offset(buf), %xmm9
+        movdqu  \offset+16(buf), %xmm12
+        pshufb  BSWAP_MASK, %xmm9
+        pshufb  BSWAP_MASK, %xmm12
+        movdqa  \reg1, %xmm8
+        movdqa  \reg2, %xmm13
+        pclmulqdq       $0x00, FOLD_CONSTS, \reg1
+        pclmulqdq       $0x11, FOLD_CONSTS, %xmm8
+        pclmulqdq       $0x00, FOLD_CONSTS, \reg2
+        pclmulqdq       $0x11, FOLD_CONSTS, %xmm13
+        pxor    %xmm9 , \reg1
+        xorps   %xmm8 , \reg1
+        pxor    %xmm12, \reg2
+        xorps   %xmm13, \reg2
+.endm
+# Fold src_reg into dst_reg.
+.macro  fold_16_bytes   src_reg, dst_reg
+        movdqa  \src_reg, %xmm8
+        pclmulqdq       $0x11, FOLD_CONSTS, \src_reg
+        pclmulqdq       $0x00, FOLD_CONSTS, %xmm8
+        pxor    %xmm8, \dst_reg
+        xorps   \src_reg, \dst_reg
+.endm
-ENTRY(crc_t10dif_pcl)
+#
+# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
+#
+# Assumes len >= 16.
+#
 .align 16
+ENTRY(crc_t10dif_pcl)
-        # adjust the 16-bit initial_crc value, scale it to 32 bits
+        movdqa  .Lbswap_mask(%rip), BSWAP_MASK
-        shl     $16, arg1_low32
+        # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-        # Allocate Stack Space
+        cmp     $256, len
-        mov     %rsp, %rcx
+        jl      .Lless_than_256_bytes
-        sub     $16*2, %rsp
-        # align stack to 16 byte boundary
+        # Load the first 128 data bytes.  Byte swapping is necessary to make the
-        and     $~(0x10 - 1), %rsp
+        # bit order match the polynomial coefficient order.
+        movdqu  16*0(buf), %xmm0
-        # check if smaller than 256
+        movdqu  16*1(buf), %xmm1
-        cmp     $256, arg3
+        movdqu  16*2(buf), %xmm2
+        movdqu  16*3(buf), %xmm3
-        # for sizes less than 128, we can't fold 64B at a time...
+        movdqu  16*4(buf), %xmm4
-        jl      _less_than_128
+        movdqu  16*5(buf), %xmm5
+        movdqu  16*6(buf), %xmm6
+        movdqu  16*7(buf), %xmm7
-        # load the initial crc value
+        add     $128, buf
-        movd    arg1_low32, %xmm10      # initial crc
+        pshufb  BSWAP_MASK, %xmm0
+        pshufb  BSWAP_MASK, %xmm1
-        # crc value does not need to be byte-reflected, but it needs
+        pshufb  BSWAP_MASK, %xmm2
-        # to be moved to the high part of the register.
+        pshufb  BSWAP_MASK, %xmm3
-        # because data will be byte-reflected and will align with
+        pshufb  BSWAP_MASK, %xmm4
-        # initial crc at correct place.
+        pshufb  BSWAP_MASK, %xmm5
-        pslldq  $12, %xmm10
+        pshufb  BSWAP_MASK, %xmm6
+        pshufb  BSWAP_MASK, %xmm7
-        movdqa  SHUF_MASK(%rip), %xmm11
-        # receive the initial 64B data, xor the initial crc value
+        # XOR the first 16 data *bits* with the initial CRC value.
-        movdqu  16*0(arg2), %xmm0
+        pxor    %xmm8, %xmm8
-        movdqu  16*1(arg2), %xmm1
+        pinsrw  $7, init_crc, %xmm8
-        movdqu  16*2(arg2), %xmm2
+        pxor    %xmm8, %xmm0
-        movdqu  16*3(arg2), %xmm3
-        movdqu  16*4(arg2), %xmm4
+        movdqa  .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
-        movdqu  16*5(arg2), %xmm5
-        movdqu  16*6(arg2), %xmm6
+        # Subtract 128 for the 128 data bytes just consumed.  Subtract another
-        movdqu  16*7(arg2), %xmm7
+        # 128 to simplify the termination condition of the following loop.
+        sub     $256, len
-        pshufb  %xmm11, %xmm0
-        # XOR the initial_crc value
+        # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
-        pxor    %xmm10, %xmm0
+        # bytes xmm0-7 into them, storing the result back into xmm0-7.
-        pshufb  %xmm11, %xmm1
+.Lfold_128_bytes_loop:
-        pshufb  %xmm11, %xmm2
+        fold_32_bytes   0, %xmm0, %xmm1
-        pshufb  %xmm11, %xmm3
+        fold_32_bytes   32, %xmm2, %xmm3
-        pshufb  %xmm11, %xmm4
+        fold_32_bytes   64, %xmm4, %xmm5
-        pshufb  %xmm11, %xmm5
+        fold_32_bytes   96, %xmm6, %xmm7
-        pshufb  %xmm11, %xmm6
+        add     $128, buf
-        pshufb  %xmm11, %xmm7
+        sub     $128, len
+        jge     .Lfold_128_bytes_loop
-        movdqa  rk3(%rip), %xmm10       #xmm10 has rk3 and rk4
-                                        #imm value of pclmulqdq instruction
+        # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
-                                        #will determine which constant to use
+        # Fold across 64 bytes.
-        #################################################################
+        movdqa  .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
-        # we subtract 256 instead of 128 to save one instruction from the loop
+        fold_16_bytes   %xmm0, %xmm4
-        sub     $256, arg3
+        fold_16_bytes   %xmm1, %xmm5
+        fold_16_bytes   %xmm2, %xmm6
-        # at this section of the code, there is 64*x+y (0<=y<64) bytes of
+        fold_16_bytes   %xmm3, %xmm7
-        # buffer. The _fold_64_B_loop will fold 64B at a time
+        # Fold across 32 bytes.
-        # until we have 64+y Bytes of buffer
+        movdqa  .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
+        fold_16_bytes   %xmm4, %xmm6
+        fold_16_bytes   %xmm5, %xmm7
-        # fold 64B at a time. This section of the code folds 4 xmm
+        # Fold across 16 bytes.
-        # registers in parallel
+        movdqa  .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
-_fold_64_B_loop:
+        fold_16_bytes   %xmm6, %xmm7
-        # update the buffer pointer
+        # Add 128 to get the correct number of data bytes remaining in 0...127
-        add     $128, arg2              #    buf += 64#
+        # (not counting xmm7), following the previous extra subtraction by 128.
+        # Then subtract 16 to simplify the termination condition of the
-        movdqu  16*0(arg2), %xmm9
+        # following loop.
-        movdqu  16*1(arg2), %xmm12
+        add     $128-16, len
-        pshufb  %xmm11, %xmm9
-        pshufb  %xmm11, %xmm12
+        # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
-        movdqa  %xmm0, %xmm8
+        # xmm7 into them, storing the result back into xmm7.
-        movdqa  %xmm1, %xmm13
+        jl      .Lfold_16_bytes_loop_done
-        pclmulqdq       $0x0 , %xmm10, %xmm0
+.Lfold_16_bytes_loop:
-        pclmulqdq       $0x11, %xmm10, %xmm8
-        pclmulqdq       $0x0 , %xmm10, %xmm1
-        pclmulqdq       $0x11, %xmm10, %xmm13
-        pxor    %xmm9 , %xmm0
-        xorps   %xmm8 , %xmm0
-        pxor    %xmm12, %xmm1
-        xorps   %xmm13, %xmm1
-        movdqu  16*2(arg2), %xmm9
-        movdqu  16*3(arg2), %xmm12
-        pshufb  %xmm11, %xmm9
-        pshufb  %xmm11, %xmm12
-        movdqa  %xmm2, %xmm8
-        movdqa  %xmm3, %xmm13
-        pclmulqdq       $0x0, %xmm10, %xmm2
-        pclmulqdq       $0x11, %xmm10, %xmm8
-        pclmulqdq       $0x0, %xmm10, %xmm3
-        pclmulqdq       $0x11, %xmm10, %xmm13
-        pxor    %xmm9 , %xmm2
-        xorps   %xmm8 , %xmm2
-        pxor    %xmm12, %xmm3
-        xorps   %xmm13, %xmm3
-        movdqu  16*4(arg2), %xmm9
-        movdqu  16*5(arg2), %xmm12
-        pshufb  %xmm11, %xmm9
-        pshufb  %xmm11, %xmm12
-        movdqa  %xmm4, %xmm8
-        movdqa  %xmm5, %xmm13
-        pclmulqdq       $0x0,  %xmm10, %xmm4
-        pclmulqdq       $0x11, %xmm10, %xmm8
-        pclmulqdq       $0x0,  %xmm10, %xmm5
-        pclmulqdq       $0x11, %xmm10, %xmm13
-        pxor    %xmm9 ,  %xmm4
-        xorps   %xmm8 ,  %xmm4
-        pxor    %xmm12,  %xmm5
-        xorps   %xmm13,  %xmm5
-        movdqu  16*6(arg2), %xmm9
-        movdqu  16*7(arg2), %xmm12
-        pshufb  %xmm11, %xmm9
-        pshufb  %xmm11, %xmm12
-        movdqa  %xmm6 , %xmm8
-        movdqa  %xmm7 , %xmm13
-        pclmulqdq       $0x0 , %xmm10, %xmm6
-        pclmulqdq       $0x11, %xmm10, %xmm8
-        pclmulqdq       $0x0 , %xmm10, %xmm7
-        pclmulqdq       $0x11, %xmm10, %xmm13
-        pxor    %xmm9 , %xmm6
-        xorps   %xmm8 , %xmm6
-        pxor    %xmm12, %xmm7
-        xorps   %xmm13, %xmm7
-        sub     $128, arg3
-        # check if there is another 64B in the buffer to be able to fold
-        jge     _fold_64_B_loop
-        ##################################################################
-        add     $128, arg2
-        # at this point, the buffer pointer is pointing at the last y Bytes
-        # of the buffer the 64B of folded data is in 4 of the xmm
-        # registers: xmm0, xmm1, xmm2, xmm3
-        # fold the 8 xmm registers to 1 xmm register with different constants
-        movdqa  rk9(%rip), %xmm10
-        movdqa  %xmm0, %xmm8
-        pclmulqdq       $0x11, %xmm10, %xmm0
-        pclmulqdq       $0x0 , %xmm10, %xmm8
-        pxor    %xmm8, %xmm7
-        xorps   %xmm0, %xmm7
-        movdqa  rk11(%rip), %xmm10
-        movdqa  %xmm1, %xmm8
-        pclmulqdq        $0x11, %xmm10, %xmm1
-        pclmulqdq        $0x0 , %xmm10, %xmm8
-        pxor    %xmm8, %xmm7
-        xorps   %xmm1, %xmm7
-        movdqa  rk13(%rip), %xmm10
-        movdqa  %xmm2, %xmm8
-        pclmulqdq        $0x11, %xmm10, %xmm2
-        pclmulqdq        $0x0 , %xmm10, %xmm8
-        pxor    %xmm8, %xmm7
-        pxor    %xmm2, %xmm7
-        movdqa  rk15(%rip), %xmm10
-        movdqa  %xmm3, %xmm8
-        pclmulqdq       $0x11, %xmm10, %xmm3
-        pclmulqdq       $0x0 , %xmm10, %xmm8
-        pxor    %xmm8, %xmm7
-        xorps   %xmm3, %xmm7
-        movdqa  rk17(%rip), %xmm10
-        movdqa  %xmm4, %xmm8
-        pclmulqdq       $0x11, %xmm10, %xmm4
-        pclmulqdq       $0x0 , %xmm10, %xmm8
-        pxor    %xmm8, %xmm7
-        pxor    %xmm4, %xmm7
-        movdqa  rk19(%rip), %xmm10
-        movdqa  %xmm5, %xmm8
-        pclmulqdq       $0x11, %xmm10, %xmm5
-        pclmulqdq       $0x0 , %xmm10, %xmm8
-        pxor    %xmm8, %xmm7
-        xorps   %xmm5, %xmm7
-        movdqa  rk1(%rip), %xmm10       #xmm10 has rk1 and rk2
-                                        #imm value of pclmulqdq instruction
-                                        #will determine which constant to use
-        movdqa  %xmm6, %xmm8
-        pclmulqdq       $0x11, %xmm10, %xmm6
-        pclmulqdq       $0x0 , %xmm10, %xmm8
-        pxor    %xmm8, %xmm7
-        pxor    %xmm6, %xmm7
-        # instead of 64, we add 48 to the loop counter to save 1 instruction
-        # from the loop instead of a cmp instruction, we use the negative
-        # flag with the jl instruction
-        add     $128-16, arg3
-        jl      _final_reduction_for_128
-        # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
-        # and the rest is in memory. We can fold 16 bytes at a time if y>=16
-        # continue folding 16B at a time
-_16B_reduction_loop:
        movdqa  %xmm7, %xmm8
-        pclmulqdq       $0x11, %xmm10, %xmm7
+        pclmulqdq       $0x11, FOLD_CONSTS, %xmm7
-        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pclmulqdq       $0x00, FOLD_CONSTS, %xmm8
        pxor    %xmm8, %xmm7
-        movdqu  (arg2), %xmm0
+        movdqu  (buf), %xmm0
-        pshufb  %xmm11, %xmm0
+        pshufb  BSWAP_MASK, %xmm0
        pxor    %xmm0 , %xmm7
-        add     $16, arg2
+        add     $16, buf
-        sub     $16, arg3
+        sub     $16, len
-        # instead of a cmp instruction, we utilize the flags with the
+        jge     .Lfold_16_bytes_loop
-        # jge instruction equivalent of: cmp arg3, 16-16
-        # check if there is any more 16B in the buffer to be able to fold
+.Lfold_16_bytes_loop_done:
-        jge     _16B_reduction_loop
+        # Add 16 to get the correct number of data bytes remaining in 0...15
+        # (not counting xmm7), following the previous extra subtraction by 16.
-        #now we have 16+z bytes left to reduce, where 0<= z < 16.
+        add     $16, len
-        #first, we reduce the data in the xmm7 register
+        je      .Lreduce_final_16_bytes
+.Lhandle_partial_segment:
-_final_reduction_for_128:
+        # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
-        # check if any more data to fold. If not, compute the CRC of
+        # bytes are in xmm7 and the rest are the remaining data in 'buf'.  To do
-        # the final 128 bits
+        # this without needing a fold constant for each possible 'len', redivide
-        add     $16, arg3
+        # the bytes into a first chunk of 'len' bytes and a second chunk of 16
-        je      _128_done
+        # bytes, then fold the first chunk into the second.
-        # here we are getting data that is less than 16 bytes.
-        # since we know that there was data before the pointer, we can
-        # offset the input pointer before the actual point, to receive
-        # exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_xmms:
        movdqa  %xmm7, %xmm2
-        movdqu  -16(arg2, arg3), %xmm1
+        # xmm1 = last 16 original data bytes
-        pshufb  %xmm11, %xmm1
+        movdqu  -16(buf, len), %xmm1
+        pshufb  BSWAP_MASK, %xmm1
-        # get rid of the extra data that was loaded before
+        # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
-        # load the shift constant
+        lea     .Lbyteshift_table+16(%rip), %rax
-        lea     pshufb_shf_table+16(%rip), %rax
+        sub     len, %rax
-        sub     arg3, %rax
        movdqu  (%rax), %xmm0
-        # shift xmm2 to the left by arg3 bytes
        pshufb  %xmm0, %xmm2
-        # shift xmm7 to the right by 16-arg3 bytes
+        # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
-        pxor    mask1(%rip), %xmm0
+        pxor    .Lmask1(%rip), %xmm0
        pshufb  %xmm0, %xmm7
+        # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
+        # then '16-len' bytes from xmm2 (high-order bytes).
        pblendvb        %xmm2, %xmm1    #xmm0 is implicit
-        # fold 16 Bytes
+        # Fold the first chunk into the second chunk, storing the result in xmm7.
-        movdqa  %xmm1, %xmm2
        movdqa  %xmm7, %xmm8
-        pclmulqdq       $0x11, %xmm10, %xmm7
+        pclmulqdq       $0x11, FOLD_CONSTS, %xmm7
-        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pclmulqdq       $0x00, FOLD_CONSTS, %xmm8
        pxor    %xmm8, %xmm7
-        pxor    %xmm2, %xmm7
+        pxor    %xmm1, %xmm7
-_128_done:
+.Lreduce_final_16_bytes:
-        # compute crc of a 128-bit value
+        # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
-        movdqa  rk5(%rip), %xmm10       # rk5 and rk6 in xmm10
-        movdqa  %xmm7, %xmm0
-        #64b fold
+        # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-        pclmulqdq       $0x1, %xmm10, %xmm7
+        movdqa  .Lfinal_fold_consts(%rip), FOLD_CONSTS
-        pslldq  $8   ,  %xmm0
-        pxor    %xmm0,  %xmm7
-        #32b fold
+        # Fold the high 64 bits into the low 64 bits, while also multiplying by
+        # x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+        # whose low 48 bits are 0.
        movdqa  %xmm7, %xmm0
+        pclmulqdq       $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
+        pslldq  $8, %xmm0
+        pxor    %xmm0, %xmm7                      # + low bits * x^64
-        pand    mask2(%rip), %xmm0
+        # Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+        # value congruent to x^64 * M(x) and whose low 48 bits are 0.
-        psrldq  $12, %xmm7
-        pclmulqdq       $0x10, %xmm10, %xmm7
-        pxor    %xmm0, %xmm7
-        #barrett reduction
-_barrett:
-        movdqa  rk7(%rip), %xmm10       # rk7 and rk8 in xmm10
        movdqa  %xmm7, %xmm0
-        pclmulqdq       $0x01, %xmm10, %xmm7
+        pand    .Lmask2(%rip), %xmm0              # zero high 32 bits
-        pslldq  $4, %xmm7
+        psrldq  $12, %xmm7                        # extract high 32 bits
-        pclmulqdq       $0x11, %xmm10, %xmm7
+        pclmulqdq       $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
+        pxor    %xmm0, %xmm7                      # + low bits
-        pslldq  $4, %xmm7
+        # Load G(x) and floor(x^48 / G(x)).
-        pxor    %xmm0, %xmm7
+        movdqa  .Lbarrett_reduction_consts(%rip), FOLD_CONSTS
-        pextrd  $1, %xmm7, %eax
-_cleanup:
+        # Use Barrett reduction to compute the final CRC value.
-        # scale the result back to 16 bits
+        movdqa  %xmm7, %xmm0
-        shr     $16, %eax
+        pclmulqdq       $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
-        mov     %rcx, %rsp
+        psrlq   $32, %xmm7                        # /= x^32
+        pclmulqdq       $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
+        psrlq   $48, %xmm0
+        pxor    %xmm7, %xmm0                 # + low 16 nonzero bits
+        # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
+        pextrw  $0, %xmm0, %eax
        ret
-########################################################################
 .align 16
-_less_than_128:
+.Lless_than_256_bytes:
+        # Checksumming a buffer of length 16...255 bytes
-        # check if there is enough buffer to be able to fold 16B at a time
-        cmp     $32, arg3
-        jl      _less_than_32
-        movdqa  SHUF_MASK(%rip), %xmm11
-        # now if there is, load the constants
+        # Load the first 16 data bytes.
-        movdqa  rk1(%rip), %xmm10       # rk1 and rk2 in xmm10
+        movdqu  (buf), %xmm7
+        pshufb  BSWAP_MASK, %xmm7
+        add     $16, buf
-        movd    arg1_low32, %xmm0       # get the initial crc value
+        # XOR the first 16 data *bits* with the initial CRC value.
-        pslldq  $12, %xmm0      # align it to its correct place
+        pxor    %xmm0, %xmm0
-        movdqu  (arg2), %xmm7   # load the plaintext
+        pinsrw  $7, init_crc, %xmm0
-        pshufb  %xmm11, %xmm7   # byte-reflect the plaintext
        pxor    %xmm0, %xmm7
+        movdqa  .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
-        # update the buffer pointer
+        cmp     $16, len
-        add     $16, arg2
+        je      .Lreduce_final_16_bytes         # len == 16
+        sub     $32, len
-        # update the counter. subtract 32 instead of 16 to save one
+        jge     .Lfold_16_bytes_loop            # 32 <= len <= 255
-        # instruction from the loop
+        add     $16, len
-        sub     $32, arg3
+        jmp     .Lhandle_partial_segment        # 17 <= len <= 31
-        jmp     _16B_reduction_loop
-.align 16
-_less_than_32:
-        # mov initial crc to the return value. this is necessary for
-        # zero-length buffers.
-        mov     arg1_low32, %eax
-        test    arg3, arg3
-        je      _cleanup
-        movdqa  SHUF_MASK(%rip), %xmm11
-        movd    arg1_low32, %xmm0       # get the initial crc value
-        pslldq  $12, %xmm0      # align it to its correct place
-        cmp     $16, arg3
-        je      _exact_16_left
-        jl      _less_than_16_left
-        movdqu  (arg2), %xmm7   # load the plaintext
-        pshufb  %xmm11, %xmm7   # byte-reflect the plaintext
-        pxor    %xmm0 , %xmm7   # xor the initial crc value
-        add     $16, arg2
-        sub     $16, arg3
-        movdqa  rk1(%rip), %xmm10       # rk1 and rk2 in xmm10
-        jmp     _get_last_two_xmms
-.align 16
-_less_than_16_left:
-        # use stack space to load data less than 16 bytes, zero-out
-        # the 16B in memory first.
-        pxor    %xmm1, %xmm1
-        mov     %rsp, %r11
-        movdqa  %xmm1, (%r11)
-        cmp     $4, arg3
-        jl      _only_less_than_4
-        # backup the counter value
-        mov     arg3, %r9
-        cmp     $8, arg3
-        jl      _less_than_8_left
-        # load 8 Bytes
-        mov     (arg2), %rax
-        mov     %rax, (%r11)
-        add     $8, %r11
-        sub     $8, arg3
-        add     $8, arg2
-_less_than_8_left:
-        cmp     $4, arg3
-        jl      _less_than_4_left
-        # load 4 Bytes
-        mov     (arg2), %eax
-        mov     %eax, (%r11)
-        add     $4, %r11
-        sub     $4, arg3
-        add     $4, arg2
-_less_than_4_left:
-        cmp     $2, arg3
-        jl      _less_than_2_left
-        # load 2 Bytes
-        mov     (arg2), %ax
-        mov     %ax, (%r11)
-        add     $2, %r11
-        sub     $2, arg3
-        add     $2, arg2
-_less_than_2_left:
-        cmp     $1, arg3
-        jl      _zero_left
-        # load 1 Byte
-        mov     (arg2), %al
-        mov     %al, (%r11)
-_zero_left:
-        movdqa  (%rsp), %xmm7
-        pshufb  %xmm11, %xmm7
-        pxor    %xmm0 , %xmm7   # xor the initial crc value
-        # shl r9, 4
-        lea     pshufb_shf_table+16(%rip), %rax
-        sub     %r9, %rax
-        movdqu  (%rax), %xmm0
-        pxor    mask1(%rip), %xmm0
-        pshufb  %xmm0, %xmm7
-        jmp     _128_done
-.align 16
-_exact_16_left:
-        movdqu  (arg2), %xmm7
-        pshufb  %xmm11, %xmm7
-        pxor    %xmm0 , %xmm7   # xor the initial crc value
-        jmp     _128_done
-_only_less_than_4:
-        cmp     $3, arg3
-        jl      _only_less_than_3
-        # load 3 Bytes
-        mov     (arg2), %al
-        mov     %al, (%r11)
-        mov     1(arg2), %al
-        mov     %al, 1(%r11)
-        mov     2(arg2), %al
-        mov     %al, 2(%r11)
-        movdqa   (%rsp), %xmm7
-        pshufb   %xmm11, %xmm7
-        pxor     %xmm0 , %xmm7  # xor the initial crc value
-        psrldq  $5, %xmm7
-        jmp     _barrett
-_only_less_than_3:
-        cmp     $2, arg3
-        jl      _only_less_than_2
-        # load 2 Bytes
-        mov     (arg2), %al
-        mov     %al, (%r11)
-        mov     1(arg2), %al
-        mov     %al, 1(%r11)
-        movdqa  (%rsp), %xmm7
-        pshufb  %xmm11, %xmm7
-        pxor    %xmm0 , %xmm7   # xor the initial crc value
-        psrldq  $6, %xmm7
-        jmp     _barrett
-_only_less_than_2:
-        # load 1 Byte
-        mov     (arg2), %al
-        mov     %al, (%r11)
-        movdqa  (%rsp), %xmm7
-        pshufb  %xmm11, %xmm7
-        pxor    %xmm0 , %xmm7   # xor the initial crc value
-        psrldq  $7, %xmm7
-        jmp     _barrett
 ENDPROC(crc_t10dif_pcl)
 .section        .rodata, "a", @progbits
 .align 16
-# precomputed constants
-# these constants are precomputed from the poly:
-# 0x8bb70000 (0x8bb7 scaled to 32 bits)
-# Q = 0x18BB70000
-# rk1 = 2^(32*3) mod Q << 32
-# rk2 = 2^(32*5) mod Q << 32
-# rk3 = 2^(32*15) mod Q << 32
-# rk4 = 2^(32*17) mod Q << 32
-# rk5 = 2^(32*3) mod Q << 32
-# rk6 = 2^(32*2) mod Q << 32
-# rk7 = floor(2^64/Q)
-# rk8 = Q
-rk1:
-.quad 0x2d56000000000000
-rk2:
-.quad 0x06df000000000000
-rk3:
-.quad 0x9d9d000000000000
-rk4:
-.quad 0x7cf5000000000000
-rk5:
-.quad 0x2d56000000000000
-rk6:
-.quad 0x1368000000000000
-rk7:
-.quad 0x00000001f65a57f8
-rk8:
-.quad 0x000000018bb70000
-rk9:
-.quad 0xceae000000000000
-rk10:
-.quad 0xbfd6000000000000
-rk11:
-.quad 0x1e16000000000000
-rk12:
-.quad 0x713c000000000000
-rk13:
-.quad 0xf7f9000000000000
-rk14:
-.quad 0x80a6000000000000
-rk15:
-.quad 0x044c000000000000
-rk16:
-.quad 0xe658000000000000
-rk17:
-.quad 0xad18000000000000
-rk18:
-.quad 0xa497000000000000
-rk19:
-.quad 0x6ee3000000000000
-rk20:
-.quad 0xe7b5000000000000
+# Fold constants precomputed from the polynomial 0x18bb7
+# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+        .quad           0x0000000000006123      # x^(8*128)     mod G(x)
+        .quad           0x0000000000002295      # x^(8*128+64)  mod G(x)
+.Lfold_across_64_bytes_consts:
+        .quad           0x0000000000001069      # x^(4*128)     mod G(x)
+        .quad           0x000000000000dd31      # x^(4*128+64)  mod G(x)
+.Lfold_across_32_bytes_consts:
+        .quad           0x000000000000857d      # x^(2*128)     mod G(x)
+        .quad           0x0000000000007acc      # x^(2*128+64)  mod G(x)
+.Lfold_across_16_bytes_consts:
+        .quad           0x000000000000a010      # x^(1*128)     mod G(x)
+        .quad           0x0000000000001faa      # x^(1*128+64)  mod G(x)
+.Lfinal_fold_consts:
+        .quad           0x1368000000000000      # x^48 * (x^48 mod G(x))
+        .quad           0x2d56000000000000      # x^48 * (x^80 mod G(x))
+.Lbarrett_reduction_consts:
+        .quad           0x0000000000018bb7      # G(x)
+        .quad           0x00000001f65a57f8      # floor(x^48 / G(x))
 .section        .rodata.cst16.mask1, "aM", @progbits, 16
 .align 16
-mask1:
+.Lmask1:
-.octa 0x80808080808080808080808080808080
+        .octa   0x80808080808080808080808080808080
 .section        .rodata.cst16.mask2, "aM", @progbits, 16
 .align 16
-mask2:
+.Lmask2:
-.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+        .octa   0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+.section        .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+        .octa   0x000102030405060708090A0B0C0D0E0F
-.section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.section        .rodata.cst32.byteshift_table, "aM", @progbits, 32
 .align 16
-SHUF_MASK:
+# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
-.octa 0x000102030405060708090A0B0C0D0E0F
+# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
+# 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.section        .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
+.Lbyteshift_table:
-.align 32
+        .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-pshufb_shf_table:
+        .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-# use these values for shift constants for the pshufb instruction
+        .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-# different alignments result in values as shown:
+        .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
-#       DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-#       DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-#       DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-#       DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-#       DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-#       DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-#       DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
-#       DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
-#       DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
-#       DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
-#       DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
-#       DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
-#       DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
-#       DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
-#       DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
-.octa 0x8f8e8d8c8b8a89888786858483828100
-.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index cd4df9322501..0e785c0b2354 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -33,18 +33,12 @@
 #include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
-asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
-                                size_t len);
 struct chksum_desc_ctx {
        __u16 crc;
 };
-/*
- * Steps through buffer one byte at at time, calculates reflected
- * crc using table.
- */
 static int chksum_init(struct shash_desc *desc)
 {
        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
@@ -59,7 +53,7 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
 {
        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-        if (irq_fpu_usable()) {
+        if (length >= 16 && irq_fpu_usable()) {
                kernel_fpu_begin();
                ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
                kernel_fpu_end();
@@ -79,7 +73,7 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
 static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
                        u8 *out)
 {
-        if (irq_fpu_usable()) {
+        if (len >= 16 && irq_fpu_usable()) {
                kernel_fpu_begin();
                *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
                kernel_fpu_end();
diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c
index 0dccdda1eb3a..7e600f8bcdad 100644
--- a/arch/x86/crypto/morus1280_glue.c
+++ b/arch/x86/crypto/morus1280_glue.c
@@ -85,31 +85,20 @@ static void crypto_morus1280_glue_process_ad(
 static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state,
                                                struct morus1280_ops ops,
-                                                struct aead_request *req)
+                                                struct skcipher_walk *walk)
 {
-        struct skcipher_walk walk;
+        while (walk->nbytes >= MORUS1280_BLOCK_SIZE) {
-        u8 *cursor_src, *cursor_dst;
+                ops.crypt_blocks(state, walk->src.virt.addr,
-        unsigned int chunksize, base;
+                                 walk->dst.virt.addr,
+                                 round_down(walk->nbytes,
-        ops.skcipher_walk_init(&walk, req, false);
+                                            MORUS1280_BLOCK_SIZE));
+                skcipher_walk_done(walk, walk->nbytes % MORUS1280_BLOCK_SIZE);
-        while (walk.nbytes) {
+        }
-                cursor_src = walk.src.virt.addr;
-                cursor_dst = walk.dst.virt.addr;
-                chunksize = walk.nbytes;
-                ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
-                base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1);
-                cursor_src += base;
-                cursor_dst += base;
-                chunksize &= MORUS1280_BLOCK_SIZE - 1;
-                if (chunksize > 0)
-                        ops.crypt_tail(state, cursor_src, cursor_dst,
-                                       chunksize);
-                skcipher_walk_done(&walk, 0);
+        if (walk->nbytes) {
+                ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr,
+                               walk->nbytes);
+                skcipher_walk_done(walk, 0);
        }
 }
@@ -147,12 +136,15 @@ static void crypto_morus1280_glue_crypt(struct aead_request *req,
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
        struct morus1280_state state;
+        struct skcipher_walk walk;
+        ops.skcipher_walk_init(&walk, req, true);
        kernel_fpu_begin();
        ctx->ops->init(&state, &ctx->key, req->iv);
        crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
-        crypto_morus1280_glue_process_crypt(&state, ops, req);
+        crypto_morus1280_glue_process_crypt(&state, ops, &walk);
        ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
        kernel_fpu_end();
diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c
index 7b58fe4d9bd1..cb3a81732016 100644
--- a/arch/x86/crypto/morus640_glue.c
+++ b/arch/x86/crypto/morus640_glue.c
@@ -85,31 +85,19 @@ static void crypto_morus640_glue_process_ad(
 static void crypto_morus640_glue_process_crypt(struct morus640_state *state,
                                               struct morus640_ops ops,
-                                               struct aead_request *req)
+                                               struct skcipher_walk *walk)
 {
-        struct skcipher_walk walk;
+        while (walk->nbytes >= MORUS640_BLOCK_SIZE) {
-        u8 *cursor_src, *cursor_dst;
+                ops.crypt_blocks(state, walk->src.virt.addr,
-        unsigned int chunksize, base;
+                                 walk->dst.virt.addr,
+                                 round_down(walk->nbytes, MORUS640_BLOCK_SIZE));
-        ops.skcipher_walk_init(&walk, req, false);
+                skcipher_walk_done(walk, walk->nbytes % MORUS640_BLOCK_SIZE);
+        }
-        while (walk.nbytes) {
-                cursor_src = walk.src.virt.addr;
-                cursor_dst = walk.dst.virt.addr;
-                chunksize = walk.nbytes;
-                ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
-                base = chunksize & ~(MORUS640_BLOCK_SIZE - 1);
-                cursor_src += base;
-                cursor_dst += base;
-                chunksize &= MORUS640_BLOCK_SIZE - 1;
-                if (chunksize > 0)
-                        ops.crypt_tail(state, cursor_src, cursor_dst,
-                                       chunksize);
-                skcipher_walk_done(&walk, 0);
+        if (walk->nbytes) {
+                ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr,
+                               walk->nbytes);
+                skcipher_walk_done(walk, 0);
        }
 }
@@ -143,12 +131,15 @@ static void crypto_morus640_glue_crypt(struct aead_request *req,
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
        struct morus640_state state;
+        struct skcipher_walk walk;
+        ops.skcipher_walk_init(&walk, req, true);
        kernel_fpu_begin();
        ctx->ops->init(&state, &ctx->key, req->iv);
        crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
-        crypto_morus640_glue_process_crypt(&state, ops, req);
+        crypto_morus640_glue_process_crypt(&state, ops, &walk);
        ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
        kernel_fpu_end();
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index c88c670cb5fc..e6add74d78a5 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -272,6 +272,10 @@ ENTRY(poly1305_block_sse2)
        dec             %rcx
        jnz             .Ldoblock
+        # Zeroing of key material
+        mov             %rcx,0x00(%rsp)
+        mov             %rcx,0x08(%rsp)
        add             $0x10,%rsp
        pop             %r12
        pop             %rbx
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 374a19712e20..b684f0294f35 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2278,6 +2278,19 @@ void perf_check_microcode(void)
                x86_pmu.check_microcode();
 }
+static int x86_pmu_check_period(struct perf_event *event, u64 value)
+{
+        if (x86_pmu.check_period && x86_pmu.check_period(event, value))
+                return -EINVAL;
+        if (value && x86_pmu.limit_period) {
+                if (x86_pmu.limit_period(event, value) > value)
+                        return -EINVAL;
+        }
+        return 0;
+}
 static struct pmu pmu = {
        .pmu_enable             = x86_pmu_enable,
        .pmu_disable            = x86_pmu_disable,
@@ -2302,6 +2315,7 @@ static struct pmu pmu = {
        .event_idx              = x86_pmu_event_idx,
        .sched_task             = x86_pmu_sched_task,
        .task_ctx_size          = sizeof(struct x86_perf_task_context),
+        .check_period           = x86_pmu_check_period,
 };
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 40e12cfc87f6..730978dff63f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3559,6 +3559,14 @@ static void free_excl_cntrs(int cpu)
 static void intel_pmu_cpu_dying(int cpu)
 {
+        fini_debug_store_on_cpu(cpu);
+        if (x86_pmu.counter_freezing)
+                disable_counter_freeze();
+}
+static void intel_pmu_cpu_dead(int cpu)
+{
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
        struct intel_shared_regs *pc;
@@ -3570,11 +3578,6 @@ static void intel_pmu_cpu_dying(int cpu)
        }
        free_excl_cntrs(cpu);
-        fini_debug_store_on_cpu(cpu);
-        if (x86_pmu.counter_freezing)
-                disable_counter_freeze();
 }
 static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3584,6 +3587,11 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx,
        intel_pmu_lbr_sched_task(ctx, sched_in);
 }
+static int intel_pmu_check_period(struct perf_event *event, u64 value)
+{
+        return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
+}
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
 PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -3663,6 +3671,9 @@ static __initconst const struct x86_pmu core_pmu = {
        .cpu_prepare            = intel_pmu_cpu_prepare,
        .cpu_starting           = intel_pmu_cpu_starting,
        .cpu_dying              = intel_pmu_cpu_dying,
+        .cpu_dead               = intel_pmu_cpu_dead,
+        .check_period           = intel_pmu_check_period,
 };
 static struct attribute *intel_pmu_attrs[];
@@ -3703,8 +3714,12 @@ static __initconst const struct x86_pmu intel_pmu = {
        .cpu_prepare            = intel_pmu_cpu_prepare,
        .cpu_starting           = intel_pmu_cpu_starting,
        .cpu_dying              = intel_pmu_cpu_dying,
+        .cpu_dead               = intel_pmu_cpu_dead,
        .guest_get_msrs         = intel_guest_get_msrs,
        .sched_task             = intel_pmu_sched_task,
+        .check_period           = intel_pmu_check_period,
 };
 static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index c07bee31abe8..b10e04387f38 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1222,6 +1222,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
        .id_table       = snbep_uncore_pci_ids,
 };
+#define NODE_ID_MASK    0x7
 /*
 * build pci bus to socket mapping
 */
@@ -1243,7 +1245,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
                err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
                if (err)
                        break;
-                nodeid = config;
+                nodeid = config & NODE_ID_MASK;
                /* get the Node ID mapping */
                err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
                if (err)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 78d7b7031bfc..d46fd6754d92 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -646,6 +646,11 @@ struct x86_pmu {
         * Intel host/guest support (KVM)
         */
        struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+        /*
+         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
+         */
+        int (*check_period) (struct perf_event *event, u64 period);
 };
 struct x86_perf_task_context {
@@ -857,7 +862,7 @@ static inline int amd_pmu_init(void)
 #ifdef CONFIG_CPU_SUP_INTEL
-static inline bool intel_pmu_has_bts(struct perf_event *event)
+static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period)
 {
        struct hw_perf_event *hwc = &event->hw;
        unsigned int hw_event, bts_event;
@@ -868,7 +873,14 @@ static inline bool intel_pmu_has_bts(struct perf_event *event)
        hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
        bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-        return hw_event == bts_event && hwc->sample_period == 1;
+        return hw_event == bts_event && period == 1;
+}
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        return intel_pmu_has_bts_period(event, hwc->sample_period);
 }
 int intel_pmu_save_and_restart(struct perf_event *event);
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f65b78d32f5e..3c135084e1eb 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -39,82 +39,10 @@
 static int load_aout_binary(struct linux_binprm *);
 static int load_aout_library(struct file *);
-#ifdef CONFIG_COREDUMP
-static int aout_core_dump(struct coredump_params *);
-static unsigned long get_dr(int n)
-{
-        struct perf_event *bp = current->thread.ptrace_bps[n];
-        return bp ? bp->hw.info.address : 0;
-}
-/*
- * fill in the user structure for a core dump..
- */
-static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
-{
-        u32 fs, gs;
-        memset(dump, 0, sizeof(*dump));
-/* changed the size calculations - should hopefully work better. lbt */
-        dump->magic = CMAGIC;
-        dump->start_code = 0;
-        dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
-        dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-        dump->u_dsize = ((unsigned long)
-                         (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
-        dump->u_dsize -= dump->u_tsize;
-        dump->u_debugreg[0] = get_dr(0);
-        dump->u_debugreg[1] = get_dr(1);
-        dump->u_debugreg[2] = get_dr(2);
-        dump->u_debugreg[3] = get_dr(3);
-        dump->u_debugreg[6] = current->thread.debugreg6;
-        dump->u_debugreg[7] = current->thread.ptrace_dr7;
-        if (dump->start_stack < 0xc0000000) {
-                unsigned long tmp;
-                tmp = (unsigned long) (0xc0000000 - dump->start_stack);
-                dump->u_ssize = tmp >> PAGE_SHIFT;
-        }
-        dump->regs.ebx = regs->bx;
-        dump->regs.ecx = regs->cx;
-        dump->regs.edx = regs->dx;
-        dump->regs.esi = regs->si;
-        dump->regs.edi = regs->di;
-        dump->regs.ebp = regs->bp;
-        dump->regs.eax = regs->ax;
-        dump->regs.ds = current->thread.ds;
-        dump->regs.es = current->thread.es;
-        savesegment(fs, fs);
-        dump->regs.fs = fs;
-        savesegment(gs, gs);
-        dump->regs.gs = gs;
-        dump->regs.orig_eax = regs->orig_ax;
-        dump->regs.eip = regs->ip;
-        dump->regs.cs = regs->cs;
-        dump->regs.eflags = regs->flags;
-        dump->regs.esp = regs->sp;
-        dump->regs.ss = regs->ss;
-#if 1 /* FIXME */
-        dump->u_fpvalid = 0;
-#else
-        dump->u_fpvalid = dump_fpu(regs, &dump->i387);
-#endif
-}
-#endif
 static struct linux_binfmt aout_format = {
        .module         = THIS_MODULE,
        .load_binary    = load_aout_binary,
        .load_shlib     = load_aout_library,
-#ifdef CONFIG_COREDUMP
-        .core_dump      = aout_core_dump,
-#endif
-        .min_coredump   = PAGE_SIZE
 };
 static int set_brk(unsigned long start, unsigned long end)
@@ -126,91 +54,6 @@ static int set_brk(unsigned long start, unsigned long end)
        return vm_brk(start, end - start);
 }
-#ifdef CONFIG_COREDUMP
-/*
- * These are the only things you should do on a core-file: use only these
- * macros to write out all the necessary info.
- */
-#include <linux/coredump.h>
-#define START_DATA(u)   (u.u_tsize << PAGE_SHIFT)
-#define START_STACK(u)  (u.start_stack)
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-static int aout_core_dump(struct coredump_params *cprm)
-{
-        mm_segment_t fs;
-        int has_dumped = 0;
-        unsigned long dump_start, dump_size;
-        struct user32 dump;
-        fs = get_fs();
-        set_fs(KERNEL_DS);
-        has_dumped = 1;
-        strncpy(dump.u_comm, current->comm, sizeof(current->comm));
-        dump.u_ar0 = offsetof(struct user32, regs);
-        dump.signal = cprm->siginfo->si_signo;
-        dump_thread32(cprm->regs, &dump);
-        /*
-         * If the size of the dump file exceeds the rlimit, then see
-         * what would happen if we wrote the stack, but not the data
-         * area.
-         */
-        if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
-                dump.u_dsize = 0;
-        /* Make sure we have enough room to write the stack and data areas. */
-        if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
-                dump.u_ssize = 0;
-        /* make sure we actually have a data and stack area to dump */
-        set_fs(USER_DS);
-        if (!access_ok((void *) (unsigned long)START_DATA(dump),
-                       dump.u_dsize << PAGE_SHIFT))
-                dump.u_dsize = 0;
-        if (!access_ok((void *) (unsigned long)START_STACK(dump),
-                       dump.u_ssize << PAGE_SHIFT))
-                dump.u_ssize = 0;
-        set_fs(KERNEL_DS);
-        /* struct user */
-        if (!dump_emit(cprm, &dump, sizeof(dump)))
-                goto end_coredump;
-        /* Now dump all of the user data.  Include malloced stuff as well */
-        if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
-                goto end_coredump;
-        /* now we start writing out the user space info */
-        set_fs(USER_DS);
-        /* Dump the data area */
-        if (dump.u_dsize != 0) {
-                dump_start = START_DATA(dump);
-                dump_size = dump.u_dsize << PAGE_SHIFT;
-                if (!dump_emit(cprm, (void *)dump_start, dump_size))
-                        goto end_coredump;
-        }
-        /* Now prepare to dump the stack area */
-        if (dump.u_ssize != 0) {
-                dump_start = START_STACK(dump);
-                dump_size = dump.u_ssize << PAGE_SHIFT;
-                if (!dump_emit(cprm, (void *)dump_start, dump_size))
-                        goto end_coredump;
-        }
-end_coredump:
-        set_fs(fs);
-        return has_dumped;
-}
-#endif
 /*
 * create_aout_tables() parses the env- and arg-strings in new user
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
deleted file mode 100644
index 7d3ece8bfb61..000000000000
--- a/arch/x86/include/asm/a.out-core.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* a.out coredump register dumper
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#ifndef _ASM_X86_A_OUT_CORE_H
-#define _ASM_X86_A_OUT_CORE_H
-#ifdef __KERNEL__
-#ifdef CONFIG_X86_32
-#include <linux/user.h>
-#include <linux/elfcore.h>
-#include <linux/mm_types.h>
-#include <asm/debugreg.h>
-/*
- * fill in the user structure for an a.out core dump
- */
-static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
-{
-/* changed the size calculations - should hopefully work better. lbt */
-        dump->magic = CMAGIC;
-        dump->start_code = 0;
-        dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
-        dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
-        dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
-                        >> PAGE_SHIFT;
-        dump->u_dsize -= dump->u_tsize;
-        dump->u_ssize = 0;
-        aout_dump_debugregs(dump);
-        if (dump->start_stack < TASK_SIZE)
-                dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
-                                >> PAGE_SHIFT;
-        dump->regs.bx = regs->bx;
-        dump->regs.cx = regs->cx;
-        dump->regs.dx = regs->dx;
-        dump->regs.si = regs->si;
-        dump->regs.di = regs->di;
-        dump->regs.bp = regs->bp;
-        dump->regs.ax = regs->ax;
-        dump->regs.ds = (u16)regs->ds;
-        dump->regs.es = (u16)regs->es;
-        dump->regs.fs = (u16)regs->fs;
-        dump->regs.gs = get_user_gs(regs);
-        dump->regs.orig_ax = regs->orig_ax;
-        dump->regs.ip = regs->ip;
-        dump->regs.cs = (u16)regs->cs;
-        dump->regs.flags = regs->flags;
-        dump->regs.sp = regs->sp;
-        dump->regs.ss = (u16)regs->ss;
-        dump->u_fpvalid = dump_fpu(regs, &dump->i387);
-}
-#endif /* CONFIG_X86_32 */
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 705dafc2d11a..2bdbbbcfa393 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -841,7 +841,7 @@ union hv_gpa_page_range {
 * count is equal with how many entries of union hv_gpa_page_range can
 * be populated into the input parameter page.
 */
-#define HV_MAX_FLUSH_REP_COUNT (PAGE_SIZE - 2 * sizeof(u64) /   \
+#define HV_MAX_FLUSH_REP_COUNT ((PAGE_SIZE - 2 * sizeof(u64)) / \
                                sizeof(union hv_gpa_page_range))
 struct hv_guest_mapping_flush_list {
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index d9a9993af882..9f15384c504a 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -52,6 +52,8 @@
 #define INTEL_FAM6_CANNONLAKE_MOBILE    0x66
+#define INTEL_FAM6_ICELAKE_MOBILE       0x7E
 /* "Small Core" Processors (Atom) */
 #define INTEL_FAM6_ATOM_BONNELL         0x1C /* Diamondville, Pineview */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4660ce90de7f..180373360e34 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -299,6 +299,7 @@ union kvm_mmu_extended_role {
                unsigned int cr4_smap:1;
                unsigned int cr4_smep:1;
                unsigned int cr4_la57:1;
+                unsigned int maxphyaddr:6;
        };
 };
@@ -397,6 +398,7 @@ struct kvm_mmu {
        void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                           u64 *spte, const void *pte);
        hpa_t root_hpa;
+        gpa_t root_cr3;
        union kvm_mmu_role mmu_role;
        u8 root_level;
        u8 shadow_root_level;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e805292..2779ace16d23 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1065,7 +1065,7 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
 {
-        native_set_pmd(pmdp, pmd);
+        set_pmd(pmdp, pmd);
 }
 static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 780f2b42c8ef..5e49a0acb5ee 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -25,7 +25,6 @@
 #define KERNEL_DS       MAKE_MM_SEG(-1UL)
 #define USER_DS         MAKE_MM_SEG(TASK_SIZE_MAX)
-#define get_ds()        (KERNEL_DS)
 #define get_fs()        (current->thread.addr_limit)
 static inline void set_fs(mm_segment_t fs)
 {
@@ -284,7 +283,7 @@ do {									\
                __put_user_goto(x, ptr, "l", "k", "ir", label);         \
                break;                                                  \
        case 8:                                                         \
-                __put_user_goto_u64((__typeof__(*ptr))(x), ptr, label); \
+                __put_user_goto_u64(x, ptr, label);                     \
                break;                                                  \
        default:                                                        \
                __put_user_bad();                                       \
@@ -431,8 +430,10 @@ do {									\
 ({                                                              \
        __label__ __pu_label;                                   \
        int __pu_err = -EFAULT;                                 \
+        __typeof__(*(ptr)) __pu_val;                            \
+        __pu_val = x;                                           \
        __uaccess_begin();                                      \
-        __put_user_size((x), (ptr), (size), __pu_label);        \
+        __put_user_size(__pu_val, (ptr), (size), __pu_label);   \
        __pu_err = 0;                                           \
 __pu_label:                                                     \
        __uaccess_end();                                        \
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index e652a7cc6186..3f697a9e3f59 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -48,7 +48,8 @@ enum {
        BIOS_STATUS_SUCCESS             =  0,
        BIOS_STATUS_UNIMPLEMENTED       = -ENOSYS,
        BIOS_STATUS_EINVAL              = -EINVAL,
-        BIOS_STATUS_UNAVAIL             = -EBUSY
+        BIOS_STATUS_UNAVAIL             = -EBUSY,
+        BIOS_STATUS_ABORT               = -EINTR,
 };
 /* Address map parameters */
@@ -167,4 +168,9 @@ extern long system_serial_number;
 extern struct kobject *sgi_uv_kobj;     /* /sys/firmware/sgi_uv */
+/*
+ * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details
+ */
+extern struct semaphore __efi_uv_runtime_lock;
 #endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index f6648e9928b3..efe701b7c6ce 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -3,3 +3,4 @@ include include/uapi/asm-generic/Kbuild.asm
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generated-y += unistd_x32.h
+generic-y += socket.h
diff --git a/arch/x86/include/uapi/asm/socket.h b/arch/x86/include/uapi/asm/socket.h
deleted file mode 100644
index 6b71384b9d8b..000000000000
--- a/arch/x86/include/uapi/asm/socket.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/socket.h>
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 672c7225cb1b..6ce290c506d9 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -784,6 +784,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
                        quirk_no_way_out(i, m, regs);
                if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+                        m->bank = i;
                        mce_read_aux(m, i);
                        *msg = tmp;
                        return 1;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbffa6c54697..c07958b59f50 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
        unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
        unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+        unsigned f_la57 = 0;
        /* cpuid 1.edx */
        const u32 kvm_cpuid_1_edx_x86_features =
@@ -489,7 +490,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                        // TSC_ADJUST is emulated
                        entry->ebx |= F(TSC_ADJUST);
                        entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+                        f_la57 = entry->ecx & F(LA57);
                        cpuid_mask(&entry->ecx, CPUID_7_ECX);
+                        /* Set LA57 based on hardware capability. */
+                        entry->ecx |= f_la57;
                        entry->ecx |= f_umip;
                        /* PKU is not yet implemented for shadow paging. */
                        if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da9c42349b1f..f2d1d230d5b8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3555,6 +3555,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                                                           &invalid_list);
                        mmu->root_hpa = INVALID_PAGE;
                }
+                mmu->root_cr3 = 0;
        }
        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3610,6 +3611,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
        } else
                BUG();
+        vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
        return 0;
 }
@@ -3618,10 +3620,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu_page *sp;
        u64 pdptr, pm_mask;
-        gfn_t root_gfn;
+        gfn_t root_gfn, root_cr3;
        int i;
-        root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT;
+        root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
+        root_gfn = root_cr3 >> PAGE_SHIFT;
        if (mmu_check_root(vcpu, root_gfn))
                return 1;
@@ -3646,7 +3649,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
                vcpu->arch.mmu->root_hpa = root;
-                return 0;
+                goto set_root_cr3;
        }
        /*
@@ -3712,6 +3715,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
        }
+set_root_cr3:
+        vcpu->arch.mmu->root_cr3 = root_cr3;
        return 0;
 }
@@ -4163,7 +4169,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
        struct kvm_mmu_root_info root;
        struct kvm_mmu *mmu = vcpu->arch.mmu;
-        root.cr3 = mmu->get_cr3(vcpu);
+        root.cr3 = mmu->root_cr3;
        root.hpa = mmu->root_hpa;
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
@@ -4176,6 +4182,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
        }
        mmu->root_hpa = root.hpa;
+        mmu->root_cr3 = root.cr3;
        return i < KVM_MMU_NUM_PREV_ROOTS;
 }
@@ -4770,6 +4777,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
        ext.cr4_pse = !!is_pse(vcpu);
        ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
        ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+        ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
        ext.valid = 1;
@@ -5516,11 +5524,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
        vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
+        vcpu->arch.root_mmu.root_cr3 = 0;
        vcpu->arch.root_mmu.translate_gpa = translate_gpa;
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
                vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
        vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
+        vcpu->arch.guest_mmu.root_cr3 = 0;
        vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
                vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d8ea4ebd79e7..d737a51a53ca 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2473,6 +2473,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
            (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
                return -EINVAL;
+        if (!nested_cpu_has_preemption_timer(vmcs12) &&
+            nested_cpu_has_save_preemption_timer(vmcs12))
+                return -EINVAL;
        if (nested_cpu_has_ept(vmcs12) &&
            !valid_ept_address(vcpu, vmcs12->ept_pointer))
                return -EINVAL;
@@ -5557,9 +5561,11 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
         * secondary cpu-based controls.  Do not include those that
         * depend on CPUID bits, they are added later by vmx_cpuid_update.
         */
-        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+        if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
-                msrs->secondary_ctls_low,
+                rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
-                msrs->secondary_ctls_high);
+                      msrs->secondary_ctls_low,
+                      msrs->secondary_ctls_high);
        msrs->secondary_ctls_low = 0;
        msrs->secondary_ctls_high &=
                SECONDARY_EXEC_DESC |
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 95d618045001..30a6bcd735ec 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -863,7 +863,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
        if (!entry_only)
                j = find_msr(&m->host, msr);
-        if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
+        if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
+                (j < 0 &&  m->host.nr == NR_AUTOLOAD_MSRS)) {
                printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
@@ -1193,21 +1194,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
        if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
                return;
-        /*
-         * First handle the simple case where no cmpxchg is necessary; just
-         * allow posting non-urgent interrupts.
-         *
-         * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
-         * PI.NDST: pi_post_block will do it for us and the wakeup_handler
-         * expects the VCPU to be on the blocked_vcpu_list that matches
-         * PI.NDST.
-         */
-        if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
-            vcpu->cpu == cpu) {
-                pi_clear_sn(pi_desc);
-                return;
-        }
        /* The full case.  */
        do {
                old.control = new.control = pi_desc->control;
@@ -1222,6 +1208,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
                new.sn = 0;
        } while (cmpxchg64(&pi_desc->control, old.control,
                           new.control) != old.control);
+        /*
+         * Clear SN before reading the bitmap.  The VT-d firmware
+         * writes the bitmap and reads SN atomically (5.2.3 in the
+         * spec), so it doesn't really have a memory barrier that
+         * pairs with this, but we cannot do that and we need one.
+         */
+        smp_mb__after_atomic();
+        if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+                pi_set_on(pi_desc);
 }
 /*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 99328954c2fc..0ac0a64c7790 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -337,16 +337,16 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
        return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
+static inline void pi_set_sn(struct pi_desc *pi_desc)
 {
-        return clear_bit(POSTED_INTR_SN,
+        return set_bit(POSTED_INTR_SN,
                        (unsigned long *)&pi_desc->control);
 }
-static inline void pi_set_sn(struct pi_desc *pi_desc)
+static inline void pi_set_on(struct pi_desc *pi_desc)
 {
-        return set_bit(POSTED_INTR_SN,
+        set_bit(POSTED_INTR_ON,
-                        (unsigned long *)&pi_desc->control);
+                (unsigned long *)&pi_desc->control);
 }
 static inline void pi_clear_on(struct pi_desc *pi_desc)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e67ecf25e690..941f932373d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7801,7 +7801,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         * 1) We should set ->mode before checking ->requests.  Please see
         * the comment in kvm_vcpu_exiting_guest_mode().
         *
-         * 2) For APICv, we should set ->mode before checking PIR.ON.  This
+         * 2) For APICv, we should set ->mode before checking PID.ON. This
         * pairs with the memory barrier implicit in pi_test_and_set_on
         * (see vmx_deliver_posted_interrupt).
         *
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 6521134057e8..3c4568f8fb28 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -117,67 +117,12 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL_GPL(ex_handler_fprestore);
-/* Helper to check whether a uaccess fault indicates a kernel bug. */
-static bool bogus_uaccess(struct pt_regs *regs, int trapnr,
-                          unsigned long fault_addr)
-{
-        /* This is the normal case: #PF with a fault address in userspace. */
-        if (trapnr == X86_TRAP_PF && fault_addr < TASK_SIZE_MAX)
-                return false;
-        /*
-         * This code can be reached for machine checks, but only if the #MC
-         * handler has already decided that it looks like a candidate for fixup.
-         * This e.g. happens when attempting to access userspace memory which
-         * the CPU can't access because of uncorrectable bad memory.
-         */
-        if (trapnr == X86_TRAP_MC)
-                return false;
-        /*
-         * There are two remaining exception types we might encounter here:
-         *  - #PF for faulting accesses to kernel addresses
-         *  - #GP for faulting accesses to noncanonical addresses
-         * Complain about anything else.
-         */
-        if (trapnr != X86_TRAP_PF && trapnr != X86_TRAP_GP) {
-                WARN(1, "unexpected trap %d in uaccess\n", trapnr);
-                return false;
-        }
-        /*
-         * This is a faulting memory access in kernel space, on a kernel
-         * address, in a usercopy function. This can e.g. be caused by improper
-         * use of helpers like __put_user and by improper attempts to access
-         * userspace addresses in KERNEL_DS regions.
-         * The one (semi-)legitimate exception are probe_kernel_{read,write}(),
-         * which can be invoked from places like kgdb, /dev/mem (for reading)
-         * and privileged BPF code (for reading).
-         * The probe_kernel_*() functions set the kernel_uaccess_faults_ok flag
-         * to tell us that faulting on kernel addresses, and even noncanonical
-         * addresses, in a userspace accessor does not necessarily imply a
-         * kernel bug, root might just be doing weird stuff.
-         */
-        if (current->kernel_uaccess_faults_ok)
-                return false;
-        /* This is bad. Refuse the fixup so that we go into die(). */
-        if (trapnr == X86_TRAP_PF) {
-                pr_emerg("BUG: pagefault on kernel address 0x%lx in non-whitelisted uaccess\n",
-                         fault_addr);
-        } else {
-                pr_emerg("BUG: GPF in non-whitelisted uaccess (non-canonical address?)\n");
-        }
-        return true;
-}
 __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
                                  struct pt_regs *regs, int trapnr,
                                  unsigned long error_code,
                                  unsigned long fault_addr)
 {
-        if (bogus_uaccess(regs, trapnr, fault_addr))
+        WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
-                return false;
        regs->ip = ex_fixup_addr(fixup);
        return true;
 }
@@ -188,8 +133,6 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup,
                              unsigned long error_code,
                              unsigned long fault_addr)
 {
-        if (bogus_uaccess(regs, trapnr, fault_addr))
-                return false;
        /* Special hack for uaccess_err */
        current->thread.uaccess_err = 1;
        regs->ip = ex_fixup_addr(fixup);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4f8972311a77..14e6119838a6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -230,6 +230,29 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
 #endif
+/*
+ * See set_mce_nospec().
+ *
+ * Machine check recovery code needs to change cache mode of poisoned pages to
+ * UC to avoid speculative access logging another error. But passing the
+ * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
+ * speculative access. So we cheat and flip the top bit of the address. This
+ * works fine for the code that updates the page tables. But at the end of the
+ * process we need to flush the TLB and cache and the non-canonical address
+ * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long fix_addr(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+        return (long)(addr << 1) >> 1;
+#else
+        return addr;
+#endif
+}
 static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
 {
        if (cpa->flags & CPA_PAGES_ARRAY) {
@@ -313,7 +336,7 @@ void __cpa_flush_tlb(void *data)
        unsigned int i;
        for (i = 0; i < cpa->numpages; i++)
-                __flush_tlb_one_kernel(__cpa_addr(cpa, i));
+                __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
 }
 static void cpa_flush(struct cpa_data *data, int cache)
@@ -347,7 +370,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
                 * Only flush present addresses:
                 */
                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
-                        clflush_cache_range_opt((void *)addr, PAGE_SIZE);
+                        clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
        }
        mb();
 }
@@ -1627,29 +1650,6 @@ out:
        return ret;
 }
-/*
- * Machine check recovery code needs to change cache mode of poisoned
- * pages to UC to avoid speculative access logging another error. But
- * passing the address of the 1:1 mapping to set_memory_uc() is a fine
- * way to encourage a speculative access. So we cheat and flip the top
- * bit of the address. This works fine for the code that updates the
- * page tables. But at the end of the process we need to flush the cache
- * and the non-canonical address causes a #GP fault when used by the
- * CLFLUSH instruction.
- *
- * But in the common case we already have a canonical address. This code
- * will fix the top bit if needed and is a no-op otherwise.
- */
-static inline unsigned long make_addr_canonical_again(unsigned long addr)
-{
-#ifdef CONFIG_X86_64
-        return (long)(addr << 1) >> 1;
-#else
-        return addr;
-#endif
-}
 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                                    pgprot_t mask_set, pgprot_t mask_clr,
                                    int force_split, int in_flag,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5542303c43d9..afabf597c855 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -881,20 +881,41 @@ xadd:			if (is_imm8(insn->off))
                case BPF_JMP | BPF_JSLT | BPF_X:
                case BPF_JMP | BPF_JSGE | BPF_X:
                case BPF_JMP | BPF_JSLE | BPF_X:
+                case BPF_JMP32 | BPF_JEQ | BPF_X:
+                case BPF_JMP32 | BPF_JNE | BPF_X:
+                case BPF_JMP32 | BPF_JGT | BPF_X:
+                case BPF_JMP32 | BPF_JLT | BPF_X:
+                case BPF_JMP32 | BPF_JGE | BPF_X:
+                case BPF_JMP32 | BPF_JLE | BPF_X:
+                case BPF_JMP32 | BPF_JSGT | BPF_X:
+                case BPF_JMP32 | BPF_JSLT | BPF_X:
+                case BPF_JMP32 | BPF_JSGE | BPF_X:
+                case BPF_JMP32 | BPF_JSLE | BPF_X:
                        /* cmp dst_reg, src_reg */
-                        EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
+                        if (BPF_CLASS(insn->code) == BPF_JMP)
-                              add_2reg(0xC0, dst_reg, src_reg));
+                                EMIT1(add_2mod(0x48, dst_reg, src_reg));
+                        else if (is_ereg(dst_reg) || is_ereg(src_reg))
+                                EMIT1(add_2mod(0x40, dst_reg, src_reg));
+                        EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
                        goto emit_cond_jmp;
                case BPF_JMP | BPF_JSET | BPF_X:
+                case BPF_JMP32 | BPF_JSET | BPF_X:
                        /* test dst_reg, src_reg */
-                        EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x85,
+                        if (BPF_CLASS(insn->code) == BPF_JMP)
-                              add_2reg(0xC0, dst_reg, src_reg));
+                                EMIT1(add_2mod(0x48, dst_reg, src_reg));
+                        else if (is_ereg(dst_reg) || is_ereg(src_reg))
+                                EMIT1(add_2mod(0x40, dst_reg, src_reg));
+                        EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
                        goto emit_cond_jmp;
                case BPF_JMP | BPF_JSET | BPF_K:
+                case BPF_JMP32 | BPF_JSET | BPF_K:
                        /* test dst_reg, imm32 */
-                        EMIT1(add_1mod(0x48, dst_reg));
+                        if (BPF_CLASS(insn->code) == BPF_JMP)
+                                EMIT1(add_1mod(0x48, dst_reg));
+                        else if (is_ereg(dst_reg))
+                                EMIT1(add_1mod(0x40, dst_reg));
                        EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
                        goto emit_cond_jmp;
@@ -908,8 +929,21 @@ xadd:			if (is_imm8(insn->off))
                case BPF_JMP | BPF_JSLT | BPF_K:
                case BPF_JMP | BPF_JSGE | BPF_K:
                case BPF_JMP | BPF_JSLE | BPF_K:
+                case BPF_JMP32 | BPF_JEQ | BPF_K:
+                case BPF_JMP32 | BPF_JNE | BPF_K:
+                case BPF_JMP32 | BPF_JGT | BPF_K:
+                case BPF_JMP32 | BPF_JLT | BPF_K:
+                case BPF_JMP32 | BPF_JGE | BPF_K:
+                case BPF_JMP32 | BPF_JLE | BPF_K:
+                case BPF_JMP32 | BPF_JSGT | BPF_K:
+                case BPF_JMP32 | BPF_JSLT | BPF_K:
+                case BPF_JMP32 | BPF_JSGE | BPF_K:
+                case BPF_JMP32 | BPF_JSLE | BPF_K:
                        /* cmp dst_reg, imm8/32 */
-                        EMIT1(add_1mod(0x48, dst_reg));
+                        if (BPF_CLASS(insn->code) == BPF_JMP)
+                                EMIT1(add_1mod(0x48, dst_reg));
+                        else if (is_ereg(dst_reg))
+                                EMIT1(add_1mod(0x40, dst_reg));
                        if (is_imm8(imm32))
                                EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 8f6cc71e0848..0d9cdffce6ac 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2072,7 +2072,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
                case BPF_JMP | BPF_JSGT | BPF_X:
                case BPF_JMP | BPF_JSLE | BPF_X:
                case BPF_JMP | BPF_JSLT | BPF_X:
-                case BPF_JMP | BPF_JSGE | BPF_X: {
+                case BPF_JMP | BPF_JSGE | BPF_X:
+                case BPF_JMP32 | BPF_JEQ | BPF_X:
+                case BPF_JMP32 | BPF_JNE | BPF_X:
+                case BPF_JMP32 | BPF_JGT | BPF_X:
+                case BPF_JMP32 | BPF_JLT | BPF_X:
+                case BPF_JMP32 | BPF_JGE | BPF_X:
+                case BPF_JMP32 | BPF_JLE | BPF_X:
+                case BPF_JMP32 | BPF_JSGT | BPF_X:
+                case BPF_JMP32 | BPF_JSLE | BPF_X:
+                case BPF_JMP32 | BPF_JSLT | BPF_X:
+                case BPF_JMP32 | BPF_JSGE | BPF_X: {
+                        bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
                        u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
                        u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
                        u8 sreg_lo = sstk ? IA32_ECX : src_lo;
@@ -2081,25 +2092,35 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
                        if (dstk) {
                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
                                      STACK_VAR(dst_lo));
-                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+                                if (is_jmp64)
-                                      STACK_VAR(dst_hi));
+                                        EMIT3(0x8B,
+                                              add_2reg(0x40, IA32_EBP,
+                                                       IA32_EDX),
+                                              STACK_VAR(dst_hi));
                        }
                        if (sstk) {
                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
                                      STACK_VAR(src_lo));
-                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+                                if (is_jmp64)
-                                      STACK_VAR(src_hi));
+                                        EMIT3(0x8B,
+                                              add_2reg(0x40, IA32_EBP,
+                                                       IA32_EBX),
+                                              STACK_VAR(src_hi));
                        }
-                        /* cmp dreg_hi,sreg_hi */
+                        if (is_jmp64) {
-                        EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+                                /* cmp dreg_hi,sreg_hi */
-                        EMIT2(IA32_JNE, 2);
+                                EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+                                EMIT2(IA32_JNE, 2);
+                        }
                        /* cmp dreg_lo,sreg_lo */
                        EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
                        goto emit_cond_jmp;
                }
-                case BPF_JMP | BPF_JSET | BPF_X: {
+                case BPF_JMP | BPF_JSET | BPF_X:
+                case BPF_JMP32 | BPF_JSET | BPF_X: {
+                        bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
                        u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
                        u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
                        u8 sreg_lo = sstk ? IA32_ECX : src_lo;
@@ -2108,15 +2129,21 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
                        if (dstk) {
                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
                                      STACK_VAR(dst_lo));
-                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+                                if (is_jmp64)
-                                      STACK_VAR(dst_hi));
+                                        EMIT3(0x8B,
+                                              add_2reg(0x40, IA32_EBP,
+                                                       IA32_EDX),
+                                              STACK_VAR(dst_hi));
                        }
                        if (sstk) {
                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
                                      STACK_VAR(src_lo));
-                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+                                if (is_jmp64)
-                                      STACK_VAR(src_hi));
+                                        EMIT3(0x8B,
+                                              add_2reg(0x40, IA32_EBP,
+                                                       IA32_EBX),
+                                              STACK_VAR(src_hi));
                        }
                        /* and dreg_lo,sreg_lo */
                        EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
@@ -2126,32 +2153,39 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
                        EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
                        goto emit_cond_jmp;
                }
-                case BPF_JMP | BPF_JSET | BPF_K: {
+                case BPF_JMP | BPF_JSET | BPF_K:
-                        u32 hi;
+                case BPF_JMP32 | BPF_JSET | BPF_K: {
+                        bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
                        u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
                        u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
                        u8 sreg_lo = IA32_ECX;
                        u8 sreg_hi = IA32_EBX;
+                        u32 hi;
                        if (dstk) {
                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
                                      STACK_VAR(dst_lo));
-                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+                                if (is_jmp64)
-                                      STACK_VAR(dst_hi));
+                                        EMIT3(0x8B,
+                                              add_2reg(0x40, IA32_EBP,
+                                                       IA32_EDX),
+                                              STACK_VAR(dst_hi));
                        }
-                        hi = imm32 & (1<<31) ? (u32)~0 : 0;
                        /* mov ecx,imm32 */
-                        EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+                        EMIT2_off32(0xC7, add_1reg(0xC0, sreg_lo), imm32);
-                        /* mov ebx,imm32 */
-                        EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
                        /* and dreg_lo,sreg_lo */
                        EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
-                        /* and dreg_hi,sreg_hi */
+                        if (is_jmp64) {
-                        EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+                                hi = imm32 & (1 << 31) ? (u32)~0 : 0;
-                        /* or dreg_lo,dreg_hi */
+                                /* mov ebx,imm32 */
-                        EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+                                EMIT2_off32(0xC7, add_1reg(0xC0, sreg_hi), hi);
+                                /* and dreg_hi,sreg_hi */
+                                EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+                                /* or dreg_lo,dreg_hi */
+                                EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+                        }
                        goto emit_cond_jmp;
                }
                case BPF_JMP | BPF_JEQ | BPF_K:
@@ -2163,29 +2197,44 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
                case BPF_JMP | BPF_JSGT | BPF_K:
                case BPF_JMP | BPF_JSLE | BPF_K:
                case BPF_JMP | BPF_JSLT | BPF_K:
-                case BPF_JMP | BPF_JSGE | BPF_K: {
+                case BPF_JMP | BPF_JSGE | BPF_K:
-                        u32 hi;
+                case BPF_JMP32 | BPF_JEQ | BPF_K:
+                case BPF_JMP32 | BPF_JNE | BPF_K:
+                case BPF_JMP32 | BPF_JGT | BPF_K:
+                case BPF_JMP32 | BPF_JLT | BPF_K:
+                case BPF_JMP32 | BPF_JGE | BPF_K:
+                case BPF_JMP32 | BPF_JLE | BPF_K:
+                case BPF_JMP32 | BPF_JSGT | BPF_K:
+                case BPF_JMP32 | BPF_JSLE | BPF_K:
+                case BPF_JMP32 | BPF_JSLT | BPF_K:
+                case BPF_JMP32 | BPF_JSGE | BPF_K: {
+                        bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
                        u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
                        u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
                        u8 sreg_lo = IA32_ECX;
                        u8 sreg_hi = IA32_EBX;
+                        u32 hi;
                        if (dstk) {
                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
                                      STACK_VAR(dst_lo));
-                                EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+                                if (is_jmp64)
-                                      STACK_VAR(dst_hi));
+                                        EMIT3(0x8B,
+                                              add_2reg(0x40, IA32_EBP,
+                                                       IA32_EDX),
+                                              STACK_VAR(dst_hi));
                        }
-                        hi = imm32 & (1<<31) ? (u32)~0 : 0;
                        /* mov ecx,imm32 */
                        EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
-                        /* mov ebx,imm32 */
+                        if (is_jmp64) {
-                        EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+                                hi = imm32 & (1 << 31) ? (u32)~0 : 0;
+                                /* mov ebx,imm32 */
-                        /* cmp dreg_hi,sreg_hi */
+                                EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
-                        EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+                                /* cmp dreg_hi,sreg_hi */
-                        EMIT2(IA32_JNE, 2);
+                                EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+                                EMIT2(IA32_JNE, 2);
+                        }
                        /* cmp dreg_lo,sreg_lo */
                        EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
index 96f438d4b026..1421d5330b2c 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -44,7 +44,6 @@ static struct fixed_voltage_config bcm43xx_vmmc = {
         */
        .microvolts             = 2000000,              /* 1.8V */
        .startup_delay          = 250 * 1000,           /* 250ms */
-        .enable_high            = 1,                    /* active high */
        .enabled_at_boot        = 0,                    /* disabled at boot */
        .init_data              = &bcm43xx_vmmc_data,
 };
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 4a6a5a26c582..eb33432f2f24 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -29,7 +29,8 @@
 struct uv_systab *uv_systab;
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+                        u64 a4, u64 a5)
 {
        struct uv_systab *tab = uv_systab;
        s64 ret;
@@ -51,6 +52,19 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
        return ret;
 }
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+{
+        s64 ret;
+        if (down_interruptible(&__efi_uv_runtime_lock))
+                return BIOS_STATUS_ABORT;
+        ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+        up(&__efi_uv_runtime_lock);
+        return ret;
+}
 EXPORT_SYMBOL_GPL(uv_bios_call);
 s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
@@ -59,10 +73,15 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
        unsigned long bios_flags;
        s64 ret;
+        if (down_interruptible(&__efi_uv_runtime_lock))
+                return BIOS_STATUS_ABORT;
        local_irq_save(bios_flags);
-        ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+        ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
        local_irq_restore(bios_flags);
+        up(&__efi_uv_runtime_lock);
        return ret;
 }
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index f518b4744ff8..494eeb51e4e1 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -16,7 +16,6 @@ config 64BIT
 config X86_32
        def_bool !64BIT
-        select HAVE_AOUT
        select ARCH_WANT_IPC_PARSE_VERSION
        select MODULES_USE_ELF_REL
        select CLONE_BACKWARDS