diff options
Diffstat (limited to 'arch/x86')
136 files changed, 6820 insertions, 3339 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 68f5578fe38e..cf42fc305419 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -46,6 +46,7 @@ config X86 | |||
46 | select HAVE_KERNEL_GZIP | 46 | select HAVE_KERNEL_GZIP |
47 | select HAVE_KERNEL_BZIP2 | 47 | select HAVE_KERNEL_BZIP2 |
48 | select HAVE_KERNEL_LZMA | 48 | select HAVE_KERNEL_LZMA |
49 | select HAVE_ARCH_KMEMCHECK | ||
49 | 50 | ||
50 | config OUTPUT_FORMAT | 51 | config OUTPUT_FORMAT |
51 | string | 52 | string |
@@ -789,10 +790,26 @@ config X86_MCE | |||
789 | to disable it. MCE support simply ignores non-MCE processors like | 790 | to disable it. MCE support simply ignores non-MCE processors like |
790 | the 386 and 486, so nearly everyone can say Y here. | 791 | the 386 and 486, so nearly everyone can say Y here. |
791 | 792 | ||
793 | config X86_OLD_MCE | ||
794 | depends on X86_32 && X86_MCE | ||
795 | bool "Use legacy machine check code (will go away)" | ||
796 | default n | ||
797 | select X86_ANCIENT_MCE | ||
798 | ---help--- | ||
799 | Use the old i386 machine check code. This is merely intended for | ||
800 | testing in a transition period. Try this if you run into any machine | ||
801 | check related software problems, but report the problem to | ||
802 | linux-kernel. When in doubt say no. | ||
803 | |||
804 | config X86_NEW_MCE | ||
805 | depends on X86_MCE | ||
806 | bool | ||
807 | default y if (!X86_OLD_MCE && X86_32) || X86_64 | ||
808 | |||
792 | config X86_MCE_INTEL | 809 | config X86_MCE_INTEL |
793 | def_bool y | 810 | def_bool y |
794 | prompt "Intel MCE features" | 811 | prompt "Intel MCE features" |
795 | depends on X86_64 && X86_MCE && X86_LOCAL_APIC | 812 | depends on X86_NEW_MCE && X86_LOCAL_APIC |
796 | ---help--- | 813 | ---help--- |
797 | Additional support for intel specific MCE features such as | 814 | Additional support for intel specific MCE features such as |
798 | the thermal monitor. | 815 | the thermal monitor. |
@@ -800,19 +817,36 @@ config X86_MCE_INTEL | |||
800 | config X86_MCE_AMD | 817 | config X86_MCE_AMD |
801 | def_bool y | 818 | def_bool y |
802 | prompt "AMD MCE features" | 819 | prompt "AMD MCE features" |
803 | depends on X86_64 && X86_MCE && X86_LOCAL_APIC | 820 | depends on X86_NEW_MCE && X86_LOCAL_APIC |
804 | ---help--- | 821 | ---help--- |
805 | Additional support for AMD specific MCE features such as | 822 | Additional support for AMD specific MCE features such as |
806 | the DRAM Error Threshold. | 823 | the DRAM Error Threshold. |
807 | 824 | ||
825 | config X86_ANCIENT_MCE | ||
826 | def_bool n | ||
827 | depends on X86_32 | ||
828 | prompt "Support for old Pentium 5 / WinChip machine checks" | ||
829 | ---help--- | ||
830 | Include support for machine check handling on old Pentium 5 or WinChip | ||
831 | systems. These typically need to be enabled explicitely on the command | ||
832 | line. | ||
833 | |||
808 | config X86_MCE_THRESHOLD | 834 | config X86_MCE_THRESHOLD |
809 | depends on X86_MCE_AMD || X86_MCE_INTEL | 835 | depends on X86_MCE_AMD || X86_MCE_INTEL |
810 | bool | 836 | bool |
811 | default y | 837 | default y |
812 | 838 | ||
839 | config X86_MCE_INJECT | ||
840 | depends on X86_NEW_MCE | ||
841 | tristate "Machine check injector support" | ||
842 | ---help--- | ||
843 | Provide support for injecting machine checks for testing purposes. | ||
844 | If you don't know what a machine check is and you don't do kernel | ||
845 | QA it is safe to say n. | ||
846 | |||
813 | config X86_MCE_NONFATAL | 847 | config X86_MCE_NONFATAL |
814 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" | 848 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" |
815 | depends on X86_32 && X86_MCE | 849 | depends on X86_OLD_MCE |
816 | ---help--- | 850 | ---help--- |
817 | Enabling this feature starts a timer that triggers every 5 seconds which | 851 | Enabling this feature starts a timer that triggers every 5 seconds which |
818 | will look at the machine check registers to see if anything happened. | 852 | will look at the machine check registers to see if anything happened. |
@@ -825,11 +859,15 @@ config X86_MCE_NONFATAL | |||
825 | 859 | ||
826 | config X86_MCE_P4THERMAL | 860 | config X86_MCE_P4THERMAL |
827 | bool "check for P4 thermal throttling interrupt." | 861 | bool "check for P4 thermal throttling interrupt." |
828 | depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) | 862 | depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) |
829 | ---help--- | 863 | ---help--- |
830 | Enabling this feature will cause a message to be printed when the P4 | 864 | Enabling this feature will cause a message to be printed when the P4 |
831 | enters thermal throttling. | 865 | enters thermal throttling. |
832 | 866 | ||
867 | config X86_THERMAL_VECTOR | ||
868 | def_bool y | ||
869 | depends on X86_MCE_P4THERMAL || X86_MCE_INTEL | ||
870 | |||
833 | config VM86 | 871 | config VM86 |
834 | bool "Enable VM86 support" if EMBEDDED | 872 | bool "Enable VM86 support" if EMBEDDED |
835 | default y | 873 | default y |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index edbd0ca62067..1b68659c41b4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR | |||
81 | endif | 81 | endif |
82 | endif | 82 | endif |
83 | 83 | ||
84 | # Don't unroll struct assignments with kmemcheck enabled | ||
85 | ifeq ($(CONFIG_KMEMCHECK),y) | ||
86 | KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) | ||
87 | endif | ||
88 | |||
84 | # Stackpointer is addressed different for 32 bit and 64 bit x86 | 89 | # Stackpointer is addressed different for 32 bit and 64 bit x86 |
85 | sp-$(CONFIG_X86_32) := esp | 90 | sp-$(CONFIG_X86_32) := esp |
86 | sp-$(CONFIG_X86_64) := rsp | 91 | sp-$(CONFIG_X86_64) := rsp |
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ebe7deedd5b4..cfb0010fa940 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -2,6 +2,8 @@ | |||
2 | # Arch-specific CryptoAPI modules. | 2 | # Arch-specific CryptoAPI modules. |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_CRYPTO_FPU) += fpu.o | ||
6 | |||
5 | obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o | 7 | obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o |
6 | obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o | 8 | obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o |
7 | obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o | 9 | obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o |
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 02af0af65497..4e663398f77f 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -21,6 +21,22 @@ | |||
21 | #include <asm/i387.h> | 21 | #include <asm/i387.h> |
22 | #include <asm/aes.h> | 22 | #include <asm/aes.h> |
23 | 23 | ||
24 | #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) | ||
25 | #define HAS_CTR | ||
26 | #endif | ||
27 | |||
28 | #if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) | ||
29 | #define HAS_LRW | ||
30 | #endif | ||
31 | |||
32 | #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) | ||
33 | #define HAS_PCBC | ||
34 | #endif | ||
35 | |||
36 | #if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) | ||
37 | #define HAS_XTS | ||
38 | #endif | ||
39 | |||
24 | struct async_aes_ctx { | 40 | struct async_aes_ctx { |
25 | struct cryptd_ablkcipher *cryptd_tfm; | 41 | struct cryptd_ablkcipher *cryptd_tfm; |
26 | }; | 42 | }; |
@@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = { | |||
137 | } | 153 | } |
138 | }; | 154 | }; |
139 | 155 | ||
156 | static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | ||
157 | { | ||
158 | struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); | ||
159 | |||
160 | aesni_enc(ctx, dst, src); | ||
161 | } | ||
162 | |||
163 | static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | ||
164 | { | ||
165 | struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); | ||
166 | |||
167 | aesni_dec(ctx, dst, src); | ||
168 | } | ||
169 | |||
170 | static struct crypto_alg __aesni_alg = { | ||
171 | .cra_name = "__aes-aesni", | ||
172 | .cra_driver_name = "__driver-aes-aesni", | ||
173 | .cra_priority = 0, | ||
174 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER, | ||
175 | .cra_blocksize = AES_BLOCK_SIZE, | ||
176 | .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, | ||
177 | .cra_alignmask = 0, | ||
178 | .cra_module = THIS_MODULE, | ||
179 | .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list), | ||
180 | .cra_u = { | ||
181 | .cipher = { | ||
182 | .cia_min_keysize = AES_MIN_KEY_SIZE, | ||
183 | .cia_max_keysize = AES_MAX_KEY_SIZE, | ||
184 | .cia_setkey = aes_set_key, | ||
185 | .cia_encrypt = __aes_encrypt, | ||
186 | .cia_decrypt = __aes_decrypt | ||
187 | } | ||
188 | } | ||
189 | }; | ||
190 | |||
140 | static int ecb_encrypt(struct blkcipher_desc *desc, | 191 | static int ecb_encrypt(struct blkcipher_desc *desc, |
141 | struct scatterlist *dst, struct scatterlist *src, | 192 | struct scatterlist *dst, struct scatterlist *src, |
142 | unsigned int nbytes) | 193 | unsigned int nbytes) |
@@ -277,8 +328,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, | |||
277 | unsigned int key_len) | 328 | unsigned int key_len) |
278 | { | 329 | { |
279 | struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); | 330 | struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); |
331 | struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; | ||
332 | int err; | ||
280 | 333 | ||
281 | return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); | 334 | crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); |
335 | crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) | ||
336 | & CRYPTO_TFM_REQ_MASK); | ||
337 | err = crypto_ablkcipher_setkey(child, key, key_len); | ||
338 | crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) | ||
339 | & CRYPTO_TFM_RES_MASK); | ||
340 | return err; | ||
282 | } | 341 | } |
283 | 342 | ||
284 | static int ablk_encrypt(struct ablkcipher_request *req) | 343 | static int ablk_encrypt(struct ablkcipher_request *req) |
@@ -411,6 +470,163 @@ static struct crypto_alg ablk_cbc_alg = { | |||
411 | }, | 470 | }, |
412 | }; | 471 | }; |
413 | 472 | ||
473 | #ifdef HAS_CTR | ||
474 | static int ablk_ctr_init(struct crypto_tfm *tfm) | ||
475 | { | ||
476 | struct cryptd_ablkcipher *cryptd_tfm; | ||
477 | |||
478 | cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", | ||
479 | 0, 0); | ||
480 | if (IS_ERR(cryptd_tfm)) | ||
481 | return PTR_ERR(cryptd_tfm); | ||
482 | ablk_init_common(tfm, cryptd_tfm); | ||
483 | return 0; | ||
484 | } | ||
485 | |||
486 | static struct crypto_alg ablk_ctr_alg = { | ||
487 | .cra_name = "ctr(aes)", | ||
488 | .cra_driver_name = "ctr-aes-aesni", | ||
489 | .cra_priority = 400, | ||
490 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
491 | .cra_blocksize = 1, | ||
492 | .cra_ctxsize = sizeof(struct async_aes_ctx), | ||
493 | .cra_alignmask = 0, | ||
494 | .cra_type = &crypto_ablkcipher_type, | ||
495 | .cra_module = THIS_MODULE, | ||
496 | .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), | ||
497 | .cra_init = ablk_ctr_init, | ||
498 | .cra_exit = ablk_exit, | ||
499 | .cra_u = { | ||
500 | .ablkcipher = { | ||
501 | .min_keysize = AES_MIN_KEY_SIZE, | ||
502 | .max_keysize = AES_MAX_KEY_SIZE, | ||
503 | .ivsize = AES_BLOCK_SIZE, | ||
504 | .setkey = ablk_set_key, | ||
505 | .encrypt = ablk_encrypt, | ||
506 | .decrypt = ablk_decrypt, | ||
507 | .geniv = "chainiv", | ||
508 | }, | ||
509 | }, | ||
510 | }; | ||
511 | #endif | ||
512 | |||
513 | #ifdef HAS_LRW | ||
514 | static int ablk_lrw_init(struct crypto_tfm *tfm) | ||
515 | { | ||
516 | struct cryptd_ablkcipher *cryptd_tfm; | ||
517 | |||
518 | cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))", | ||
519 | 0, 0); | ||
520 | if (IS_ERR(cryptd_tfm)) | ||
521 | return PTR_ERR(cryptd_tfm); | ||
522 | ablk_init_common(tfm, cryptd_tfm); | ||
523 | return 0; | ||
524 | } | ||
525 | |||
526 | static struct crypto_alg ablk_lrw_alg = { | ||
527 | .cra_name = "lrw(aes)", | ||
528 | .cra_driver_name = "lrw-aes-aesni", | ||
529 | .cra_priority = 400, | ||
530 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
531 | .cra_blocksize = AES_BLOCK_SIZE, | ||
532 | .cra_ctxsize = sizeof(struct async_aes_ctx), | ||
533 | .cra_alignmask = 0, | ||
534 | .cra_type = &crypto_ablkcipher_type, | ||
535 | .cra_module = THIS_MODULE, | ||
536 | .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), | ||
537 | .cra_init = ablk_lrw_init, | ||
538 | .cra_exit = ablk_exit, | ||
539 | .cra_u = { | ||
540 | .ablkcipher = { | ||
541 | .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, | ||
542 | .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, | ||
543 | .ivsize = AES_BLOCK_SIZE, | ||
544 | .setkey = ablk_set_key, | ||
545 | .encrypt = ablk_encrypt, | ||
546 | .decrypt = ablk_decrypt, | ||
547 | }, | ||
548 | }, | ||
549 | }; | ||
550 | #endif | ||
551 | |||
552 | #ifdef HAS_PCBC | ||
553 | static int ablk_pcbc_init(struct crypto_tfm *tfm) | ||
554 | { | ||
555 | struct cryptd_ablkcipher *cryptd_tfm; | ||
556 | |||
557 | cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))", | ||
558 | 0, 0); | ||
559 | if (IS_ERR(cryptd_tfm)) | ||
560 | return PTR_ERR(cryptd_tfm); | ||
561 | ablk_init_common(tfm, cryptd_tfm); | ||
562 | return 0; | ||
563 | } | ||
564 | |||
565 | static struct crypto_alg ablk_pcbc_alg = { | ||
566 | .cra_name = "pcbc(aes)", | ||
567 | .cra_driver_name = "pcbc-aes-aesni", | ||
568 | .cra_priority = 400, | ||
569 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
570 | .cra_blocksize = AES_BLOCK_SIZE, | ||
571 | .cra_ctxsize = sizeof(struct async_aes_ctx), | ||
572 | .cra_alignmask = 0, | ||
573 | .cra_type = &crypto_ablkcipher_type, | ||
574 | .cra_module = THIS_MODULE, | ||
575 | .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list), | ||
576 | .cra_init = ablk_pcbc_init, | ||
577 | .cra_exit = ablk_exit, | ||
578 | .cra_u = { | ||
579 | .ablkcipher = { | ||
580 | .min_keysize = AES_MIN_KEY_SIZE, | ||
581 | .max_keysize = AES_MAX_KEY_SIZE, | ||
582 | .ivsize = AES_BLOCK_SIZE, | ||
583 | .setkey = ablk_set_key, | ||
584 | .encrypt = ablk_encrypt, | ||
585 | .decrypt = ablk_decrypt, | ||
586 | }, | ||
587 | }, | ||
588 | }; | ||
589 | #endif | ||
590 | |||
591 | #ifdef HAS_XTS | ||
592 | static int ablk_xts_init(struct crypto_tfm *tfm) | ||
593 | { | ||
594 | struct cryptd_ablkcipher *cryptd_tfm; | ||
595 | |||
596 | cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))", | ||
597 | 0, 0); | ||
598 | if (IS_ERR(cryptd_tfm)) | ||
599 | return PTR_ERR(cryptd_tfm); | ||
600 | ablk_init_common(tfm, cryptd_tfm); | ||
601 | return 0; | ||
602 | } | ||
603 | |||
604 | static struct crypto_alg ablk_xts_alg = { | ||
605 | .cra_name = "xts(aes)", | ||
606 | .cra_driver_name = "xts-aes-aesni", | ||
607 | .cra_priority = 400, | ||
608 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
609 | .cra_blocksize = AES_BLOCK_SIZE, | ||
610 | .cra_ctxsize = sizeof(struct async_aes_ctx), | ||
611 | .cra_alignmask = 0, | ||
612 | .cra_type = &crypto_ablkcipher_type, | ||
613 | .cra_module = THIS_MODULE, | ||
614 | .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), | ||
615 | .cra_init = ablk_xts_init, | ||
616 | .cra_exit = ablk_exit, | ||
617 | .cra_u = { | ||
618 | .ablkcipher = { | ||
619 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
620 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
621 | .ivsize = AES_BLOCK_SIZE, | ||
622 | .setkey = ablk_set_key, | ||
623 | .encrypt = ablk_encrypt, | ||
624 | .decrypt = ablk_decrypt, | ||
625 | }, | ||
626 | }, | ||
627 | }; | ||
628 | #endif | ||
629 | |||
414 | static int __init aesni_init(void) | 630 | static int __init aesni_init(void) |
415 | { | 631 | { |
416 | int err; | 632 | int err; |
@@ -421,6 +637,8 @@ static int __init aesni_init(void) | |||
421 | } | 637 | } |
422 | if ((err = crypto_register_alg(&aesni_alg))) | 638 | if ((err = crypto_register_alg(&aesni_alg))) |
423 | goto aes_err; | 639 | goto aes_err; |
640 | if ((err = crypto_register_alg(&__aesni_alg))) | ||
641 | goto __aes_err; | ||
424 | if ((err = crypto_register_alg(&blk_ecb_alg))) | 642 | if ((err = crypto_register_alg(&blk_ecb_alg))) |
425 | goto blk_ecb_err; | 643 | goto blk_ecb_err; |
426 | if ((err = crypto_register_alg(&blk_cbc_alg))) | 644 | if ((err = crypto_register_alg(&blk_cbc_alg))) |
@@ -429,9 +647,41 @@ static int __init aesni_init(void) | |||
429 | goto ablk_ecb_err; | 647 | goto ablk_ecb_err; |
430 | if ((err = crypto_register_alg(&ablk_cbc_alg))) | 648 | if ((err = crypto_register_alg(&ablk_cbc_alg))) |
431 | goto ablk_cbc_err; | 649 | goto ablk_cbc_err; |
650 | #ifdef HAS_CTR | ||
651 | if ((err = crypto_register_alg(&ablk_ctr_alg))) | ||
652 | goto ablk_ctr_err; | ||
653 | #endif | ||
654 | #ifdef HAS_LRW | ||
655 | if ((err = crypto_register_alg(&ablk_lrw_alg))) | ||
656 | goto ablk_lrw_err; | ||
657 | #endif | ||
658 | #ifdef HAS_PCBC | ||
659 | if ((err = crypto_register_alg(&ablk_pcbc_alg))) | ||
660 | goto ablk_pcbc_err; | ||
661 | #endif | ||
662 | #ifdef HAS_XTS | ||
663 | if ((err = crypto_register_alg(&ablk_xts_alg))) | ||
664 | goto ablk_xts_err; | ||
665 | #endif | ||
432 | 666 | ||
433 | return err; | 667 | return err; |
434 | 668 | ||
669 | #ifdef HAS_XTS | ||
670 | ablk_xts_err: | ||
671 | #endif | ||
672 | #ifdef HAS_PCBC | ||
673 | crypto_unregister_alg(&ablk_pcbc_alg); | ||
674 | ablk_pcbc_err: | ||
675 | #endif | ||
676 | #ifdef HAS_LRW | ||
677 | crypto_unregister_alg(&ablk_lrw_alg); | ||
678 | ablk_lrw_err: | ||
679 | #endif | ||
680 | #ifdef HAS_CTR | ||
681 | crypto_unregister_alg(&ablk_ctr_alg); | ||
682 | ablk_ctr_err: | ||
683 | #endif | ||
684 | crypto_unregister_alg(&ablk_cbc_alg); | ||
435 | ablk_cbc_err: | 685 | ablk_cbc_err: |
436 | crypto_unregister_alg(&ablk_ecb_alg); | 686 | crypto_unregister_alg(&ablk_ecb_alg); |
437 | ablk_ecb_err: | 687 | ablk_ecb_err: |
@@ -439,6 +689,8 @@ ablk_ecb_err: | |||
439 | blk_cbc_err: | 689 | blk_cbc_err: |
440 | crypto_unregister_alg(&blk_ecb_alg); | 690 | crypto_unregister_alg(&blk_ecb_alg); |
441 | blk_ecb_err: | 691 | blk_ecb_err: |
692 | crypto_unregister_alg(&__aesni_alg); | ||
693 | __aes_err: | ||
442 | crypto_unregister_alg(&aesni_alg); | 694 | crypto_unregister_alg(&aesni_alg); |
443 | aes_err: | 695 | aes_err: |
444 | return err; | 696 | return err; |
@@ -446,10 +698,23 @@ aes_err: | |||
446 | 698 | ||
447 | static void __exit aesni_exit(void) | 699 | static void __exit aesni_exit(void) |
448 | { | 700 | { |
701 | #ifdef HAS_XTS | ||
702 | crypto_unregister_alg(&ablk_xts_alg); | ||
703 | #endif | ||
704 | #ifdef HAS_PCBC | ||
705 | crypto_unregister_alg(&ablk_pcbc_alg); | ||
706 | #endif | ||
707 | #ifdef HAS_LRW | ||
708 | crypto_unregister_alg(&ablk_lrw_alg); | ||
709 | #endif | ||
710 | #ifdef HAS_CTR | ||
711 | crypto_unregister_alg(&ablk_ctr_alg); | ||
712 | #endif | ||
449 | crypto_unregister_alg(&ablk_cbc_alg); | 713 | crypto_unregister_alg(&ablk_cbc_alg); |
450 | crypto_unregister_alg(&ablk_ecb_alg); | 714 | crypto_unregister_alg(&ablk_ecb_alg); |
451 | crypto_unregister_alg(&blk_cbc_alg); | 715 | crypto_unregister_alg(&blk_cbc_alg); |
452 | crypto_unregister_alg(&blk_ecb_alg); | 716 | crypto_unregister_alg(&blk_ecb_alg); |
717 | crypto_unregister_alg(&__aesni_alg); | ||
453 | crypto_unregister_alg(&aesni_alg); | 718 | crypto_unregister_alg(&aesni_alg); |
454 | } | 719 | } |
455 | 720 | ||
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c new file mode 100644 index 000000000000..5f9781a3815f --- /dev/null +++ b/arch/x86/crypto/fpu.c | |||
@@ -0,0 +1,166 @@ | |||
1 | /* | ||
2 | * FPU: Wrapper for blkcipher touching fpu | ||
3 | * | ||
4 | * Copyright (c) Intel Corp. | ||
5 | * Author: Huang Ying <ying.huang@intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify it | ||
8 | * under the terms of the GNU General Public License as published by the Free | ||
9 | * Software Foundation; either version 2 of the License, or (at your option) | ||
10 | * any later version. | ||
11 | * | ||
12 | */ | ||
13 | |||
14 | #include <crypto/algapi.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <asm/i387.h> | ||
20 | |||
21 | struct crypto_fpu_ctx { | ||
22 | struct crypto_blkcipher *child; | ||
23 | }; | ||
24 | |||
25 | static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key, | ||
26 | unsigned int keylen) | ||
27 | { | ||
28 | struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent); | ||
29 | struct crypto_blkcipher *child = ctx->child; | ||
30 | int err; | ||
31 | |||
32 | crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); | ||
33 | crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & | ||
34 | CRYPTO_TFM_REQ_MASK); | ||
35 | err = crypto_blkcipher_setkey(child, key, keylen); | ||
36 | crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & | ||
37 | CRYPTO_TFM_RES_MASK); | ||
38 | return err; | ||
39 | } | ||
40 | |||
41 | static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, | ||
42 | struct scatterlist *dst, struct scatterlist *src, | ||
43 | unsigned int nbytes) | ||
44 | { | ||
45 | int err; | ||
46 | struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); | ||
47 | struct crypto_blkcipher *child = ctx->child; | ||
48 | struct blkcipher_desc desc = { | ||
49 | .tfm = child, | ||
50 | .info = desc_in->info, | ||
51 | .flags = desc_in->flags, | ||
52 | }; | ||
53 | |||
54 | kernel_fpu_begin(); | ||
55 | err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes); | ||
56 | kernel_fpu_end(); | ||
57 | return err; | ||
58 | } | ||
59 | |||
60 | static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, | ||
61 | struct scatterlist *dst, struct scatterlist *src, | ||
62 | unsigned int nbytes) | ||
63 | { | ||
64 | int err; | ||
65 | struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); | ||
66 | struct crypto_blkcipher *child = ctx->child; | ||
67 | struct blkcipher_desc desc = { | ||
68 | .tfm = child, | ||
69 | .info = desc_in->info, | ||
70 | .flags = desc_in->flags, | ||
71 | }; | ||
72 | |||
73 | kernel_fpu_begin(); | ||
74 | err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes); | ||
75 | kernel_fpu_end(); | ||
76 | return err; | ||
77 | } | ||
78 | |||
79 | static int crypto_fpu_init_tfm(struct crypto_tfm *tfm) | ||
80 | { | ||
81 | struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); | ||
82 | struct crypto_spawn *spawn = crypto_instance_ctx(inst); | ||
83 | struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); | ||
84 | struct crypto_blkcipher *cipher; | ||
85 | |||
86 | cipher = crypto_spawn_blkcipher(spawn); | ||
87 | if (IS_ERR(cipher)) | ||
88 | return PTR_ERR(cipher); | ||
89 | |||
90 | ctx->child = cipher; | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm) | ||
95 | { | ||
96 | struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); | ||
97 | crypto_free_blkcipher(ctx->child); | ||
98 | } | ||
99 | |||
100 | static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb) | ||
101 | { | ||
102 | struct crypto_instance *inst; | ||
103 | struct crypto_alg *alg; | ||
104 | int err; | ||
105 | |||
106 | err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); | ||
107 | if (err) | ||
108 | return ERR_PTR(err); | ||
109 | |||
110 | alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER, | ||
111 | CRYPTO_ALG_TYPE_MASK); | ||
112 | if (IS_ERR(alg)) | ||
113 | return ERR_CAST(alg); | ||
114 | |||
115 | inst = crypto_alloc_instance("fpu", alg); | ||
116 | if (IS_ERR(inst)) | ||
117 | goto out_put_alg; | ||
118 | |||
119 | inst->alg.cra_flags = alg->cra_flags; | ||
120 | inst->alg.cra_priority = alg->cra_priority; | ||
121 | inst->alg.cra_blocksize = alg->cra_blocksize; | ||
122 | inst->alg.cra_alignmask = alg->cra_alignmask; | ||
123 | inst->alg.cra_type = alg->cra_type; | ||
124 | inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize; | ||
125 | inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize; | ||
126 | inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize; | ||
127 | inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx); | ||
128 | inst->alg.cra_init = crypto_fpu_init_tfm; | ||
129 | inst->alg.cra_exit = crypto_fpu_exit_tfm; | ||
130 | inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey; | ||
131 | inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt; | ||
132 | inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt; | ||
133 | |||
134 | out_put_alg: | ||
135 | crypto_mod_put(alg); | ||
136 | return inst; | ||
137 | } | ||
138 | |||
139 | static void crypto_fpu_free(struct crypto_instance *inst) | ||
140 | { | ||
141 | crypto_drop_spawn(crypto_instance_ctx(inst)); | ||
142 | kfree(inst); | ||
143 | } | ||
144 | |||
145 | static struct crypto_template crypto_fpu_tmpl = { | ||
146 | .name = "fpu", | ||
147 | .alloc = crypto_fpu_alloc, | ||
148 | .free = crypto_fpu_free, | ||
149 | .module = THIS_MODULE, | ||
150 | }; | ||
151 | |||
152 | static int __init crypto_fpu_module_init(void) | ||
153 | { | ||
154 | return crypto_register_template(&crypto_fpu_tmpl); | ||
155 | } | ||
156 | |||
157 | static void __exit crypto_fpu_module_exit(void) | ||
158 | { | ||
159 | crypto_unregister_template(&crypto_fpu_tmpl); | ||
160 | } | ||
161 | |||
162 | module_init(crypto_fpu_module_init); | ||
163 | module_exit(crypto_fpu_module_exit); | ||
164 | |||
165 | MODULE_LICENSE("GPL"); | ||
166 | MODULE_DESCRIPTION("FPU block cipher wrapper"); | ||
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index aff9f1fcdcd7..8cb9c814e120 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h | |||
@@ -483,5 +483,5 @@ atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) | |||
483 | return old_val < 0; | 483 | return old_val < 0; |
484 | } | 484 | } |
485 | 485 | ||
486 | #include <asm-generic/atomic.h> | 486 | #include <asm-generic/atomic-long.h> |
487 | #endif /* _ASM_X86_ATOMIC_32_H */ | 487 | #endif /* _ASM_X86_ATOMIC_32_H */ |
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 8c21731984da..0d6360220007 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h | |||
@@ -455,5 +455,5 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) | |||
455 | #define smp_mb__before_atomic_inc() barrier() | 455 | #define smp_mb__before_atomic_inc() barrier() |
456 | #define smp_mb__after_atomic_inc() barrier() | 456 | #define smp_mb__after_atomic_inc() barrier() |
457 | 457 | ||
458 | #include <asm-generic/atomic.h> | 458 | #include <asm-generic/atomic-long.h> |
459 | #endif /* _ASM_X86_ATOMIC_64_H */ | 459 | #endif /* _ASM_X86_ATOMIC_64_H */ |
diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h new file mode 100644 index 000000000000..b0ae1c4dc791 --- /dev/null +++ b/arch/x86/include/asm/bitsperlong.h | |||
@@ -0,0 +1,13 @@ | |||
1 | #ifndef __ASM_X86_BITSPERLONG_H | ||
2 | #define __ASM_X86_BITSPERLONG_H | ||
3 | |||
4 | #ifdef __x86_64__ | ||
5 | # define __BITS_PER_LONG 64 | ||
6 | #else | ||
7 | # define __BITS_PER_LONG 32 | ||
8 | #endif | ||
9 | |||
10 | #include <asm-generic/bitsperlong.h> | ||
11 | |||
12 | #endif /* __ASM_X86_BITSPERLONG_H */ | ||
13 | |||
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 19af42138f78..4a28d22d4793 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -116,6 +116,8 @@ | |||
116 | #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ | 116 | #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ |
117 | #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ | 117 | #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ |
118 | #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ | 118 | #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ |
119 | #define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ | ||
120 | #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ | ||
119 | #define X86_FEATURE_AES (4*32+25) /* AES instructions */ | 121 | #define X86_FEATURE_AES (4*32+25) /* AES instructions */ |
120 | #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ | 122 | #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ |
121 | #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ | 123 | #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ |
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index f82fdc412c64..b93405b228b4 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h | |||
@@ -6,6 +6,7 @@ | |||
6 | * Documentation/DMA-API.txt for documentation. | 6 | * Documentation/DMA-API.txt for documentation. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/kmemcheck.h> | ||
9 | #include <linux/scatterlist.h> | 10 | #include <linux/scatterlist.h> |
10 | #include <linux/dma-debug.h> | 11 | #include <linux/dma-debug.h> |
11 | #include <linux/dma-attrs.h> | 12 | #include <linux/dma-attrs.h> |
@@ -60,6 +61,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size, | |||
60 | dma_addr_t addr; | 61 | dma_addr_t addr; |
61 | 62 | ||
62 | BUG_ON(!valid_dma_direction(dir)); | 63 | BUG_ON(!valid_dma_direction(dir)); |
64 | kmemcheck_mark_initialized(ptr, size); | ||
63 | addr = ops->map_page(hwdev, virt_to_page(ptr), | 65 | addr = ops->map_page(hwdev, virt_to_page(ptr), |
64 | (unsigned long)ptr & ~PAGE_MASK, size, | 66 | (unsigned long)ptr & ~PAGE_MASK, size, |
65 | dir, NULL); | 67 | dir, NULL); |
@@ -87,8 +89,12 @@ dma_map_sg(struct device *hwdev, struct scatterlist *sg, | |||
87 | { | 89 | { |
88 | struct dma_map_ops *ops = get_dma_ops(hwdev); | 90 | struct dma_map_ops *ops = get_dma_ops(hwdev); |
89 | int ents; | 91 | int ents; |
92 | struct scatterlist *s; | ||
93 | int i; | ||
90 | 94 | ||
91 | BUG_ON(!valid_dma_direction(dir)); | 95 | BUG_ON(!valid_dma_direction(dir)); |
96 | for_each_sg(sg, s, nents, i) | ||
97 | kmemcheck_mark_initialized(sg_virt(s), s->length); | ||
92 | ents = ops->map_sg(hwdev, sg, nents, dir, NULL); | 98 | ents = ops->map_sg(hwdev, sg, nents, dir, NULL); |
93 | debug_dma_map_sg(hwdev, sg, nents, ents, dir); | 99 | debug_dma_map_sg(hwdev, sg, nents, ents, dir); |
94 | 100 | ||
@@ -200,6 +206,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, | |||
200 | dma_addr_t addr; | 206 | dma_addr_t addr; |
201 | 207 | ||
202 | BUG_ON(!valid_dma_direction(dir)); | 208 | BUG_ON(!valid_dma_direction(dir)); |
209 | kmemcheck_mark_initialized(page_address(page) + offset, size); | ||
203 | addr = ops->map_page(dev, page, offset, size, dir, NULL); | 210 | addr = ops->map_page(dev, page, offset, size, dir, NULL); |
204 | debug_dma_map_page(dev, page, offset, size, dir, addr, false); | 211 | debug_dma_map_page(dev, page, offset, size, dir, addr, false); |
205 | 212 | ||
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index d750a10ccad6..ff8cbfa07851 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h | |||
@@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) | |||
14 | BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) | 14 | BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) |
15 | BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) | 15 | BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) |
16 | BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) | 16 | BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) |
17 | BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) | ||
17 | 18 | ||
18 | BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, | 19 | BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, |
19 | smp_invalidate_interrupt) | 20 | smp_invalidate_interrupt) |
@@ -52,8 +53,16 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) | |||
52 | BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) | 53 | BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) |
53 | #endif | 54 | #endif |
54 | 55 | ||
55 | #ifdef CONFIG_X86_MCE_P4THERMAL | 56 | #ifdef CONFIG_X86_THERMAL_VECTOR |
56 | BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) | 57 | BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) |
57 | #endif | 58 | #endif |
58 | 59 | ||
60 | #ifdef CONFIG_X86_MCE_THRESHOLD | ||
61 | BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) | ||
62 | #endif | ||
63 | |||
64 | #ifdef CONFIG_X86_NEW_MCE | ||
65 | BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) | ||
66 | #endif | ||
67 | |||
59 | #endif | 68 | #endif |
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 9ebc5c255032..82e3e8f01043 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h | |||
@@ -22,7 +22,7 @@ typedef struct { | |||
22 | #endif | 22 | #endif |
23 | #ifdef CONFIG_X86_MCE | 23 | #ifdef CONFIG_X86_MCE |
24 | unsigned int irq_thermal_count; | 24 | unsigned int irq_thermal_count; |
25 | # ifdef CONFIG_X86_64 | 25 | # ifdef CONFIG_X86_MCE_THRESHOLD |
26 | unsigned int irq_threshold_count; | 26 | unsigned int irq_threshold_count; |
27 | # endif | 27 | # endif |
28 | #endif | 28 | #endif |
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 6df45f639666..ba180d93b08c 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h | |||
@@ -34,6 +34,7 @@ extern void perf_pending_interrupt(void); | |||
34 | extern void spurious_interrupt(void); | 34 | extern void spurious_interrupt(void); |
35 | extern void thermal_interrupt(void); | 35 | extern void thermal_interrupt(void); |
36 | extern void reschedule_interrupt(void); | 36 | extern void reschedule_interrupt(void); |
37 | extern void mce_self_interrupt(void); | ||
37 | 38 | ||
38 | extern void invalidate_interrupt(void); | 39 | extern void invalidate_interrupt(void); |
39 | extern void invalidate_interrupt0(void); | 40 | extern void invalidate_interrupt0(void); |
@@ -46,6 +47,7 @@ extern void invalidate_interrupt6(void); | |||
46 | extern void invalidate_interrupt7(void); | 47 | extern void invalidate_interrupt7(void); |
47 | 48 | ||
48 | extern void irq_move_cleanup_interrupt(void); | 49 | extern void irq_move_cleanup_interrupt(void); |
50 | extern void reboot_interrupt(void); | ||
49 | extern void threshold_interrupt(void); | 51 | extern void threshold_interrupt(void); |
50 | 52 | ||
51 | extern void call_function_interrupt(void); | 53 | extern void call_function_interrupt(void); |
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index e997be98c9b9..5b21f0ec3df2 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h | |||
@@ -25,6 +25,7 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #define NMI_VECTOR 0x02 | 27 | #define NMI_VECTOR 0x02 |
28 | #define MCE_VECTOR 0x12 | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * IDT vectors usable for external interrupt sources start | 31 | * IDT vectors usable for external interrupt sources start |
@@ -87,13 +88,8 @@ | |||
87 | #define CALL_FUNCTION_VECTOR 0xfc | 88 | #define CALL_FUNCTION_VECTOR 0xfc |
88 | #define CALL_FUNCTION_SINGLE_VECTOR 0xfb | 89 | #define CALL_FUNCTION_SINGLE_VECTOR 0xfb |
89 | #define THERMAL_APIC_VECTOR 0xfa | 90 | #define THERMAL_APIC_VECTOR 0xfa |
90 | 91 | #define THRESHOLD_APIC_VECTOR 0xf9 | |
91 | #ifdef CONFIG_X86_32 | 92 | #define REBOOT_VECTOR 0xf8 |
92 | /* 0xf8 - 0xf9 : free */ | ||
93 | #else | ||
94 | # define THRESHOLD_APIC_VECTOR 0xf9 | ||
95 | # define UV_BAU_MESSAGE 0xf8 | ||
96 | #endif | ||
97 | 93 | ||
98 | /* f0-f7 used for spreading out TLB flushes: */ | 94 | /* f0-f7 used for spreading out TLB flushes: */ |
99 | #define INVALIDATE_TLB_VECTOR_END 0xf7 | 95 | #define INVALIDATE_TLB_VECTOR_END 0xf7 |
@@ -117,6 +113,13 @@ | |||
117 | */ | 113 | */ |
118 | #define LOCAL_PENDING_VECTOR 0xec | 114 | #define LOCAL_PENDING_VECTOR 0xec |
119 | 115 | ||
116 | #define UV_BAU_MESSAGE 0xec | ||
117 | |||
118 | /* | ||
119 | * Self IPI vector for machine checks | ||
120 | */ | ||
121 | #define MCE_SELF_VECTOR 0xeb | ||
122 | |||
120 | /* | 123 | /* |
121 | * First APIC vector available to drivers: (vectors 0x30-0xee) we | 124 | * First APIC vector available to drivers: (vectors 0x30-0xee) we |
122 | * start at 0x31(0x41) to spread out vectors evenly between priority | 125 | * start at 0x31(0x41) to spread out vectors evenly between priority |
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h index 5759c165a5cf..9e00a731a7fb 100644 --- a/arch/x86/include/asm/kmap_types.h +++ b/arch/x86/include/asm/kmap_types.h | |||
@@ -2,28 +2,11 @@ | |||
2 | #define _ASM_X86_KMAP_TYPES_H | 2 | #define _ASM_X86_KMAP_TYPES_H |
3 | 3 | ||
4 | #if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) | 4 | #if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) |
5 | # define D(n) __KM_FENCE_##n , | 5 | #define __WITH_KM_FENCE |
6 | #else | ||
7 | # define D(n) | ||
8 | #endif | 6 | #endif |
9 | 7 | ||
10 | enum km_type { | 8 | #include <asm-generic/kmap_types.h> |
11 | D(0) KM_BOUNCE_READ, | ||
12 | D(1) KM_SKB_SUNRPC_DATA, | ||
13 | D(2) KM_SKB_DATA_SOFTIRQ, | ||
14 | D(3) KM_USER0, | ||
15 | D(4) KM_USER1, | ||
16 | D(5) KM_BIO_SRC_IRQ, | ||
17 | D(6) KM_BIO_DST_IRQ, | ||
18 | D(7) KM_PTE0, | ||
19 | D(8) KM_PTE1, | ||
20 | D(9) KM_IRQ0, | ||
21 | D(10) KM_IRQ1, | ||
22 | D(11) KM_SOFTIRQ0, | ||
23 | D(12) KM_SOFTIRQ1, | ||
24 | D(13) KM_TYPE_NR | ||
25 | }; | ||
26 | 9 | ||
27 | #undef D | 10 | #undef __WITH_KM_FENCE |
28 | 11 | ||
29 | #endif /* _ASM_X86_KMAP_TYPES_H */ | 12 | #endif /* _ASM_X86_KMAP_TYPES_H */ |
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h new file mode 100644 index 000000000000..ed01518f297e --- /dev/null +++ b/arch/x86/include/asm/kmemcheck.h | |||
@@ -0,0 +1,42 @@ | |||
1 | #ifndef ASM_X86_KMEMCHECK_H | ||
2 | #define ASM_X86_KMEMCHECK_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <asm/ptrace.h> | ||
6 | |||
7 | #ifdef CONFIG_KMEMCHECK | ||
8 | bool kmemcheck_active(struct pt_regs *regs); | ||
9 | |||
10 | void kmemcheck_show(struct pt_regs *regs); | ||
11 | void kmemcheck_hide(struct pt_regs *regs); | ||
12 | |||
13 | bool kmemcheck_fault(struct pt_regs *regs, | ||
14 | unsigned long address, unsigned long error_code); | ||
15 | bool kmemcheck_trap(struct pt_regs *regs); | ||
16 | #else | ||
17 | static inline bool kmemcheck_active(struct pt_regs *regs) | ||
18 | { | ||
19 | return false; | ||
20 | } | ||
21 | |||
22 | static inline void kmemcheck_show(struct pt_regs *regs) | ||
23 | { | ||
24 | } | ||
25 | |||
26 | static inline void kmemcheck_hide(struct pt_regs *regs) | ||
27 | { | ||
28 | } | ||
29 | |||
30 | static inline bool kmemcheck_fault(struct pt_regs *regs, | ||
31 | unsigned long address, unsigned long error_code) | ||
32 | { | ||
33 | return false; | ||
34 | } | ||
35 | |||
36 | static inline bool kmemcheck_trap(struct pt_regs *regs) | ||
37 | { | ||
38 | return false; | ||
39 | } | ||
40 | #endif /* CONFIG_KMEMCHECK */ | ||
41 | |||
42 | #endif | ||
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index dc3f6cf11704..125be8b19568 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h | |||
@@ -16,6 +16,7 @@ | |||
16 | #define __KVM_HAVE_MSI | 16 | #define __KVM_HAVE_MSI |
17 | #define __KVM_HAVE_USER_NMI | 17 | #define __KVM_HAVE_USER_NMI |
18 | #define __KVM_HAVE_GUEST_DEBUG | 18 | #define __KVM_HAVE_GUEST_DEBUG |
19 | #define __KVM_HAVE_MSIX | ||
19 | 20 | ||
20 | /* Architectural interrupt line count. */ | 21 | /* Architectural interrupt line count. */ |
21 | #define KVM_NR_INTERRUPTS 256 | 22 | #define KVM_NR_INTERRUPTS 256 |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f0faf58044ff..eabdc1cfab5c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -185,6 +185,7 @@ union kvm_mmu_page_role { | |||
185 | unsigned access:3; | 185 | unsigned access:3; |
186 | unsigned invalid:1; | 186 | unsigned invalid:1; |
187 | unsigned cr4_pge:1; | 187 | unsigned cr4_pge:1; |
188 | unsigned nxe:1; | ||
188 | }; | 189 | }; |
189 | }; | 190 | }; |
190 | 191 | ||
@@ -212,7 +213,6 @@ struct kvm_mmu_page { | |||
212 | int multimapped; /* More than one parent_pte? */ | 213 | int multimapped; /* More than one parent_pte? */ |
213 | int root_count; /* Currently serving as active root */ | 214 | int root_count; /* Currently serving as active root */ |
214 | bool unsync; | 215 | bool unsync; |
215 | bool global; | ||
216 | unsigned int unsync_children; | 216 | unsigned int unsync_children; |
217 | union { | 217 | union { |
218 | u64 *parent_pte; /* !multimapped */ | 218 | u64 *parent_pte; /* !multimapped */ |
@@ -261,13 +261,11 @@ struct kvm_mmu { | |||
261 | union kvm_mmu_page_role base_role; | 261 | union kvm_mmu_page_role base_role; |
262 | 262 | ||
263 | u64 *pae_root; | 263 | u64 *pae_root; |
264 | u64 rsvd_bits_mask[2][4]; | ||
264 | }; | 265 | }; |
265 | 266 | ||
266 | struct kvm_vcpu_arch { | 267 | struct kvm_vcpu_arch { |
267 | u64 host_tsc; | 268 | u64 host_tsc; |
268 | int interrupt_window_open; | ||
269 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ | ||
270 | DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); | ||
271 | /* | 269 | /* |
272 | * rip and regs accesses must go through | 270 | * rip and regs accesses must go through |
273 | * kvm_{register,rip}_{read,write} functions. | 271 | * kvm_{register,rip}_{read,write} functions. |
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch { | |||
286 | u64 shadow_efer; | 284 | u64 shadow_efer; |
287 | u64 apic_base; | 285 | u64 apic_base; |
288 | struct kvm_lapic *apic; /* kernel irqchip context */ | 286 | struct kvm_lapic *apic; /* kernel irqchip context */ |
287 | int32_t apic_arb_prio; | ||
289 | int mp_state; | 288 | int mp_state; |
290 | int sipi_vector; | 289 | int sipi_vector; |
291 | u64 ia32_misc_enable_msr; | 290 | u64 ia32_misc_enable_msr; |
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch { | |||
320 | struct kvm_pio_request pio; | 319 | struct kvm_pio_request pio; |
321 | void *pio_data; | 320 | void *pio_data; |
322 | 321 | ||
322 | u8 event_exit_inst_len; | ||
323 | |||
323 | struct kvm_queued_exception { | 324 | struct kvm_queued_exception { |
324 | bool pending; | 325 | bool pending; |
325 | bool has_error_code; | 326 | bool has_error_code; |
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch { | |||
329 | 330 | ||
330 | struct kvm_queued_interrupt { | 331 | struct kvm_queued_interrupt { |
331 | bool pending; | 332 | bool pending; |
333 | bool soft; | ||
332 | u8 nr; | 334 | u8 nr; |
333 | } interrupt; | 335 | } interrupt; |
334 | 336 | ||
335 | struct { | 337 | struct { |
336 | int active; | 338 | int vm86_active; |
337 | u8 save_iopl; | 339 | u8 save_iopl; |
338 | struct kvm_save_segment { | 340 | struct kvm_save_segment { |
339 | u16 selector; | 341 | u16 selector; |
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch { | |||
356 | unsigned int time_offset; | 358 | unsigned int time_offset; |
357 | struct page *time_page; | 359 | struct page *time_page; |
358 | 360 | ||
361 | bool singlestep; /* guest is single stepped by KVM */ | ||
359 | bool nmi_pending; | 362 | bool nmi_pending; |
360 | bool nmi_injected; | 363 | bool nmi_injected; |
361 | bool nmi_window_open; | ||
362 | 364 | ||
363 | struct mtrr_state_type mtrr_state; | 365 | struct mtrr_state_type mtrr_state; |
364 | u32 pat; | 366 | u32 pat; |
@@ -392,15 +394,14 @@ struct kvm_arch{ | |||
392 | */ | 394 | */ |
393 | struct list_head active_mmu_pages; | 395 | struct list_head active_mmu_pages; |
394 | struct list_head assigned_dev_head; | 396 | struct list_head assigned_dev_head; |
395 | struct list_head oos_global_pages; | ||
396 | struct iommu_domain *iommu_domain; | 397 | struct iommu_domain *iommu_domain; |
398 | int iommu_flags; | ||
397 | struct kvm_pic *vpic; | 399 | struct kvm_pic *vpic; |
398 | struct kvm_ioapic *vioapic; | 400 | struct kvm_ioapic *vioapic; |
399 | struct kvm_pit *vpit; | 401 | struct kvm_pit *vpit; |
400 | struct hlist_head irq_ack_notifier_list; | 402 | struct hlist_head irq_ack_notifier_list; |
401 | int vapics_in_nmi_mode; | 403 | int vapics_in_nmi_mode; |
402 | 404 | ||
403 | int round_robin_prev_vcpu; | ||
404 | unsigned int tss_addr; | 405 | unsigned int tss_addr; |
405 | struct page *apic_access_page; | 406 | struct page *apic_access_page; |
406 | 407 | ||
@@ -423,7 +424,6 @@ struct kvm_vm_stat { | |||
423 | u32 mmu_recycled; | 424 | u32 mmu_recycled; |
424 | u32 mmu_cache_miss; | 425 | u32 mmu_cache_miss; |
425 | u32 mmu_unsync; | 426 | u32 mmu_unsync; |
426 | u32 mmu_unsync_global; | ||
427 | u32 remote_tlb_flush; | 427 | u32 remote_tlb_flush; |
428 | u32 lpages; | 428 | u32 lpages; |
429 | }; | 429 | }; |
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat { | |||
443 | u32 halt_exits; | 443 | u32 halt_exits; |
444 | u32 halt_wakeup; | 444 | u32 halt_wakeup; |
445 | u32 request_irq_exits; | 445 | u32 request_irq_exits; |
446 | u32 request_nmi_exits; | ||
447 | u32 irq_exits; | 446 | u32 irq_exits; |
448 | u32 host_state_reload; | 447 | u32 host_state_reload; |
449 | u32 efer_reload; | 448 | u32 efer_reload; |
@@ -511,20 +510,22 @@ struct kvm_x86_ops { | |||
511 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); | 510 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); |
512 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); | 511 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); |
513 | void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); | 512 | void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); |
513 | void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); | ||
514 | u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); | ||
514 | void (*patch_hypercall)(struct kvm_vcpu *vcpu, | 515 | void (*patch_hypercall)(struct kvm_vcpu *vcpu, |
515 | unsigned char *hypercall_addr); | 516 | unsigned char *hypercall_addr); |
516 | int (*get_irq)(struct kvm_vcpu *vcpu); | 517 | void (*set_irq)(struct kvm_vcpu *vcpu); |
517 | void (*set_irq)(struct kvm_vcpu *vcpu, int vec); | 518 | void (*set_nmi)(struct kvm_vcpu *vcpu); |
518 | void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, | 519 | void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, |
519 | bool has_error_code, u32 error_code); | 520 | bool has_error_code, u32 error_code); |
520 | bool (*exception_injected)(struct kvm_vcpu *vcpu); | 521 | int (*interrupt_allowed)(struct kvm_vcpu *vcpu); |
521 | void (*inject_pending_irq)(struct kvm_vcpu *vcpu); | 522 | int (*nmi_allowed)(struct kvm_vcpu *vcpu); |
522 | void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, | 523 | void (*enable_nmi_window)(struct kvm_vcpu *vcpu); |
523 | struct kvm_run *run); | 524 | void (*enable_irq_window)(struct kvm_vcpu *vcpu); |
524 | 525 | void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); | |
525 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | 526 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); |
526 | int (*get_tdp_level)(void); | 527 | int (*get_tdp_level)(void); |
527 | int (*get_mt_mask_shift)(void); | 528 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); |
528 | }; | 529 | }; |
529 | 530 | ||
530 | extern struct kvm_x86_ops *kvm_x86_ops; | 531 | extern struct kvm_x86_ops *kvm_x86_ops; |
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); | |||
538 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | 539 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); |
539 | void kvm_mmu_set_base_ptes(u64 base_pte); | 540 | void kvm_mmu_set_base_ptes(u64 base_pte); |
540 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 541 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
541 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); | 542 | u64 dirty_mask, u64 nx_mask, u64 x_mask); |
542 | 543 | ||
543 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 544 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
544 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | 545 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); |
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
552 | const void *val, int bytes); | 553 | const void *val, int bytes); |
553 | int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | 554 | int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, |
554 | gpa_t addr, unsigned long *ret); | 555 | gpa_t addr, unsigned long *ret); |
556 | u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); | ||
555 | 557 | ||
556 | extern bool tdp_enabled; | 558 | extern bool tdp_enabled; |
557 | 559 | ||
@@ -563,6 +565,7 @@ enum emulation_result { | |||
563 | 565 | ||
564 | #define EMULTYPE_NO_DECODE (1 << 0) | 566 | #define EMULTYPE_NO_DECODE (1 << 0) |
565 | #define EMULTYPE_TRAP_UD (1 << 1) | 567 | #define EMULTYPE_TRAP_UD (1 << 1) |
568 | #define EMULTYPE_SKIP (1 << 2) | ||
566 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, | 569 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, |
567 | unsigned long cr2, u16 error_code, int emulation_type); | 570 | unsigned long cr2, u16 error_code, int emulation_type); |
568 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); | 571 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); |
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | |||
638 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | 641 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
639 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | 642 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); |
640 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); | 643 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); |
641 | void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); | ||
642 | 644 | ||
643 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); | 645 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); |
644 | 646 | ||
@@ -769,6 +771,8 @@ enum { | |||
769 | #define HF_GIF_MASK (1 << 0) | 771 | #define HF_GIF_MASK (1 << 0) |
770 | #define HF_HIF_MASK (1 << 1) | 772 | #define HF_HIF_MASK (1 << 1) |
771 | #define HF_VINTR_MASK (1 << 2) | 773 | #define HF_VINTR_MASK (1 << 2) |
774 | #define HF_NMI_MASK (1 << 3) | ||
775 | #define HF_IRET_MASK (1 << 4) | ||
772 | 776 | ||
773 | /* | 777 | /* |
774 | * Hardware virtualization extension instructions may fault if a | 778 | * Hardware virtualization extension instructions may fault if a |
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void); | |||
791 | #define KVM_ARCH_WANT_MMU_NOTIFIER | 795 | #define KVM_ARCH_WANT_MMU_NOTIFIER |
792 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | 796 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); |
793 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | 797 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); |
798 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); | ||
794 | 799 | ||
795 | #endif /* _ASM_X86_KVM_HOST_H */ | 800 | #endif /* _ASM_X86_KVM_HOST_H */ |
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 6a159732881a..b7ed2c423116 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h | |||
@@ -143,6 +143,9 @@ struct decode_cache { | |||
143 | struct fetch_cache fetch; | 143 | struct fetch_cache fetch; |
144 | }; | 144 | }; |
145 | 145 | ||
146 | #define X86_SHADOW_INT_MOV_SS 1 | ||
147 | #define X86_SHADOW_INT_STI 2 | ||
148 | |||
146 | struct x86_emulate_ctxt { | 149 | struct x86_emulate_ctxt { |
147 | /* Register state before/after emulation. */ | 150 | /* Register state before/after emulation. */ |
148 | struct kvm_vcpu *vcpu; | 151 | struct kvm_vcpu *vcpu; |
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt { | |||
152 | int mode; | 155 | int mode; |
153 | u32 cs_base; | 156 | u32 cs_base; |
154 | 157 | ||
158 | /* interruptibility state, as a result of execution of STI or MOV SS */ | ||
159 | int interruptibility; | ||
160 | |||
155 | /* decode cache */ | 161 | /* decode cache */ |
156 | struct decode_cache decode; | 162 | struct decode_cache decode; |
157 | }; | 163 | }; |
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9c..313389cd50d2 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
@@ -17,8 +17,13 @@ | |||
17 | /* Pages for switcher itself, then two pages per cpu */ | 17 | /* Pages for switcher itself, then two pages per cpu */ |
18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) | 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) |
19 | 19 | ||
20 | /* We map at -4M for ease of mapping into the guest (one PTE page). */ | 20 | /* We map at -4M (-2M when PAE is activated) for ease of mapping |
21 | * into the guest (one PTE page). */ | ||
22 | #ifdef CONFIG_X86_PAE | ||
23 | #define SWITCHER_ADDR 0xFFE00000 | ||
24 | #else | ||
21 | #define SWITCHER_ADDR 0xFFC00000 | 25 | #define SWITCHER_ADDR 0xFFC00000 |
26 | #endif | ||
22 | 27 | ||
23 | /* Found in switcher.S */ | 28 | /* Found in switcher.S */ |
24 | extern unsigned long default_idt_entries[]; | 29 | extern unsigned long default_idt_entries[]; |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487b..d31c4a684078 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
@@ -12,11 +12,13 @@ | |||
12 | #define LHCALL_TS 8 | 12 | #define LHCALL_TS 8 |
13 | #define LHCALL_SET_CLOCKEVENT 9 | 13 | #define LHCALL_SET_CLOCKEVENT 9 |
14 | #define LHCALL_HALT 10 | 14 | #define LHCALL_HALT 10 |
15 | #define LHCALL_SET_PMD 13 | ||
15 | #define LHCALL_SET_PTE 14 | 16 | #define LHCALL_SET_PTE 14 |
16 | #define LHCALL_SET_PMD 15 | 17 | #define LHCALL_SET_PGD 15 |
17 | #define LHCALL_LOAD_TLS 16 | 18 | #define LHCALL_LOAD_TLS 16 |
18 | #define LHCALL_NOTIFY 17 | 19 | #define LHCALL_NOTIFY 17 |
19 | #define LHCALL_LOAD_GDT_ENTRY 18 | 20 | #define LHCALL_LOAD_GDT_ENTRY 18 |
21 | #define LHCALL_SEND_INTERRUPTS 19 | ||
20 | 22 | ||
21 | #define LGUEST_TRAP_ENTRY 0x1F | 23 | #define LGUEST_TRAP_ENTRY 0x1F |
22 | 24 | ||
@@ -32,10 +34,10 @@ | |||
32 | * operations? There are two ways: the direct way is to make a "hypercall", | 34 | * operations? There are two ways: the direct way is to make a "hypercall", |
33 | * to make requests of the Host Itself. | 35 | * to make requests of the Host Itself. |
34 | * | 36 | * |
35 | * We use the KVM hypercall mechanism. Eighteen hypercalls are | 37 | * We use the KVM hypercall mechanism. Seventeen hypercalls are |
36 | * available: the hypercall number is put in the %eax register, and the | 38 | * available: the hypercall number is put in the %eax register, and the |
37 | * arguments (when required) are placed in %ebx, %ecx and %edx. If a return | 39 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. |
38 | * value makes sense, it's returned in %eax. | 40 | * If a return value makes sense, it's returned in %eax. |
39 | * | 41 | * |
40 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful | 42 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful |
41 | * Host, rather than returning failure. This reflects Winston Churchill's | 43 | * Host, rather than returning failure. This reflects Winston Churchill's |
@@ -47,8 +49,9 @@ | |||
47 | 49 | ||
48 | #define LHCALL_RING_SIZE 64 | 50 | #define LHCALL_RING_SIZE 64 |
49 | struct hcall_args { | 51 | struct hcall_args { |
50 | /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ | 52 | /* These map directly onto eax, ebx, ecx, edx and esi |
51 | unsigned long arg0, arg1, arg2, arg3; | 53 | * in struct lguest_regs */ |
54 | unsigned long arg0, arg1, arg2, arg3, arg4; | ||
52 | }; | 55 | }; |
53 | 56 | ||
54 | #endif /* !__ASSEMBLY__ */ | 57 | #endif /* !__ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4f8c199584e7..540a466e50f5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -1,8 +1,6 @@ | |||
1 | #ifndef _ASM_X86_MCE_H | 1 | #ifndef _ASM_X86_MCE_H |
2 | #define _ASM_X86_MCE_H | 2 | #define _ASM_X86_MCE_H |
3 | 3 | ||
4 | #ifdef __x86_64__ | ||
5 | |||
6 | #include <linux/types.h> | 4 | #include <linux/types.h> |
7 | #include <asm/ioctls.h> | 5 | #include <asm/ioctls.h> |
8 | 6 | ||
@@ -10,21 +8,35 @@ | |||
10 | * Machine Check support for x86 | 8 | * Machine Check support for x86 |
11 | */ | 9 | */ |
12 | 10 | ||
13 | #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ | 11 | #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ |
14 | #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ | 12 | #define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ |
15 | #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ | 13 | #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ |
16 | 14 | #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ | |
17 | #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ | 15 | #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ |
18 | #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ | 16 | #define MCG_EXT_CNT_SHIFT 16 |
19 | #define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ | 17 | #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) |
20 | 18 | #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ | |
21 | #define MCI_STATUS_VAL (1UL<<63) /* valid error */ | 19 | |
22 | #define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ | 20 | #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ |
23 | #define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ | 21 | #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ |
24 | #define MCI_STATUS_EN (1UL<<60) /* error enabled */ | 22 | #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ |
25 | #define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ | 23 | |
26 | #define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ | 24 | #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ |
27 | #define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ | 25 | #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ |
26 | #define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ | ||
27 | #define MCI_STATUS_EN (1ULL<<60) /* error enabled */ | ||
28 | #define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ | ||
29 | #define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ | ||
30 | #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ | ||
31 | #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ | ||
32 | #define MCI_STATUS_AR (1ULL<<55) /* Action required */ | ||
33 | |||
34 | /* MISC register defines */ | ||
35 | #define MCM_ADDR_SEGOFF 0 /* segment offset */ | ||
36 | #define MCM_ADDR_LINEAR 1 /* linear address */ | ||
37 | #define MCM_ADDR_PHYS 2 /* physical address */ | ||
38 | #define MCM_ADDR_MEM 3 /* memory address */ | ||
39 | #define MCM_ADDR_GENERIC 7 /* generic */ | ||
28 | 40 | ||
29 | /* Fields are zero when not available */ | 41 | /* Fields are zero when not available */ |
30 | struct mce { | 42 | struct mce { |
@@ -34,13 +46,19 @@ struct mce { | |||
34 | __u64 mcgstatus; | 46 | __u64 mcgstatus; |
35 | __u64 ip; | 47 | __u64 ip; |
36 | __u64 tsc; /* cpu time stamp counter */ | 48 | __u64 tsc; /* cpu time stamp counter */ |
37 | __u64 res1; /* for future extension */ | 49 | __u64 time; /* wall time_t when error was detected */ |
38 | __u64 res2; /* dito. */ | 50 | __u8 cpuvendor; /* cpu vendor as encoded in system.h */ |
51 | __u8 pad1; | ||
52 | __u16 pad2; | ||
53 | __u32 cpuid; /* CPUID 1 EAX */ | ||
39 | __u8 cs; /* code segment */ | 54 | __u8 cs; /* code segment */ |
40 | __u8 bank; /* machine check bank */ | 55 | __u8 bank; /* machine check bank */ |
41 | __u8 cpu; /* cpu that raised the error */ | 56 | __u8 cpu; /* cpu number; obsolete; use extcpu now */ |
42 | __u8 finished; /* entry is valid */ | 57 | __u8 finished; /* entry is valid */ |
43 | __u32 pad; | 58 | __u32 extcpu; /* linux cpu number that detected the error */ |
59 | __u32 socketid; /* CPU socket ID */ | ||
60 | __u32 apicid; /* CPU initial apic ID */ | ||
61 | __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ | ||
44 | }; | 62 | }; |
45 | 63 | ||
46 | /* | 64 | /* |
@@ -57,7 +75,7 @@ struct mce_log { | |||
57 | unsigned len; /* = MCE_LOG_LEN */ | 75 | unsigned len; /* = MCE_LOG_LEN */ |
58 | unsigned next; | 76 | unsigned next; |
59 | unsigned flags; | 77 | unsigned flags; |
60 | unsigned pad0; | 78 | unsigned recordlen; /* length of struct mce */ |
61 | struct mce entry[MCE_LOG_LEN]; | 79 | struct mce entry[MCE_LOG_LEN]; |
62 | }; | 80 | }; |
63 | 81 | ||
@@ -82,19 +100,16 @@ struct mce_log { | |||
82 | #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) | 100 | #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) |
83 | #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) | 101 | #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) |
84 | 102 | ||
85 | #endif /* __x86_64__ */ | ||
86 | |||
87 | #ifdef __KERNEL__ | 103 | #ifdef __KERNEL__ |
88 | 104 | ||
89 | #ifdef CONFIG_X86_32 | ||
90 | extern int mce_disabled; | 105 | extern int mce_disabled; |
91 | #else /* CONFIG_X86_32 */ | ||
92 | 106 | ||
93 | #include <asm/atomic.h> | 107 | #include <asm/atomic.h> |
108 | #include <linux/percpu.h> | ||
94 | 109 | ||
95 | void mce_setup(struct mce *m); | 110 | void mce_setup(struct mce *m); |
96 | void mce_log(struct mce *m); | 111 | void mce_log(struct mce *m); |
97 | DECLARE_PER_CPU(struct sys_device, device_mce); | 112 | DECLARE_PER_CPU(struct sys_device, mce_dev); |
98 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | 113 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); |
99 | 114 | ||
100 | /* | 115 | /* |
@@ -104,6 +119,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | |||
104 | #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) | 119 | #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) |
105 | 120 | ||
106 | #ifdef CONFIG_X86_MCE_INTEL | 121 | #ifdef CONFIG_X86_MCE_INTEL |
122 | extern int mce_cmci_disabled; | ||
123 | extern int mce_ignore_ce; | ||
107 | void mce_intel_feature_init(struct cpuinfo_x86 *c); | 124 | void mce_intel_feature_init(struct cpuinfo_x86 *c); |
108 | void cmci_clear(void); | 125 | void cmci_clear(void); |
109 | void cmci_reenable(void); | 126 | void cmci_reenable(void); |
@@ -123,13 +140,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); | |||
123 | static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } | 140 | static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } |
124 | #endif | 141 | #endif |
125 | 142 | ||
126 | extern int mce_available(struct cpuinfo_x86 *c); | 143 | int mce_available(struct cpuinfo_x86 *c); |
144 | |||
145 | DECLARE_PER_CPU(unsigned, mce_exception_count); | ||
146 | DECLARE_PER_CPU(unsigned, mce_poll_count); | ||
127 | 147 | ||
128 | void mce_log_therm_throt_event(__u64 status); | 148 | void mce_log_therm_throt_event(__u64 status); |
129 | 149 | ||
130 | extern atomic_t mce_entry; | 150 | extern atomic_t mce_entry; |
131 | 151 | ||
132 | extern void do_machine_check(struct pt_regs *, long); | 152 | void do_machine_check(struct pt_regs *, long); |
133 | 153 | ||
134 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); | 154 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); |
135 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); | 155 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); |
@@ -139,14 +159,16 @@ enum mcp_flags { | |||
139 | MCP_UC = (1 << 1), /* log uncorrected errors */ | 159 | MCP_UC = (1 << 1), /* log uncorrected errors */ |
140 | MCP_DONTLOG = (1 << 2), /* only clear, don't log */ | 160 | MCP_DONTLOG = (1 << 2), /* only clear, don't log */ |
141 | }; | 161 | }; |
142 | extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); | 162 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); |
143 | 163 | ||
144 | extern int mce_notify_user(void); | 164 | int mce_notify_irq(void); |
165 | void mce_notify_process(void); | ||
145 | 166 | ||
146 | #endif /* !CONFIG_X86_32 */ | 167 | DECLARE_PER_CPU(struct mce, injectm); |
168 | extern struct file_operations mce_chrdev_ops; | ||
147 | 169 | ||
148 | #ifdef CONFIG_X86_MCE | 170 | #ifdef CONFIG_X86_MCE |
149 | extern void mcheck_init(struct cpuinfo_x86 *c); | 171 | void mcheck_init(struct cpuinfo_x86 *c); |
150 | #else | 172 | #else |
151 | #define mcheck_init(c) do { } while (0) | 173 | #define mcheck_init(c) do { } while (0) |
152 | #endif | 174 | #endif |
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 90bc4108a4fd..751af2550ed9 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #ifndef _ASM_X86_MMAN_H | 1 | #ifndef _ASM_X86_MMAN_H |
2 | #define _ASM_X86_MMAN_H | 2 | #define _ASM_X86_MMAN_H |
3 | 3 | ||
4 | #include <asm-generic/mman.h> | 4 | #include <asm-generic/mman-common.h> |
5 | 5 | ||
6 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ | 6 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ |
7 | 7 | ||
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4d58d04fca83..1692fb5050e3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -207,7 +207,14 @@ | |||
207 | 207 | ||
208 | #define MSR_IA32_THERM_CONTROL 0x0000019a | 208 | #define MSR_IA32_THERM_CONTROL 0x0000019a |
209 | #define MSR_IA32_THERM_INTERRUPT 0x0000019b | 209 | #define MSR_IA32_THERM_INTERRUPT 0x0000019b |
210 | |||
211 | #define THERM_INT_LOW_ENABLE (1 << 0) | ||
212 | #define THERM_INT_HIGH_ENABLE (1 << 1) | ||
213 | |||
210 | #define MSR_IA32_THERM_STATUS 0x0000019c | 214 | #define MSR_IA32_THERM_STATUS 0x0000019c |
215 | |||
216 | #define THERM_STATUS_PROCHOT (1 << 0) | ||
217 | |||
211 | #define MSR_IA32_MISC_ENABLE 0x000001a0 | 218 | #define MSR_IA32_MISC_ENABLE 0x000001a0 |
212 | 219 | ||
213 | /* MISC_ENABLE bits: architectural */ | 220 | /* MISC_ENABLE bits: architectural */ |
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 638bf6241807..22603764e7db 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h | |||
@@ -12,6 +12,17 @@ | |||
12 | 12 | ||
13 | #include <asm/asm.h> | 13 | #include <asm/asm.h> |
14 | #include <asm/errno.h> | 14 | #include <asm/errno.h> |
15 | #include <asm/cpumask.h> | ||
16 | |||
17 | struct msr { | ||
18 | union { | ||
19 | struct { | ||
20 | u32 l; | ||
21 | u32 h; | ||
22 | }; | ||
23 | u64 q; | ||
24 | }; | ||
25 | }; | ||
15 | 26 | ||
16 | static inline unsigned long long native_read_tscp(unsigned int *aux) | 27 | static inline unsigned long long native_read_tscp(unsigned int *aux) |
17 | { | 28 | { |
@@ -216,6 +227,8 @@ do { \ | |||
216 | #ifdef CONFIG_SMP | 227 | #ifdef CONFIG_SMP |
217 | int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); | 228 | int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); |
218 | int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); | 229 | int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); |
230 | void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); | ||
231 | void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); | ||
219 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); | 232 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); |
220 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); | 233 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); |
221 | #else /* CONFIG_SMP */ | 234 | #else /* CONFIG_SMP */ |
@@ -229,6 +242,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | |||
229 | wrmsr(msr_no, l, h); | 242 | wrmsr(msr_no, l, h); |
230 | return 0; | 243 | return 0; |
231 | } | 244 | } |
245 | static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, | ||
246 | struct msr *msrs) | ||
247 | { | ||
248 | rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); | ||
249 | } | ||
250 | static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no, | ||
251 | struct msr *msrs) | ||
252 | { | ||
253 | wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); | ||
254 | } | ||
232 | static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, | 255 | static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, |
233 | u32 *l, u32 *h) | 256 | u32 *l, u32 *h) |
234 | { | 257 | { |
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 89ed9d70b0aa..625c3f0e741a 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h | |||
@@ -56,7 +56,7 @@ extern bool __virt_addr_valid(unsigned long kaddr); | |||
56 | #endif /* __ASSEMBLY__ */ | 56 | #endif /* __ASSEMBLY__ */ |
57 | 57 | ||
58 | #include <asm-generic/memory_model.h> | 58 | #include <asm-generic/memory_model.h> |
59 | #include <asm-generic/page.h> | 59 | #include <asm-generic/getorder.h> |
60 | 60 | ||
61 | #define __HAVE_ARCH_GATE_AREA 1 | 61 | #define __HAVE_ARCH_GATE_AREA 1 |
62 | 62 | ||
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 18ef7ebf2631..3cc06e3fceb8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -317,6 +317,11 @@ static inline int pte_present(pte_t a) | |||
317 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); | 317 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); |
318 | } | 318 | } |
319 | 319 | ||
320 | static inline int pte_hidden(pte_t pte) | ||
321 | { | ||
322 | return pte_flags(pte) & _PAGE_HIDDEN; | ||
323 | } | ||
324 | |||
320 | static inline int pmd_present(pmd_t pmd) | 325 | static inline int pmd_present(pmd_t pmd) |
321 | { | 326 | { |
322 | return pmd_flags(pmd) & _PAGE_PRESENT; | 327 | return pmd_flags(pmd) & _PAGE_PRESENT; |
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 2733fad45f98..5e67c1532314 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h | |||
@@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ | |||
46 | # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) | 46 | # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) |
47 | #endif | 47 | #endif |
48 | 48 | ||
49 | #define MODULES_VADDR VMALLOC_START | ||
50 | #define MODULES_END VMALLOC_END | ||
51 | #define MODULES_LEN (MODULES_VADDR - MODULES_END) | ||
52 | |||
49 | #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) | 53 | #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) |
50 | 54 | ||
51 | #endif /* _ASM_X86_PGTABLE_32_DEFS_H */ | 55 | #endif /* _ASM_X86_PGTABLE_32_DEFS_H */ |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4d258ad76a0f..54cb697f4900 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -18,7 +18,7 @@ | |||
18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | 18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ |
19 | #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ | 19 | #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ |
20 | #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ | 20 | #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ |
21 | #define _PAGE_BIT_UNUSED3 11 | 21 | #define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ |
22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ | 22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ |
23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 | 23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 |
24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 | 24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 |
@@ -41,13 +41,18 @@ | |||
41 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) | 41 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
42 | #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) | 42 | #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) |
43 | #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) | 43 | #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) |
44 | #define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) | ||
45 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) | 44 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) |
46 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) | 45 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) |
47 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) | 46 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) |
48 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) | 47 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) |
49 | #define __HAVE_ARCH_PTE_SPECIAL | 48 | #define __HAVE_ARCH_PTE_SPECIAL |
50 | 49 | ||
50 | #ifdef CONFIG_KMEMCHECK | ||
51 | #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) | ||
52 | #else | ||
53 | #define _PAGE_HIDDEN (_AT(pteval_t, 0)) | ||
54 | #endif | ||
55 | |||
51 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 56 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
52 | #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) | 57 | #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) |
53 | #else | 58 | #else |
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 7761a5d554bb..598457cbd0f8 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h | |||
@@ -117,7 +117,7 @@ typedef unsigned long sigset_t; | |||
117 | #define MINSIGSTKSZ 2048 | 117 | #define MINSIGSTKSZ 2048 |
118 | #define SIGSTKSZ 8192 | 118 | #define SIGSTKSZ 8192 |
119 | 119 | ||
120 | #include <asm-generic/signal.h> | 120 | #include <asm-generic/signal-defs.h> |
121 | 121 | ||
122 | #ifndef __ASSEMBLY__ | 122 | #ifndef __ASSEMBLY__ |
123 | 123 | ||
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 0e0e3ba827f7..c86f452256de 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h | |||
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) | |||
177 | * No 3D Now! | 177 | * No 3D Now! |
178 | */ | 178 | */ |
179 | 179 | ||
180 | #ifndef CONFIG_KMEMCHECK | ||
180 | #define memcpy(t, f, n) \ | 181 | #define memcpy(t, f, n) \ |
181 | (__builtin_constant_p((n)) \ | 182 | (__builtin_constant_p((n)) \ |
182 | ? __constant_memcpy((t), (f), (n)) \ | 183 | ? __constant_memcpy((t), (f), (n)) \ |
183 | : __memcpy((t), (f), (n))) | 184 | : __memcpy((t), (f), (n))) |
185 | #else | ||
186 | /* | ||
187 | * kmemcheck becomes very happy if we use the REP instructions unconditionally, | ||
188 | * because it means that we know both memory operands in advance. | ||
189 | */ | ||
190 | #define memcpy(t, f, n) __memcpy((t), (f), (n)) | ||
191 | #endif | ||
184 | 192 | ||
185 | #endif | 193 | #endif |
186 | 194 | ||
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 2afe164bf1e6..19e2c468fc2c 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h | |||
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t | |||
27 | function. */ | 27 | function. */ |
28 | 28 | ||
29 | #define __HAVE_ARCH_MEMCPY 1 | 29 | #define __HAVE_ARCH_MEMCPY 1 |
30 | #ifndef CONFIG_KMEMCHECK | ||
30 | #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 | 31 | #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 |
31 | extern void *memcpy(void *to, const void *from, size_t len); | 32 | extern void *memcpy(void *to, const void *from, size_t len); |
32 | #else | 33 | #else |
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len); | |||
42 | __ret; \ | 43 | __ret; \ |
43 | }) | 44 | }) |
44 | #endif | 45 | #endif |
46 | #else | ||
47 | /* | ||
48 | * kmemcheck becomes very happy if we use the REP instructions unconditionally, | ||
49 | * because it means that we know both memory operands in advance. | ||
50 | */ | ||
51 | #define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) | ||
52 | #endif | ||
45 | 53 | ||
46 | #define __HAVE_ARCH_MEMSET | 54 | #define __HAVE_ARCH_MEMSET |
47 | void *memset(void *s, int c, size_t n); | 55 | void *memset(void *s, int c, size_t n); |
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 82ada75f3ebf..85574b7c1bc1 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h | |||
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb { | |||
225 | #define SVM_EVTINJ_VALID_ERR (1 << 11) | 225 | #define SVM_EVTINJ_VALID_ERR (1 << 11) |
226 | 226 | ||
227 | #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK | 227 | #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK |
228 | #define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK | ||
228 | 229 | ||
229 | #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR | 230 | #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR |
230 | #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI | 231 | #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI |
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index f72956331c49..c4ee8056baca 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h | |||
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios, | |||
67 | SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); | 67 | SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); |
68 | SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); | 68 | SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); |
69 | SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); | 69 | SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); |
70 | get_user(termios->c_line, &termio->c_line); | ||
70 | return copy_from_user(termios->c_cc, termio->c_cc, NCC); | 71 | return copy_from_user(termios->c_cc, termio->c_cc, NCC); |
71 | } | 72 | } |
72 | 73 | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 602c769fc98c..b0783520988b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -154,9 +154,9 @@ struct thread_info { | |||
154 | 154 | ||
155 | /* thread information allocation */ | 155 | /* thread information allocation */ |
156 | #ifdef CONFIG_DEBUG_STACK_USAGE | 156 | #ifdef CONFIG_DEBUG_STACK_USAGE |
157 | #define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) | 157 | #define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) |
158 | #else | 158 | #else |
159 | #define THREAD_FLAGS GFP_KERNEL | 159 | #define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK) |
160 | #endif | 160 | #endif |
161 | 161 | ||
162 | #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 162 | #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR |
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h index b5c9d45c981f..1375cfc93960 100644 --- a/arch/x86/include/asm/timex.h +++ b/arch/x86/include/asm/timex.h | |||
@@ -4,9 +4,7 @@ | |||
4 | #include <asm/processor.h> | 4 | #include <asm/processor.h> |
5 | #include <asm/tsc.h> | 5 | #include <asm/tsc.h> |
6 | 6 | ||
7 | /* The PIT ticks at this frequency (in HZ): */ | 7 | /* Assume we use the PIT time source for the clock tick */ |
8 | #define PIT_TICK_RATE 1193182 | ||
9 | |||
10 | #define CLOCK_TICK_RATE PIT_TICK_RATE | 8 | #define CLOCK_TICK_RATE PIT_TICK_RATE |
11 | 9 | ||
12 | #define ARCH_HAS_READ_CURRENT_TIMER | 10 | #define ARCH_HAS_READ_CURRENT_TIMER |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index a5ecc9c33e92..7f3eba08e7de 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start, | |||
172 | flush_tlb_all(); | 172 | flush_tlb_all(); |
173 | } | 173 | } |
174 | 174 | ||
175 | extern void zap_low_mappings(void); | 175 | extern void zap_low_mappings(bool early); |
176 | 176 | ||
177 | #endif /* _ASM_X86_TLBFLUSH_H */ | 177 | #endif /* _ASM_X86_TLBFLUSH_H */ |
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index e6f736320077..09b97745772f 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h | |||
@@ -14,12 +14,6 @@ typedef unsigned short umode_t; | |||
14 | */ | 14 | */ |
15 | #ifdef __KERNEL__ | 15 | #ifdef __KERNEL__ |
16 | 16 | ||
17 | #ifdef CONFIG_X86_32 | ||
18 | # define BITS_PER_LONG 32 | ||
19 | #else | ||
20 | # define BITS_PER_LONG 64 | ||
21 | #endif | ||
22 | |||
23 | #ifndef __ASSEMBLY__ | 17 | #ifndef __ASSEMBLY__ |
24 | 18 | ||
25 | typedef u64 dma64_addr_t; | 19 | typedef u64 dma64_addr_t; |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 498f944010b9..11be5ad2e0e9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -247,6 +247,7 @@ enum vmcs_field { | |||
247 | #define EXIT_REASON_MSR_READ 31 | 247 | #define EXIT_REASON_MSR_READ 31 |
248 | #define EXIT_REASON_MSR_WRITE 32 | 248 | #define EXIT_REASON_MSR_WRITE 32 |
249 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | 249 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 |
250 | #define EXIT_REASON_MCE_DURING_VMENTRY 41 | ||
250 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 | 251 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 |
251 | #define EXIT_REASON_APIC_ACCESS 44 | 252 | #define EXIT_REASON_APIC_ACCESS 44 |
252 | #define EXIT_REASON_EPT_VIOLATION 48 | 253 | #define EXIT_REASON_EPT_VIOLATION 48 |
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 11b3bb86e17b..7fcf6f3dbcc3 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h | |||
@@ -1,5 +1,10 @@ | |||
1 | #ifdef CONFIG_KMEMCHECK | ||
2 | /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ | ||
3 | # include <asm-generic/xor.h> | ||
4 | #else | ||
1 | #ifdef CONFIG_X86_32 | 5 | #ifdef CONFIG_X86_32 |
2 | # include "xor_32.h" | 6 | # include "xor_32.h" |
3 | #else | 7 | #else |
4 | # include "xor_64.h" | 8 | # include "xor_64.h" |
5 | #endif | 9 | #endif |
10 | #endif | ||
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4f78bd682125..f3477bb84566 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | |||
73 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | 73 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o |
74 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 74 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
75 | obj-$(CONFIG_KPROBES) += kprobes.o | 75 | obj-$(CONFIG_KPROBES) += kprobes.o |
76 | obj-$(CONFIG_MODULES) += module_$(BITS).o | 76 | obj-$(CONFIG_MODULES) += module.o |
77 | obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o | 77 | obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o |
78 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o | 78 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o |
79 | obj-$(CONFIG_KGDB) += kgdb.o | 79 | obj-$(CONFIG_KGDB) += kgdb.o |
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7c243a2c5115..ca93638ba430 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void) | |||
104 | initial_gs = per_cpu_offset(smp_processor_id()); | 104 | initial_gs = per_cpu_offset(smp_processor_id()); |
105 | #endif | 105 | #endif |
106 | initial_code = (unsigned long)wakeup_long64; | 106 | initial_code = (unsigned long)wakeup_long64; |
107 | saved_magic = 0x123456789abcdef0; | 107 | saved_magic = 0x123456789abcdef0L; |
108 | #endif /* CONFIG_64BIT */ | 108 | #endif /* CONFIG_64BIT */ |
109 | 109 | ||
110 | return 0; | 110 | return 0; |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 076d3881f3da..8c7c042ecad1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -899,7 +899,7 @@ void clear_local_APIC(void) | |||
899 | } | 899 | } |
900 | 900 | ||
901 | /* lets not touch this if we didn't frob it */ | 901 | /* lets not touch this if we didn't frob it */ |
902 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) | 902 | #ifdef CONFIG_X86_THERMAL_VECTOR |
903 | if (maxlvt >= 5) { | 903 | if (maxlvt >= 5) { |
904 | v = apic_read(APIC_LVTTHMR); | 904 | v = apic_read(APIC_LVTTHMR); |
905 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); | 905 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); |
@@ -2017,7 +2017,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) | |||
2017 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | 2017 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); |
2018 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | 2018 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); |
2019 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | 2019 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); |
2020 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) | 2020 | #ifdef CONFIG_X86_THERMAL_VECTOR |
2021 | if (maxlvt >= 5) | 2021 | if (maxlvt >= 5) |
2022 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | 2022 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); |
2023 | #endif | 2023 | #endif |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1946fac42ab3..ef8d9290c7ea 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void) | |||
177 | struct irq_cfg *cfg; | 177 | struct irq_cfg *cfg; |
178 | struct irq_desc *desc; | 178 | struct irq_desc *desc; |
179 | int count; | 179 | int count; |
180 | int node; | ||
180 | int i; | 181 | int i; |
181 | 182 | ||
182 | cfg = irq_cfgx; | 183 | cfg = irq_cfgx; |
183 | count = ARRAY_SIZE(irq_cfgx); | 184 | count = ARRAY_SIZE(irq_cfgx); |
185 | node= cpu_to_node(boot_cpu_id); | ||
184 | 186 | ||
185 | for (i = 0; i < count; i++) { | 187 | for (i = 0; i < count; i++) { |
186 | desc = irq_to_desc(i); | 188 | desc = irq_to_desc(i); |
187 | desc->chip_data = &cfg[i]; | 189 | desc->chip_data = &cfg[i]; |
188 | alloc_bootmem_cpumask_var(&cfg[i].domain); | 190 | zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); |
189 | alloc_bootmem_cpumask_var(&cfg[i].old_domain); | 191 | zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); |
190 | if (i < NR_IRQS_LEGACY) | 192 | if (i < NR_IRQS_LEGACY) |
191 | cpumask_setall(cfg[i].domain); | 193 | cpumask_setall(cfg[i].domain); |
192 | } | 194 | } |
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index a691302dc3ff..b3025b43b63a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c | |||
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) | |||
66 | 66 | ||
67 | static inline int mce_in_progress(void) | 67 | static inline int mce_in_progress(void) |
68 | { | 68 | { |
69 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) | 69 | #if defined(CONFIG_X86_NEW_MCE) |
70 | return atomic_read(&mce_entry) > 0; | 70 | return atomic_read(&mce_entry) > 0; |
71 | #endif | 71 | #endif |
72 | return 0; | 72 | return 0; |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ef0ae207a7c8..096d19aea2f7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored) | |||
463 | uv_set_scir_bits(bits); | 463 | uv_set_scir_bits(bits); |
464 | 464 | ||
465 | /* enable next timer period */ | 465 | /* enable next timer period */ |
466 | mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); | 466 | mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); |
467 | } | 467 | } |
468 | 468 | ||
469 | static void __cpuinit uv_heartbeat_enable(int cpu) | 469 | static void __cpuinit uv_heartbeat_enable(int cpu) |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 49e0939bac42..79302e9a33a4 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable) | |||
1233 | int err; | 1233 | int err; |
1234 | struct apm_user *as; | 1234 | struct apm_user *as; |
1235 | 1235 | ||
1236 | device_suspend(PMSG_SUSPEND); | 1236 | dpm_suspend_start(PMSG_SUSPEND); |
1237 | 1237 | ||
1238 | device_power_down(PMSG_SUSPEND); | 1238 | dpm_suspend_noirq(PMSG_SUSPEND); |
1239 | 1239 | ||
1240 | local_irq_disable(); | 1240 | local_irq_disable(); |
1241 | sysdev_suspend(PMSG_SUSPEND); | 1241 | sysdev_suspend(PMSG_SUSPEND); |
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable) | |||
1259 | sysdev_resume(); | 1259 | sysdev_resume(); |
1260 | local_irq_enable(); | 1260 | local_irq_enable(); |
1261 | 1261 | ||
1262 | device_power_up(PMSG_RESUME); | 1262 | dpm_resume_noirq(PMSG_RESUME); |
1263 | 1263 | ||
1264 | device_resume(PMSG_RESUME); | 1264 | dpm_resume_end(PMSG_RESUME); |
1265 | queue_event(APM_NORMAL_RESUME, NULL); | 1265 | queue_event(APM_NORMAL_RESUME, NULL); |
1266 | spin_lock(&user_list_lock); | 1266 | spin_lock(&user_list_lock); |
1267 | for (as = user_list; as != NULL; as = as->next) { | 1267 | for (as = user_list; as != NULL; as = as->next) { |
@@ -1277,7 +1277,7 @@ static void standby(void) | |||
1277 | { | 1277 | { |
1278 | int err; | 1278 | int err; |
1279 | 1279 | ||
1280 | device_power_down(PMSG_SUSPEND); | 1280 | dpm_suspend_noirq(PMSG_SUSPEND); |
1281 | 1281 | ||
1282 | local_irq_disable(); | 1282 | local_irq_disable(); |
1283 | sysdev_suspend(PMSG_SUSPEND); | 1283 | sysdev_suspend(PMSG_SUSPEND); |
@@ -1291,7 +1291,7 @@ static void standby(void) | |||
1291 | sysdev_resume(); | 1291 | sysdev_resume(); |
1292 | local_irq_enable(); | 1292 | local_irq_enable(); |
1293 | 1293 | ||
1294 | device_power_up(PMSG_RESUME); | 1294 | dpm_resume_noirq(PMSG_RESUME); |
1295 | } | 1295 | } |
1296 | 1296 | ||
1297 | static apm_event_t get_event(void) | 1297 | static apm_event_t get_event(void) |
@@ -1376,7 +1376,7 @@ static void check_events(void) | |||
1376 | ignore_bounce = 1; | 1376 | ignore_bounce = 1; |
1377 | if ((event != APM_NORMAL_RESUME) | 1377 | if ((event != APM_NORMAL_RESUME) |
1378 | || (ignore_normal_resume == 0)) { | 1378 | || (ignore_normal_resume == 0)) { |
1379 | device_resume(PMSG_RESUME); | 1379 | dpm_resume_end(PMSG_RESUME); |
1380 | queue_event(event, NULL); | 1380 | queue_event(event, NULL); |
1381 | } | 1381 | } |
1382 | ignore_normal_resume = 0; | 1382 | ignore_normal_resume = 0; |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd7015..dfdbf6403895 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -126,6 +126,7 @@ void foo(void) | |||
126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) | 126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) |
127 | BLANK(); | 127 | BLANK(); |
128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); | 128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); |
129 | OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); | ||
129 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); | 130 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); |
130 | 131 | ||
131 | BLANK(); | 132 | BLANK(); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3ffdcfa9abdf..9fa33886c0d7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -487,7 +487,6 @@ out: | |||
487 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | 487 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) |
488 | { | 488 | { |
489 | char *v = c->x86_vendor_id; | 489 | char *v = c->x86_vendor_id; |
490 | static int printed; | ||
491 | int i; | 490 | int i; |
492 | 491 | ||
493 | for (i = 0; i < X86_VENDOR_NUM; i++) { | 492 | for (i = 0; i < X86_VENDOR_NUM; i++) { |
@@ -504,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | |||
504 | } | 503 | } |
505 | } | 504 | } |
506 | 505 | ||
507 | if (!printed) { | 506 | printk_once(KERN_ERR |
508 | printed++; | 507 | "CPU: vendor_id '%s' unknown, using generic init.\n" \ |
509 | printk(KERN_ERR | 508 | "CPU: Your system may be unstable.\n", v); |
510 | "CPU: vendor_id '%s' unknown, using generic init.\n", v); | ||
511 | |||
512 | printk(KERN_ERR "CPU: Your system may be unstable.\n"); | ||
513 | } | ||
514 | 509 | ||
515 | c->x86_vendor = X86_VENDOR_UNKNOWN; | 510 | c->x86_vendor = X86_VENDOR_UNKNOWN; |
516 | this_cpu = &default_cpu; | 511 | this_cpu = &default_cpu; |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index daed39ba2614..3260ab044996 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |||
86 | */ | 86 | */ |
87 | if (c->x86 == 6 && c->x86_model < 15) | 87 | if (c->x86 == 6 && c->x86_model < 15) |
88 | clear_cpu_cap(c, X86_FEATURE_PAT); | 88 | clear_cpu_cap(c, X86_FEATURE_PAT); |
89 | |||
90 | #ifdef CONFIG_KMEMCHECK | ||
91 | /* | ||
92 | * P4s have a "fast strings" feature which causes single- | ||
93 | * stepping REP instructions to only generate a #DB on | ||
94 | * cache-line boundaries. | ||
95 | * | ||
96 | * Ingo Molnar reported a Pentium D (model 6) and a Xeon | ||
97 | * (model 2) with the same problem. | ||
98 | */ | ||
99 | if (c->x86 == 15) { | ||
100 | u64 misc_enable; | ||
101 | |||
102 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | ||
103 | |||
104 | if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { | ||
105 | printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); | ||
106 | |||
107 | misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; | ||
108 | wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | ||
109 | } | ||
110 | } | ||
111 | #endif | ||
89 | } | 112 | } |
90 | 113 | ||
91 | #ifdef CONFIG_X86_32 | 114 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe8..45004faf67ea 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile | |||
@@ -1,7 +1,11 @@ | |||
1 | obj-y = mce_$(BITS).o therm_throt.o | 1 | obj-y = mce.o therm_throt.o |
2 | 2 | ||
3 | obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o | 3 | obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o |
4 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o | 4 | obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o |
5 | obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o | ||
6 | obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o | ||
7 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o | ||
5 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o | 8 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o |
6 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o | 9 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o |
7 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o | 10 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o |
11 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o | ||
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39a..89e510424152 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * Athlon specific Machine Check Exception Reporting | 2 | * Athlon specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Dave Jones <davej@redhat.com> | 3 | * (C) Copyright 2002 Dave Jones <davej@redhat.com> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
11 | 10 | ||
12 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
@@ -15,12 +14,12 @@ | |||
15 | 14 | ||
16 | #include "mce.h" | 15 | #include "mce.h" |
17 | 16 | ||
18 | /* Machine Check Handler For AMD Athlon/Duron */ | 17 | /* Machine Check Handler For AMD Athlon/Duron: */ |
19 | static void k7_machine_check(struct pt_regs *regs, long error_code) | 18 | static void k7_machine_check(struct pt_regs *regs, long error_code) |
20 | { | 19 | { |
21 | int recover = 1; | ||
22 | u32 alow, ahigh, high, low; | 20 | u32 alow, ahigh, high, low; |
23 | u32 mcgstl, mcgsth; | 21 | u32 mcgstl, mcgsth; |
22 | int recover = 1; | ||
24 | int i; | 23 | int i; |
25 | 24 | ||
26 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 25 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) | |||
32 | 31 | ||
33 | for (i = 1; i < nr_mce_banks; i++) { | 32 | for (i = 1; i < nr_mce_banks; i++) { |
34 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | 33 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); |
35 | if (high&(1<<31)) { | 34 | if (high & (1<<31)) { |
36 | char misc[20]; | 35 | char misc[20]; |
37 | char addr[24]; | 36 | char addr[24]; |
38 | misc[0] = addr[0] = '\0'; | 37 | |
38 | misc[0] = '\0'; | ||
39 | addr[0] = '\0'; | ||
40 | |||
39 | if (high & (1<<29)) | 41 | if (high & (1<<29)) |
40 | recover |= 1; | 42 | recover |= 1; |
41 | if (high & (1<<25)) | 43 | if (high & (1<<25)) |
42 | recover |= 2; | 44 | recover |= 2; |
43 | high &= ~(1<<31); | 45 | high &= ~(1<<31); |
46 | |||
44 | if (high & (1<<27)) { | 47 | if (high & (1<<27)) { |
45 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); | 48 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); |
46 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); | 49 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); |
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) | |||
49 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); | 52 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); |
50 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); | 53 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); |
51 | } | 54 | } |
55 | |||
52 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", | 56 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", |
53 | smp_processor_id(), i, high, low, misc, addr); | 57 | smp_processor_id(), i, high, low, misc, addr); |
54 | /* Clear it */ | 58 | |
59 | /* Clear it: */ | ||
55 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | 60 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); |
56 | /* Serialize */ | 61 | /* Serialize: */ |
57 | wmb(); | 62 | wmb(); |
58 | add_taint(TAINT_MACHINE_CHECK); | 63 | add_taint(TAINT_MACHINE_CHECK); |
59 | } | 64 | } |
60 | } | 65 | } |
61 | 66 | ||
62 | if (recover&2) | 67 | if (recover & 2) |
63 | panic("CPU context corrupt"); | 68 | panic("CPU context corrupt"); |
64 | if (recover&1) | 69 | if (recover & 1) |
65 | panic("Unable to continue"); | 70 | panic("Unable to continue"); |
71 | |||
66 | printk(KERN_EMERG "Attempting to continue.\n"); | 72 | printk(KERN_EMERG "Attempting to continue.\n"); |
73 | |||
67 | mcgstl &= ~(1<<2); | 74 | mcgstl &= ~(1<<2); |
68 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 75 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
69 | } | 76 | } |
70 | 77 | ||
71 | 78 | ||
72 | /* AMD K7 machine check is Intel like */ | 79 | /* AMD K7 machine check is Intel like: */ |
73 | void amd_mcheck_init(struct cpuinfo_x86 *c) | 80 | void amd_mcheck_init(struct cpuinfo_x86 *c) |
74 | { | 81 | { |
75 | u32 l, h; | 82 | u32 l, h; |
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) | |||
79 | return; | 86 | return; |
80 | 87 | ||
81 | machine_check_vector = k7_machine_check; | 88 | machine_check_vector = k7_machine_check; |
89 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
82 | wmb(); | 90 | wmb(); |
83 | 91 | ||
84 | printk(KERN_INFO "Intel machine check architecture supported.\n"); | 92 | printk(KERN_INFO "Intel machine check architecture supported.\n"); |
93 | |||
85 | rdmsr(MSR_IA32_MCG_CAP, l, h); | 94 | rdmsr(MSR_IA32_MCG_CAP, l, h); |
86 | if (l & (1<<8)) /* Control register present ? */ | 95 | if (l & (1<<8)) /* Control register present ? */ |
87 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 96 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
88 | nr_mce_banks = l & 0xff; | 97 | nr_mce_banks = l & 0xff; |
89 | 98 | ||
90 | /* Clear status for MC index 0 separately, we don't touch CTL, | 99 | /* |
91 | * as some K7 Athlons cause spurious MCEs when its enabled. */ | 100 | * Clear status for MC index 0 separately, we don't touch CTL, |
101 | * as some K7 Athlons cause spurious MCEs when its enabled: | ||
102 | */ | ||
92 | if (boot_cpu_data.x86 == 6) { | 103 | if (boot_cpu_data.x86 == 6) { |
93 | wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); | 104 | wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); |
94 | i = 1; | 105 | i = 1; |
95 | } else | 106 | } else |
96 | i = 0; | 107 | i = 0; |
108 | |||
97 | for (; i < nr_mce_banks; i++) { | 109 | for (; i < nr_mce_banks; i++) { |
98 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); | 110 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); |
99 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); | 111 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 000000000000..a3a235a53f09 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * Machine check injection support. | ||
3 | * Copyright 2008 Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; version 2 | ||
8 | * of the License. | ||
9 | * | ||
10 | * Authors: | ||
11 | * Andi Kleen | ||
12 | * Ying Huang | ||
13 | */ | ||
14 | #include <linux/uaccess.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/timer.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/smp.h> | ||
21 | #include <asm/mce.h> | ||
22 | |||
23 | /* Update fake mce registers on current CPU. */ | ||
24 | static void inject_mce(struct mce *m) | ||
25 | { | ||
26 | struct mce *i = &per_cpu(injectm, m->extcpu); | ||
27 | |||
28 | /* Make sure noone reads partially written injectm */ | ||
29 | i->finished = 0; | ||
30 | mb(); | ||
31 | m->finished = 0; | ||
32 | /* First set the fields after finished */ | ||
33 | i->extcpu = m->extcpu; | ||
34 | mb(); | ||
35 | /* Now write record in order, finished last (except above) */ | ||
36 | memcpy(i, m, sizeof(struct mce)); | ||
37 | /* Finally activate it */ | ||
38 | mb(); | ||
39 | i->finished = 1; | ||
40 | } | ||
41 | |||
42 | struct delayed_mce { | ||
43 | struct timer_list timer; | ||
44 | struct mce m; | ||
45 | }; | ||
46 | |||
47 | /* Inject mce on current CPU */ | ||
48 | static void raise_mce(unsigned long data) | ||
49 | { | ||
50 | struct delayed_mce *dm = (struct delayed_mce *)data; | ||
51 | struct mce *m = &dm->m; | ||
52 | int cpu = m->extcpu; | ||
53 | |||
54 | inject_mce(m); | ||
55 | if (m->status & MCI_STATUS_UC) { | ||
56 | struct pt_regs regs; | ||
57 | memset(®s, 0, sizeof(struct pt_regs)); | ||
58 | regs.ip = m->ip; | ||
59 | regs.cs = m->cs; | ||
60 | printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); | ||
61 | do_machine_check(®s, 0); | ||
62 | printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); | ||
63 | } else { | ||
64 | mce_banks_t b; | ||
65 | memset(&b, 0xff, sizeof(mce_banks_t)); | ||
66 | printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); | ||
67 | machine_check_poll(0, &b); | ||
68 | mce_notify_irq(); | ||
69 | printk(KERN_INFO "Finished machine check poll on CPU %d\n", | ||
70 | cpu); | ||
71 | } | ||
72 | kfree(dm); | ||
73 | } | ||
74 | |||
75 | /* Error injection interface */ | ||
76 | static ssize_t mce_write(struct file *filp, const char __user *ubuf, | ||
77 | size_t usize, loff_t *off) | ||
78 | { | ||
79 | struct delayed_mce *dm; | ||
80 | struct mce m; | ||
81 | |||
82 | if (!capable(CAP_SYS_ADMIN)) | ||
83 | return -EPERM; | ||
84 | /* | ||
85 | * There are some cases where real MSR reads could slip | ||
86 | * through. | ||
87 | */ | ||
88 | if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) | ||
89 | return -EIO; | ||
90 | |||
91 | if ((unsigned long)usize > sizeof(struct mce)) | ||
92 | usize = sizeof(struct mce); | ||
93 | if (copy_from_user(&m, ubuf, usize)) | ||
94 | return -EFAULT; | ||
95 | |||
96 | if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) | ||
97 | return -EINVAL; | ||
98 | |||
99 | dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); | ||
100 | if (!dm) | ||
101 | return -ENOMEM; | ||
102 | |||
103 | /* | ||
104 | * Need to give user space some time to set everything up, | ||
105 | * so do it a jiffie or two later everywhere. | ||
106 | * Should we use a hrtimer here for better synchronization? | ||
107 | */ | ||
108 | memcpy(&dm->m, &m, sizeof(struct mce)); | ||
109 | setup_timer(&dm->timer, raise_mce, (unsigned long)dm); | ||
110 | dm->timer.expires = jiffies + 2; | ||
111 | add_timer_on(&dm->timer, m.extcpu); | ||
112 | return usize; | ||
113 | } | ||
114 | |||
115 | static int inject_init(void) | ||
116 | { | ||
117 | printk(KERN_INFO "Machine check injector initialized\n"); | ||
118 | mce_chrdev_ops.write = mce_write; | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | module_init(inject_init); | ||
123 | /* | ||
124 | * Cannot tolerate unloading currently because we cannot | ||
125 | * guarantee all openers of mce_chrdev will get a reference to us. | ||
126 | */ | ||
127 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 000000000000..54dcb8ff12e5 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
@@ -0,0 +1,15 @@ | |||
1 | #include <asm/mce.h> | ||
2 | |||
3 | enum severity_level { | ||
4 | MCE_NO_SEVERITY, | ||
5 | MCE_KEEP_SEVERITY, | ||
6 | MCE_SOME_SEVERITY, | ||
7 | MCE_AO_SEVERITY, | ||
8 | MCE_UC_SEVERITY, | ||
9 | MCE_AR_SEVERITY, | ||
10 | MCE_PANIC_SEVERITY, | ||
11 | }; | ||
12 | |||
13 | int mce_severity(struct mce *a, int tolerant, char **msg); | ||
14 | |||
15 | extern int mce_ser; | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 000000000000..ff0807f97056 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -0,0 +1,218 @@ | |||
1 | /* | ||
2 | * MCE grading rules. | ||
3 | * Copyright 2008, 2009 Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; version 2 | ||
8 | * of the License. | ||
9 | * | ||
10 | * Author: Andi Kleen | ||
11 | */ | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/seq_file.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/debugfs.h> | ||
16 | #include <asm/mce.h> | ||
17 | |||
18 | #include "mce-internal.h" | ||
19 | |||
20 | /* | ||
21 | * Grade an mce by severity. In general the most severe ones are processed | ||
22 | * first. Since there are quite a lot of combinations test the bits in a | ||
23 | * table-driven way. The rules are simply processed in order, first | ||
24 | * match wins. | ||
25 | * | ||
26 | * Note this is only used for machine check exceptions, the corrected | ||
27 | * errors use much simpler rules. The exceptions still check for the corrected | ||
28 | * errors, but only to leave them alone for the CMCI handler (except for | ||
29 | * panic situations) | ||
30 | */ | ||
31 | |||
32 | enum context { IN_KERNEL = 1, IN_USER = 2 }; | ||
33 | enum ser { SER_REQUIRED = 1, NO_SER = 2 }; | ||
34 | |||
35 | static struct severity { | ||
36 | u64 mask; | ||
37 | u64 result; | ||
38 | unsigned char sev; | ||
39 | unsigned char mcgmask; | ||
40 | unsigned char mcgres; | ||
41 | unsigned char ser; | ||
42 | unsigned char context; | ||
43 | unsigned char covered; | ||
44 | char *msg; | ||
45 | } severities[] = { | ||
46 | #define KERNEL .context = IN_KERNEL | ||
47 | #define USER .context = IN_USER | ||
48 | #define SER .ser = SER_REQUIRED | ||
49 | #define NOSER .ser = NO_SER | ||
50 | #define SEV(s) .sev = MCE_ ## s ## _SEVERITY | ||
51 | #define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } | ||
52 | #define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } | ||
53 | #define MCGMASK(x, res, s, m, r...) \ | ||
54 | { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } | ||
55 | #define MASK(x, y, s, m, r...) \ | ||
56 | { .mask = x, .result = y, SEV(s), .msg = m, ## r } | ||
57 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) | ||
58 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) | ||
59 | #define MCACOD 0xffff | ||
60 | |||
61 | BITCLR(MCI_STATUS_VAL, NO, "Invalid"), | ||
62 | BITCLR(MCI_STATUS_EN, NO, "Not enabled"), | ||
63 | BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), | ||
64 | /* When MCIP is not set something is very confused */ | ||
65 | MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), | ||
66 | /* Neither return not error IP -- no chance to recover -> PANIC */ | ||
67 | MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, | ||
68 | "Neither restart nor error IP"), | ||
69 | MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", | ||
70 | KERNEL), | ||
71 | BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), | ||
72 | MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, | ||
73 | "Spurious not enabled", SER), | ||
74 | |||
75 | /* ignore OVER for UCNA */ | ||
76 | MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, | ||
77 | "Uncorrected no action required", SER), | ||
78 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, | ||
79 | "Illegal combination (UCNA with AR=1)", SER), | ||
80 | MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), | ||
81 | |||
82 | /* AR add known MCACODs here */ | ||
83 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, | ||
84 | "Action required with lost events", SER), | ||
85 | MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, | ||
86 | "Action required; unknown MCACOD", SER), | ||
87 | |||
88 | /* known AO MCACODs: */ | ||
89 | MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, | ||
90 | "Action optional: memory scrubbing error", SER), | ||
91 | MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, | ||
92 | "Action optional: last level cache writeback error", SER), | ||
93 | |||
94 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, | ||
95 | "Action optional unknown MCACOD", SER), | ||
96 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, | ||
97 | "Action optional with lost events", SER), | ||
98 | BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), | ||
99 | BITSET(MCI_STATUS_UC, UC, "Uncorrected"), | ||
100 | BITSET(0, SOME, "No match") /* always matches. keep at end */ | ||
101 | }; | ||
102 | |||
103 | /* | ||
104 | * If the EIPV bit is set, it means the saved IP is the | ||
105 | * instruction which caused the MCE. | ||
106 | */ | ||
107 | static int error_context(struct mce *m) | ||
108 | { | ||
109 | if (m->mcgstatus & MCG_STATUS_EIPV) | ||
110 | return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | ||
111 | /* Unknown, assume kernel */ | ||
112 | return IN_KERNEL; | ||
113 | } | ||
114 | |||
115 | int mce_severity(struct mce *a, int tolerant, char **msg) | ||
116 | { | ||
117 | enum context ctx = error_context(a); | ||
118 | struct severity *s; | ||
119 | |||
120 | for (s = severities;; s++) { | ||
121 | if ((a->status & s->mask) != s->result) | ||
122 | continue; | ||
123 | if ((a->mcgstatus & s->mcgmask) != s->mcgres) | ||
124 | continue; | ||
125 | if (s->ser == SER_REQUIRED && !mce_ser) | ||
126 | continue; | ||
127 | if (s->ser == NO_SER && mce_ser) | ||
128 | continue; | ||
129 | if (s->context && ctx != s->context) | ||
130 | continue; | ||
131 | if (msg) | ||
132 | *msg = s->msg; | ||
133 | s->covered = 1; | ||
134 | if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { | ||
135 | if (panic_on_oops || tolerant < 1) | ||
136 | return MCE_PANIC_SEVERITY; | ||
137 | } | ||
138 | return s->sev; | ||
139 | } | ||
140 | } | ||
141 | |||
142 | static void *s_start(struct seq_file *f, loff_t *pos) | ||
143 | { | ||
144 | if (*pos >= ARRAY_SIZE(severities)) | ||
145 | return NULL; | ||
146 | return &severities[*pos]; | ||
147 | } | ||
148 | |||
149 | static void *s_next(struct seq_file *f, void *data, loff_t *pos) | ||
150 | { | ||
151 | if (++(*pos) >= ARRAY_SIZE(severities)) | ||
152 | return NULL; | ||
153 | return &severities[*pos]; | ||
154 | } | ||
155 | |||
156 | static void s_stop(struct seq_file *f, void *data) | ||
157 | { | ||
158 | } | ||
159 | |||
160 | static int s_show(struct seq_file *f, void *data) | ||
161 | { | ||
162 | struct severity *ser = data; | ||
163 | seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); | ||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | static const struct seq_operations severities_seq_ops = { | ||
168 | .start = s_start, | ||
169 | .next = s_next, | ||
170 | .stop = s_stop, | ||
171 | .show = s_show, | ||
172 | }; | ||
173 | |||
174 | static int severities_coverage_open(struct inode *inode, struct file *file) | ||
175 | { | ||
176 | return seq_open(file, &severities_seq_ops); | ||
177 | } | ||
178 | |||
179 | static ssize_t severities_coverage_write(struct file *file, | ||
180 | const char __user *ubuf, | ||
181 | size_t count, loff_t *ppos) | ||
182 | { | ||
183 | int i; | ||
184 | for (i = 0; i < ARRAY_SIZE(severities); i++) | ||
185 | severities[i].covered = 0; | ||
186 | return count; | ||
187 | } | ||
188 | |||
189 | static const struct file_operations severities_coverage_fops = { | ||
190 | .open = severities_coverage_open, | ||
191 | .release = seq_release, | ||
192 | .read = seq_read, | ||
193 | .write = severities_coverage_write, | ||
194 | }; | ||
195 | |||
196 | static int __init severities_debugfs_init(void) | ||
197 | { | ||
198 | struct dentry *dmce = NULL, *fseverities_coverage = NULL; | ||
199 | |||
200 | dmce = debugfs_create_dir("mce", NULL); | ||
201 | if (dmce == NULL) | ||
202 | goto err_out; | ||
203 | fseverities_coverage = debugfs_create_file("severities-coverage", | ||
204 | 0444, dmce, NULL, | ||
205 | &severities_coverage_fops); | ||
206 | if (fseverities_coverage == NULL) | ||
207 | goto err_out; | ||
208 | |||
209 | return 0; | ||
210 | |||
211 | err_out: | ||
212 | if (fseverities_coverage) | ||
213 | debugfs_remove(fseverities_coverage); | ||
214 | if (dmce) | ||
215 | debugfs_remove(dmce); | ||
216 | return -ENOMEM; | ||
217 | } | ||
218 | late_initcall(severities_debugfs_init); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 000000000000..fabba15e4558 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -0,0 +1,1964 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * | ||
4 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
5 | * Rest from unknown author(s). | ||
6 | * 2004 Andi Kleen. Rewrote most of it. | ||
7 | * Copyright 2008 Intel Corporation | ||
8 | * Author: Andi Kleen | ||
9 | */ | ||
10 | #include <linux/thread_info.h> | ||
11 | #include <linux/capability.h> | ||
12 | #include <linux/miscdevice.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/ratelimit.h> | ||
15 | #include <linux/kallsyms.h> | ||
16 | #include <linux/rcupdate.h> | ||
17 | #include <linux/kobject.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/kdebug.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/percpu.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/sysdev.h> | ||
24 | #include <linux/delay.h> | ||
25 | #include <linux/ctype.h> | ||
26 | #include <linux/sched.h> | ||
27 | #include <linux/sysfs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/kmod.h> | ||
31 | #include <linux/poll.h> | ||
32 | #include <linux/nmi.h> | ||
33 | #include <linux/cpu.h> | ||
34 | #include <linux/smp.h> | ||
35 | #include <linux/fs.h> | ||
36 | #include <linux/mm.h> | ||
37 | |||
38 | #include <asm/processor.h> | ||
39 | #include <asm/hw_irq.h> | ||
40 | #include <asm/apic.h> | ||
41 | #include <asm/idle.h> | ||
42 | #include <asm/ipi.h> | ||
43 | #include <asm/mce.h> | ||
44 | #include <asm/msr.h> | ||
45 | |||
46 | #include "mce-internal.h" | ||
47 | #include "mce.h" | ||
48 | |||
49 | /* Handle unconfigured int18 (should never happen) */ | ||
50 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | ||
51 | { | ||
52 | printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", | ||
53 | smp_processor_id()); | ||
54 | } | ||
55 | |||
56 | /* Call the installed machine check handler for this CPU setup. */ | ||
57 | void (*machine_check_vector)(struct pt_regs *, long error_code) = | ||
58 | unexpected_machine_check; | ||
59 | |||
60 | int mce_disabled; | ||
61 | |||
62 | #ifdef CONFIG_X86_NEW_MCE | ||
63 | |||
64 | #define MISC_MCELOG_MINOR 227 | ||
65 | |||
66 | #define SPINUNIT 100 /* 100ns */ | ||
67 | |||
68 | atomic_t mce_entry; | ||
69 | |||
70 | DEFINE_PER_CPU(unsigned, mce_exception_count); | ||
71 | |||
72 | /* | ||
73 | * Tolerant levels: | ||
74 | * 0: always panic on uncorrected errors, log corrected errors | ||
75 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | ||
76 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | ||
77 | * 3: never panic or SIGBUS, log all errors (for testing only) | ||
78 | */ | ||
79 | static int tolerant = 1; | ||
80 | static int banks; | ||
81 | static u64 *bank; | ||
82 | static unsigned long notify_user; | ||
83 | static int rip_msr; | ||
84 | static int mce_bootlog = -1; | ||
85 | static int monarch_timeout = -1; | ||
86 | static int mce_panic_timeout; | ||
87 | static int mce_dont_log_ce; | ||
88 | int mce_cmci_disabled; | ||
89 | int mce_ignore_ce; | ||
90 | int mce_ser; | ||
91 | |||
92 | static char trigger[128]; | ||
93 | static char *trigger_argv[2] = { trigger, NULL }; | ||
94 | |||
95 | static unsigned long dont_init_banks; | ||
96 | |||
97 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | ||
98 | static DEFINE_PER_CPU(struct mce, mces_seen); | ||
99 | static int cpu_missing; | ||
100 | |||
101 | |||
102 | /* MCA banks polled by the period polling timer for corrected events */ | ||
103 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | ||
104 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL | ||
105 | }; | ||
106 | |||
107 | static inline int skip_bank_init(int i) | ||
108 | { | ||
109 | return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); | ||
110 | } | ||
111 | |||
112 | static DEFINE_PER_CPU(struct work_struct, mce_work); | ||
113 | |||
114 | /* Do initial initialization of a struct mce */ | ||
115 | void mce_setup(struct mce *m) | ||
116 | { | ||
117 | memset(m, 0, sizeof(struct mce)); | ||
118 | m->cpu = m->extcpu = smp_processor_id(); | ||
119 | rdtscll(m->tsc); | ||
120 | /* We hope get_seconds stays lockless */ | ||
121 | m->time = get_seconds(); | ||
122 | m->cpuvendor = boot_cpu_data.x86_vendor; | ||
123 | m->cpuid = cpuid_eax(1); | ||
124 | #ifdef CONFIG_SMP | ||
125 | m->socketid = cpu_data(m->extcpu).phys_proc_id; | ||
126 | #endif | ||
127 | m->apicid = cpu_data(m->extcpu).initial_apicid; | ||
128 | rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); | ||
129 | } | ||
130 | |||
131 | DEFINE_PER_CPU(struct mce, injectm); | ||
132 | EXPORT_PER_CPU_SYMBOL_GPL(injectm); | ||
133 | |||
134 | /* | ||
135 | * Lockless MCE logging infrastructure. | ||
136 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
137 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
138 | */ | ||
139 | |||
140 | static struct mce_log mcelog = { | ||
141 | .signature = MCE_LOG_SIGNATURE, | ||
142 | .len = MCE_LOG_LEN, | ||
143 | .recordlen = sizeof(struct mce), | ||
144 | }; | ||
145 | |||
146 | void mce_log(struct mce *mce) | ||
147 | { | ||
148 | unsigned next, entry; | ||
149 | |||
150 | mce->finished = 0; | ||
151 | wmb(); | ||
152 | for (;;) { | ||
153 | entry = rcu_dereference(mcelog.next); | ||
154 | for (;;) { | ||
155 | /* | ||
156 | * When the buffer fills up discard new entries. | ||
157 | * Assume that the earlier errors are the more | ||
158 | * interesting ones: | ||
159 | */ | ||
160 | if (entry >= MCE_LOG_LEN) { | ||
161 | set_bit(MCE_OVERFLOW, | ||
162 | (unsigned long *)&mcelog.flags); | ||
163 | return; | ||
164 | } | ||
165 | /* Old left over entry. Skip: */ | ||
166 | if (mcelog.entry[entry].finished) { | ||
167 | entry++; | ||
168 | continue; | ||
169 | } | ||
170 | break; | ||
171 | } | ||
172 | smp_rmb(); | ||
173 | next = entry + 1; | ||
174 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
175 | break; | ||
176 | } | ||
177 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
178 | wmb(); | ||
179 | mcelog.entry[entry].finished = 1; | ||
180 | wmb(); | ||
181 | |||
182 | mce->finished = 1; | ||
183 | set_bit(0, ¬ify_user); | ||
184 | } | ||
185 | |||
186 | static void print_mce(struct mce *m) | ||
187 | { | ||
188 | printk(KERN_EMERG | ||
189 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
190 | m->extcpu, m->mcgstatus, m->bank, m->status); | ||
191 | if (m->ip) { | ||
192 | printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", | ||
193 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
194 | m->cs, m->ip); | ||
195 | if (m->cs == __KERNEL_CS) | ||
196 | print_symbol("{%s}", m->ip); | ||
197 | printk("\n"); | ||
198 | } | ||
199 | printk(KERN_EMERG "TSC %llx ", m->tsc); | ||
200 | if (m->addr) | ||
201 | printk("ADDR %llx ", m->addr); | ||
202 | if (m->misc) | ||
203 | printk("MISC %llx ", m->misc); | ||
204 | printk("\n"); | ||
205 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | ||
206 | m->cpuvendor, m->cpuid, m->time, m->socketid, | ||
207 | m->apicid); | ||
208 | } | ||
209 | |||
210 | static void print_mce_head(void) | ||
211 | { | ||
212 | printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); | ||
213 | } | ||
214 | |||
215 | static void print_mce_tail(void) | ||
216 | { | ||
217 | printk(KERN_EMERG "This is not a software problem!\n" | ||
218 | KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | ||
219 | } | ||
220 | |||
221 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | ||
222 | |||
223 | static atomic_t mce_paniced; | ||
224 | |||
225 | /* Panic in progress. Enable interrupts and wait for final IPI */ | ||
226 | static void wait_for_panic(void) | ||
227 | { | ||
228 | long timeout = PANIC_TIMEOUT*USEC_PER_SEC; | ||
229 | preempt_disable(); | ||
230 | local_irq_enable(); | ||
231 | while (timeout-- > 0) | ||
232 | udelay(1); | ||
233 | if (panic_timeout == 0) | ||
234 | panic_timeout = mce_panic_timeout; | ||
235 | panic("Panicing machine check CPU died"); | ||
236 | } | ||
237 | |||
238 | static void mce_panic(char *msg, struct mce *final, char *exp) | ||
239 | { | ||
240 | int i; | ||
241 | |||
242 | /* | ||
243 | * Make sure only one CPU runs in machine check panic | ||
244 | */ | ||
245 | if (atomic_add_return(1, &mce_paniced) > 1) | ||
246 | wait_for_panic(); | ||
247 | barrier(); | ||
248 | |||
249 | bust_spinlocks(1); | ||
250 | console_verbose(); | ||
251 | print_mce_head(); | ||
252 | /* First print corrected ones that are still unlogged */ | ||
253 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
254 | struct mce *m = &mcelog.entry[i]; | ||
255 | if (!(m->status & MCI_STATUS_VAL)) | ||
256 | continue; | ||
257 | if (!(m->status & MCI_STATUS_UC)) | ||
258 | print_mce(m); | ||
259 | } | ||
260 | /* Now print uncorrected but with the final one last */ | ||
261 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
262 | struct mce *m = &mcelog.entry[i]; | ||
263 | if (!(m->status & MCI_STATUS_VAL)) | ||
264 | continue; | ||
265 | if (!(m->status & MCI_STATUS_UC)) | ||
266 | continue; | ||
267 | if (!final || memcmp(m, final, sizeof(struct mce))) | ||
268 | print_mce(m); | ||
269 | } | ||
270 | if (final) | ||
271 | print_mce(final); | ||
272 | if (cpu_missing) | ||
273 | printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); | ||
274 | print_mce_tail(); | ||
275 | if (exp) | ||
276 | printk(KERN_EMERG "Machine check: %s\n", exp); | ||
277 | if (panic_timeout == 0) | ||
278 | panic_timeout = mce_panic_timeout; | ||
279 | panic(msg); | ||
280 | } | ||
281 | |||
282 | /* Support code for software error injection */ | ||
283 | |||
284 | static int msr_to_offset(u32 msr) | ||
285 | { | ||
286 | unsigned bank = __get_cpu_var(injectm.bank); | ||
287 | if (msr == rip_msr) | ||
288 | return offsetof(struct mce, ip); | ||
289 | if (msr == MSR_IA32_MC0_STATUS + bank*4) | ||
290 | return offsetof(struct mce, status); | ||
291 | if (msr == MSR_IA32_MC0_ADDR + bank*4) | ||
292 | return offsetof(struct mce, addr); | ||
293 | if (msr == MSR_IA32_MC0_MISC + bank*4) | ||
294 | return offsetof(struct mce, misc); | ||
295 | if (msr == MSR_IA32_MCG_STATUS) | ||
296 | return offsetof(struct mce, mcgstatus); | ||
297 | return -1; | ||
298 | } | ||
299 | |||
300 | /* MSR access wrappers used for error injection */ | ||
301 | static u64 mce_rdmsrl(u32 msr) | ||
302 | { | ||
303 | u64 v; | ||
304 | if (__get_cpu_var(injectm).finished) { | ||
305 | int offset = msr_to_offset(msr); | ||
306 | if (offset < 0) | ||
307 | return 0; | ||
308 | return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); | ||
309 | } | ||
310 | rdmsrl(msr, v); | ||
311 | return v; | ||
312 | } | ||
313 | |||
314 | static void mce_wrmsrl(u32 msr, u64 v) | ||
315 | { | ||
316 | if (__get_cpu_var(injectm).finished) { | ||
317 | int offset = msr_to_offset(msr); | ||
318 | if (offset >= 0) | ||
319 | *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; | ||
320 | return; | ||
321 | } | ||
322 | wrmsrl(msr, v); | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Simple lockless ring to communicate PFNs from the exception handler with the | ||
327 | * process context work function. This is vastly simplified because there's | ||
328 | * only a single reader and a single writer. | ||
329 | */ | ||
330 | #define MCE_RING_SIZE 16 /* we use one entry less */ | ||
331 | |||
332 | struct mce_ring { | ||
333 | unsigned short start; | ||
334 | unsigned short end; | ||
335 | unsigned long ring[MCE_RING_SIZE]; | ||
336 | }; | ||
337 | static DEFINE_PER_CPU(struct mce_ring, mce_ring); | ||
338 | |||
339 | /* Runs with CPU affinity in workqueue */ | ||
340 | static int mce_ring_empty(void) | ||
341 | { | ||
342 | struct mce_ring *r = &__get_cpu_var(mce_ring); | ||
343 | |||
344 | return r->start == r->end; | ||
345 | } | ||
346 | |||
347 | static int mce_ring_get(unsigned long *pfn) | ||
348 | { | ||
349 | struct mce_ring *r; | ||
350 | int ret = 0; | ||
351 | |||
352 | *pfn = 0; | ||
353 | get_cpu(); | ||
354 | r = &__get_cpu_var(mce_ring); | ||
355 | if (r->start == r->end) | ||
356 | goto out; | ||
357 | *pfn = r->ring[r->start]; | ||
358 | r->start = (r->start + 1) % MCE_RING_SIZE; | ||
359 | ret = 1; | ||
360 | out: | ||
361 | put_cpu(); | ||
362 | return ret; | ||
363 | } | ||
364 | |||
365 | /* Always runs in MCE context with preempt off */ | ||
366 | static int mce_ring_add(unsigned long pfn) | ||
367 | { | ||
368 | struct mce_ring *r = &__get_cpu_var(mce_ring); | ||
369 | unsigned next; | ||
370 | |||
371 | next = (r->end + 1) % MCE_RING_SIZE; | ||
372 | if (next == r->start) | ||
373 | return -1; | ||
374 | r->ring[r->end] = pfn; | ||
375 | wmb(); | ||
376 | r->end = next; | ||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | int mce_available(struct cpuinfo_x86 *c) | ||
381 | { | ||
382 | if (mce_disabled) | ||
383 | return 0; | ||
384 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | ||
385 | } | ||
386 | |||
387 | static void mce_schedule_work(void) | ||
388 | { | ||
389 | if (!mce_ring_empty()) { | ||
390 | struct work_struct *work = &__get_cpu_var(mce_work); | ||
391 | if (!work_pending(work)) | ||
392 | schedule_work(work); | ||
393 | } | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * Get the address of the instruction at the time of the machine check | ||
398 | * error. | ||
399 | */ | ||
400 | static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | ||
401 | { | ||
402 | |||
403 | if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { | ||
404 | m->ip = regs->ip; | ||
405 | m->cs = regs->cs; | ||
406 | } else { | ||
407 | m->ip = 0; | ||
408 | m->cs = 0; | ||
409 | } | ||
410 | if (rip_msr) | ||
411 | m->ip = mce_rdmsrl(rip_msr); | ||
412 | } | ||
413 | |||
414 | #ifdef CONFIG_X86_LOCAL_APIC | ||
415 | /* | ||
416 | * Called after interrupts have been reenabled again | ||
417 | * when a MCE happened during an interrupts off region | ||
418 | * in the kernel. | ||
419 | */ | ||
420 | asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) | ||
421 | { | ||
422 | ack_APIC_irq(); | ||
423 | exit_idle(); | ||
424 | irq_enter(); | ||
425 | mce_notify_irq(); | ||
426 | mce_schedule_work(); | ||
427 | irq_exit(); | ||
428 | } | ||
429 | #endif | ||
430 | |||
431 | static void mce_report_event(struct pt_regs *regs) | ||
432 | { | ||
433 | if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { | ||
434 | mce_notify_irq(); | ||
435 | /* | ||
436 | * Triggering the work queue here is just an insurance | ||
437 | * policy in case the syscall exit notify handler | ||
438 | * doesn't run soon enough or ends up running on the | ||
439 | * wrong CPU (can happen when audit sleeps) | ||
440 | */ | ||
441 | mce_schedule_work(); | ||
442 | return; | ||
443 | } | ||
444 | |||
445 | #ifdef CONFIG_X86_LOCAL_APIC | ||
446 | /* | ||
447 | * Without APIC do not notify. The event will be picked | ||
448 | * up eventually. | ||
449 | */ | ||
450 | if (!cpu_has_apic) | ||
451 | return; | ||
452 | |||
453 | /* | ||
454 | * When interrupts are disabled we cannot use | ||
455 | * kernel services safely. Trigger an self interrupt | ||
456 | * through the APIC to instead do the notification | ||
457 | * after interrupts are reenabled again. | ||
458 | */ | ||
459 | apic->send_IPI_self(MCE_SELF_VECTOR); | ||
460 | |||
461 | /* | ||
462 | * Wait for idle afterwards again so that we don't leave the | ||
463 | * APIC in a non idle state because the normal APIC writes | ||
464 | * cannot exclude us. | ||
465 | */ | ||
466 | apic_wait_icr_idle(); | ||
467 | #endif | ||
468 | } | ||
469 | |||
470 | DEFINE_PER_CPU(unsigned, mce_poll_count); | ||
471 | |||
472 | /* | ||
473 | * Poll for corrected events or events that happened before reset. | ||
474 | * Those are just logged through /dev/mcelog. | ||
475 | * | ||
476 | * This is executed in standard interrupt context. | ||
477 | * | ||
478 | * Note: spec recommends to panic for fatal unsignalled | ||
479 | * errors here. However this would be quite problematic -- | ||
480 | * we would need to reimplement the Monarch handling and | ||
481 | * it would mess up the exclusion between exception handler | ||
482 | * and poll hander -- * so we skip this for now. | ||
483 | * These cases should not happen anyways, or only when the CPU | ||
484 | * is already totally * confused. In this case it's likely it will | ||
485 | * not fully execute the machine check handler either. | ||
486 | */ | ||
487 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | ||
488 | { | ||
489 | struct mce m; | ||
490 | int i; | ||
491 | |||
492 | __get_cpu_var(mce_poll_count)++; | ||
493 | |||
494 | mce_setup(&m); | ||
495 | |||
496 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); | ||
497 | for (i = 0; i < banks; i++) { | ||
498 | if (!bank[i] || !test_bit(i, *b)) | ||
499 | continue; | ||
500 | |||
501 | m.misc = 0; | ||
502 | m.addr = 0; | ||
503 | m.bank = i; | ||
504 | m.tsc = 0; | ||
505 | |||
506 | barrier(); | ||
507 | m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | ||
508 | if (!(m.status & MCI_STATUS_VAL)) | ||
509 | continue; | ||
510 | |||
511 | /* | ||
512 | * Uncorrected or signalled events are handled by the exception | ||
513 | * handler when it is enabled, so don't process those here. | ||
514 | * | ||
515 | * TBD do the same check for MCI_STATUS_EN here? | ||
516 | */ | ||
517 | if (!(flags & MCP_UC) && | ||
518 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) | ||
519 | continue; | ||
520 | |||
521 | if (m.status & MCI_STATUS_MISCV) | ||
522 | m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); | ||
523 | if (m.status & MCI_STATUS_ADDRV) | ||
524 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); | ||
525 | |||
526 | if (!(flags & MCP_TIMESTAMP)) | ||
527 | m.tsc = 0; | ||
528 | /* | ||
529 | * Don't get the IP here because it's unlikely to | ||
530 | * have anything to do with the actual error location. | ||
531 | */ | ||
532 | if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { | ||
533 | mce_log(&m); | ||
534 | add_taint(TAINT_MACHINE_CHECK); | ||
535 | } | ||
536 | |||
537 | /* | ||
538 | * Clear state for this bank. | ||
539 | */ | ||
540 | mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * Don't clear MCG_STATUS here because it's only defined for | ||
545 | * exceptions. | ||
546 | */ | ||
547 | |||
548 | sync_core(); | ||
549 | } | ||
550 | EXPORT_SYMBOL_GPL(machine_check_poll); | ||
551 | |||
552 | /* | ||
553 | * Do a quick check if any of the events requires a panic. | ||
554 | * This decides if we keep the events around or clear them. | ||
555 | */ | ||
556 | static int mce_no_way_out(struct mce *m, char **msg) | ||
557 | { | ||
558 | int i; | ||
559 | |||
560 | for (i = 0; i < banks; i++) { | ||
561 | m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | ||
562 | if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) | ||
563 | return 1; | ||
564 | } | ||
565 | return 0; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * Variable to establish order between CPUs while scanning. | ||
570 | * Each CPU spins initially until executing is equal its number. | ||
571 | */ | ||
572 | static atomic_t mce_executing; | ||
573 | |||
574 | /* | ||
575 | * Defines order of CPUs on entry. First CPU becomes Monarch. | ||
576 | */ | ||
577 | static atomic_t mce_callin; | ||
578 | |||
579 | /* | ||
580 | * Check if a timeout waiting for other CPUs happened. | ||
581 | */ | ||
582 | static int mce_timed_out(u64 *t) | ||
583 | { | ||
584 | /* | ||
585 | * The others already did panic for some reason. | ||
586 | * Bail out like in a timeout. | ||
587 | * rmb() to tell the compiler that system_state | ||
588 | * might have been modified by someone else. | ||
589 | */ | ||
590 | rmb(); | ||
591 | if (atomic_read(&mce_paniced)) | ||
592 | wait_for_panic(); | ||
593 | if (!monarch_timeout) | ||
594 | goto out; | ||
595 | if ((s64)*t < SPINUNIT) { | ||
596 | /* CHECKME: Make panic default for 1 too? */ | ||
597 | if (tolerant < 1) | ||
598 | mce_panic("Timeout synchronizing machine check over CPUs", | ||
599 | NULL, NULL); | ||
600 | cpu_missing = 1; | ||
601 | return 1; | ||
602 | } | ||
603 | *t -= SPINUNIT; | ||
604 | out: | ||
605 | touch_nmi_watchdog(); | ||
606 | return 0; | ||
607 | } | ||
608 | |||
609 | /* | ||
610 | * The Monarch's reign. The Monarch is the CPU who entered | ||
611 | * the machine check handler first. It waits for the others to | ||
612 | * raise the exception too and then grades them. When any | ||
613 | * error is fatal panic. Only then let the others continue. | ||
614 | * | ||
615 | * The other CPUs entering the MCE handler will be controlled by the | ||
616 | * Monarch. They are called Subjects. | ||
617 | * | ||
618 | * This way we prevent any potential data corruption in a unrecoverable case | ||
619 | * and also makes sure always all CPU's errors are examined. | ||
620 | * | ||
621 | * Also this detects the case of an machine check event coming from outer | ||
622 | * space (not detected by any CPUs) In this case some external agent wants | ||
623 | * us to shut down, so panic too. | ||
624 | * | ||
625 | * The other CPUs might still decide to panic if the handler happens | ||
626 | * in a unrecoverable place, but in this case the system is in a semi-stable | ||
627 | * state and won't corrupt anything by itself. It's ok to let the others | ||
628 | * continue for a bit first. | ||
629 | * | ||
630 | * All the spin loops have timeouts; when a timeout happens a CPU | ||
631 | * typically elects itself to be Monarch. | ||
632 | */ | ||
633 | static void mce_reign(void) | ||
634 | { | ||
635 | int cpu; | ||
636 | struct mce *m = NULL; | ||
637 | int global_worst = 0; | ||
638 | char *msg = NULL; | ||
639 | char *nmsg = NULL; | ||
640 | |||
641 | /* | ||
642 | * This CPU is the Monarch and the other CPUs have run | ||
643 | * through their handlers. | ||
644 | * Grade the severity of the errors of all the CPUs. | ||
645 | */ | ||
646 | for_each_possible_cpu(cpu) { | ||
647 | int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, | ||
648 | &nmsg); | ||
649 | if (severity > global_worst) { | ||
650 | msg = nmsg; | ||
651 | global_worst = severity; | ||
652 | m = &per_cpu(mces_seen, cpu); | ||
653 | } | ||
654 | } | ||
655 | |||
656 | /* | ||
657 | * Cannot recover? Panic here then. | ||
658 | * This dumps all the mces in the log buffer and stops the | ||
659 | * other CPUs. | ||
660 | */ | ||
661 | if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) | ||
662 | mce_panic("Fatal Machine check", m, msg); | ||
663 | |||
664 | /* | ||
665 | * For UC somewhere we let the CPU who detects it handle it. | ||
666 | * Also must let continue the others, otherwise the handling | ||
667 | * CPU could deadlock on a lock. | ||
668 | */ | ||
669 | |||
670 | /* | ||
671 | * No machine check event found. Must be some external | ||
672 | * source or one CPU is hung. Panic. | ||
673 | */ | ||
674 | if (!m && tolerant < 3) | ||
675 | mce_panic("Machine check from unknown source", NULL, NULL); | ||
676 | |||
677 | /* | ||
678 | * Now clear all the mces_seen so that they don't reappear on | ||
679 | * the next mce. | ||
680 | */ | ||
681 | for_each_possible_cpu(cpu) | ||
682 | memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); | ||
683 | } | ||
684 | |||
685 | static atomic_t global_nwo; | ||
686 | |||
687 | /* | ||
688 | * Start of Monarch synchronization. This waits until all CPUs have | ||
689 | * entered the exception handler and then determines if any of them | ||
690 | * saw a fatal event that requires panic. Then it executes them | ||
691 | * in the entry order. | ||
692 | * TBD double check parallel CPU hotunplug | ||
693 | */ | ||
694 | static int mce_start(int no_way_out, int *order) | ||
695 | { | ||
696 | int nwo; | ||
697 | int cpus = num_online_cpus(); | ||
698 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; | ||
699 | |||
700 | if (!timeout) { | ||
701 | *order = -1; | ||
702 | return no_way_out; | ||
703 | } | ||
704 | |||
705 | atomic_add(no_way_out, &global_nwo); | ||
706 | |||
707 | /* | ||
708 | * Wait for everyone. | ||
709 | */ | ||
710 | while (atomic_read(&mce_callin) != cpus) { | ||
711 | if (mce_timed_out(&timeout)) { | ||
712 | atomic_set(&global_nwo, 0); | ||
713 | *order = -1; | ||
714 | return no_way_out; | ||
715 | } | ||
716 | ndelay(SPINUNIT); | ||
717 | } | ||
718 | |||
719 | /* | ||
720 | * Cache the global no_way_out state. | ||
721 | */ | ||
722 | nwo = atomic_read(&global_nwo); | ||
723 | |||
724 | /* | ||
725 | * Monarch starts executing now, the others wait. | ||
726 | */ | ||
727 | if (*order == 1) { | ||
728 | atomic_set(&mce_executing, 1); | ||
729 | return nwo; | ||
730 | } | ||
731 | |||
732 | /* | ||
733 | * Now start the scanning loop one by one | ||
734 | * in the original callin order. | ||
735 | * This way when there are any shared banks it will | ||
736 | * be only seen by one CPU before cleared, avoiding duplicates. | ||
737 | */ | ||
738 | while (atomic_read(&mce_executing) < *order) { | ||
739 | if (mce_timed_out(&timeout)) { | ||
740 | atomic_set(&global_nwo, 0); | ||
741 | *order = -1; | ||
742 | return no_way_out; | ||
743 | } | ||
744 | ndelay(SPINUNIT); | ||
745 | } | ||
746 | return nwo; | ||
747 | } | ||
748 | |||
749 | /* | ||
750 | * Synchronize between CPUs after main scanning loop. | ||
751 | * This invokes the bulk of the Monarch processing. | ||
752 | */ | ||
753 | static int mce_end(int order) | ||
754 | { | ||
755 | int ret = -1; | ||
756 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; | ||
757 | |||
758 | if (!timeout) | ||
759 | goto reset; | ||
760 | if (order < 0) | ||
761 | goto reset; | ||
762 | |||
763 | /* | ||
764 | * Allow others to run. | ||
765 | */ | ||
766 | atomic_inc(&mce_executing); | ||
767 | |||
768 | if (order == 1) { | ||
769 | /* CHECKME: Can this race with a parallel hotplug? */ | ||
770 | int cpus = num_online_cpus(); | ||
771 | |||
772 | /* | ||
773 | * Monarch: Wait for everyone to go through their scanning | ||
774 | * loops. | ||
775 | */ | ||
776 | while (atomic_read(&mce_executing) <= cpus) { | ||
777 | if (mce_timed_out(&timeout)) | ||
778 | goto reset; | ||
779 | ndelay(SPINUNIT); | ||
780 | } | ||
781 | |||
782 | mce_reign(); | ||
783 | barrier(); | ||
784 | ret = 0; | ||
785 | } else { | ||
786 | /* | ||
787 | * Subject: Wait for Monarch to finish. | ||
788 | */ | ||
789 | while (atomic_read(&mce_executing) != 0) { | ||
790 | if (mce_timed_out(&timeout)) | ||
791 | goto reset; | ||
792 | ndelay(SPINUNIT); | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * Don't reset anything. That's done by the Monarch. | ||
797 | */ | ||
798 | return 0; | ||
799 | } | ||
800 | |||
801 | /* | ||
802 | * Reset all global state. | ||
803 | */ | ||
804 | reset: | ||
805 | atomic_set(&global_nwo, 0); | ||
806 | atomic_set(&mce_callin, 0); | ||
807 | barrier(); | ||
808 | |||
809 | /* | ||
810 | * Let others run again. | ||
811 | */ | ||
812 | atomic_set(&mce_executing, 0); | ||
813 | return ret; | ||
814 | } | ||
815 | |||
816 | /* | ||
817 | * Check if the address reported by the CPU is in a format we can parse. | ||
818 | * It would be possible to add code for most other cases, but all would | ||
819 | * be somewhat complicated (e.g. segment offset would require an instruction | ||
820 | * parser). So only support physical addresses upto page granuality for now. | ||
821 | */ | ||
822 | static int mce_usable_address(struct mce *m) | ||
823 | { | ||
824 | if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) | ||
825 | return 0; | ||
826 | if ((m->misc & 0x3f) > PAGE_SHIFT) | ||
827 | return 0; | ||
828 | if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) | ||
829 | return 0; | ||
830 | return 1; | ||
831 | } | ||
832 | |||
833 | static void mce_clear_state(unsigned long *toclear) | ||
834 | { | ||
835 | int i; | ||
836 | |||
837 | for (i = 0; i < banks; i++) { | ||
838 | if (test_bit(i, toclear)) | ||
839 | mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
840 | } | ||
841 | } | ||
842 | |||
843 | /* | ||
844 | * The actual machine check handler. This only handles real | ||
845 | * exceptions when something got corrupted coming in through int 18. | ||
846 | * | ||
847 | * This is executed in NMI context not subject to normal locking rules. This | ||
848 | * implies that most kernel services cannot be safely used. Don't even | ||
849 | * think about putting a printk in there! | ||
850 | * | ||
851 | * On Intel systems this is entered on all CPUs in parallel through | ||
852 | * MCE broadcast. However some CPUs might be broken beyond repair, | ||
853 | * so be always careful when synchronizing with others. | ||
854 | */ | ||
855 | void do_machine_check(struct pt_regs *regs, long error_code) | ||
856 | { | ||
857 | struct mce m, *final; | ||
858 | int i; | ||
859 | int worst = 0; | ||
860 | int severity; | ||
861 | /* | ||
862 | * Establish sequential order between the CPUs entering the machine | ||
863 | * check handler. | ||
864 | */ | ||
865 | int order; | ||
866 | |||
867 | /* | ||
868 | * If no_way_out gets set, there is no safe way to recover from this | ||
869 | * MCE. If tolerant is cranked up, we'll try anyway. | ||
870 | */ | ||
871 | int no_way_out = 0; | ||
872 | /* | ||
873 | * If kill_it gets set, there might be a way to recover from this | ||
874 | * error. | ||
875 | */ | ||
876 | int kill_it = 0; | ||
877 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | ||
878 | char *msg = "Unknown"; | ||
879 | |||
880 | atomic_inc(&mce_entry); | ||
881 | |||
882 | __get_cpu_var(mce_exception_count)++; | ||
883 | |||
884 | if (notify_die(DIE_NMI, "machine check", regs, error_code, | ||
885 | 18, SIGKILL) == NOTIFY_STOP) | ||
886 | goto out; | ||
887 | if (!banks) | ||
888 | goto out; | ||
889 | |||
890 | order = atomic_add_return(1, &mce_callin); | ||
891 | mce_setup(&m); | ||
892 | |||
893 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); | ||
894 | no_way_out = mce_no_way_out(&m, &msg); | ||
895 | |||
896 | final = &__get_cpu_var(mces_seen); | ||
897 | *final = m; | ||
898 | |||
899 | barrier(); | ||
900 | |||
901 | /* | ||
902 | * When no restart IP must always kill or panic. | ||
903 | */ | ||
904 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
905 | kill_it = 1; | ||
906 | |||
907 | /* | ||
908 | * Go through all the banks in exclusion of the other CPUs. | ||
909 | * This way we don't report duplicated events on shared banks | ||
910 | * because the first one to see it will clear it. | ||
911 | */ | ||
912 | no_way_out = mce_start(no_way_out, &order); | ||
913 | for (i = 0; i < banks; i++) { | ||
914 | __clear_bit(i, toclear); | ||
915 | if (!bank[i]) | ||
916 | continue; | ||
917 | |||
918 | m.misc = 0; | ||
919 | m.addr = 0; | ||
920 | m.bank = i; | ||
921 | |||
922 | m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | ||
923 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
924 | continue; | ||
925 | |||
926 | /* | ||
927 | * Non uncorrected or non signaled errors are handled by | ||
928 | * machine_check_poll. Leave them alone, unless this panics. | ||
929 | */ | ||
930 | if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && | ||
931 | !no_way_out) | ||
932 | continue; | ||
933 | |||
934 | /* | ||
935 | * Set taint even when machine check was not enabled. | ||
936 | */ | ||
937 | add_taint(TAINT_MACHINE_CHECK); | ||
938 | |||
939 | severity = mce_severity(&m, tolerant, NULL); | ||
940 | |||
941 | /* | ||
942 | * When machine check was for corrected handler don't touch, | ||
943 | * unless we're panicing. | ||
944 | */ | ||
945 | if (severity == MCE_KEEP_SEVERITY && !no_way_out) | ||
946 | continue; | ||
947 | __set_bit(i, toclear); | ||
948 | if (severity == MCE_NO_SEVERITY) { | ||
949 | /* | ||
950 | * Machine check event was not enabled. Clear, but | ||
951 | * ignore. | ||
952 | */ | ||
953 | continue; | ||
954 | } | ||
955 | |||
956 | /* | ||
957 | * Kill on action required. | ||
958 | */ | ||
959 | if (severity == MCE_AR_SEVERITY) | ||
960 | kill_it = 1; | ||
961 | |||
962 | if (m.status & MCI_STATUS_MISCV) | ||
963 | m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); | ||
964 | if (m.status & MCI_STATUS_ADDRV) | ||
965 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); | ||
966 | |||
967 | /* | ||
968 | * Action optional error. Queue address for later processing. | ||
969 | * When the ring overflows we just ignore the AO error. | ||
970 | * RED-PEN add some logging mechanism when | ||
971 | * usable_address or mce_add_ring fails. | ||
972 | * RED-PEN don't ignore overflow for tolerant == 0 | ||
973 | */ | ||
974 | if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) | ||
975 | mce_ring_add(m.addr >> PAGE_SHIFT); | ||
976 | |||
977 | mce_get_rip(&m, regs); | ||
978 | mce_log(&m); | ||
979 | |||
980 | if (severity > worst) { | ||
981 | *final = m; | ||
982 | worst = severity; | ||
983 | } | ||
984 | } | ||
985 | |||
986 | if (!no_way_out) | ||
987 | mce_clear_state(toclear); | ||
988 | |||
989 | /* | ||
990 | * Do most of the synchronization with other CPUs. | ||
991 | * When there's any problem use only local no_way_out state. | ||
992 | */ | ||
993 | if (mce_end(order) < 0) | ||
994 | no_way_out = worst >= MCE_PANIC_SEVERITY; | ||
995 | |||
996 | /* | ||
997 | * If we have decided that we just CAN'T continue, and the user | ||
998 | * has not set tolerant to an insane level, give up and die. | ||
999 | * | ||
1000 | * This is mainly used in the case when the system doesn't | ||
1001 | * support MCE broadcasting or it has been disabled. | ||
1002 | */ | ||
1003 | if (no_way_out && tolerant < 3) | ||
1004 | mce_panic("Fatal machine check on current CPU", final, msg); | ||
1005 | |||
1006 | /* | ||
1007 | * If the error seems to be unrecoverable, something should be | ||
1008 | * done. Try to kill as little as possible. If we can kill just | ||
1009 | * one task, do that. If the user has set the tolerance very | ||
1010 | * high, don't try to do anything at all. | ||
1011 | */ | ||
1012 | |||
1013 | if (kill_it && tolerant < 3) | ||
1014 | force_sig(SIGBUS, current); | ||
1015 | |||
1016 | /* notify userspace ASAP */ | ||
1017 | set_thread_flag(TIF_MCE_NOTIFY); | ||
1018 | |||
1019 | if (worst > 0) | ||
1020 | mce_report_event(regs); | ||
1021 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
1022 | out: | ||
1023 | atomic_dec(&mce_entry); | ||
1024 | sync_core(); | ||
1025 | } | ||
1026 | EXPORT_SYMBOL_GPL(do_machine_check); | ||
1027 | |||
1028 | /* dummy to break dependency. actual code is in mm/memory-failure.c */ | ||
1029 | void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) | ||
1030 | { | ||
1031 | printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); | ||
1032 | } | ||
1033 | |||
1034 | /* | ||
1035 | * Called after mce notification in process context. This code | ||
1036 | * is allowed to sleep. Call the high level VM handler to process | ||
1037 | * any corrupted pages. | ||
1038 | * Assume that the work queue code only calls this one at a time | ||
1039 | * per CPU. | ||
1040 | * Note we don't disable preemption, so this code might run on the wrong | ||
1041 | * CPU. In this case the event is picked up by the scheduled work queue. | ||
1042 | * This is merely a fast path to expedite processing in some common | ||
1043 | * cases. | ||
1044 | */ | ||
1045 | void mce_notify_process(void) | ||
1046 | { | ||
1047 | unsigned long pfn; | ||
1048 | mce_notify_irq(); | ||
1049 | while (mce_ring_get(&pfn)) | ||
1050 | memory_failure(pfn, MCE_VECTOR); | ||
1051 | } | ||
1052 | |||
1053 | static void mce_process_work(struct work_struct *dummy) | ||
1054 | { | ||
1055 | mce_notify_process(); | ||
1056 | } | ||
1057 | |||
1058 | #ifdef CONFIG_X86_MCE_INTEL | ||
1059 | /*** | ||
1060 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog | ||
1061 | * @cpu: The CPU on which the event occurred. | ||
1062 | * @status: Event status information | ||
1063 | * | ||
1064 | * This function should be called by the thermal interrupt after the | ||
1065 | * event has been processed and the decision was made to log the event | ||
1066 | * further. | ||
1067 | * | ||
1068 | * The status parameter will be saved to the 'status' field of 'struct mce' | ||
1069 | * and historically has been the register value of the | ||
1070 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | ||
1071 | */ | ||
1072 | void mce_log_therm_throt_event(__u64 status) | ||
1073 | { | ||
1074 | struct mce m; | ||
1075 | |||
1076 | mce_setup(&m); | ||
1077 | m.bank = MCE_THERMAL_BANK; | ||
1078 | m.status = status; | ||
1079 | mce_log(&m); | ||
1080 | } | ||
1081 | #endif /* CONFIG_X86_MCE_INTEL */ | ||
1082 | |||
1083 | /* | ||
1084 | * Periodic polling timer for "silent" machine check errors. If the | ||
1085 | * poller finds an MCE, poll 2x faster. When the poller finds no more | ||
1086 | * errors, poll 2x slower (up to check_interval seconds). | ||
1087 | */ | ||
1088 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
1089 | |||
1090 | static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ | ||
1091 | static DEFINE_PER_CPU(struct timer_list, mce_timer); | ||
1092 | |||
1093 | static void mcheck_timer(unsigned long data) | ||
1094 | { | ||
1095 | struct timer_list *t = &per_cpu(mce_timer, data); | ||
1096 | int *n; | ||
1097 | |||
1098 | WARN_ON(smp_processor_id() != data); | ||
1099 | |||
1100 | if (mce_available(¤t_cpu_data)) { | ||
1101 | machine_check_poll(MCP_TIMESTAMP, | ||
1102 | &__get_cpu_var(mce_poll_banks)); | ||
1103 | } | ||
1104 | |||
1105 | /* | ||
1106 | * Alert userspace if needed. If we logged an MCE, reduce the | ||
1107 | * polling interval, otherwise increase the polling interval. | ||
1108 | */ | ||
1109 | n = &__get_cpu_var(next_interval); | ||
1110 | if (mce_notify_irq()) | ||
1111 | *n = max(*n/2, HZ/100); | ||
1112 | else | ||
1113 | *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); | ||
1114 | |||
1115 | t->expires = jiffies + *n; | ||
1116 | add_timer(t); | ||
1117 | } | ||
1118 | |||
1119 | static void mce_do_trigger(struct work_struct *work) | ||
1120 | { | ||
1121 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); | ||
1122 | } | ||
1123 | |||
1124 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | ||
1125 | |||
1126 | /* | ||
1127 | * Notify the user(s) about new machine check events. | ||
1128 | * Can be called from interrupt context, but not from machine check/NMI | ||
1129 | * context. | ||
1130 | */ | ||
1131 | int mce_notify_irq(void) | ||
1132 | { | ||
1133 | /* Not more than two messages every minute */ | ||
1134 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | ||
1135 | |||
1136 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1137 | |||
1138 | if (test_and_clear_bit(0, ¬ify_user)) { | ||
1139 | wake_up_interruptible(&mce_wait); | ||
1140 | |||
1141 | /* | ||
1142 | * There is no risk of missing notifications because | ||
1143 | * work_pending is always cleared before the function is | ||
1144 | * executed. | ||
1145 | */ | ||
1146 | if (trigger[0] && !work_pending(&mce_trigger_work)) | ||
1147 | schedule_work(&mce_trigger_work); | ||
1148 | |||
1149 | if (__ratelimit(&ratelimit)) | ||
1150 | printk(KERN_INFO "Machine check events logged\n"); | ||
1151 | |||
1152 | return 1; | ||
1153 | } | ||
1154 | return 0; | ||
1155 | } | ||
1156 | EXPORT_SYMBOL_GPL(mce_notify_irq); | ||
1157 | |||
1158 | /* | ||
1159 | * Initialize Machine Checks for a CPU. | ||
1160 | */ | ||
1161 | static int mce_cap_init(void) | ||
1162 | { | ||
1163 | unsigned b; | ||
1164 | u64 cap; | ||
1165 | |||
1166 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
1167 | |||
1168 | b = cap & MCG_BANKCNT_MASK; | ||
1169 | printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); | ||
1170 | |||
1171 | if (b > MAX_NR_BANKS) { | ||
1172 | printk(KERN_WARNING | ||
1173 | "MCE: Using only %u machine check banks out of %u\n", | ||
1174 | MAX_NR_BANKS, b); | ||
1175 | b = MAX_NR_BANKS; | ||
1176 | } | ||
1177 | |||
1178 | /* Don't support asymmetric configurations today */ | ||
1179 | WARN_ON(banks != 0 && b != banks); | ||
1180 | banks = b; | ||
1181 | if (!bank) { | ||
1182 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | ||
1183 | if (!bank) | ||
1184 | return -ENOMEM; | ||
1185 | memset(bank, 0xff, banks * sizeof(u64)); | ||
1186 | } | ||
1187 | |||
1188 | /* Use accurate RIP reporting if available. */ | ||
1189 | if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) | ||
1190 | rip_msr = MSR_IA32_MCG_EIP; | ||
1191 | |||
1192 | if (cap & MCG_SER_P) | ||
1193 | mce_ser = 1; | ||
1194 | |||
1195 | return 0; | ||
1196 | } | ||
1197 | |||
1198 | static void mce_init(void) | ||
1199 | { | ||
1200 | mce_banks_t all_banks; | ||
1201 | u64 cap; | ||
1202 | int i; | ||
1203 | |||
1204 | /* | ||
1205 | * Log the machine checks left over from the previous reset. | ||
1206 | */ | ||
1207 | bitmap_fill(all_banks, MAX_NR_BANKS); | ||
1208 | machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); | ||
1209 | |||
1210 | set_in_cr4(X86_CR4_MCE); | ||
1211 | |||
1212 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
1213 | if (cap & MCG_CTL_P) | ||
1214 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
1215 | |||
1216 | for (i = 0; i < banks; i++) { | ||
1217 | if (skip_bank_init(i)) | ||
1218 | continue; | ||
1219 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
1220 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
1221 | } | ||
1222 | } | ||
1223 | |||
1224 | /* Add per CPU specific workarounds here */ | ||
1225 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
1226 | { | ||
1227 | /* This should be disabled by the BIOS, but isn't always */ | ||
1228 | if (c->x86_vendor == X86_VENDOR_AMD) { | ||
1229 | if (c->x86 == 15 && banks > 4) { | ||
1230 | /* | ||
1231 | * disable GART TBL walk error reporting, which | ||
1232 | * trips off incorrectly with the IOMMU & 3ware | ||
1233 | * & Cerberus: | ||
1234 | */ | ||
1235 | clear_bit(10, (unsigned long *)&bank[4]); | ||
1236 | } | ||
1237 | if (c->x86 <= 17 && mce_bootlog < 0) { | ||
1238 | /* | ||
1239 | * Lots of broken BIOS around that don't clear them | ||
1240 | * by default and leave crap in there. Don't log: | ||
1241 | */ | ||
1242 | mce_bootlog = 0; | ||
1243 | } | ||
1244 | /* | ||
1245 | * Various K7s with broken bank 0 around. Always disable | ||
1246 | * by default. | ||
1247 | */ | ||
1248 | if (c->x86 == 6) | ||
1249 | bank[0] = 0; | ||
1250 | } | ||
1251 | |||
1252 | if (c->x86_vendor == X86_VENDOR_INTEL) { | ||
1253 | /* | ||
1254 | * SDM documents that on family 6 bank 0 should not be written | ||
1255 | * because it aliases to another special BIOS controlled | ||
1256 | * register. | ||
1257 | * But it's not aliased anymore on model 0x1a+ | ||
1258 | * Don't ignore bank 0 completely because there could be a | ||
1259 | * valid event later, merely don't write CTL0. | ||
1260 | */ | ||
1261 | |||
1262 | if (c->x86 == 6 && c->x86_model < 0x1A) | ||
1263 | __set_bit(0, &dont_init_banks); | ||
1264 | |||
1265 | /* | ||
1266 | * All newer Intel systems support MCE broadcasting. Enable | ||
1267 | * synchronization with a one second timeout. | ||
1268 | */ | ||
1269 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && | ||
1270 | monarch_timeout < 0) | ||
1271 | monarch_timeout = USEC_PER_SEC; | ||
1272 | } | ||
1273 | if (monarch_timeout < 0) | ||
1274 | monarch_timeout = 0; | ||
1275 | if (mce_bootlog != 0) | ||
1276 | mce_panic_timeout = 30; | ||
1277 | } | ||
1278 | |||
1279 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) | ||
1280 | { | ||
1281 | if (c->x86 != 5) | ||
1282 | return; | ||
1283 | switch (c->x86_vendor) { | ||
1284 | case X86_VENDOR_INTEL: | ||
1285 | if (mce_p5_enabled()) | ||
1286 | intel_p5_mcheck_init(c); | ||
1287 | break; | ||
1288 | case X86_VENDOR_CENTAUR: | ||
1289 | winchip_mcheck_init(c); | ||
1290 | break; | ||
1291 | } | ||
1292 | } | ||
1293 | |||
1294 | static void mce_cpu_features(struct cpuinfo_x86 *c) | ||
1295 | { | ||
1296 | switch (c->x86_vendor) { | ||
1297 | case X86_VENDOR_INTEL: | ||
1298 | mce_intel_feature_init(c); | ||
1299 | break; | ||
1300 | case X86_VENDOR_AMD: | ||
1301 | mce_amd_feature_init(c); | ||
1302 | break; | ||
1303 | default: | ||
1304 | break; | ||
1305 | } | ||
1306 | } | ||
1307 | |||
1308 | static void mce_init_timer(void) | ||
1309 | { | ||
1310 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
1311 | int *n = &__get_cpu_var(next_interval); | ||
1312 | |||
1313 | if (mce_ignore_ce) | ||
1314 | return; | ||
1315 | |||
1316 | *n = check_interval * HZ; | ||
1317 | if (!*n) | ||
1318 | return; | ||
1319 | setup_timer(t, mcheck_timer, smp_processor_id()); | ||
1320 | t->expires = round_jiffies(jiffies + *n); | ||
1321 | add_timer(t); | ||
1322 | } | ||
1323 | |||
1324 | /* | ||
1325 | * Called for each booted CPU to set up machine checks. | ||
1326 | * Must be called with preempt off: | ||
1327 | */ | ||
1328 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | ||
1329 | { | ||
1330 | if (mce_disabled) | ||
1331 | return; | ||
1332 | |||
1333 | mce_ancient_init(c); | ||
1334 | |||
1335 | if (!mce_available(c)) | ||
1336 | return; | ||
1337 | |||
1338 | if (mce_cap_init() < 0) { | ||
1339 | mce_disabled = 1; | ||
1340 | return; | ||
1341 | } | ||
1342 | mce_cpu_quirks(c); | ||
1343 | |||
1344 | machine_check_vector = do_machine_check; | ||
1345 | |||
1346 | mce_init(); | ||
1347 | mce_cpu_features(c); | ||
1348 | mce_init_timer(); | ||
1349 | INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); | ||
1350 | } | ||
1351 | |||
1352 | /* | ||
1353 | * Character device to read and clear the MCE log. | ||
1354 | */ | ||
1355 | |||
1356 | static DEFINE_SPINLOCK(mce_state_lock); | ||
1357 | static int open_count; /* #times opened */ | ||
1358 | static int open_exclu; /* already open exclusive? */ | ||
1359 | |||
1360 | static int mce_open(struct inode *inode, struct file *file) | ||
1361 | { | ||
1362 | spin_lock(&mce_state_lock); | ||
1363 | |||
1364 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { | ||
1365 | spin_unlock(&mce_state_lock); | ||
1366 | |||
1367 | return -EBUSY; | ||
1368 | } | ||
1369 | |||
1370 | if (file->f_flags & O_EXCL) | ||
1371 | open_exclu = 1; | ||
1372 | open_count++; | ||
1373 | |||
1374 | spin_unlock(&mce_state_lock); | ||
1375 | |||
1376 | return nonseekable_open(inode, file); | ||
1377 | } | ||
1378 | |||
1379 | static int mce_release(struct inode *inode, struct file *file) | ||
1380 | { | ||
1381 | spin_lock(&mce_state_lock); | ||
1382 | |||
1383 | open_count--; | ||
1384 | open_exclu = 0; | ||
1385 | |||
1386 | spin_unlock(&mce_state_lock); | ||
1387 | |||
1388 | return 0; | ||
1389 | } | ||
1390 | |||
1391 | static void collect_tscs(void *data) | ||
1392 | { | ||
1393 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
1394 | |||
1395 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
1396 | } | ||
1397 | |||
1398 | static DEFINE_MUTEX(mce_read_mutex); | ||
1399 | |||
1400 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | ||
1401 | loff_t *off) | ||
1402 | { | ||
1403 | char __user *buf = ubuf; | ||
1404 | unsigned long *cpu_tsc; | ||
1405 | unsigned prev, next; | ||
1406 | int i, err; | ||
1407 | |||
1408 | cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); | ||
1409 | if (!cpu_tsc) | ||
1410 | return -ENOMEM; | ||
1411 | |||
1412 | mutex_lock(&mce_read_mutex); | ||
1413 | next = rcu_dereference(mcelog.next); | ||
1414 | |||
1415 | /* Only supports full reads right now */ | ||
1416 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
1417 | mutex_unlock(&mce_read_mutex); | ||
1418 | kfree(cpu_tsc); | ||
1419 | |||
1420 | return -EINVAL; | ||
1421 | } | ||
1422 | |||
1423 | err = 0; | ||
1424 | prev = 0; | ||
1425 | do { | ||
1426 | for (i = prev; i < next; i++) { | ||
1427 | unsigned long start = jiffies; | ||
1428 | |||
1429 | while (!mcelog.entry[i].finished) { | ||
1430 | if (time_after_eq(jiffies, start + 2)) { | ||
1431 | memset(mcelog.entry + i, 0, | ||
1432 | sizeof(struct mce)); | ||
1433 | goto timeout; | ||
1434 | } | ||
1435 | cpu_relax(); | ||
1436 | } | ||
1437 | smp_rmb(); | ||
1438 | err |= copy_to_user(buf, mcelog.entry + i, | ||
1439 | sizeof(struct mce)); | ||
1440 | buf += sizeof(struct mce); | ||
1441 | timeout: | ||
1442 | ; | ||
1443 | } | ||
1444 | |||
1445 | memset(mcelog.entry + prev, 0, | ||
1446 | (next - prev) * sizeof(struct mce)); | ||
1447 | prev = next; | ||
1448 | next = cmpxchg(&mcelog.next, prev, 0); | ||
1449 | } while (next != prev); | ||
1450 | |||
1451 | synchronize_sched(); | ||
1452 | |||
1453 | /* | ||
1454 | * Collect entries that were still getting written before the | ||
1455 | * synchronize. | ||
1456 | */ | ||
1457 | on_each_cpu(collect_tscs, cpu_tsc, 1); | ||
1458 | |||
1459 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
1460 | if (mcelog.entry[i].finished && | ||
1461 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
1462 | err |= copy_to_user(buf, mcelog.entry+i, | ||
1463 | sizeof(struct mce)); | ||
1464 | smp_rmb(); | ||
1465 | buf += sizeof(struct mce); | ||
1466 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
1467 | } | ||
1468 | } | ||
1469 | mutex_unlock(&mce_read_mutex); | ||
1470 | kfree(cpu_tsc); | ||
1471 | |||
1472 | return err ? -EFAULT : buf - ubuf; | ||
1473 | } | ||
1474 | |||
1475 | static unsigned int mce_poll(struct file *file, poll_table *wait) | ||
1476 | { | ||
1477 | poll_wait(file, &mce_wait, wait); | ||
1478 | if (rcu_dereference(mcelog.next)) | ||
1479 | return POLLIN | POLLRDNORM; | ||
1480 | return 0; | ||
1481 | } | ||
1482 | |||
1483 | static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) | ||
1484 | { | ||
1485 | int __user *p = (int __user *)arg; | ||
1486 | |||
1487 | if (!capable(CAP_SYS_ADMIN)) | ||
1488 | return -EPERM; | ||
1489 | |||
1490 | switch (cmd) { | ||
1491 | case MCE_GET_RECORD_LEN: | ||
1492 | return put_user(sizeof(struct mce), p); | ||
1493 | case MCE_GET_LOG_LEN: | ||
1494 | return put_user(MCE_LOG_LEN, p); | ||
1495 | case MCE_GETCLEAR_FLAGS: { | ||
1496 | unsigned flags; | ||
1497 | |||
1498 | do { | ||
1499 | flags = mcelog.flags; | ||
1500 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
1501 | |||
1502 | return put_user(flags, p); | ||
1503 | } | ||
1504 | default: | ||
1505 | return -ENOTTY; | ||
1506 | } | ||
1507 | } | ||
1508 | |||
1509 | /* Modified in mce-inject.c, so not static or const */ | ||
1510 | struct file_operations mce_chrdev_ops = { | ||
1511 | .open = mce_open, | ||
1512 | .release = mce_release, | ||
1513 | .read = mce_read, | ||
1514 | .poll = mce_poll, | ||
1515 | .unlocked_ioctl = mce_ioctl, | ||
1516 | }; | ||
1517 | EXPORT_SYMBOL_GPL(mce_chrdev_ops); | ||
1518 | |||
1519 | static struct miscdevice mce_log_device = { | ||
1520 | MISC_MCELOG_MINOR, | ||
1521 | "mcelog", | ||
1522 | &mce_chrdev_ops, | ||
1523 | }; | ||
1524 | |||
1525 | /* | ||
1526 | * mce=off Disables machine check | ||
1527 | * mce=no_cmci Disables CMCI | ||
1528 | * mce=dont_log_ce Clears corrected events silently, no log created for CEs. | ||
1529 | * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. | ||
1530 | * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) | ||
1531 | * monarchtimeout is how long to wait for other CPUs on machine | ||
1532 | * check, or 0 to not wait | ||
1533 | * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | ||
1534 | * mce=nobootlog Don't log MCEs from before booting. | ||
1535 | */ | ||
1536 | static int __init mcheck_enable(char *str) | ||
1537 | { | ||
1538 | if (*str == 0) | ||
1539 | enable_p5_mce(); | ||
1540 | if (*str == '=') | ||
1541 | str++; | ||
1542 | if (!strcmp(str, "off")) | ||
1543 | mce_disabled = 1; | ||
1544 | else if (!strcmp(str, "no_cmci")) | ||
1545 | mce_cmci_disabled = 1; | ||
1546 | else if (!strcmp(str, "dont_log_ce")) | ||
1547 | mce_dont_log_ce = 1; | ||
1548 | else if (!strcmp(str, "ignore_ce")) | ||
1549 | mce_ignore_ce = 1; | ||
1550 | else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) | ||
1551 | mce_bootlog = (str[0] == 'b'); | ||
1552 | else if (isdigit(str[0])) { | ||
1553 | get_option(&str, &tolerant); | ||
1554 | if (*str == ',') { | ||
1555 | ++str; | ||
1556 | get_option(&str, &monarch_timeout); | ||
1557 | } | ||
1558 | } else { | ||
1559 | printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", | ||
1560 | str); | ||
1561 | return 0; | ||
1562 | } | ||
1563 | return 1; | ||
1564 | } | ||
1565 | __setup("mce", mcheck_enable); | ||
1566 | |||
1567 | /* | ||
1568 | * Sysfs support | ||
1569 | */ | ||
1570 | |||
1571 | /* | ||
1572 | * Disable machine checks on suspend and shutdown. We can't really handle | ||
1573 | * them later. | ||
1574 | */ | ||
1575 | static int mce_disable(void) | ||
1576 | { | ||
1577 | int i; | ||
1578 | |||
1579 | for (i = 0; i < banks; i++) { | ||
1580 | if (!skip_bank_init(i)) | ||
1581 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
1582 | } | ||
1583 | return 0; | ||
1584 | } | ||
1585 | |||
1586 | static int mce_suspend(struct sys_device *dev, pm_message_t state) | ||
1587 | { | ||
1588 | return mce_disable(); | ||
1589 | } | ||
1590 | |||
1591 | static int mce_shutdown(struct sys_device *dev) | ||
1592 | { | ||
1593 | return mce_disable(); | ||
1594 | } | ||
1595 | |||
1596 | /* | ||
1597 | * On resume clear all MCE state. Don't want to see leftovers from the BIOS. | ||
1598 | * Only one CPU is active at this time, the others get re-added later using | ||
1599 | * CPU hotplug: | ||
1600 | */ | ||
1601 | static int mce_resume(struct sys_device *dev) | ||
1602 | { | ||
1603 | mce_init(); | ||
1604 | mce_cpu_features(¤t_cpu_data); | ||
1605 | |||
1606 | return 0; | ||
1607 | } | ||
1608 | |||
1609 | static void mce_cpu_restart(void *data) | ||
1610 | { | ||
1611 | del_timer_sync(&__get_cpu_var(mce_timer)); | ||
1612 | if (mce_available(¤t_cpu_data)) | ||
1613 | mce_init(); | ||
1614 | mce_init_timer(); | ||
1615 | } | ||
1616 | |||
1617 | /* Reinit MCEs after user configuration changes */ | ||
1618 | static void mce_restart(void) | ||
1619 | { | ||
1620 | on_each_cpu(mce_cpu_restart, NULL, 1); | ||
1621 | } | ||
1622 | |||
1623 | static struct sysdev_class mce_sysclass = { | ||
1624 | .suspend = mce_suspend, | ||
1625 | .shutdown = mce_shutdown, | ||
1626 | .resume = mce_resume, | ||
1627 | .name = "machinecheck", | ||
1628 | }; | ||
1629 | |||
1630 | DEFINE_PER_CPU(struct sys_device, mce_dev); | ||
1631 | |||
1632 | __cpuinitdata | ||
1633 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | ||
1634 | |||
1635 | static struct sysdev_attribute *bank_attrs; | ||
1636 | |||
1637 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
1638 | char *buf) | ||
1639 | { | ||
1640 | u64 b = bank[attr - bank_attrs]; | ||
1641 | |||
1642 | return sprintf(buf, "%llx\n", b); | ||
1643 | } | ||
1644 | |||
1645 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
1646 | const char *buf, size_t size) | ||
1647 | { | ||
1648 | u64 new; | ||
1649 | |||
1650 | if (strict_strtoull(buf, 0, &new) < 0) | ||
1651 | return -EINVAL; | ||
1652 | |||
1653 | bank[attr - bank_attrs] = new; | ||
1654 | mce_restart(); | ||
1655 | |||
1656 | return size; | ||
1657 | } | ||
1658 | |||
1659 | static ssize_t | ||
1660 | show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) | ||
1661 | { | ||
1662 | strcpy(buf, trigger); | ||
1663 | strcat(buf, "\n"); | ||
1664 | return strlen(trigger) + 1; | ||
1665 | } | ||
1666 | |||
1667 | static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, | ||
1668 | const char *buf, size_t siz) | ||
1669 | { | ||
1670 | char *p; | ||
1671 | int len; | ||
1672 | |||
1673 | strncpy(trigger, buf, sizeof(trigger)); | ||
1674 | trigger[sizeof(trigger)-1] = 0; | ||
1675 | len = strlen(trigger); | ||
1676 | p = strchr(trigger, '\n'); | ||
1677 | |||
1678 | if (*p) | ||
1679 | *p = 0; | ||
1680 | |||
1681 | return len; | ||
1682 | } | ||
1683 | |||
1684 | static ssize_t store_int_with_restart(struct sys_device *s, | ||
1685 | struct sysdev_attribute *attr, | ||
1686 | const char *buf, size_t size) | ||
1687 | { | ||
1688 | ssize_t ret = sysdev_store_int(s, attr, buf, size); | ||
1689 | mce_restart(); | ||
1690 | return ret; | ||
1691 | } | ||
1692 | |||
1693 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | ||
1694 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | ||
1695 | static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); | ||
1696 | |||
1697 | static struct sysdev_ext_attribute attr_check_interval = { | ||
1698 | _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, | ||
1699 | store_int_with_restart), | ||
1700 | &check_interval | ||
1701 | }; | ||
1702 | |||
1703 | static struct sysdev_attribute *mce_attrs[] = { | ||
1704 | &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, | ||
1705 | &attr_monarch_timeout.attr, | ||
1706 | NULL | ||
1707 | }; | ||
1708 | |||
1709 | static cpumask_var_t mce_dev_initialized; | ||
1710 | |||
1711 | /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ | ||
1712 | static __cpuinit int mce_create_device(unsigned int cpu) | ||
1713 | { | ||
1714 | int err; | ||
1715 | int i; | ||
1716 | |||
1717 | if (!mce_available(&boot_cpu_data)) | ||
1718 | return -EIO; | ||
1719 | |||
1720 | memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); | ||
1721 | per_cpu(mce_dev, cpu).id = cpu; | ||
1722 | per_cpu(mce_dev, cpu).cls = &mce_sysclass; | ||
1723 | |||
1724 | err = sysdev_register(&per_cpu(mce_dev, cpu)); | ||
1725 | if (err) | ||
1726 | return err; | ||
1727 | |||
1728 | for (i = 0; mce_attrs[i]; i++) { | ||
1729 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | ||
1730 | if (err) | ||
1731 | goto error; | ||
1732 | } | ||
1733 | for (i = 0; i < banks; i++) { | ||
1734 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), | ||
1735 | &bank_attrs[i]); | ||
1736 | if (err) | ||
1737 | goto error2; | ||
1738 | } | ||
1739 | cpumask_set_cpu(cpu, mce_dev_initialized); | ||
1740 | |||
1741 | return 0; | ||
1742 | error2: | ||
1743 | while (--i >= 0) | ||
1744 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); | ||
1745 | error: | ||
1746 | while (--i >= 0) | ||
1747 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | ||
1748 | |||
1749 | sysdev_unregister(&per_cpu(mce_dev, cpu)); | ||
1750 | |||
1751 | return err; | ||
1752 | } | ||
1753 | |||
1754 | static __cpuinit void mce_remove_device(unsigned int cpu) | ||
1755 | { | ||
1756 | int i; | ||
1757 | |||
1758 | if (!cpumask_test_cpu(cpu, mce_dev_initialized)) | ||
1759 | return; | ||
1760 | |||
1761 | for (i = 0; mce_attrs[i]; i++) | ||
1762 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | ||
1763 | |||
1764 | for (i = 0; i < banks; i++) | ||
1765 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); | ||
1766 | |||
1767 | sysdev_unregister(&per_cpu(mce_dev, cpu)); | ||
1768 | cpumask_clear_cpu(cpu, mce_dev_initialized); | ||
1769 | } | ||
1770 | |||
1771 | /* Make sure there are no machine checks on offlined CPUs. */ | ||
1772 | static void mce_disable_cpu(void *h) | ||
1773 | { | ||
1774 | unsigned long action = *(unsigned long *)h; | ||
1775 | int i; | ||
1776 | |||
1777 | if (!mce_available(¤t_cpu_data)) | ||
1778 | return; | ||
1779 | if (!(action & CPU_TASKS_FROZEN)) | ||
1780 | cmci_clear(); | ||
1781 | for (i = 0; i < banks; i++) { | ||
1782 | if (!skip_bank_init(i)) | ||
1783 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
1784 | } | ||
1785 | } | ||
1786 | |||
1787 | static void mce_reenable_cpu(void *h) | ||
1788 | { | ||
1789 | unsigned long action = *(unsigned long *)h; | ||
1790 | int i; | ||
1791 | |||
1792 | if (!mce_available(¤t_cpu_data)) | ||
1793 | return; | ||
1794 | |||
1795 | if (!(action & CPU_TASKS_FROZEN)) | ||
1796 | cmci_reenable(); | ||
1797 | for (i = 0; i < banks; i++) { | ||
1798 | if (!skip_bank_init(i)) | ||
1799 | wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); | ||
1800 | } | ||
1801 | } | ||
1802 | |||
1803 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | ||
1804 | static int __cpuinit | ||
1805 | mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
1806 | { | ||
1807 | unsigned int cpu = (unsigned long)hcpu; | ||
1808 | struct timer_list *t = &per_cpu(mce_timer, cpu); | ||
1809 | |||
1810 | switch (action) { | ||
1811 | case CPU_ONLINE: | ||
1812 | case CPU_ONLINE_FROZEN: | ||
1813 | mce_create_device(cpu); | ||
1814 | if (threshold_cpu_callback) | ||
1815 | threshold_cpu_callback(action, cpu); | ||
1816 | break; | ||
1817 | case CPU_DEAD: | ||
1818 | case CPU_DEAD_FROZEN: | ||
1819 | if (threshold_cpu_callback) | ||
1820 | threshold_cpu_callback(action, cpu); | ||
1821 | mce_remove_device(cpu); | ||
1822 | break; | ||
1823 | case CPU_DOWN_PREPARE: | ||
1824 | case CPU_DOWN_PREPARE_FROZEN: | ||
1825 | del_timer_sync(t); | ||
1826 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | ||
1827 | break; | ||
1828 | case CPU_DOWN_FAILED: | ||
1829 | case CPU_DOWN_FAILED_FROZEN: | ||
1830 | t->expires = round_jiffies(jiffies + | ||
1831 | __get_cpu_var(next_interval)); | ||
1832 | add_timer_on(t, cpu); | ||
1833 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | ||
1834 | break; | ||
1835 | case CPU_POST_DEAD: | ||
1836 | /* intentionally ignoring frozen here */ | ||
1837 | cmci_rediscover(cpu); | ||
1838 | break; | ||
1839 | } | ||
1840 | return NOTIFY_OK; | ||
1841 | } | ||
1842 | |||
1843 | static struct notifier_block mce_cpu_notifier __cpuinitdata = { | ||
1844 | .notifier_call = mce_cpu_callback, | ||
1845 | }; | ||
1846 | |||
1847 | static __init int mce_init_banks(void) | ||
1848 | { | ||
1849 | int i; | ||
1850 | |||
1851 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
1852 | GFP_KERNEL); | ||
1853 | if (!bank_attrs) | ||
1854 | return -ENOMEM; | ||
1855 | |||
1856 | for (i = 0; i < banks; i++) { | ||
1857 | struct sysdev_attribute *a = &bank_attrs[i]; | ||
1858 | |||
1859 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | ||
1860 | if (!a->attr.name) | ||
1861 | goto nomem; | ||
1862 | |||
1863 | a->attr.mode = 0644; | ||
1864 | a->show = show_bank; | ||
1865 | a->store = set_bank; | ||
1866 | } | ||
1867 | return 0; | ||
1868 | |||
1869 | nomem: | ||
1870 | while (--i >= 0) | ||
1871 | kfree(bank_attrs[i].attr.name); | ||
1872 | kfree(bank_attrs); | ||
1873 | bank_attrs = NULL; | ||
1874 | |||
1875 | return -ENOMEM; | ||
1876 | } | ||
1877 | |||
1878 | static __init int mce_init_device(void) | ||
1879 | { | ||
1880 | int err; | ||
1881 | int i = 0; | ||
1882 | |||
1883 | if (!mce_available(&boot_cpu_data)) | ||
1884 | return -EIO; | ||
1885 | |||
1886 | alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); | ||
1887 | |||
1888 | err = mce_init_banks(); | ||
1889 | if (err) | ||
1890 | return err; | ||
1891 | |||
1892 | err = sysdev_class_register(&mce_sysclass); | ||
1893 | if (err) | ||
1894 | return err; | ||
1895 | |||
1896 | for_each_online_cpu(i) { | ||
1897 | err = mce_create_device(i); | ||
1898 | if (err) | ||
1899 | return err; | ||
1900 | } | ||
1901 | |||
1902 | register_hotcpu_notifier(&mce_cpu_notifier); | ||
1903 | misc_register(&mce_log_device); | ||
1904 | |||
1905 | return err; | ||
1906 | } | ||
1907 | |||
1908 | device_initcall(mce_init_device); | ||
1909 | |||
1910 | #else /* CONFIG_X86_OLD_MCE: */ | ||
1911 | |||
1912 | int nr_mce_banks; | ||
1913 | EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ | ||
1914 | |||
1915 | /* This has to be run for each processor */ | ||
1916 | void mcheck_init(struct cpuinfo_x86 *c) | ||
1917 | { | ||
1918 | if (mce_disabled == 1) | ||
1919 | return; | ||
1920 | |||
1921 | switch (c->x86_vendor) { | ||
1922 | case X86_VENDOR_AMD: | ||
1923 | amd_mcheck_init(c); | ||
1924 | break; | ||
1925 | |||
1926 | case X86_VENDOR_INTEL: | ||
1927 | if (c->x86 == 5) | ||
1928 | intel_p5_mcheck_init(c); | ||
1929 | if (c->x86 == 6) | ||
1930 | intel_p6_mcheck_init(c); | ||
1931 | if (c->x86 == 15) | ||
1932 | intel_p4_mcheck_init(c); | ||
1933 | break; | ||
1934 | |||
1935 | case X86_VENDOR_CENTAUR: | ||
1936 | if (c->x86 == 5) | ||
1937 | winchip_mcheck_init(c); | ||
1938 | break; | ||
1939 | |||
1940 | default: | ||
1941 | break; | ||
1942 | } | ||
1943 | printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); | ||
1944 | } | ||
1945 | |||
1946 | static int __init mcheck_enable(char *str) | ||
1947 | { | ||
1948 | mce_disabled = -1; | ||
1949 | return 1; | ||
1950 | } | ||
1951 | |||
1952 | __setup("mce", mcheck_enable); | ||
1953 | |||
1954 | #endif /* CONFIG_X86_OLD_MCE */ | ||
1955 | |||
1956 | /* | ||
1957 | * Old style boot options parsing. Only for compatibility. | ||
1958 | */ | ||
1959 | static int __init mcheck_disable(char *str) | ||
1960 | { | ||
1961 | mce_disabled = 1; | ||
1962 | return 1; | ||
1963 | } | ||
1964 | __setup("nomce", mcheck_disable); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index ae9f628838f1..84a552b458c8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h | |||
@@ -1,14 +1,38 @@ | |||
1 | #include <linux/init.h> | 1 | #include <linux/init.h> |
2 | #include <asm/mce.h> | 2 | #include <asm/mce.h> |
3 | 3 | ||
4 | #ifdef CONFIG_X86_OLD_MCE | ||
4 | void amd_mcheck_init(struct cpuinfo_x86 *c); | 5 | void amd_mcheck_init(struct cpuinfo_x86 *c); |
5 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c); | 6 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c); |
6 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); | ||
7 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c); | 7 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c); |
8 | #endif | ||
9 | |||
10 | #ifdef CONFIG_X86_ANCIENT_MCE | ||
11 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); | ||
8 | void winchip_mcheck_init(struct cpuinfo_x86 *c); | 12 | void winchip_mcheck_init(struct cpuinfo_x86 *c); |
13 | extern int mce_p5_enable; | ||
14 | static inline int mce_p5_enabled(void) { return mce_p5_enable; } | ||
15 | static inline void enable_p5_mce(void) { mce_p5_enable = 1; } | ||
16 | #else | ||
17 | static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} | ||
18 | static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} | ||
19 | static inline int mce_p5_enabled(void) { return 0; } | ||
20 | static inline void enable_p5_mce(void) { } | ||
21 | #endif | ||
9 | 22 | ||
10 | /* Call the installed machine check handler for this CPU setup. */ | 23 | /* Call the installed machine check handler for this CPU setup. */ |
11 | extern void (*machine_check_vector)(struct pt_regs *, long error_code); | 24 | extern void (*machine_check_vector)(struct pt_regs *, long error_code); |
12 | 25 | ||
26 | #ifdef CONFIG_X86_OLD_MCE | ||
27 | |||
13 | extern int nr_mce_banks; | 28 | extern int nr_mce_banks; |
14 | 29 | ||
30 | void intel_set_thermal_handler(void); | ||
31 | |||
32 | #else | ||
33 | |||
34 | static inline void intel_set_thermal_handler(void) { } | ||
35 | |||
36 | #endif | ||
37 | |||
38 | void intel_init_thermal(struct cpuinfo_x86 *c); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 3552119b091d..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null | |||
@@ -1,76 +0,0 @@ | |||
1 | /* | ||
2 | * mce.c - x86 Machine Check Exception Reporting | ||
3 | * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com> | ||
4 | */ | ||
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/smp.h> | ||
11 | #include <linux/thread_info.h> | ||
12 | |||
13 | #include <asm/processor.h> | ||
14 | #include <asm/system.h> | ||
15 | #include <asm/mce.h> | ||
16 | |||
17 | #include "mce.h" | ||
18 | |||
19 | int mce_disabled; | ||
20 | int nr_mce_banks; | ||
21 | |||
22 | EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ | ||
23 | |||
24 | /* Handle unconfigured int18 (should never happen) */ | ||
25 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | ||
26 | { | ||
27 | printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); | ||
28 | } | ||
29 | |||
30 | /* Call the installed machine check handler for this CPU setup. */ | ||
31 | void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; | ||
32 | |||
33 | /* This has to be run for each processor */ | ||
34 | void mcheck_init(struct cpuinfo_x86 *c) | ||
35 | { | ||
36 | if (mce_disabled == 1) | ||
37 | return; | ||
38 | |||
39 | switch (c->x86_vendor) { | ||
40 | case X86_VENDOR_AMD: | ||
41 | amd_mcheck_init(c); | ||
42 | break; | ||
43 | |||
44 | case X86_VENDOR_INTEL: | ||
45 | if (c->x86 == 5) | ||
46 | intel_p5_mcheck_init(c); | ||
47 | if (c->x86 == 6) | ||
48 | intel_p6_mcheck_init(c); | ||
49 | if (c->x86 == 15) | ||
50 | intel_p4_mcheck_init(c); | ||
51 | break; | ||
52 | |||
53 | case X86_VENDOR_CENTAUR: | ||
54 | if (c->x86 == 5) | ||
55 | winchip_mcheck_init(c); | ||
56 | break; | ||
57 | |||
58 | default: | ||
59 | break; | ||
60 | } | ||
61 | } | ||
62 | |||
63 | static int __init mcheck_disable(char *str) | ||
64 | { | ||
65 | mce_disabled = 1; | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | static int __init mcheck_enable(char *str) | ||
70 | { | ||
71 | mce_disabled = -1; | ||
72 | return 1; | ||
73 | } | ||
74 | |||
75 | __setup("nomce", mcheck_disable); | ||
76 | __setup("mce", mcheck_enable); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c deleted file mode 100644 index 09dd1d414fc3..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ /dev/null | |||
@@ -1,1187 +0,0 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | * Rest from unknown author(s). | ||
5 | * 2004 Andi Kleen. Rewrote most of it. | ||
6 | * Copyright 2008 Intel Corporation | ||
7 | * Author: Andi Kleen | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/smp_lock.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <linux/rcupdate.h> | ||
17 | #include <linux/kallsyms.h> | ||
18 | #include <linux/sysdev.h> | ||
19 | #include <linux/miscdevice.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/capability.h> | ||
22 | #include <linux/cpu.h> | ||
23 | #include <linux/percpu.h> | ||
24 | #include <linux/poll.h> | ||
25 | #include <linux/thread_info.h> | ||
26 | #include <linux/ctype.h> | ||
27 | #include <linux/kmod.h> | ||
28 | #include <linux/kdebug.h> | ||
29 | #include <linux/kobject.h> | ||
30 | #include <linux/sysfs.h> | ||
31 | #include <linux/ratelimit.h> | ||
32 | #include <asm/processor.h> | ||
33 | #include <asm/msr.h> | ||
34 | #include <asm/mce.h> | ||
35 | #include <asm/uaccess.h> | ||
36 | #include <asm/smp.h> | ||
37 | #include <asm/idle.h> | ||
38 | |||
39 | #define MISC_MCELOG_MINOR 227 | ||
40 | |||
41 | atomic_t mce_entry; | ||
42 | |||
43 | static int mce_dont_init; | ||
44 | |||
45 | /* | ||
46 | * Tolerant levels: | ||
47 | * 0: always panic on uncorrected errors, log corrected errors | ||
48 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | ||
49 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | ||
50 | * 3: never panic or SIGBUS, log all errors (for testing only) | ||
51 | */ | ||
52 | static int tolerant = 1; | ||
53 | static int banks; | ||
54 | static u64 *bank; | ||
55 | static unsigned long notify_user; | ||
56 | static int rip_msr; | ||
57 | static int mce_bootlog = -1; | ||
58 | static atomic_t mce_events; | ||
59 | |||
60 | static char trigger[128]; | ||
61 | static char *trigger_argv[2] = { trigger, NULL }; | ||
62 | |||
63 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | ||
64 | |||
65 | /* MCA banks polled by the period polling timer for corrected events */ | ||
66 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | ||
67 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL | ||
68 | }; | ||
69 | |||
70 | /* Do initial initialization of a struct mce */ | ||
71 | void mce_setup(struct mce *m) | ||
72 | { | ||
73 | memset(m, 0, sizeof(struct mce)); | ||
74 | m->cpu = smp_processor_id(); | ||
75 | rdtscll(m->tsc); | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * Lockless MCE logging infrastructure. | ||
80 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
81 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
82 | */ | ||
83 | |||
84 | static struct mce_log mcelog = { | ||
85 | MCE_LOG_SIGNATURE, | ||
86 | MCE_LOG_LEN, | ||
87 | }; | ||
88 | |||
89 | void mce_log(struct mce *mce) | ||
90 | { | ||
91 | unsigned next, entry; | ||
92 | atomic_inc(&mce_events); | ||
93 | mce->finished = 0; | ||
94 | wmb(); | ||
95 | for (;;) { | ||
96 | entry = rcu_dereference(mcelog.next); | ||
97 | for (;;) { | ||
98 | /* When the buffer fills up discard new entries. Assume | ||
99 | that the earlier errors are the more interesting. */ | ||
100 | if (entry >= MCE_LOG_LEN) { | ||
101 | set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); | ||
102 | return; | ||
103 | } | ||
104 | /* Old left over entry. Skip. */ | ||
105 | if (mcelog.entry[entry].finished) { | ||
106 | entry++; | ||
107 | continue; | ||
108 | } | ||
109 | break; | ||
110 | } | ||
111 | smp_rmb(); | ||
112 | next = entry + 1; | ||
113 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
114 | break; | ||
115 | } | ||
116 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
117 | wmb(); | ||
118 | mcelog.entry[entry].finished = 1; | ||
119 | wmb(); | ||
120 | |||
121 | set_bit(0, ¬ify_user); | ||
122 | } | ||
123 | |||
124 | static void print_mce(struct mce *m) | ||
125 | { | ||
126 | printk(KERN_EMERG "\n" | ||
127 | KERN_EMERG "HARDWARE ERROR\n" | ||
128 | KERN_EMERG | ||
129 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
130 | m->cpu, m->mcgstatus, m->bank, m->status); | ||
131 | if (m->ip) { | ||
132 | printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", | ||
133 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
134 | m->cs, m->ip); | ||
135 | if (m->cs == __KERNEL_CS) | ||
136 | print_symbol("{%s}", m->ip); | ||
137 | printk("\n"); | ||
138 | } | ||
139 | printk(KERN_EMERG "TSC %llx ", m->tsc); | ||
140 | if (m->addr) | ||
141 | printk("ADDR %llx ", m->addr); | ||
142 | if (m->misc) | ||
143 | printk("MISC %llx ", m->misc); | ||
144 | printk("\n"); | ||
145 | printk(KERN_EMERG "This is not a software problem!\n"); | ||
146 | printk(KERN_EMERG "Run through mcelog --ascii to decode " | ||
147 | "and contact your hardware vendor\n"); | ||
148 | } | ||
149 | |||
150 | static void mce_panic(char *msg, struct mce *backup, unsigned long start) | ||
151 | { | ||
152 | int i; | ||
153 | |||
154 | oops_begin(); | ||
155 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
156 | unsigned long tsc = mcelog.entry[i].tsc; | ||
157 | |||
158 | if (time_before(tsc, start)) | ||
159 | continue; | ||
160 | print_mce(&mcelog.entry[i]); | ||
161 | if (backup && mcelog.entry[i].tsc == backup->tsc) | ||
162 | backup = NULL; | ||
163 | } | ||
164 | if (backup) | ||
165 | print_mce(backup); | ||
166 | panic(msg); | ||
167 | } | ||
168 | |||
169 | int mce_available(struct cpuinfo_x86 *c) | ||
170 | { | ||
171 | if (mce_dont_init) | ||
172 | return 0; | ||
173 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | ||
174 | } | ||
175 | |||
176 | static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | ||
177 | { | ||
178 | if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { | ||
179 | m->ip = regs->ip; | ||
180 | m->cs = regs->cs; | ||
181 | } else { | ||
182 | m->ip = 0; | ||
183 | m->cs = 0; | ||
184 | } | ||
185 | if (rip_msr) { | ||
186 | /* Assume the RIP in the MSR is exact. Is this true? */ | ||
187 | m->mcgstatus |= MCG_STATUS_EIPV; | ||
188 | rdmsrl(rip_msr, m->ip); | ||
189 | m->cs = 0; | ||
190 | } | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Poll for corrected events or events that happened before reset. | ||
195 | * Those are just logged through /dev/mcelog. | ||
196 | * | ||
197 | * This is executed in standard interrupt context. | ||
198 | */ | ||
199 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | ||
200 | { | ||
201 | struct mce m; | ||
202 | int i; | ||
203 | |||
204 | mce_setup(&m); | ||
205 | |||
206 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
207 | for (i = 0; i < banks; i++) { | ||
208 | if (!bank[i] || !test_bit(i, *b)) | ||
209 | continue; | ||
210 | |||
211 | m.misc = 0; | ||
212 | m.addr = 0; | ||
213 | m.bank = i; | ||
214 | m.tsc = 0; | ||
215 | |||
216 | barrier(); | ||
217 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
218 | if (!(m.status & MCI_STATUS_VAL)) | ||
219 | continue; | ||
220 | |||
221 | /* | ||
222 | * Uncorrected events are handled by the exception handler | ||
223 | * when it is enabled. But when the exception is disabled log | ||
224 | * everything. | ||
225 | * | ||
226 | * TBD do the same check for MCI_STATUS_EN here? | ||
227 | */ | ||
228 | if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) | ||
229 | continue; | ||
230 | |||
231 | if (m.status & MCI_STATUS_MISCV) | ||
232 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
233 | if (m.status & MCI_STATUS_ADDRV) | ||
234 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
235 | |||
236 | if (!(flags & MCP_TIMESTAMP)) | ||
237 | m.tsc = 0; | ||
238 | /* | ||
239 | * Don't get the IP here because it's unlikely to | ||
240 | * have anything to do with the actual error location. | ||
241 | */ | ||
242 | if (!(flags & MCP_DONTLOG)) { | ||
243 | mce_log(&m); | ||
244 | add_taint(TAINT_MACHINE_CHECK); | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * Clear state for this bank. | ||
249 | */ | ||
250 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | * Don't clear MCG_STATUS here because it's only defined for | ||
255 | * exceptions. | ||
256 | */ | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * The actual machine check handler. This only handles real | ||
261 | * exceptions when something got corrupted coming in through int 18. | ||
262 | * | ||
263 | * This is executed in NMI context not subject to normal locking rules. This | ||
264 | * implies that most kernel services cannot be safely used. Don't even | ||
265 | * think about putting a printk in there! | ||
266 | */ | ||
267 | void do_machine_check(struct pt_regs * regs, long error_code) | ||
268 | { | ||
269 | struct mce m, panicm; | ||
270 | u64 mcestart = 0; | ||
271 | int i; | ||
272 | int panicm_found = 0; | ||
273 | /* | ||
274 | * If no_way_out gets set, there is no safe way to recover from this | ||
275 | * MCE. If tolerant is cranked up, we'll try anyway. | ||
276 | */ | ||
277 | int no_way_out = 0; | ||
278 | /* | ||
279 | * If kill_it gets set, there might be a way to recover from this | ||
280 | * error. | ||
281 | */ | ||
282 | int kill_it = 0; | ||
283 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | ||
284 | |||
285 | atomic_inc(&mce_entry); | ||
286 | |||
287 | if (notify_die(DIE_NMI, "machine check", regs, error_code, | ||
288 | 18, SIGKILL) == NOTIFY_STOP) | ||
289 | goto out2; | ||
290 | if (!banks) | ||
291 | goto out2; | ||
292 | |||
293 | mce_setup(&m); | ||
294 | |||
295 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
296 | /* if the restart IP is not valid, we're done for */ | ||
297 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
298 | no_way_out = 1; | ||
299 | |||
300 | rdtscll(mcestart); | ||
301 | barrier(); | ||
302 | |||
303 | for (i = 0; i < banks; i++) { | ||
304 | __clear_bit(i, toclear); | ||
305 | if (!bank[i]) | ||
306 | continue; | ||
307 | |||
308 | m.misc = 0; | ||
309 | m.addr = 0; | ||
310 | m.bank = i; | ||
311 | |||
312 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
313 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
314 | continue; | ||
315 | |||
316 | /* | ||
317 | * Non uncorrected errors are handled by machine_check_poll | ||
318 | * Leave them alone. | ||
319 | */ | ||
320 | if ((m.status & MCI_STATUS_UC) == 0) | ||
321 | continue; | ||
322 | |||
323 | /* | ||
324 | * Set taint even when machine check was not enabled. | ||
325 | */ | ||
326 | add_taint(TAINT_MACHINE_CHECK); | ||
327 | |||
328 | __set_bit(i, toclear); | ||
329 | |||
330 | if (m.status & MCI_STATUS_EN) { | ||
331 | /* if PCC was set, there's no way out */ | ||
332 | no_way_out |= !!(m.status & MCI_STATUS_PCC); | ||
333 | /* | ||
334 | * If this error was uncorrectable and there was | ||
335 | * an overflow, we're in trouble. If no overflow, | ||
336 | * we might get away with just killing a task. | ||
337 | */ | ||
338 | if (m.status & MCI_STATUS_UC) { | ||
339 | if (tolerant < 1 || m.status & MCI_STATUS_OVER) | ||
340 | no_way_out = 1; | ||
341 | kill_it = 1; | ||
342 | } | ||
343 | } else { | ||
344 | /* | ||
345 | * Machine check event was not enabled. Clear, but | ||
346 | * ignore. | ||
347 | */ | ||
348 | continue; | ||
349 | } | ||
350 | |||
351 | if (m.status & MCI_STATUS_MISCV) | ||
352 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
353 | if (m.status & MCI_STATUS_ADDRV) | ||
354 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
355 | |||
356 | mce_get_rip(&m, regs); | ||
357 | mce_log(&m); | ||
358 | |||
359 | /* Did this bank cause the exception? */ | ||
360 | /* Assume that the bank with uncorrectable errors did it, | ||
361 | and that there is only a single one. */ | ||
362 | if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { | ||
363 | panicm = m; | ||
364 | panicm_found = 1; | ||
365 | } | ||
366 | } | ||
367 | |||
368 | /* If we didn't find an uncorrectable error, pick | ||
369 | the last one (shouldn't happen, just being safe). */ | ||
370 | if (!panicm_found) | ||
371 | panicm = m; | ||
372 | |||
373 | /* | ||
374 | * If we have decided that we just CAN'T continue, and the user | ||
375 | * has not set tolerant to an insane level, give up and die. | ||
376 | */ | ||
377 | if (no_way_out && tolerant < 3) | ||
378 | mce_panic("Machine check", &panicm, mcestart); | ||
379 | |||
380 | /* | ||
381 | * If the error seems to be unrecoverable, something should be | ||
382 | * done. Try to kill as little as possible. If we can kill just | ||
383 | * one task, do that. If the user has set the tolerance very | ||
384 | * high, don't try to do anything at all. | ||
385 | */ | ||
386 | if (kill_it && tolerant < 3) { | ||
387 | int user_space = 0; | ||
388 | |||
389 | /* | ||
390 | * If the EIPV bit is set, it means the saved IP is the | ||
391 | * instruction which caused the MCE. | ||
392 | */ | ||
393 | if (m.mcgstatus & MCG_STATUS_EIPV) | ||
394 | user_space = panicm.ip && (panicm.cs & 3); | ||
395 | |||
396 | /* | ||
397 | * If we know that the error was in user space, send a | ||
398 | * SIGBUS. Otherwise, panic if tolerance is low. | ||
399 | * | ||
400 | * force_sig() takes an awful lot of locks and has a slight | ||
401 | * risk of deadlocking. | ||
402 | */ | ||
403 | if (user_space) { | ||
404 | force_sig(SIGBUS, current); | ||
405 | } else if (panic_on_oops || tolerant < 2) { | ||
406 | mce_panic("Uncorrected machine check", | ||
407 | &panicm, mcestart); | ||
408 | } | ||
409 | } | ||
410 | |||
411 | /* notify userspace ASAP */ | ||
412 | set_thread_flag(TIF_MCE_NOTIFY); | ||
413 | |||
414 | /* the last thing we do is clear state */ | ||
415 | for (i = 0; i < banks; i++) { | ||
416 | if (test_bit(i, toclear)) | ||
417 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
418 | } | ||
419 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
420 | out2: | ||
421 | atomic_dec(&mce_entry); | ||
422 | } | ||
423 | |||
424 | #ifdef CONFIG_X86_MCE_INTEL | ||
425 | /*** | ||
426 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog | ||
427 | * @cpu: The CPU on which the event occurred. | ||
428 | * @status: Event status information | ||
429 | * | ||
430 | * This function should be called by the thermal interrupt after the | ||
431 | * event has been processed and the decision was made to log the event | ||
432 | * further. | ||
433 | * | ||
434 | * The status parameter will be saved to the 'status' field of 'struct mce' | ||
435 | * and historically has been the register value of the | ||
436 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | ||
437 | */ | ||
438 | void mce_log_therm_throt_event(__u64 status) | ||
439 | { | ||
440 | struct mce m; | ||
441 | |||
442 | mce_setup(&m); | ||
443 | m.bank = MCE_THERMAL_BANK; | ||
444 | m.status = status; | ||
445 | mce_log(&m); | ||
446 | } | ||
447 | #endif /* CONFIG_X86_MCE_INTEL */ | ||
448 | |||
449 | /* | ||
450 | * Periodic polling timer for "silent" machine check errors. If the | ||
451 | * poller finds an MCE, poll 2x faster. When the poller finds no more | ||
452 | * errors, poll 2x slower (up to check_interval seconds). | ||
453 | */ | ||
454 | |||
455 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
456 | static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ | ||
457 | static void mcheck_timer(unsigned long); | ||
458 | static DEFINE_PER_CPU(struct timer_list, mce_timer); | ||
459 | |||
460 | static void mcheck_timer(unsigned long data) | ||
461 | { | ||
462 | struct timer_list *t = &per_cpu(mce_timer, data); | ||
463 | int *n; | ||
464 | |||
465 | WARN_ON(smp_processor_id() != data); | ||
466 | |||
467 | if (mce_available(¤t_cpu_data)) | ||
468 | machine_check_poll(MCP_TIMESTAMP, | ||
469 | &__get_cpu_var(mce_poll_banks)); | ||
470 | |||
471 | /* | ||
472 | * Alert userspace if needed. If we logged an MCE, reduce the | ||
473 | * polling interval, otherwise increase the polling interval. | ||
474 | */ | ||
475 | n = &__get_cpu_var(next_interval); | ||
476 | if (mce_notify_user()) { | ||
477 | *n = max(*n/2, HZ/100); | ||
478 | } else { | ||
479 | *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); | ||
480 | } | ||
481 | |||
482 | t->expires = jiffies + *n; | ||
483 | add_timer(t); | ||
484 | } | ||
485 | |||
486 | static void mce_do_trigger(struct work_struct *work) | ||
487 | { | ||
488 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); | ||
489 | } | ||
490 | |||
491 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | ||
492 | |||
493 | /* | ||
494 | * Notify the user(s) about new machine check events. | ||
495 | * Can be called from interrupt context, but not from machine check/NMI | ||
496 | * context. | ||
497 | */ | ||
498 | int mce_notify_user(void) | ||
499 | { | ||
500 | /* Not more than two messages every minute */ | ||
501 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | ||
502 | |||
503 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
504 | if (test_and_clear_bit(0, ¬ify_user)) { | ||
505 | wake_up_interruptible(&mce_wait); | ||
506 | |||
507 | /* | ||
508 | * There is no risk of missing notifications because | ||
509 | * work_pending is always cleared before the function is | ||
510 | * executed. | ||
511 | */ | ||
512 | if (trigger[0] && !work_pending(&mce_trigger_work)) | ||
513 | schedule_work(&mce_trigger_work); | ||
514 | |||
515 | if (__ratelimit(&ratelimit)) | ||
516 | printk(KERN_INFO "Machine check events logged\n"); | ||
517 | |||
518 | return 1; | ||
519 | } | ||
520 | return 0; | ||
521 | } | ||
522 | |||
523 | /* see if the idle task needs to notify userspace */ | ||
524 | static int | ||
525 | mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) | ||
526 | { | ||
527 | /* IDLE_END should be safe - interrupts are back on */ | ||
528 | if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) | ||
529 | mce_notify_user(); | ||
530 | |||
531 | return NOTIFY_OK; | ||
532 | } | ||
533 | |||
534 | static struct notifier_block mce_idle_notifier = { | ||
535 | .notifier_call = mce_idle_callback, | ||
536 | }; | ||
537 | |||
538 | static __init int periodic_mcheck_init(void) | ||
539 | { | ||
540 | idle_notifier_register(&mce_idle_notifier); | ||
541 | return 0; | ||
542 | } | ||
543 | __initcall(periodic_mcheck_init); | ||
544 | |||
545 | /* | ||
546 | * Initialize Machine Checks for a CPU. | ||
547 | */ | ||
548 | static int mce_cap_init(void) | ||
549 | { | ||
550 | u64 cap; | ||
551 | unsigned b; | ||
552 | |||
553 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
554 | b = cap & 0xff; | ||
555 | if (b > MAX_NR_BANKS) { | ||
556 | printk(KERN_WARNING | ||
557 | "MCE: Using only %u machine check banks out of %u\n", | ||
558 | MAX_NR_BANKS, b); | ||
559 | b = MAX_NR_BANKS; | ||
560 | } | ||
561 | |||
562 | /* Don't support asymmetric configurations today */ | ||
563 | WARN_ON(banks != 0 && b != banks); | ||
564 | banks = b; | ||
565 | if (!bank) { | ||
566 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | ||
567 | if (!bank) | ||
568 | return -ENOMEM; | ||
569 | memset(bank, 0xff, banks * sizeof(u64)); | ||
570 | } | ||
571 | |||
572 | /* Use accurate RIP reporting if available. */ | ||
573 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | ||
574 | rip_msr = MSR_IA32_MCG_EIP; | ||
575 | |||
576 | return 0; | ||
577 | } | ||
578 | |||
579 | static void mce_init(void *dummy) | ||
580 | { | ||
581 | u64 cap; | ||
582 | int i; | ||
583 | mce_banks_t all_banks; | ||
584 | |||
585 | /* | ||
586 | * Log the machine checks left over from the previous reset. | ||
587 | */ | ||
588 | bitmap_fill(all_banks, MAX_NR_BANKS); | ||
589 | machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); | ||
590 | |||
591 | set_in_cr4(X86_CR4_MCE); | ||
592 | |||
593 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
594 | if (cap & MCG_CTL_P) | ||
595 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
596 | |||
597 | for (i = 0; i < banks; i++) { | ||
598 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
599 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
600 | } | ||
601 | } | ||
602 | |||
603 | /* Add per CPU specific workarounds here */ | ||
604 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
605 | { | ||
606 | /* This should be disabled by the BIOS, but isn't always */ | ||
607 | if (c->x86_vendor == X86_VENDOR_AMD) { | ||
608 | if (c->x86 == 15 && banks > 4) | ||
609 | /* disable GART TBL walk error reporting, which trips off | ||
610 | incorrectly with the IOMMU & 3ware & Cerberus. */ | ||
611 | clear_bit(10, (unsigned long *)&bank[4]); | ||
612 | if(c->x86 <= 17 && mce_bootlog < 0) | ||
613 | /* Lots of broken BIOS around that don't clear them | ||
614 | by default and leave crap in there. Don't log. */ | ||
615 | mce_bootlog = 0; | ||
616 | } | ||
617 | |||
618 | } | ||
619 | |||
620 | static void mce_cpu_features(struct cpuinfo_x86 *c) | ||
621 | { | ||
622 | switch (c->x86_vendor) { | ||
623 | case X86_VENDOR_INTEL: | ||
624 | mce_intel_feature_init(c); | ||
625 | break; | ||
626 | case X86_VENDOR_AMD: | ||
627 | mce_amd_feature_init(c); | ||
628 | break; | ||
629 | default: | ||
630 | break; | ||
631 | } | ||
632 | } | ||
633 | |||
634 | static void mce_init_timer(void) | ||
635 | { | ||
636 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
637 | int *n = &__get_cpu_var(next_interval); | ||
638 | |||
639 | *n = check_interval * HZ; | ||
640 | if (!*n) | ||
641 | return; | ||
642 | setup_timer(t, mcheck_timer, smp_processor_id()); | ||
643 | t->expires = round_jiffies(jiffies + *n); | ||
644 | add_timer(t); | ||
645 | } | ||
646 | |||
647 | /* | ||
648 | * Called for each booted CPU to set up machine checks. | ||
649 | * Must be called with preempt off. | ||
650 | */ | ||
651 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | ||
652 | { | ||
653 | if (!mce_available(c)) | ||
654 | return; | ||
655 | |||
656 | if (mce_cap_init() < 0) { | ||
657 | mce_dont_init = 1; | ||
658 | return; | ||
659 | } | ||
660 | mce_cpu_quirks(c); | ||
661 | |||
662 | mce_init(NULL); | ||
663 | mce_cpu_features(c); | ||
664 | mce_init_timer(); | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Character device to read and clear the MCE log. | ||
669 | */ | ||
670 | |||
671 | static DEFINE_SPINLOCK(mce_state_lock); | ||
672 | static int open_count; /* #times opened */ | ||
673 | static int open_exclu; /* already open exclusive? */ | ||
674 | |||
675 | static int mce_open(struct inode *inode, struct file *file) | ||
676 | { | ||
677 | lock_kernel(); | ||
678 | spin_lock(&mce_state_lock); | ||
679 | |||
680 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { | ||
681 | spin_unlock(&mce_state_lock); | ||
682 | unlock_kernel(); | ||
683 | return -EBUSY; | ||
684 | } | ||
685 | |||
686 | if (file->f_flags & O_EXCL) | ||
687 | open_exclu = 1; | ||
688 | open_count++; | ||
689 | |||
690 | spin_unlock(&mce_state_lock); | ||
691 | unlock_kernel(); | ||
692 | |||
693 | return nonseekable_open(inode, file); | ||
694 | } | ||
695 | |||
696 | static int mce_release(struct inode *inode, struct file *file) | ||
697 | { | ||
698 | spin_lock(&mce_state_lock); | ||
699 | |||
700 | open_count--; | ||
701 | open_exclu = 0; | ||
702 | |||
703 | spin_unlock(&mce_state_lock); | ||
704 | |||
705 | return 0; | ||
706 | } | ||
707 | |||
708 | static void collect_tscs(void *data) | ||
709 | { | ||
710 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
711 | |||
712 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
713 | } | ||
714 | |||
715 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | ||
716 | loff_t *off) | ||
717 | { | ||
718 | unsigned long *cpu_tsc; | ||
719 | static DEFINE_MUTEX(mce_read_mutex); | ||
720 | unsigned prev, next; | ||
721 | char __user *buf = ubuf; | ||
722 | int i, err; | ||
723 | |||
724 | cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); | ||
725 | if (!cpu_tsc) | ||
726 | return -ENOMEM; | ||
727 | |||
728 | mutex_lock(&mce_read_mutex); | ||
729 | next = rcu_dereference(mcelog.next); | ||
730 | |||
731 | /* Only supports full reads right now */ | ||
732 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
733 | mutex_unlock(&mce_read_mutex); | ||
734 | kfree(cpu_tsc); | ||
735 | return -EINVAL; | ||
736 | } | ||
737 | |||
738 | err = 0; | ||
739 | prev = 0; | ||
740 | do { | ||
741 | for (i = prev; i < next; i++) { | ||
742 | unsigned long start = jiffies; | ||
743 | |||
744 | while (!mcelog.entry[i].finished) { | ||
745 | if (time_after_eq(jiffies, start + 2)) { | ||
746 | memset(mcelog.entry + i, 0, | ||
747 | sizeof(struct mce)); | ||
748 | goto timeout; | ||
749 | } | ||
750 | cpu_relax(); | ||
751 | } | ||
752 | smp_rmb(); | ||
753 | err |= copy_to_user(buf, mcelog.entry + i, | ||
754 | sizeof(struct mce)); | ||
755 | buf += sizeof(struct mce); | ||
756 | timeout: | ||
757 | ; | ||
758 | } | ||
759 | |||
760 | memset(mcelog.entry + prev, 0, | ||
761 | (next - prev) * sizeof(struct mce)); | ||
762 | prev = next; | ||
763 | next = cmpxchg(&mcelog.next, prev, 0); | ||
764 | } while (next != prev); | ||
765 | |||
766 | synchronize_sched(); | ||
767 | |||
768 | /* | ||
769 | * Collect entries that were still getting written before the | ||
770 | * synchronize. | ||
771 | */ | ||
772 | on_each_cpu(collect_tscs, cpu_tsc, 1); | ||
773 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
774 | if (mcelog.entry[i].finished && | ||
775 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
776 | err |= copy_to_user(buf, mcelog.entry+i, | ||
777 | sizeof(struct mce)); | ||
778 | smp_rmb(); | ||
779 | buf += sizeof(struct mce); | ||
780 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
781 | } | ||
782 | } | ||
783 | mutex_unlock(&mce_read_mutex); | ||
784 | kfree(cpu_tsc); | ||
785 | return err ? -EFAULT : buf - ubuf; | ||
786 | } | ||
787 | |||
788 | static unsigned int mce_poll(struct file *file, poll_table *wait) | ||
789 | { | ||
790 | poll_wait(file, &mce_wait, wait); | ||
791 | if (rcu_dereference(mcelog.next)) | ||
792 | return POLLIN | POLLRDNORM; | ||
793 | return 0; | ||
794 | } | ||
795 | |||
796 | static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) | ||
797 | { | ||
798 | int __user *p = (int __user *)arg; | ||
799 | |||
800 | if (!capable(CAP_SYS_ADMIN)) | ||
801 | return -EPERM; | ||
802 | switch (cmd) { | ||
803 | case MCE_GET_RECORD_LEN: | ||
804 | return put_user(sizeof(struct mce), p); | ||
805 | case MCE_GET_LOG_LEN: | ||
806 | return put_user(MCE_LOG_LEN, p); | ||
807 | case MCE_GETCLEAR_FLAGS: { | ||
808 | unsigned flags; | ||
809 | |||
810 | do { | ||
811 | flags = mcelog.flags; | ||
812 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
813 | return put_user(flags, p); | ||
814 | } | ||
815 | default: | ||
816 | return -ENOTTY; | ||
817 | } | ||
818 | } | ||
819 | |||
820 | static const struct file_operations mce_chrdev_ops = { | ||
821 | .open = mce_open, | ||
822 | .release = mce_release, | ||
823 | .read = mce_read, | ||
824 | .poll = mce_poll, | ||
825 | .unlocked_ioctl = mce_ioctl, | ||
826 | }; | ||
827 | |||
828 | static struct miscdevice mce_log_device = { | ||
829 | MISC_MCELOG_MINOR, | ||
830 | "mcelog", | ||
831 | &mce_chrdev_ops, | ||
832 | }; | ||
833 | |||
834 | /* | ||
835 | * Old style boot options parsing. Only for compatibility. | ||
836 | */ | ||
837 | static int __init mcheck_disable(char *str) | ||
838 | { | ||
839 | mce_dont_init = 1; | ||
840 | return 1; | ||
841 | } | ||
842 | |||
843 | /* mce=off disables machine check. | ||
844 | mce=TOLERANCELEVEL (number, see above) | ||
845 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | ||
846 | mce=nobootlog Don't log MCEs from before booting. */ | ||
847 | static int __init mcheck_enable(char *str) | ||
848 | { | ||
849 | if (!strcmp(str, "off")) | ||
850 | mce_dont_init = 1; | ||
851 | else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) | ||
852 | mce_bootlog = str[0] == 'b'; | ||
853 | else if (isdigit(str[0])) | ||
854 | get_option(&str, &tolerant); | ||
855 | else | ||
856 | printk("mce= argument %s ignored. Please use /sys", str); | ||
857 | return 1; | ||
858 | } | ||
859 | |||
860 | __setup("nomce", mcheck_disable); | ||
861 | __setup("mce=", mcheck_enable); | ||
862 | |||
863 | /* | ||
864 | * Sysfs support | ||
865 | */ | ||
866 | |||
867 | /* | ||
868 | * Disable machine checks on suspend and shutdown. We can't really handle | ||
869 | * them later. | ||
870 | */ | ||
871 | static int mce_disable(void) | ||
872 | { | ||
873 | int i; | ||
874 | |||
875 | for (i = 0; i < banks; i++) | ||
876 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
877 | return 0; | ||
878 | } | ||
879 | |||
880 | static int mce_suspend(struct sys_device *dev, pm_message_t state) | ||
881 | { | ||
882 | return mce_disable(); | ||
883 | } | ||
884 | |||
885 | static int mce_shutdown(struct sys_device *dev) | ||
886 | { | ||
887 | return mce_disable(); | ||
888 | } | ||
889 | |||
890 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. | ||
891 | Only one CPU is active at this time, the others get readded later using | ||
892 | CPU hotplug. */ | ||
893 | static int mce_resume(struct sys_device *dev) | ||
894 | { | ||
895 | mce_init(NULL); | ||
896 | mce_cpu_features(¤t_cpu_data); | ||
897 | return 0; | ||
898 | } | ||
899 | |||
900 | static void mce_cpu_restart(void *data) | ||
901 | { | ||
902 | del_timer_sync(&__get_cpu_var(mce_timer)); | ||
903 | if (mce_available(¤t_cpu_data)) | ||
904 | mce_init(NULL); | ||
905 | mce_init_timer(); | ||
906 | } | ||
907 | |||
908 | /* Reinit MCEs after user configuration changes */ | ||
909 | static void mce_restart(void) | ||
910 | { | ||
911 | on_each_cpu(mce_cpu_restart, NULL, 1); | ||
912 | } | ||
913 | |||
914 | static struct sysdev_class mce_sysclass = { | ||
915 | .suspend = mce_suspend, | ||
916 | .shutdown = mce_shutdown, | ||
917 | .resume = mce_resume, | ||
918 | .name = "machinecheck", | ||
919 | }; | ||
920 | |||
921 | DEFINE_PER_CPU(struct sys_device, device_mce); | ||
922 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; | ||
923 | |||
924 | /* Why are there no generic functions for this? */ | ||
925 | #define ACCESSOR(name, var, start) \ | ||
926 | static ssize_t show_ ## name(struct sys_device *s, \ | ||
927 | struct sysdev_attribute *attr, \ | ||
928 | char *buf) { \ | ||
929 | return sprintf(buf, "%lx\n", (unsigned long)var); \ | ||
930 | } \ | ||
931 | static ssize_t set_ ## name(struct sys_device *s, \ | ||
932 | struct sysdev_attribute *attr, \ | ||
933 | const char *buf, size_t siz) { \ | ||
934 | char *end; \ | ||
935 | unsigned long new = simple_strtoul(buf, &end, 0); \ | ||
936 | if (end == buf) return -EINVAL; \ | ||
937 | var = new; \ | ||
938 | start; \ | ||
939 | return end-buf; \ | ||
940 | } \ | ||
941 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | ||
942 | |||
943 | static struct sysdev_attribute *bank_attrs; | ||
944 | |||
945 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
946 | char *buf) | ||
947 | { | ||
948 | u64 b = bank[attr - bank_attrs]; | ||
949 | return sprintf(buf, "%llx\n", b); | ||
950 | } | ||
951 | |||
952 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
953 | const char *buf, size_t siz) | ||
954 | { | ||
955 | char *end; | ||
956 | u64 new = simple_strtoull(buf, &end, 0); | ||
957 | if (end == buf) | ||
958 | return -EINVAL; | ||
959 | bank[attr - bank_attrs] = new; | ||
960 | mce_restart(); | ||
961 | return end-buf; | ||
962 | } | ||
963 | |||
964 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, | ||
965 | char *buf) | ||
966 | { | ||
967 | strcpy(buf, trigger); | ||
968 | strcat(buf, "\n"); | ||
969 | return strlen(trigger) + 1; | ||
970 | } | ||
971 | |||
972 | static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, | ||
973 | const char *buf,size_t siz) | ||
974 | { | ||
975 | char *p; | ||
976 | int len; | ||
977 | strncpy(trigger, buf, sizeof(trigger)); | ||
978 | trigger[sizeof(trigger)-1] = 0; | ||
979 | len = strlen(trigger); | ||
980 | p = strchr(trigger, '\n'); | ||
981 | if (*p) *p = 0; | ||
982 | return len; | ||
983 | } | ||
984 | |||
985 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | ||
986 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | ||
987 | ACCESSOR(check_interval,check_interval,mce_restart()) | ||
988 | static struct sysdev_attribute *mce_attributes[] = { | ||
989 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, | ||
990 | NULL | ||
991 | }; | ||
992 | |||
993 | static cpumask_var_t mce_device_initialized; | ||
994 | |||
995 | /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ | ||
996 | static __cpuinit int mce_create_device(unsigned int cpu) | ||
997 | { | ||
998 | int err; | ||
999 | int i; | ||
1000 | |||
1001 | if (!mce_available(&boot_cpu_data)) | ||
1002 | return -EIO; | ||
1003 | |||
1004 | memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); | ||
1005 | per_cpu(device_mce,cpu).id = cpu; | ||
1006 | per_cpu(device_mce,cpu).cls = &mce_sysclass; | ||
1007 | |||
1008 | err = sysdev_register(&per_cpu(device_mce,cpu)); | ||
1009 | if (err) | ||
1010 | return err; | ||
1011 | |||
1012 | for (i = 0; mce_attributes[i]; i++) { | ||
1013 | err = sysdev_create_file(&per_cpu(device_mce,cpu), | ||
1014 | mce_attributes[i]); | ||
1015 | if (err) | ||
1016 | goto error; | ||
1017 | } | ||
1018 | for (i = 0; i < banks; i++) { | ||
1019 | err = sysdev_create_file(&per_cpu(device_mce, cpu), | ||
1020 | &bank_attrs[i]); | ||
1021 | if (err) | ||
1022 | goto error2; | ||
1023 | } | ||
1024 | cpumask_set_cpu(cpu, mce_device_initialized); | ||
1025 | |||
1026 | return 0; | ||
1027 | error2: | ||
1028 | while (--i >= 0) { | ||
1029 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
1030 | &bank_attrs[i]); | ||
1031 | } | ||
1032 | error: | ||
1033 | while (--i >= 0) { | ||
1034 | sysdev_remove_file(&per_cpu(device_mce,cpu), | ||
1035 | mce_attributes[i]); | ||
1036 | } | ||
1037 | sysdev_unregister(&per_cpu(device_mce,cpu)); | ||
1038 | |||
1039 | return err; | ||
1040 | } | ||
1041 | |||
1042 | static __cpuinit void mce_remove_device(unsigned int cpu) | ||
1043 | { | ||
1044 | int i; | ||
1045 | |||
1046 | if (!cpumask_test_cpu(cpu, mce_device_initialized)) | ||
1047 | return; | ||
1048 | |||
1049 | for (i = 0; mce_attributes[i]; i++) | ||
1050 | sysdev_remove_file(&per_cpu(device_mce,cpu), | ||
1051 | mce_attributes[i]); | ||
1052 | for (i = 0; i < banks; i++) | ||
1053 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
1054 | &bank_attrs[i]); | ||
1055 | sysdev_unregister(&per_cpu(device_mce,cpu)); | ||
1056 | cpumask_clear_cpu(cpu, mce_device_initialized); | ||
1057 | } | ||
1058 | |||
1059 | /* Make sure there are no machine checks on offlined CPUs. */ | ||
1060 | static void mce_disable_cpu(void *h) | ||
1061 | { | ||
1062 | int i; | ||
1063 | unsigned long action = *(unsigned long *)h; | ||
1064 | |||
1065 | if (!mce_available(¤t_cpu_data)) | ||
1066 | return; | ||
1067 | if (!(action & CPU_TASKS_FROZEN)) | ||
1068 | cmci_clear(); | ||
1069 | for (i = 0; i < banks; i++) | ||
1070 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
1071 | } | ||
1072 | |||
1073 | static void mce_reenable_cpu(void *h) | ||
1074 | { | ||
1075 | int i; | ||
1076 | unsigned long action = *(unsigned long *)h; | ||
1077 | |||
1078 | if (!mce_available(¤t_cpu_data)) | ||
1079 | return; | ||
1080 | if (!(action & CPU_TASKS_FROZEN)) | ||
1081 | cmci_reenable(); | ||
1082 | for (i = 0; i < banks; i++) | ||
1083 | wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); | ||
1084 | } | ||
1085 | |||
1086 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | ||
1087 | static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, | ||
1088 | unsigned long action, void *hcpu) | ||
1089 | { | ||
1090 | unsigned int cpu = (unsigned long)hcpu; | ||
1091 | struct timer_list *t = &per_cpu(mce_timer, cpu); | ||
1092 | |||
1093 | switch (action) { | ||
1094 | case CPU_ONLINE: | ||
1095 | case CPU_ONLINE_FROZEN: | ||
1096 | mce_create_device(cpu); | ||
1097 | if (threshold_cpu_callback) | ||
1098 | threshold_cpu_callback(action, cpu); | ||
1099 | break; | ||
1100 | case CPU_DEAD: | ||
1101 | case CPU_DEAD_FROZEN: | ||
1102 | if (threshold_cpu_callback) | ||
1103 | threshold_cpu_callback(action, cpu); | ||
1104 | mce_remove_device(cpu); | ||
1105 | break; | ||
1106 | case CPU_DOWN_PREPARE: | ||
1107 | case CPU_DOWN_PREPARE_FROZEN: | ||
1108 | del_timer_sync(t); | ||
1109 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | ||
1110 | break; | ||
1111 | case CPU_DOWN_FAILED: | ||
1112 | case CPU_DOWN_FAILED_FROZEN: | ||
1113 | t->expires = round_jiffies(jiffies + | ||
1114 | __get_cpu_var(next_interval)); | ||
1115 | add_timer_on(t, cpu); | ||
1116 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | ||
1117 | break; | ||
1118 | case CPU_POST_DEAD: | ||
1119 | /* intentionally ignoring frozen here */ | ||
1120 | cmci_rediscover(cpu); | ||
1121 | break; | ||
1122 | } | ||
1123 | return NOTIFY_OK; | ||
1124 | } | ||
1125 | |||
1126 | static struct notifier_block mce_cpu_notifier __cpuinitdata = { | ||
1127 | .notifier_call = mce_cpu_callback, | ||
1128 | }; | ||
1129 | |||
1130 | static __init int mce_init_banks(void) | ||
1131 | { | ||
1132 | int i; | ||
1133 | |||
1134 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
1135 | GFP_KERNEL); | ||
1136 | if (!bank_attrs) | ||
1137 | return -ENOMEM; | ||
1138 | |||
1139 | for (i = 0; i < banks; i++) { | ||
1140 | struct sysdev_attribute *a = &bank_attrs[i]; | ||
1141 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | ||
1142 | if (!a->attr.name) | ||
1143 | goto nomem; | ||
1144 | a->attr.mode = 0644; | ||
1145 | a->show = show_bank; | ||
1146 | a->store = set_bank; | ||
1147 | } | ||
1148 | return 0; | ||
1149 | |||
1150 | nomem: | ||
1151 | while (--i >= 0) | ||
1152 | kfree(bank_attrs[i].attr.name); | ||
1153 | kfree(bank_attrs); | ||
1154 | bank_attrs = NULL; | ||
1155 | return -ENOMEM; | ||
1156 | } | ||
1157 | |||
1158 | static __init int mce_init_device(void) | ||
1159 | { | ||
1160 | int err; | ||
1161 | int i = 0; | ||
1162 | |||
1163 | if (!mce_available(&boot_cpu_data)) | ||
1164 | return -EIO; | ||
1165 | |||
1166 | zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); | ||
1167 | |||
1168 | err = mce_init_banks(); | ||
1169 | if (err) | ||
1170 | return err; | ||
1171 | |||
1172 | err = sysdev_class_register(&mce_sysclass); | ||
1173 | if (err) | ||
1174 | return err; | ||
1175 | |||
1176 | for_each_online_cpu(i) { | ||
1177 | err = mce_create_device(i); | ||
1178 | if (err) | ||
1179 | return err; | ||
1180 | } | ||
1181 | |||
1182 | register_hotcpu_notifier(&mce_cpu_notifier); | ||
1183 | misc_register(&mce_log_device); | ||
1184 | return err; | ||
1185 | } | ||
1186 | |||
1187 | device_initcall(mce_init_device); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 56dde9c4bc96..ddae21620bda 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c | |||
@@ -13,22 +13,22 @@ | |||
13 | * | 13 | * |
14 | * All MC4_MISCi registers are shared between multi-cores | 14 | * All MC4_MISCi registers are shared between multi-cores |
15 | */ | 15 | */ |
16 | |||
17 | #include <linux/cpu.h> | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
21 | #include <linux/kobject.h> | ||
22 | #include <linux/notifier.h> | 17 | #include <linux/notifier.h> |
23 | #include <linux/sched.h> | 18 | #include <linux/kobject.h> |
24 | #include <linux/smp.h> | 19 | #include <linux/percpu.h> |
25 | #include <linux/sysdev.h> | 20 | #include <linux/sysdev.h> |
21 | #include <linux/errno.h> | ||
22 | #include <linux/sched.h> | ||
26 | #include <linux/sysfs.h> | 23 | #include <linux/sysfs.h> |
24 | #include <linux/init.h> | ||
25 | #include <linux/cpu.h> | ||
26 | #include <linux/smp.h> | ||
27 | |||
27 | #include <asm/apic.h> | 28 | #include <asm/apic.h> |
29 | #include <asm/idle.h> | ||
28 | #include <asm/mce.h> | 30 | #include <asm/mce.h> |
29 | #include <asm/msr.h> | 31 | #include <asm/msr.h> |
30 | #include <asm/percpu.h> | ||
31 | #include <asm/idle.h> | ||
32 | 32 | ||
33 | #define PFX "mce_threshold: " | 33 | #define PFX "mce_threshold: " |
34 | #define VERSION "version 1.1.1" | 34 | #define VERSION "version 1.1.1" |
@@ -48,26 +48,26 @@ | |||
48 | #define MCG_XBLK_ADDR 0xC0000400 | 48 | #define MCG_XBLK_ADDR 0xC0000400 |
49 | 49 | ||
50 | struct threshold_block { | 50 | struct threshold_block { |
51 | unsigned int block; | 51 | unsigned int block; |
52 | unsigned int bank; | 52 | unsigned int bank; |
53 | unsigned int cpu; | 53 | unsigned int cpu; |
54 | u32 address; | 54 | u32 address; |
55 | u16 interrupt_enable; | 55 | u16 interrupt_enable; |
56 | u16 threshold_limit; | 56 | u16 threshold_limit; |
57 | struct kobject kobj; | 57 | struct kobject kobj; |
58 | struct list_head miscj; | 58 | struct list_head miscj; |
59 | }; | 59 | }; |
60 | 60 | ||
61 | /* defaults used early on boot */ | 61 | /* defaults used early on boot */ |
62 | static struct threshold_block threshold_defaults = { | 62 | static struct threshold_block threshold_defaults = { |
63 | .interrupt_enable = 0, | 63 | .interrupt_enable = 0, |
64 | .threshold_limit = THRESHOLD_MAX, | 64 | .threshold_limit = THRESHOLD_MAX, |
65 | }; | 65 | }; |
66 | 66 | ||
67 | struct threshold_bank { | 67 | struct threshold_bank { |
68 | struct kobject *kobj; | 68 | struct kobject *kobj; |
69 | struct threshold_block *blocks; | 69 | struct threshold_block *blocks; |
70 | cpumask_var_t cpus; | 70 | cpumask_var_t cpus; |
71 | }; | 71 | }; |
72 | static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); | 72 | static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); |
73 | 73 | ||
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void); | |||
86 | */ | 86 | */ |
87 | 87 | ||
88 | struct thresh_restart { | 88 | struct thresh_restart { |
89 | struct threshold_block *b; | 89 | struct threshold_block *b; |
90 | int reset; | 90 | int reset; |
91 | u16 old_limit; | 91 | u16 old_limit; |
92 | }; | 92 | }; |
93 | 93 | ||
94 | /* must be called with correct cpu affinity */ | 94 | /* must be called with correct cpu affinity */ |
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr) | |||
110 | } else if (tr->old_limit) { /* change limit w/o reset */ | 110 | } else if (tr->old_limit) { /* change limit w/o reset */ |
111 | int new_count = (mci_misc_hi & THRESHOLD_MAX) + | 111 | int new_count = (mci_misc_hi & THRESHOLD_MAX) + |
112 | (tr->old_limit - tr->b->threshold_limit); | 112 | (tr->old_limit - tr->b->threshold_limit); |
113 | |||
113 | mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | | 114 | mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | |
114 | (new_count & THRESHOLD_MAX); | 115 | (new_count & THRESHOLD_MAX); |
115 | } | 116 | } |
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr) | |||
125 | /* cpu init entry point, called from mce.c with preempt off */ | 126 | /* cpu init entry point, called from mce.c with preempt off */ |
126 | void mce_amd_feature_init(struct cpuinfo_x86 *c) | 127 | void mce_amd_feature_init(struct cpuinfo_x86 *c) |
127 | { | 128 | { |
128 | unsigned int bank, block; | ||
129 | unsigned int cpu = smp_processor_id(); | 129 | unsigned int cpu = smp_processor_id(); |
130 | u8 lvt_off; | ||
131 | u32 low = 0, high = 0, address = 0; | 130 | u32 low = 0, high = 0, address = 0; |
131 | unsigned int bank, block; | ||
132 | struct thresh_restart tr; | 132 | struct thresh_restart tr; |
133 | u8 lvt_off; | ||
133 | 134 | ||
134 | for (bank = 0; bank < NR_BANKS; ++bank) { | 135 | for (bank = 0; bank < NR_BANKS; ++bank) { |
135 | for (block = 0; block < NR_BLOCKS; ++block) { | 136 | for (block = 0; block < NR_BLOCKS; ++block) { |
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
140 | if (!address) | 141 | if (!address) |
141 | break; | 142 | break; |
142 | address += MCG_XBLK_ADDR; | 143 | address += MCG_XBLK_ADDR; |
143 | } | 144 | } else |
144 | else | ||
145 | ++address; | 145 | ++address; |
146 | 146 | ||
147 | if (rdmsr_safe(address, &low, &high)) | 147 | if (rdmsr_safe(address, &low, &high)) |
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
193 | */ | 193 | */ |
194 | static void amd_threshold_interrupt(void) | 194 | static void amd_threshold_interrupt(void) |
195 | { | 195 | { |
196 | u32 low = 0, high = 0, address = 0; | ||
196 | unsigned int bank, block; | 197 | unsigned int bank, block; |
197 | struct mce m; | 198 | struct mce m; |
198 | u32 low = 0, high = 0, address = 0; | ||
199 | 199 | ||
200 | mce_setup(&m); | 200 | mce_setup(&m); |
201 | 201 | ||
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void) | |||
204 | if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) | 204 | if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) |
205 | continue; | 205 | continue; |
206 | for (block = 0; block < NR_BLOCKS; ++block) { | 206 | for (block = 0; block < NR_BLOCKS; ++block) { |
207 | if (block == 0) | 207 | if (block == 0) { |
208 | address = MSR_IA32_MC0_MISC + bank * 4; | 208 | address = MSR_IA32_MC0_MISC + bank * 4; |
209 | else if (block == 1) { | 209 | } else if (block == 1) { |
210 | address = (low & MASK_BLKPTR_LO) >> 21; | 210 | address = (low & MASK_BLKPTR_LO) >> 21; |
211 | if (!address) | 211 | if (!address) |
212 | break; | 212 | break; |
213 | address += MCG_XBLK_ADDR; | 213 | address += MCG_XBLK_ADDR; |
214 | } | 214 | } else { |
215 | else | ||
216 | ++address; | 215 | ++address; |
216 | } | ||
217 | 217 | ||
218 | if (rdmsr_safe(address, &low, &high)) | 218 | if (rdmsr_safe(address, &low, &high)) |
219 | break; | 219 | break; |
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void) | |||
229 | (high & MASK_LOCKED_HI)) | 229 | (high & MASK_LOCKED_HI)) |
230 | continue; | 230 | continue; |
231 | 231 | ||
232 | /* Log the machine check that caused the threshold | 232 | /* |
233 | event. */ | 233 | * Log the machine check that caused the threshold |
234 | * event. | ||
235 | */ | ||
234 | machine_check_poll(MCP_TIMESTAMP, | 236 | machine_check_poll(MCP_TIMESTAMP, |
235 | &__get_cpu_var(mce_poll_banks)); | 237 | &__get_cpu_var(mce_poll_banks)); |
236 | 238 | ||
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void) | |||
254 | 256 | ||
255 | struct threshold_attr { | 257 | struct threshold_attr { |
256 | struct attribute attr; | 258 | struct attribute attr; |
257 | ssize_t(*show) (struct threshold_block *, char *); | 259 | ssize_t (*show) (struct threshold_block *, char *); |
258 | ssize_t(*store) (struct threshold_block *, const char *, size_t count); | 260 | ssize_t (*store) (struct threshold_block *, const char *, size_t count); |
259 | }; | 261 | }; |
260 | 262 | ||
261 | #define SHOW_FIELDS(name) \ | 263 | #define SHOW_FIELDS(name) \ |
262 | static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ | 264 | static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ |
263 | { \ | 265 | { \ |
264 | return sprintf(buf, "%lx\n", (unsigned long) b->name); \ | 266 | return sprintf(buf, "%lx\n", (unsigned long) b->name); \ |
265 | } | 267 | } |
266 | SHOW_FIELDS(interrupt_enable) | 268 | SHOW_FIELDS(interrupt_enable) |
267 | SHOW_FIELDS(threshold_limit) | 269 | SHOW_FIELDS(threshold_limit) |
268 | 270 | ||
269 | static ssize_t store_interrupt_enable(struct threshold_block *b, | 271 | static ssize_t |
270 | const char *buf, size_t count) | 272 | store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) |
271 | { | 273 | { |
272 | char *end; | ||
273 | struct thresh_restart tr; | 274 | struct thresh_restart tr; |
274 | unsigned long new = simple_strtoul(buf, &end, 0); | 275 | unsigned long new; |
275 | if (end == buf) | 276 | |
277 | if (strict_strtoul(buf, 0, &new) < 0) | ||
276 | return -EINVAL; | 278 | return -EINVAL; |
279 | |||
277 | b->interrupt_enable = !!new; | 280 | b->interrupt_enable = !!new; |
278 | 281 | ||
279 | tr.b = b; | 282 | tr.b = b; |
280 | tr.reset = 0; | 283 | tr.reset = 0; |
281 | tr.old_limit = 0; | 284 | tr.old_limit = 0; |
285 | |||
282 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); | 286 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); |
283 | 287 | ||
284 | return end - buf; | 288 | return size; |
285 | } | 289 | } |
286 | 290 | ||
287 | static ssize_t store_threshold_limit(struct threshold_block *b, | 291 | static ssize_t |
288 | const char *buf, size_t count) | 292 | store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) |
289 | { | 293 | { |
290 | char *end; | ||
291 | struct thresh_restart tr; | 294 | struct thresh_restart tr; |
292 | unsigned long new = simple_strtoul(buf, &end, 0); | 295 | unsigned long new; |
293 | if (end == buf) | 296 | |
297 | if (strict_strtoul(buf, 0, &new) < 0) | ||
294 | return -EINVAL; | 298 | return -EINVAL; |
299 | |||
295 | if (new > THRESHOLD_MAX) | 300 | if (new > THRESHOLD_MAX) |
296 | new = THRESHOLD_MAX; | 301 | new = THRESHOLD_MAX; |
297 | if (new < 1) | 302 | if (new < 1) |
298 | new = 1; | 303 | new = 1; |
304 | |||
299 | tr.old_limit = b->threshold_limit; | 305 | tr.old_limit = b->threshold_limit; |
300 | b->threshold_limit = new; | 306 | b->threshold_limit = new; |
301 | tr.b = b; | 307 | tr.b = b; |
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b, | |||
303 | 309 | ||
304 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); | 310 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); |
305 | 311 | ||
306 | return end - buf; | 312 | return size; |
307 | } | 313 | } |
308 | 314 | ||
309 | struct threshold_block_cross_cpu { | 315 | struct threshold_block_cross_cpu { |
310 | struct threshold_block *tb; | 316 | struct threshold_block *tb; |
311 | long retval; | 317 | long retval; |
312 | }; | 318 | }; |
313 | 319 | ||
314 | static void local_error_count_handler(void *_tbcc) | 320 | static void local_error_count_handler(void *_tbcc) |
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b, | |||
338 | return 1; | 344 | return 1; |
339 | } | 345 | } |
340 | 346 | ||
341 | #define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ | 347 | #define RW_ATTR(val) \ |
342 | .attr = {.name = __stringify(_name), .mode = _mode }, \ | 348 | static struct threshold_attr val = { \ |
343 | .show = _show, \ | 349 | .attr = {.name = __stringify(val), .mode = 0644 }, \ |
344 | .store = _store, \ | 350 | .show = show_## val, \ |
351 | .store = store_## val, \ | ||
345 | }; | 352 | }; |
346 | 353 | ||
347 | #define RW_ATTR(name) \ | ||
348 | static struct threshold_attr name = \ | ||
349 | THRESHOLD_ATTR(name, 0644, show_## name, store_## name) | ||
350 | |||
351 | RW_ATTR(interrupt_enable); | 354 | RW_ATTR(interrupt_enable); |
352 | RW_ATTR(threshold_limit); | 355 | RW_ATTR(threshold_limit); |
353 | RW_ATTR(error_count); | 356 | RW_ATTR(error_count); |
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = { | |||
359 | NULL | 362 | NULL |
360 | }; | 363 | }; |
361 | 364 | ||
362 | #define to_block(k) container_of(k, struct threshold_block, kobj) | 365 | #define to_block(k) container_of(k, struct threshold_block, kobj) |
363 | #define to_attr(a) container_of(a, struct threshold_attr, attr) | 366 | #define to_attr(a) container_of(a, struct threshold_attr, attr) |
364 | 367 | ||
365 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) | 368 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) |
366 | { | 369 | { |
367 | struct threshold_block *b = to_block(kobj); | 370 | struct threshold_block *b = to_block(kobj); |
368 | struct threshold_attr *a = to_attr(attr); | 371 | struct threshold_attr *a = to_attr(attr); |
369 | ssize_t ret; | 372 | ssize_t ret; |
373 | |||
370 | ret = a->show ? a->show(b, buf) : -EIO; | 374 | ret = a->show ? a->show(b, buf) : -EIO; |
375 | |||
371 | return ret; | 376 | return ret; |
372 | } | 377 | } |
373 | 378 | ||
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, | |||
377 | struct threshold_block *b = to_block(kobj); | 382 | struct threshold_block *b = to_block(kobj); |
378 | struct threshold_attr *a = to_attr(attr); | 383 | struct threshold_attr *a = to_attr(attr); |
379 | ssize_t ret; | 384 | ssize_t ret; |
385 | |||
380 | ret = a->store ? a->store(b, buf, count) : -EIO; | 386 | ret = a->store ? a->store(b, buf, count) : -EIO; |
387 | |||
381 | return ret; | 388 | return ret; |
382 | } | 389 | } |
383 | 390 | ||
384 | static struct sysfs_ops threshold_ops = { | 391 | static struct sysfs_ops threshold_ops = { |
385 | .show = show, | 392 | .show = show, |
386 | .store = store, | 393 | .store = store, |
387 | }; | 394 | }; |
388 | 395 | ||
389 | static struct kobj_type threshold_ktype = { | 396 | static struct kobj_type threshold_ktype = { |
390 | .sysfs_ops = &threshold_ops, | 397 | .sysfs_ops = &threshold_ops, |
391 | .default_attrs = default_attrs, | 398 | .default_attrs = default_attrs, |
392 | }; | 399 | }; |
393 | 400 | ||
394 | static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | 401 | static __cpuinit int allocate_threshold_blocks(unsigned int cpu, |
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | |||
396 | unsigned int block, | 403 | unsigned int block, |
397 | u32 address) | 404 | u32 address) |
398 | { | 405 | { |
399 | int err; | ||
400 | u32 low, high; | ||
401 | struct threshold_block *b = NULL; | 406 | struct threshold_block *b = NULL; |
407 | u32 low, high; | ||
408 | int err; | ||
402 | 409 | ||
403 | if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) | 410 | if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) |
404 | return 0; | 411 | return 0; |
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | |||
421 | if (!b) | 428 | if (!b) |
422 | return -ENOMEM; | 429 | return -ENOMEM; |
423 | 430 | ||
424 | b->block = block; | 431 | b->block = block; |
425 | b->bank = bank; | 432 | b->bank = bank; |
426 | b->cpu = cpu; | 433 | b->cpu = cpu; |
427 | b->address = address; | 434 | b->address = address; |
428 | b->interrupt_enable = 0; | 435 | b->interrupt_enable = 0; |
429 | b->threshold_limit = THRESHOLD_MAX; | 436 | b->threshold_limit = THRESHOLD_MAX; |
430 | 437 | ||
431 | INIT_LIST_HEAD(&b->miscj); | 438 | INIT_LIST_HEAD(&b->miscj); |
432 | 439 | ||
433 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) | 440 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) { |
434 | list_add(&b->miscj, | 441 | list_add(&b->miscj, |
435 | &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); | 442 | &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); |
436 | else | 443 | } else { |
437 | per_cpu(threshold_banks, cpu)[bank]->blocks = b; | 444 | per_cpu(threshold_banks, cpu)[bank]->blocks = b; |
445 | } | ||
438 | 446 | ||
439 | err = kobject_init_and_add(&b->kobj, &threshold_ktype, | 447 | err = kobject_init_and_add(&b->kobj, &threshold_ktype, |
440 | per_cpu(threshold_banks, cpu)[bank]->kobj, | 448 | per_cpu(threshold_banks, cpu)[bank]->kobj, |
@@ -447,8 +455,9 @@ recurse: | |||
447 | if (!address) | 455 | if (!address) |
448 | return 0; | 456 | return 0; |
449 | address += MCG_XBLK_ADDR; | 457 | address += MCG_XBLK_ADDR; |
450 | } else | 458 | } else { |
451 | ++address; | 459 | ++address; |
460 | } | ||
452 | 461 | ||
453 | err = allocate_threshold_blocks(cpu, bank, ++block, address); | 462 | err = allocate_threshold_blocks(cpu, bank, ++block, address); |
454 | if (err) | 463 | if (err) |
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
500 | if (!b) | 509 | if (!b) |
501 | goto out; | 510 | goto out; |
502 | 511 | ||
503 | err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, | 512 | err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, |
504 | b->kobj, name); | 513 | b->kobj, name); |
505 | if (err) | 514 | if (err) |
506 | goto out; | 515 | goto out; |
507 | 516 | ||
508 | cpumask_copy(b->cpus, cpu_core_mask(cpu)); | 517 | cpumask_copy(b->cpus, cpu_core_mask(cpu)); |
509 | per_cpu(threshold_banks, cpu)[bank] = b; | 518 | per_cpu(threshold_banks, cpu)[bank] = b; |
519 | |||
510 | goto out; | 520 | goto out; |
511 | } | 521 | } |
512 | #endif | 522 | #endif |
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
522 | goto out; | 532 | goto out; |
523 | } | 533 | } |
524 | 534 | ||
525 | b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); | 535 | b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); |
526 | if (!b->kobj) | 536 | if (!b->kobj) |
527 | goto out_free; | 537 | goto out_free; |
528 | 538 | ||
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
542 | if (i == cpu) | 552 | if (i == cpu) |
543 | continue; | 553 | continue; |
544 | 554 | ||
545 | err = sysfs_create_link(&per_cpu(device_mce, i).kobj, | 555 | err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, |
546 | b->kobj, name); | 556 | b->kobj, name); |
547 | if (err) | 557 | if (err) |
548 | goto out; | 558 | goto out; |
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu, | |||
605 | 615 | ||
606 | static void threshold_remove_bank(unsigned int cpu, int bank) | 616 | static void threshold_remove_bank(unsigned int cpu, int bank) |
607 | { | 617 | { |
608 | int i = 0; | ||
609 | struct threshold_bank *b; | 618 | struct threshold_bank *b; |
610 | char name[32]; | 619 | char name[32]; |
620 | int i = 0; | ||
611 | 621 | ||
612 | b = per_cpu(threshold_banks, cpu)[bank]; | 622 | b = per_cpu(threshold_banks, cpu)[bank]; |
613 | |||
614 | if (!b) | 623 | if (!b) |
615 | return; | 624 | return; |
616 | |||
617 | if (!b->blocks) | 625 | if (!b->blocks) |
618 | goto free_out; | 626 | goto free_out; |
619 | 627 | ||
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
622 | #ifdef CONFIG_SMP | 630 | #ifdef CONFIG_SMP |
623 | /* sibling symlink */ | 631 | /* sibling symlink */ |
624 | if (shared_bank[bank] && b->blocks->cpu != cpu) { | 632 | if (shared_bank[bank] && b->blocks->cpu != cpu) { |
625 | sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); | 633 | sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); |
626 | per_cpu(threshold_banks, cpu)[bank] = NULL; | 634 | per_cpu(threshold_banks, cpu)[bank] = NULL; |
635 | |||
627 | return; | 636 | return; |
628 | } | 637 | } |
629 | #endif | 638 | #endif |
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
633 | if (i == cpu) | 642 | if (i == cpu) |
634 | continue; | 643 | continue; |
635 | 644 | ||
636 | sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); | 645 | sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); |
637 | per_cpu(threshold_banks, i)[bank] = NULL; | 646 | per_cpu(threshold_banks, i)[bank] = NULL; |
638 | } | 647 | } |
639 | 648 | ||
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu) | |||
659 | } | 668 | } |
660 | 669 | ||
661 | /* get notified when a cpu comes on/off */ | 670 | /* get notified when a cpu comes on/off */ |
662 | static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, | 671 | static void __cpuinit |
663 | unsigned int cpu) | 672 | amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) |
664 | { | 673 | { |
665 | if (cpu >= NR_CPUS) | ||
666 | return; | ||
667 | |||
668 | switch (action) { | 674 | switch (action) { |
669 | case CPU_ONLINE: | 675 | case CPU_ONLINE: |
670 | case CPU_ONLINE_FROZEN: | 676 | case CPU_ONLINE_FROZEN: |
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void) | |||
686 | /* to hit CPUs online before the notifier is up */ | 692 | /* to hit CPUs online before the notifier is up */ |
687 | for_each_online_cpu(lcpu) { | 693 | for_each_online_cpu(lcpu) { |
688 | int err = threshold_create_device(lcpu); | 694 | int err = threshold_create_device(lcpu); |
695 | |||
689 | if (err) | 696 | if (err) |
690 | return err; | 697 | return err; |
691 | } | 698 | } |
692 | threshold_cpu_callback = amd_64_threshold_cpu_callback; | 699 | threshold_cpu_callback = amd_64_threshold_cpu_callback; |
700 | |||
693 | return 0; | 701 | return 0; |
694 | } | 702 | } |
695 | |||
696 | device_initcall(threshold_init_device); | 703 | device_initcall(threshold_init_device); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 000000000000..2b011d2d8579 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -0,0 +1,74 @@ | |||
1 | /* | ||
2 | * Common code for Intel machine checks | ||
3 | */ | ||
4 | #include <linux/interrupt.h> | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/smp.h> | ||
9 | |||
10 | #include <asm/therm_throt.h> | ||
11 | #include <asm/processor.h> | ||
12 | #include <asm/system.h> | ||
13 | #include <asm/apic.h> | ||
14 | #include <asm/msr.h> | ||
15 | |||
16 | #include "mce.h" | ||
17 | |||
18 | void intel_init_thermal(struct cpuinfo_x86 *c) | ||
19 | { | ||
20 | unsigned int cpu = smp_processor_id(); | ||
21 | int tm2 = 0; | ||
22 | u32 l, h; | ||
23 | |||
24 | /* Thermal monitoring depends on ACPI and clock modulation*/ | ||
25 | if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) | ||
26 | return; | ||
27 | |||
28 | /* | ||
29 | * First check if its enabled already, in which case there might | ||
30 | * be some SMM goo which handles it, so we can't even put a handler | ||
31 | * since it might be delivered via SMI already: | ||
32 | */ | ||
33 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
34 | h = apic_read(APIC_LVTTHMR); | ||
35 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | ||
36 | printk(KERN_DEBUG | ||
37 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
38 | return; | ||
39 | } | ||
40 | |||
41 | if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) | ||
42 | tm2 = 1; | ||
43 | |||
44 | /* Check whether a vector already exists */ | ||
45 | if (h & APIC_VECTOR_MASK) { | ||
46 | printk(KERN_DEBUG | ||
47 | "CPU%d: Thermal LVT vector (%#x) already installed\n", | ||
48 | cpu, (h & APIC_VECTOR_MASK)); | ||
49 | return; | ||
50 | } | ||
51 | |||
52 | /* We'll mask the thermal vector in the lapic till we're ready: */ | ||
53 | h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; | ||
54 | apic_write(APIC_LVTTHMR, h); | ||
55 | |||
56 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
57 | wrmsr(MSR_IA32_THERM_INTERRUPT, | ||
58 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | ||
59 | |||
60 | intel_set_thermal_handler(); | ||
61 | |||
62 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
63 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | ||
64 | |||
65 | /* Unmask the thermal vector: */ | ||
66 | l = apic_read(APIC_LVTTHMR); | ||
67 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
68 | |||
69 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
70 | cpu, tm2 ? "TM2" : "TM1"); | ||
71 | |||
72 | /* enable thermal throttle processing */ | ||
73 | atomic_set(&therm_throt_en, 1); | ||
74 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 65a0fceedcd7..f2ef6952c400 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <asm/idle.h> | 16 | #include <asm/idle.h> |
17 | #include <asm/therm_throt.h> | 17 | #include <asm/therm_throt.h> |
18 | 18 | ||
19 | #include "mce.h" | ||
20 | |||
19 | asmlinkage void smp_thermal_interrupt(void) | 21 | asmlinkage void smp_thermal_interrupt(void) |
20 | { | 22 | { |
21 | __u64 msr_val; | 23 | __u64 msr_val; |
@@ -26,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void) | |||
26 | irq_enter(); | 28 | irq_enter(); |
27 | 29 | ||
28 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 30 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
29 | if (therm_throt_process(msr_val & 1)) | 31 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) |
30 | mce_log_therm_throt_event(msr_val); | 32 | mce_log_therm_throt_event(msr_val); |
31 | 33 | ||
32 | inc_irq_stat(irq_thermal_count); | 34 | inc_irq_stat(irq_thermal_count); |
33 | irq_exit(); | 35 | irq_exit(); |
34 | } | 36 | } |
35 | 37 | ||
36 | static void intel_init_thermal(struct cpuinfo_x86 *c) | ||
37 | { | ||
38 | u32 l, h; | ||
39 | int tm2 = 0; | ||
40 | unsigned int cpu = smp_processor_id(); | ||
41 | |||
42 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
43 | return; | ||
44 | |||
45 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
46 | return; | ||
47 | |||
48 | /* first check if TM1 is already enabled by the BIOS, in which | ||
49 | * case there might be some SMM goo which handles it, so we can't even | ||
50 | * put a handler since it might be delivered via SMI already. | ||
51 | */ | ||
52 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
53 | h = apic_read(APIC_LVTTHMR); | ||
54 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | ||
55 | printk(KERN_DEBUG | ||
56 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
57 | return; | ||
58 | } | ||
59 | |||
60 | if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) | ||
61 | tm2 = 1; | ||
62 | |||
63 | if (h & APIC_VECTOR_MASK) { | ||
64 | printk(KERN_DEBUG | ||
65 | "CPU%d: Thermal LVT vector (%#x) already " | ||
66 | "installed\n", cpu, (h & APIC_VECTOR_MASK)); | ||
67 | return; | ||
68 | } | ||
69 | |||
70 | h = THERMAL_APIC_VECTOR; | ||
71 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); | ||
72 | apic_write(APIC_LVTTHMR, h); | ||
73 | |||
74 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
75 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); | ||
76 | |||
77 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
78 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | ||
79 | |||
80 | l = apic_read(APIC_LVTTHMR); | ||
81 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
82 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
83 | cpu, tm2 ? "TM2" : "TM1"); | ||
84 | |||
85 | /* enable thermal throttle processing */ | ||
86 | atomic_set(&therm_throt_en, 1); | ||
87 | return; | ||
88 | } | ||
89 | |||
90 | /* | 38 | /* |
91 | * Support for Intel Correct Machine Check Interrupts. This allows | 39 | * Support for Intel Correct Machine Check Interrupts. This allows |
92 | * the CPU to raise an interrupt when a corrected machine check happened. | 40 | * the CPU to raise an interrupt when a corrected machine check happened. |
@@ -108,6 +56,9 @@ static int cmci_supported(int *banks) | |||
108 | { | 56 | { |
109 | u64 cap; | 57 | u64 cap; |
110 | 58 | ||
59 | if (mce_cmci_disabled || mce_ignore_ce) | ||
60 | return 0; | ||
61 | |||
111 | /* | 62 | /* |
112 | * Vendor check is not strictly needed, but the initial | 63 | * Vendor check is not strictly needed, but the initial |
113 | * initialization is vendor keyed and this | 64 | * initialization is vendor keyed and this |
@@ -131,7 +82,7 @@ static int cmci_supported(int *banks) | |||
131 | static void intel_threshold_interrupt(void) | 82 | static void intel_threshold_interrupt(void) |
132 | { | 83 | { |
133 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | 84 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); |
134 | mce_notify_user(); | 85 | mce_notify_irq(); |
135 | } | 86 | } |
136 | 87 | ||
137 | static void print_update(char *type, int *hdr, int num) | 88 | static void print_update(char *type, int *hdr, int num) |
@@ -247,7 +198,7 @@ void cmci_rediscover(int dying) | |||
247 | return; | 198 | return; |
248 | cpumask_copy(old, ¤t->cpus_allowed); | 199 | cpumask_copy(old, ¤t->cpus_allowed); |
249 | 200 | ||
250 | for_each_online_cpu (cpu) { | 201 | for_each_online_cpu(cpu) { |
251 | if (cpu == dying) | 202 | if (cpu == dying) |
252 | continue; | 203 | continue; |
253 | if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) | 204 | if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) |
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc9..70b710420f74 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c | |||
@@ -6,15 +6,14 @@ | |||
6 | * This file contains routines to check for non-fatal MCEs every 15s | 6 | * This file contains routines to check for non-fatal MCEs every 15s |
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/jiffies.h> | ||
14 | #include <linux/workqueue.h> | ||
15 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
16 | #include <linux/smp.h> | 10 | #include <linux/workqueue.h> |
11 | #include <linux/jiffies.h> | ||
12 | #include <linux/kernel.h> | ||
17 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/types.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/smp.h> | ||
18 | 17 | ||
19 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
20 | #include <asm/system.h> | 19 | #include <asm/system.h> |
@@ -22,9 +21,9 @@ | |||
22 | 21 | ||
23 | #include "mce.h" | 22 | #include "mce.h" |
24 | 23 | ||
25 | static int firstbank; | 24 | static int firstbank; |
26 | 25 | ||
27 | #define MCE_RATE 15*HZ /* timer rate is 15s */ | 26 | #define MCE_RATE (15*HZ) /* timer rate is 15s */ |
28 | 27 | ||
29 | static void mce_checkregs(void *info) | 28 | static void mce_checkregs(void *info) |
30 | { | 29 | { |
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info) | |||
34 | for (i = firstbank; i < nr_mce_banks; i++) { | 33 | for (i = firstbank; i < nr_mce_banks; i++) { |
35 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | 34 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); |
36 | 35 | ||
37 | if (high & (1<<31)) { | 36 | if (!(high & (1<<31))) |
38 | printk(KERN_INFO "MCE: The hardware reports a non " | 37 | continue; |
39 | "fatal, correctable incident occurred on " | 38 | |
40 | "CPU %d.\n", | 39 | printk(KERN_INFO "MCE: The hardware reports a non fatal, " |
40 | "correctable incident occurred on CPU %d.\n", | ||
41 | smp_processor_id()); | 41 | smp_processor_id()); |
42 | printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); | 42 | |
43 | 43 | printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); | |
44 | /* | 44 | |
45 | * Scrub the error so we don't pick it up in MCE_RATE | 45 | /* |
46 | * seconds time. | 46 | * Scrub the error so we don't pick it up in MCE_RATE |
47 | */ | 47 | * seconds time: |
48 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | 48 | */ |
49 | 49 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | |
50 | /* Serialize */ | 50 | |
51 | wmb(); | 51 | /* Serialize: */ |
52 | add_taint(TAINT_MACHINE_CHECK); | 52 | wmb(); |
53 | } | 53 | add_taint(TAINT_MACHINE_CHECK); |
54 | } | 54 | } |
55 | } | 55 | } |
56 | 56 | ||
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void) | |||
77 | 77 | ||
78 | /* Some Athlons misbehave when we frob bank 0 */ | 78 | /* Some Athlons misbehave when we frob bank 0 */ |
79 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | 79 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && |
80 | boot_cpu_data.x86 == 6) | 80 | boot_cpu_data.x86 == 6) |
81 | firstbank = 1; | 81 | firstbank = 1; |
82 | else | 82 | else |
83 | firstbank = 0; | 83 | firstbank = 0; |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * Check for non-fatal errors every MCE_RATE s | 86 | * Check for non-fatal errors every MCE_RATE s |
87 | */ | 87 | */ |
88 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); | 88 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); |
89 | printk(KERN_INFO "Machine check exception polling timer started.\n"); | 89 | printk(KERN_INFO "Machine check exception polling timer started.\n"); |
90 | |||
90 | return 0; | 91 | return 0; |
91 | } | 92 | } |
92 | module_init(init_nonfatal_mce_checker); | 93 | module_init(init_nonfatal_mce_checker); |
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf382..82cee108a2d3 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c | |||
@@ -2,18 +2,17 @@ | |||
2 | * P4 specific Machine Check Exception Reporting | 2 | * P4 specific Machine Check Exception Reporting |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #include <linux/init.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
10 | 10 | ||
11 | #include <asm/therm_throt.h> | ||
11 | #include <asm/processor.h> | 12 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 13 | #include <asm/system.h> |
13 | #include <asm/msr.h> | ||
14 | #include <asm/apic.h> | 14 | #include <asm/apic.h> |
15 | 15 | #include <asm/msr.h> | |
16 | #include <asm/therm_throt.h> | ||
17 | 16 | ||
18 | #include "mce.h" | 17 | #include "mce.h" |
19 | 18 | ||
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs; | |||
36 | 35 | ||
37 | 36 | ||
38 | #ifdef CONFIG_X86_MCE_P4THERMAL | 37 | #ifdef CONFIG_X86_MCE_P4THERMAL |
38 | |||
39 | static void unexpected_thermal_interrupt(struct pt_regs *regs) | 39 | static void unexpected_thermal_interrupt(struct pt_regs *regs) |
40 | { | 40 | { |
41 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", | 41 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", |
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) | |||
43 | add_taint(TAINT_MACHINE_CHECK); | 43 | add_taint(TAINT_MACHINE_CHECK); |
44 | } | 44 | } |
45 | 45 | ||
46 | /* P4/Xeon Thermal transition interrupt handler */ | 46 | /* P4/Xeon Thermal transition interrupt handler: */ |
47 | static void intel_thermal_interrupt(struct pt_regs *regs) | 47 | static void intel_thermal_interrupt(struct pt_regs *regs) |
48 | { | 48 | { |
49 | __u64 msr_val; | 49 | __u64 msr_val; |
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs) | |||
51 | ack_APIC_irq(); | 51 | ack_APIC_irq(); |
52 | 52 | ||
53 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 53 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
54 | therm_throt_process(msr_val & 0x1); | 54 | therm_throt_process(msr_val & THERM_STATUS_PROCHOT); |
55 | } | 55 | } |
56 | 56 | ||
57 | /* Thermal interrupt handler for this CPU setup */ | 57 | /* Thermal interrupt handler for this CPU setup: */ |
58 | static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; | 58 | static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = |
59 | unexpected_thermal_interrupt; | ||
59 | 60 | ||
60 | void smp_thermal_interrupt(struct pt_regs *regs) | 61 | void smp_thermal_interrupt(struct pt_regs *regs) |
61 | { | 62 | { |
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs) | |||
65 | irq_exit(); | 66 | irq_exit(); |
66 | } | 67 | } |
67 | 68 | ||
68 | /* P4/Xeon Thermal regulation detect and init */ | 69 | void intel_set_thermal_handler(void) |
69 | static void intel_init_thermal(struct cpuinfo_x86 *c) | ||
70 | { | 70 | { |
71 | u32 l, h; | ||
72 | unsigned int cpu = smp_processor_id(); | ||
73 | |||
74 | /* Thermal monitoring */ | ||
75 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
76 | return; /* -ENODEV */ | ||
77 | |||
78 | /* Clock modulation */ | ||
79 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
80 | return; /* -ENODEV */ | ||
81 | |||
82 | /* first check if its enabled already, in which case there might | ||
83 | * be some SMM goo which handles it, so we can't even put a handler | ||
84 | * since it might be delivered via SMI already -zwanem. | ||
85 | */ | ||
86 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
87 | h = apic_read(APIC_LVTTHMR); | ||
88 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | ||
89 | printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", | ||
90 | cpu); | ||
91 | return; /* -EBUSY */ | ||
92 | } | ||
93 | |||
94 | /* check whether a vector already exists, temporarily masked? */ | ||
95 | if (h & APIC_VECTOR_MASK) { | ||
96 | printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " | ||
97 | "installed\n", | ||
98 | cpu, (h & APIC_VECTOR_MASK)); | ||
99 | return; /* -EBUSY */ | ||
100 | } | ||
101 | |||
102 | /* The temperature transition interrupt handler setup */ | ||
103 | h = THERMAL_APIC_VECTOR; /* our delivery vector */ | ||
104 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ | ||
105 | apic_write(APIC_LVTTHMR, h); | ||
106 | |||
107 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
108 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); | ||
109 | |||
110 | /* ok we're good to go... */ | ||
111 | vendor_thermal_interrupt = intel_thermal_interrupt; | 71 | vendor_thermal_interrupt = intel_thermal_interrupt; |
112 | |||
113 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
114 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | ||
115 | |||
116 | l = apic_read(APIC_LVTTHMR); | ||
117 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
118 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); | ||
119 | |||
120 | /* enable thermal throttle processing */ | ||
121 | atomic_set(&therm_throt_en, 1); | ||
122 | return; | ||
123 | } | 72 | } |
124 | #endif /* CONFIG_X86_MCE_P4THERMAL */ | ||
125 | 73 | ||
74 | #endif /* CONFIG_X86_MCE_P4THERMAL */ | ||
126 | 75 | ||
127 | /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ | 76 | /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ |
128 | static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) | 77 | static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) |
129 | { | 78 | { |
130 | u32 h; | 79 | u32 h; |
131 | 80 | ||
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) | |||
143 | 92 | ||
144 | static void intel_machine_check(struct pt_regs *regs, long error_code) | 93 | static void intel_machine_check(struct pt_regs *regs, long error_code) |
145 | { | 94 | { |
146 | int recover = 1; | ||
147 | u32 alow, ahigh, high, low; | 95 | u32 alow, ahigh, high, low; |
148 | u32 mcgstl, mcgsth; | 96 | u32 mcgstl, mcgsth; |
97 | int recover = 1; | ||
149 | int i; | 98 | int i; |
150 | 99 | ||
151 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 100 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
157 | 106 | ||
158 | if (mce_num_extended_msrs > 0) { | 107 | if (mce_num_extended_msrs > 0) { |
159 | struct intel_mce_extended_msrs dbg; | 108 | struct intel_mce_extended_msrs dbg; |
109 | |||
160 | intel_get_extended_msrs(&dbg); | 110 | intel_get_extended_msrs(&dbg); |
111 | |||
161 | printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" | 112 | printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" |
162 | "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" | 113 | "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" |
163 | "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", | 114 | "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", |
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
171 | if (high & (1<<31)) { | 122 | if (high & (1<<31)) { |
172 | char misc[20]; | 123 | char misc[20]; |
173 | char addr[24]; | 124 | char addr[24]; |
125 | |||
174 | misc[0] = addr[0] = '\0'; | 126 | misc[0] = addr[0] = '\0'; |
175 | if (high & (1<<29)) | 127 | if (high & (1<<29)) |
176 | recover |= 1; | 128 | recover |= 1; |
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
196 | panic("Unable to continue"); | 148 | panic("Unable to continue"); |
197 | 149 | ||
198 | printk(KERN_EMERG "Attempting to continue.\n"); | 150 | printk(KERN_EMERG "Attempting to continue.\n"); |
151 | |||
199 | /* | 152 | /* |
200 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not | 153 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not |
201 | * recoverable/continuable.This will allow BIOS to look at the MSRs | 154 | * recoverable/continuable.This will allow BIOS to look at the MSRs |
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
217 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 170 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
218 | } | 171 | } |
219 | 172 | ||
220 | |||
221 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c) | 173 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c) |
222 | { | 174 | { |
223 | u32 l, h; | 175 | u32 l, h; |
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69edc..015f481ab1b0 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * P5 specific Machine Check Exception Reporting | 2 | * P5 specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> | 3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
11 | 10 | ||
12 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
@@ -15,39 +14,58 @@ | |||
15 | 14 | ||
16 | #include "mce.h" | 15 | #include "mce.h" |
17 | 16 | ||
18 | /* Machine check handler for Pentium class Intel */ | 17 | /* By default disabled */ |
18 | int mce_p5_enable; | ||
19 | |||
20 | /* Machine check handler for Pentium class Intel CPUs: */ | ||
19 | static void pentium_machine_check(struct pt_regs *regs, long error_code) | 21 | static void pentium_machine_check(struct pt_regs *regs, long error_code) |
20 | { | 22 | { |
21 | u32 loaddr, hi, lotype; | 23 | u32 loaddr, hi, lotype; |
24 | |||
22 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); | 25 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); |
23 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); | 26 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); |
24 | printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); | 27 | |
25 | if (lotype&(1<<5)) | 28 | printk(KERN_EMERG |
26 | printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); | 29 | "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", |
30 | smp_processor_id(), loaddr, lotype); | ||
31 | |||
32 | if (lotype & (1<<5)) { | ||
33 | printk(KERN_EMERG | ||
34 | "CPU#%d: Possible thermal failure (CPU on fire ?).\n", | ||
35 | smp_processor_id()); | ||
36 | } | ||
37 | |||
27 | add_taint(TAINT_MACHINE_CHECK); | 38 | add_taint(TAINT_MACHINE_CHECK); |
28 | } | 39 | } |
29 | 40 | ||
30 | /* Set up machine check reporting for processors with Intel style MCE */ | 41 | /* Set up machine check reporting for processors with Intel style MCE: */ |
31 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c) | 42 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c) |
32 | { | 43 | { |
33 | u32 l, h; | 44 | u32 l, h; |
34 | 45 | ||
35 | /*Check for MCE support */ | 46 | /* Check for MCE support: */ |
36 | if (!cpu_has(c, X86_FEATURE_MCE)) | 47 | if (!cpu_has(c, X86_FEATURE_MCE)) |
37 | return; | 48 | return; |
38 | 49 | ||
39 | /* Default P5 to off as its often misconnected */ | 50 | #ifdef CONFIG_X86_OLD_MCE |
51 | /* Default P5 to off as its often misconnected: */ | ||
40 | if (mce_disabled != -1) | 52 | if (mce_disabled != -1) |
41 | return; | 53 | return; |
54 | #endif | ||
55 | |||
42 | machine_check_vector = pentium_machine_check; | 56 | machine_check_vector = pentium_machine_check; |
57 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
43 | wmb(); | 58 | wmb(); |
44 | 59 | ||
45 | /* Read registers before enabling */ | 60 | /* Read registers before enabling: */ |
46 | rdmsr(MSR_IA32_P5_MC_ADDR, l, h); | 61 | rdmsr(MSR_IA32_P5_MC_ADDR, l, h); |
47 | rdmsr(MSR_IA32_P5_MC_TYPE, l, h); | 62 | rdmsr(MSR_IA32_P5_MC_TYPE, l, h); |
48 | printk(KERN_INFO "Intel old style machine check architecture supported.\n"); | 63 | printk(KERN_INFO |
64 | "Intel old style machine check architecture supported.\n"); | ||
49 | 65 | ||
50 | /* Enable MCE */ | 66 | /* Enable MCE: */ |
51 | set_in_cr4(X86_CR4_MCE); | 67 | set_in_cr4(X86_CR4_MCE); |
52 | printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); | 68 | printk(KERN_INFO |
69 | "Intel old style machine check reporting enabled on CPU#%d.\n", | ||
70 | smp_processor_id()); | ||
53 | } | 71 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434b..43c24e667457 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * P6 specific Machine Check Exception Reporting | 2 | * P6 specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> | 3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
11 | 10 | ||
12 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
@@ -18,9 +17,9 @@ | |||
18 | /* Machine Check Handler For PII/PIII */ | 17 | /* Machine Check Handler For PII/PIII */ |
19 | static void intel_machine_check(struct pt_regs *regs, long error_code) | 18 | static void intel_machine_check(struct pt_regs *regs, long error_code) |
20 | { | 19 | { |
21 | int recover = 1; | ||
22 | u32 alow, ahigh, high, low; | 20 | u32 alow, ahigh, high, low; |
23 | u32 mcgstl, mcgsth; | 21 | u32 mcgstl, mcgsth; |
22 | int recover = 1; | ||
24 | int i; | 23 | int i; |
25 | 24 | ||
26 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 25 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
35 | if (high & (1<<31)) { | 34 | if (high & (1<<31)) { |
36 | char misc[20]; | 35 | char misc[20]; |
37 | char addr[24]; | 36 | char addr[24]; |
38 | misc[0] = addr[0] = '\0'; | 37 | |
38 | misc[0] = '\0'; | ||
39 | addr[0] = '\0'; | ||
40 | |||
39 | if (high & (1<<29)) | 41 | if (high & (1<<29)) |
40 | recover |= 1; | 42 | recover |= 1; |
41 | if (high & (1<<25)) | 43 | if (high & (1<<25)) |
42 | recover |= 2; | 44 | recover |= 2; |
43 | high &= ~(1<<31); | 45 | high &= ~(1<<31); |
46 | |||
44 | if (high & (1<<27)) { | 47 | if (high & (1<<27)) { |
45 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); | 48 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); |
46 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); | 49 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); |
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
49 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); | 52 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); |
50 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); | 53 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); |
51 | } | 54 | } |
55 | |||
52 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", | 56 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", |
53 | smp_processor_id(), i, high, low, misc, addr); | 57 | smp_processor_id(), i, high, low, misc, addr); |
54 | } | 58 | } |
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
63 | /* | 67 | /* |
64 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not | 68 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not |
65 | * recoverable/continuable.This will allow BIOS to look at the MSRs | 69 | * recoverable/continuable.This will allow BIOS to look at the MSRs |
66 | * for errors if the OS could not log the error. | 70 | * for errors if the OS could not log the error: |
67 | */ | 71 | */ |
68 | for (i = 0; i < nr_mce_banks; i++) { | 72 | for (i = 0; i < nr_mce_banks; i++) { |
69 | unsigned int msr; | 73 | unsigned int msr; |
74 | |||
70 | msr = MSR_IA32_MC0_STATUS+i*4; | 75 | msr = MSR_IA32_MC0_STATUS+i*4; |
71 | rdmsr(msr, low, high); | 76 | rdmsr(msr, low, high); |
72 | if (high & (1<<31)) { | 77 | if (high & (1<<31)) { |
73 | /* Clear it */ | 78 | /* Clear it: */ |
74 | wrmsr(msr, 0UL, 0UL); | 79 | wrmsr(msr, 0UL, 0UL); |
75 | /* Serialize */ | 80 | /* Serialize: */ |
76 | wmb(); | 81 | wmb(); |
77 | add_taint(TAINT_MACHINE_CHECK); | 82 | add_taint(TAINT_MACHINE_CHECK); |
78 | } | 83 | } |
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
81 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 86 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
82 | } | 87 | } |
83 | 88 | ||
84 | /* Set up machine check reporting for processors with Intel style MCE */ | 89 | /* Set up machine check reporting for processors with Intel style MCE: */ |
85 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c) | 90 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c) |
86 | { | 91 | { |
87 | u32 l, h; | 92 | u32 l, h; |
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) | |||
97 | 102 | ||
98 | /* Ok machine check is available */ | 103 | /* Ok machine check is available */ |
99 | machine_check_vector = intel_machine_check; | 104 | machine_check_vector = intel_machine_check; |
105 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
100 | wmb(); | 106 | wmb(); |
101 | 107 | ||
102 | printk(KERN_INFO "Intel machine check architecture supported.\n"); | 108 | printk(KERN_INFO "Intel machine check architecture supported.\n"); |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b9..7b1ae2e20ba5 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * | ||
3 | * Thermal throttle event support code (such as syslog messaging and rate | 2 | * Thermal throttle event support code (such as syslog messaging and rate |
4 | * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). | 3 | * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). |
4 | * | ||
5 | * This allows consistent reporting of CPU thermal throttle events. | 5 | * This allows consistent reporting of CPU thermal throttle events. |
6 | * | 6 | * |
7 | * Maintains a counter in /sys that keeps track of the number of thermal | 7 | * Maintains a counter in /sys that keeps track of the number of thermal |
@@ -13,43 +13,43 @@ | |||
13 | * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. | 13 | * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. |
14 | * Inspired by Ross Biro's and Al Borchers' counter code. | 14 | * Inspired by Ross Biro's and Al Borchers' counter code. |
15 | */ | 15 | */ |
16 | 16 | #include <linux/notifier.h> | |
17 | #include <linux/jiffies.h> | ||
17 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
18 | #include <linux/sysdev.h> | 19 | #include <linux/sysdev.h> |
19 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
20 | #include <asm/cpu.h> | 21 | |
21 | #include <linux/notifier.h> | ||
22 | #include <linux/jiffies.h> | ||
23 | #include <asm/therm_throt.h> | 22 | #include <asm/therm_throt.h> |
24 | 23 | ||
25 | /* How long to wait between reporting thermal events */ | 24 | /* How long to wait between reporting thermal events */ |
26 | #define CHECK_INTERVAL (300 * HZ) | 25 | #define CHECK_INTERVAL (300 * HZ) |
27 | 26 | ||
28 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; | 27 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; |
29 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); | 28 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); |
30 | atomic_t therm_throt_en = ATOMIC_INIT(0); | 29 | |
30 | atomic_t therm_throt_en = ATOMIC_INIT(0); | ||
31 | 31 | ||
32 | #ifdef CONFIG_SYSFS | 32 | #ifdef CONFIG_SYSFS |
33 | #define define_therm_throt_sysdev_one_ro(_name) \ | 33 | #define define_therm_throt_sysdev_one_ro(_name) \ |
34 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) | 34 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) |
35 | 35 | ||
36 | #define define_therm_throt_sysdev_show_func(name) \ | 36 | #define define_therm_throt_sysdev_show_func(name) \ |
37 | static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ | 37 | static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ |
38 | struct sysdev_attribute *attr, \ | 38 | struct sysdev_attribute *attr, \ |
39 | char *buf) \ | 39 | char *buf) \ |
40 | { \ | 40 | { \ |
41 | unsigned int cpu = dev->id; \ | 41 | unsigned int cpu = dev->id; \ |
42 | ssize_t ret; \ | 42 | ssize_t ret; \ |
43 | \ | 43 | \ |
44 | preempt_disable(); /* CPU hotplug */ \ | 44 | preempt_disable(); /* CPU hotplug */ \ |
45 | if (cpu_online(cpu)) \ | 45 | if (cpu_online(cpu)) \ |
46 | ret = sprintf(buf, "%lu\n", \ | 46 | ret = sprintf(buf, "%lu\n", \ |
47 | per_cpu(thermal_throttle_##name, cpu)); \ | 47 | per_cpu(thermal_throttle_##name, cpu)); \ |
48 | else \ | 48 | else \ |
49 | ret = 0; \ | 49 | ret = 0; \ |
50 | preempt_enable(); \ | 50 | preempt_enable(); \ |
51 | \ | 51 | \ |
52 | return ret; \ | 52 | return ret; \ |
53 | } | 53 | } |
54 | 54 | ||
55 | define_therm_throt_sysdev_show_func(count); | 55 | define_therm_throt_sysdev_show_func(count); |
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = { | |||
61 | }; | 61 | }; |
62 | 62 | ||
63 | static struct attribute_group thermal_throttle_attr_group = { | 63 | static struct attribute_group thermal_throttle_attr_group = { |
64 | .attrs = thermal_throttle_attrs, | 64 | .attrs = thermal_throttle_attrs, |
65 | .name = "thermal_throttle" | 65 | .name = "thermal_throttle" |
66 | }; | 66 | }; |
67 | #endif /* CONFIG_SYSFS */ | 67 | #endif /* CONFIG_SYSFS */ |
68 | 68 | ||
@@ -110,10 +110,11 @@ int therm_throt_process(int curr) | |||
110 | } | 110 | } |
111 | 111 | ||
112 | #ifdef CONFIG_SYSFS | 112 | #ifdef CONFIG_SYSFS |
113 | /* Add/Remove thermal_throttle interface for CPU device */ | 113 | /* Add/Remove thermal_throttle interface for CPU device: */ |
114 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) | 114 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) |
115 | { | 115 | { |
116 | return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); | 116 | return sysfs_create_group(&sys_dev->kobj, |
117 | &thermal_throttle_attr_group); | ||
117 | } | 118 | } |
118 | 119 | ||
119 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) | 120 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) |
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) | |||
121 | sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); | 122 | sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); |
122 | } | 123 | } |
123 | 124 | ||
124 | /* Mutex protecting device creation against CPU hotplug */ | 125 | /* Mutex protecting device creation against CPU hotplug: */ |
125 | static DEFINE_MUTEX(therm_cpu_lock); | 126 | static DEFINE_MUTEX(therm_cpu_lock); |
126 | 127 | ||
127 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | 128 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ |
128 | static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, | 129 | static __cpuinit int |
129 | unsigned long action, | 130 | thermal_throttle_cpu_callback(struct notifier_block *nfb, |
130 | void *hcpu) | 131 | unsigned long action, |
132 | void *hcpu) | ||
131 | { | 133 | { |
132 | unsigned int cpu = (unsigned long)hcpu; | 134 | unsigned int cpu = (unsigned long)hcpu; |
133 | struct sys_device *sys_dev; | 135 | struct sys_device *sys_dev; |
134 | int err = 0; | 136 | int err = 0; |
135 | 137 | ||
136 | sys_dev = get_cpu_sysdev(cpu); | 138 | sys_dev = get_cpu_sysdev(cpu); |
139 | |||
137 | switch (action) { | 140 | switch (action) { |
138 | case CPU_UP_PREPARE: | 141 | case CPU_UP_PREPARE: |
139 | case CPU_UP_PREPARE_FROZEN: | 142 | case CPU_UP_PREPARE_FROZEN: |
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f78..d746df2909c9 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c | |||
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void) | |||
17 | 17 | ||
18 | void (*mce_threshold_vector)(void) = default_threshold_interrupt; | 18 | void (*mce_threshold_vector)(void) = default_threshold_interrupt; |
19 | 19 | ||
20 | asmlinkage void mce_threshold_interrupt(void) | 20 | asmlinkage void smp_threshold_interrupt(void) |
21 | { | 21 | { |
22 | exit_idle(); | 22 | exit_idle(); |
23 | irq_enter(); | 23 | irq_enter(); |
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811d..81b02487090b 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * IDT Winchip specific Machine Check Exception Reporting | 2 | * IDT Winchip specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> | 3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | 9 | ||
11 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 11 | #include <asm/system.h> |
@@ -14,7 +13,7 @@ | |||
14 | 13 | ||
15 | #include "mce.h" | 14 | #include "mce.h" |
16 | 15 | ||
17 | /* Machine check handler for WinChip C6 */ | 16 | /* Machine check handler for WinChip C6: */ |
18 | static void winchip_machine_check(struct pt_regs *regs, long error_code) | 17 | static void winchip_machine_check(struct pt_regs *regs, long error_code) |
19 | { | 18 | { |
20 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); | 19 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); |
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) | |||
25 | void winchip_mcheck_init(struct cpuinfo_x86 *c) | 24 | void winchip_mcheck_init(struct cpuinfo_x86 *c) |
26 | { | 25 | { |
27 | u32 lo, hi; | 26 | u32 lo, hi; |
27 | |||
28 | machine_check_vector = winchip_machine_check; | 28 | machine_check_vector = winchip_machine_check; |
29 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
29 | wmb(); | 30 | wmb(); |
31 | |||
30 | rdmsr(MSR_IDT_FCR1, lo, hi); | 32 | rdmsr(MSR_IDT_FCR1, lo, hi); |
31 | lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ | 33 | lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ |
32 | lo &= ~(1<<4); /* Enable MCE */ | 34 | lo &= ~(1<<4); /* Enable MCE */ |
33 | wrmsr(MSR_IDT_FCR1, lo, hi); | 35 | wrmsr(MSR_IDT_FCR1, lo, hi); |
36 | |||
34 | set_in_cr4(X86_CR4_MCE); | 37 | set_in_cr4(X86_CR4_MCE); |
35 | printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); | 38 | |
39 | printk(KERN_INFO | ||
40 | "Winchip machine check reporting enabled on CPU#0.\n"); | ||
36 | } | 41 | } |
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 895c82e78455..275bc142cd5d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c | |||
@@ -968,6 +968,13 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) | |||
968 | if (!x86_pmu.num_counters_fixed) | 968 | if (!x86_pmu.num_counters_fixed) |
969 | return -1; | 969 | return -1; |
970 | 970 | ||
971 | /* | ||
972 | * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: | ||
973 | */ | ||
974 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && | ||
975 | boot_cpu_data.x86_model == 28) | ||
976 | return -1; | ||
977 | |||
971 | event = hwc->config & ARCH_PERFMON_EVENT_MASK; | 978 | event = hwc->config & ARCH_PERFMON_EVENT_MASK; |
972 | 979 | ||
973 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) | 980 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) |
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2ac1f0c2beb3..b07af8861244 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = | |||
182 | .notifier_call = cpuid_class_cpu_callback, | 182 | .notifier_call = cpuid_class_cpu_callback, |
183 | }; | 183 | }; |
184 | 184 | ||
185 | static char *cpuid_nodename(struct device *dev) | ||
186 | { | ||
187 | return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); | ||
188 | } | ||
189 | |||
185 | static int __init cpuid_init(void) | 190 | static int __init cpuid_init(void) |
186 | { | 191 | { |
187 | int i, err = 0; | 192 | int i, err = 0; |
@@ -198,6 +203,7 @@ static int __init cpuid_init(void) | |||
198 | err = PTR_ERR(cpuid_class); | 203 | err = PTR_ERR(cpuid_class); |
199 | goto out_chrdev; | 204 | goto out_chrdev; |
200 | } | 205 | } |
206 | cpuid_class->nodename = cpuid_nodename; | ||
201 | for_each_online_cpu(i) { | 207 | for_each_online_cpu(i) { |
202 | err = cpuid_device_create(i); | 208 | err = cpuid_device_create(i); |
203 | if (err != 0) | 209 | if (err != 0) |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a4742a340d8d..de74f0a3e0ed 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -963,6 +963,8 @@ END(\sym) | |||
963 | #ifdef CONFIG_SMP | 963 | #ifdef CONFIG_SMP |
964 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ | 964 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ |
965 | irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt | 965 | irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt |
966 | apicinterrupt REBOOT_VECTOR \ | ||
967 | reboot_interrupt smp_reboot_interrupt | ||
966 | #endif | 968 | #endif |
967 | 969 | ||
968 | #ifdef CONFIG_X86_UV | 970 | #ifdef CONFIG_X86_UV |
@@ -994,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ | |||
994 | #endif | 996 | #endif |
995 | 997 | ||
996 | apicinterrupt THRESHOLD_APIC_VECTOR \ | 998 | apicinterrupt THRESHOLD_APIC_VECTOR \ |
997 | threshold_interrupt mce_threshold_interrupt | 999 | threshold_interrupt smp_threshold_interrupt |
998 | apicinterrupt THERMAL_APIC_VECTOR \ | 1000 | apicinterrupt THERMAL_APIC_VECTOR \ |
999 | thermal_interrupt smp_thermal_interrupt | 1001 | thermal_interrupt smp_thermal_interrupt |
1000 | 1002 | ||
1003 | #ifdef CONFIG_X86_MCE | ||
1004 | apicinterrupt MCE_SELF_VECTOR \ | ||
1005 | mce_self_interrupt smp_mce_self_interrupt | ||
1006 | #endif | ||
1007 | |||
1001 | #ifdef CONFIG_SMP | 1008 | #ifdef CONFIG_SMP |
1002 | apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ | 1009 | apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ |
1003 | call_function_single_interrupt smp_call_function_single_interrupt | 1010 | call_function_single_interrupt smp_call_function_single_interrupt |
@@ -1379,7 +1386,7 @@ errorentry xen_stack_segment do_stack_segment | |||
1379 | errorentry general_protection do_general_protection | 1386 | errorentry general_protection do_general_protection |
1380 | errorentry page_fault do_page_fault | 1387 | errorentry page_fault do_page_fault |
1381 | #ifdef CONFIG_X86_MCE | 1388 | #ifdef CONFIG_X86_MCE |
1382 | paranoidzeroentry machine_check do_machine_check | 1389 | paranoidzeroentry machine_check *machine_check_vector(%rip) |
1383 | #endif | 1390 | #endif |
1384 | 1391 | ||
1385 | /* | 1392 | /* |
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c2e0bb0890d4..5cf36c053ac4 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
8 | #include <linux/jiffies.h> | 8 | #include <linux/jiffies.h> |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/timex.h> | ||
10 | #include <linux/delay.h> | 11 | #include <linux/delay.h> |
11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
12 | #include <linux/io.h> | 13 | #include <linux/io.h> |
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index df3bf269beab..270ff83efc11 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c | |||
@@ -12,7 +12,6 @@ | |||
12 | 12 | ||
13 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | 13 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); |
14 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | 14 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); |
15 | struct mm_struct init_mm = INIT_MM(init_mm); | ||
16 | 15 | ||
17 | /* | 16 | /* |
18 | * Initial thread structure. | 17 | * Initial thread structure. |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 38287b5f116e..b0cdde6932f5 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <asm/io_apic.h> | 12 | #include <asm/io_apic.h> |
13 | #include <asm/irq.h> | 13 | #include <asm/irq.h> |
14 | #include <asm/idle.h> | 14 | #include <asm/idle.h> |
15 | #include <asm/mce.h> | ||
15 | #include <asm/hw_irq.h> | 16 | #include <asm/hw_irq.h> |
16 | 17 | ||
17 | atomic_t irq_err_count; | 18 | atomic_t irq_err_count; |
@@ -96,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec) | |||
96 | for_each_online_cpu(j) | 97 | for_each_online_cpu(j) |
97 | seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); | 98 | seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); |
98 | seq_printf(p, " Thermal event interrupts\n"); | 99 | seq_printf(p, " Thermal event interrupts\n"); |
99 | # ifdef CONFIG_X86_64 | 100 | # ifdef CONFIG_X86_MCE_THRESHOLD |
100 | seq_printf(p, "%*s: ", prec, "THR"); | 101 | seq_printf(p, "%*s: ", prec, "THR"); |
101 | for_each_online_cpu(j) | 102 | for_each_online_cpu(j) |
102 | seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); | 103 | seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); |
103 | seq_printf(p, " Threshold APIC interrupts\n"); | 104 | seq_printf(p, " Threshold APIC interrupts\n"); |
104 | # endif | 105 | # endif |
105 | #endif | 106 | #endif |
107 | #ifdef CONFIG_X86_NEW_MCE | ||
108 | seq_printf(p, "%*s: ", prec, "MCE"); | ||
109 | for_each_online_cpu(j) | ||
110 | seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); | ||
111 | seq_printf(p, " Machine check exceptions\n"); | ||
112 | seq_printf(p, "%*s: ", prec, "MCP"); | ||
113 | for_each_online_cpu(j) | ||
114 | seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); | ||
115 | seq_printf(p, " Machine check polls\n"); | ||
116 | #endif | ||
106 | seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); | 117 | seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); |
107 | #if defined(CONFIG_X86_IO_APIC) | 118 | #if defined(CONFIG_X86_IO_APIC) |
108 | seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); | 119 | seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); |
@@ -185,10 +196,14 @@ u64 arch_irq_stat_cpu(unsigned int cpu) | |||
185 | #endif | 196 | #endif |
186 | #ifdef CONFIG_X86_MCE | 197 | #ifdef CONFIG_X86_MCE |
187 | sum += irq_stats(cpu)->irq_thermal_count; | 198 | sum += irq_stats(cpu)->irq_thermal_count; |
188 | # ifdef CONFIG_X86_64 | 199 | # ifdef CONFIG_X86_MCE_THRESHOLD |
189 | sum += irq_stats(cpu)->irq_threshold_count; | 200 | sum += irq_stats(cpu)->irq_threshold_count; |
190 | # endif | 201 | # endif |
191 | #endif | 202 | #endif |
203 | #ifdef CONFIG_X86_NEW_MCE | ||
204 | sum += per_cpu(mce_exception_count, cpu); | ||
205 | sum += per_cpu(mce_poll_count, cpu); | ||
206 | #endif | ||
192 | return sum; | 207 | return sum; |
193 | } | 208 | } |
194 | 209 | ||
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 267c6624c77f..696f0e475c2d 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -173,6 +173,9 @@ static void __init smp_intr_init(void) | |||
173 | /* Low priority IPI to cleanup after moving an irq */ | 173 | /* Low priority IPI to cleanup after moving an irq */ |
174 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | 174 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); |
175 | set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); | 175 | set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); |
176 | |||
177 | /* IPI used for rebooting/stopping */ | ||
178 | alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); | ||
176 | #endif | 179 | #endif |
177 | #endif /* CONFIG_SMP */ | 180 | #endif /* CONFIG_SMP */ |
178 | } | 181 | } |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6551dedee20c..a78ecad0c900 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
30 | #include <asm/timer.h> | ||
30 | 31 | ||
31 | #define MMU_QUEUE_SIZE 1024 | 32 | #define MMU_QUEUE_SIZE 1024 |
32 | 33 | ||
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void) | |||
230 | pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; | 231 | pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; |
231 | pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; | 232 | pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; |
232 | } | 233 | } |
234 | #ifdef CONFIG_X86_IO_APIC | ||
235 | no_timer_check = 1; | ||
236 | #endif | ||
233 | } | 237 | } |
234 | 238 | ||
235 | void __init kvm_guest_init(void) | 239 | void __init kvm_guest_init(void) |
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9c4461501fcb..9371448290ac 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = { | |||
236 | static struct miscdevice microcode_dev = { | 236 | static struct miscdevice microcode_dev = { |
237 | .minor = MICROCODE_MINOR, | 237 | .minor = MICROCODE_MINOR, |
238 | .name = "microcode", | 238 | .name = "microcode", |
239 | .devnode = "cpu/microcode", | ||
239 | .fops = µcode_fops, | 240 | .fops = µcode_fops, |
240 | }; | 241 | }; |
241 | 242 | ||
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c index c23880b90b5c..89f386f044e4 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module.c | |||
@@ -1,6 +1,5 @@ | |||
1 | /* Kernel module help for x86-64 | 1 | /* Kernel module help for x86. |
2 | Copyright (C) 2001 Rusty Russell. | 2 | Copyright (C) 2001 Rusty Russell. |
3 | Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | 3 | ||
5 | This program is free software; you can redistribute it and/or modify | 4 | This program is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published by | 5 | it under the terms of the GNU General Public License as published by |
@@ -22,23 +21,18 @@ | |||
22 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
23 | #include <linux/string.h> | 22 | #include <linux/string.h> |
24 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
25 | #include <linux/mm.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/bug.h> | 24 | #include <linux/bug.h> |
25 | #include <linux/mm.h> | ||
28 | 26 | ||
29 | #include <asm/system.h> | 27 | #include <asm/system.h> |
30 | #include <asm/page.h> | 28 | #include <asm/page.h> |
31 | #include <asm/pgtable.h> | 29 | #include <asm/pgtable.h> |
32 | 30 | ||
31 | #if 0 | ||
32 | #define DEBUGP printk | ||
33 | #else | ||
33 | #define DEBUGP(fmt...) | 34 | #define DEBUGP(fmt...) |
34 | 35 | #endif | |
35 | #ifndef CONFIG_UML | ||
36 | void module_free(struct module *mod, void *module_region) | ||
37 | { | ||
38 | vfree(module_region); | ||
39 | /* FIXME: If module_region == mod->init_region, trim exception | ||
40 | table entries. */ | ||
41 | } | ||
42 | 36 | ||
43 | void *module_alloc(unsigned long size) | 37 | void *module_alloc(unsigned long size) |
44 | { | 38 | { |
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size) | |||
54 | if (!area) | 48 | if (!area) |
55 | return NULL; | 49 | return NULL; |
56 | 50 | ||
57 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); | 51 | return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, |
52 | PAGE_KERNEL_EXEC); | ||
53 | } | ||
54 | |||
55 | /* Free memory returned from module_alloc */ | ||
56 | void module_free(struct module *mod, void *module_region) | ||
57 | { | ||
58 | vfree(module_region); | ||
58 | } | 59 | } |
59 | #endif | ||
60 | 60 | ||
61 | /* We don't need anything special. */ | 61 | /* We don't need anything special. */ |
62 | int module_frob_arch_sections(Elf_Ehdr *hdr, | 62 | int module_frob_arch_sections(Elf_Ehdr *hdr, |
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, | |||
67 | return 0; | 67 | return 0; |
68 | } | 68 | } |
69 | 69 | ||
70 | #ifdef CONFIG_X86_32 | ||
71 | int apply_relocate(Elf32_Shdr *sechdrs, | ||
72 | const char *strtab, | ||
73 | unsigned int symindex, | ||
74 | unsigned int relsec, | ||
75 | struct module *me) | ||
76 | { | ||
77 | unsigned int i; | ||
78 | Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; | ||
79 | Elf32_Sym *sym; | ||
80 | uint32_t *location; | ||
81 | |||
82 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
83 | sechdrs[relsec].sh_info); | ||
84 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
85 | /* This is where to make the change */ | ||
86 | location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
87 | + rel[i].r_offset; | ||
88 | /* This is the symbol it is referring to. Note that all | ||
89 | undefined symbols have been resolved. */ | ||
90 | sym = (Elf32_Sym *)sechdrs[symindex].sh_addr | ||
91 | + ELF32_R_SYM(rel[i].r_info); | ||
92 | |||
93 | switch (ELF32_R_TYPE(rel[i].r_info)) { | ||
94 | case R_386_32: | ||
95 | /* We add the value into the location given */ | ||
96 | *location += sym->st_value; | ||
97 | break; | ||
98 | case R_386_PC32: | ||
99 | /* Add the value, subtract its postition */ | ||
100 | *location += sym->st_value - (uint32_t)location; | ||
101 | break; | ||
102 | default: | ||
103 | printk(KERN_ERR "module %s: Unknown relocation: %u\n", | ||
104 | me->name, ELF32_R_TYPE(rel[i].r_info)); | ||
105 | return -ENOEXEC; | ||
106 | } | ||
107 | } | ||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | int apply_relocate_add(Elf32_Shdr *sechdrs, | ||
112 | const char *strtab, | ||
113 | unsigned int symindex, | ||
114 | unsigned int relsec, | ||
115 | struct module *me) | ||
116 | { | ||
117 | printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", | ||
118 | me->name); | ||
119 | return -ENOEXEC; | ||
120 | } | ||
121 | #else /*X86_64*/ | ||
70 | int apply_relocate_add(Elf64_Shdr *sechdrs, | 122 | int apply_relocate_add(Elf64_Shdr *sechdrs, |
71 | const char *strtab, | 123 | const char *strtab, |
72 | unsigned int symindex, | 124 | unsigned int symindex, |
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs, | |||
147 | return -ENOSYS; | 199 | return -ENOSYS; |
148 | } | 200 | } |
149 | 201 | ||
202 | #endif | ||
203 | |||
150 | int module_finalize(const Elf_Ehdr *hdr, | 204 | int module_finalize(const Elf_Ehdr *hdr, |
151 | const Elf_Shdr *sechdrs, | 205 | const Elf_Shdr *sechdrs, |
152 | struct module *me) | 206 | struct module *me) |
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c deleted file mode 100644 index 0edd819050e7..000000000000 --- a/arch/x86/kernel/module_32.c +++ /dev/null | |||
@@ -1,152 +0,0 @@ | |||
1 | /* Kernel module help for i386. | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #include <linux/moduleloader.h> | ||
19 | #include <linux/elf.h> | ||
20 | #include <linux/vmalloc.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/bug.h> | ||
25 | |||
26 | #if 0 | ||
27 | #define DEBUGP printk | ||
28 | #else | ||
29 | #define DEBUGP(fmt...) | ||
30 | #endif | ||
31 | |||
32 | void *module_alloc(unsigned long size) | ||
33 | { | ||
34 | if (size == 0) | ||
35 | return NULL; | ||
36 | return vmalloc_exec(size); | ||
37 | } | ||
38 | |||
39 | |||
40 | /* Free memory returned from module_alloc */ | ||
41 | void module_free(struct module *mod, void *module_region) | ||
42 | { | ||
43 | vfree(module_region); | ||
44 | /* FIXME: If module_region == mod->init_region, trim exception | ||
45 | table entries. */ | ||
46 | } | ||
47 | |||
48 | /* We don't need anything special. */ | ||
49 | int module_frob_arch_sections(Elf_Ehdr *hdr, | ||
50 | Elf_Shdr *sechdrs, | ||
51 | char *secstrings, | ||
52 | struct module *mod) | ||
53 | { | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | int apply_relocate(Elf32_Shdr *sechdrs, | ||
58 | const char *strtab, | ||
59 | unsigned int symindex, | ||
60 | unsigned int relsec, | ||
61 | struct module *me) | ||
62 | { | ||
63 | unsigned int i; | ||
64 | Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; | ||
65 | Elf32_Sym *sym; | ||
66 | uint32_t *location; | ||
67 | |||
68 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
69 | sechdrs[relsec].sh_info); | ||
70 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
71 | /* This is where to make the change */ | ||
72 | location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
73 | + rel[i].r_offset; | ||
74 | /* This is the symbol it is referring to. Note that all | ||
75 | undefined symbols have been resolved. */ | ||
76 | sym = (Elf32_Sym *)sechdrs[symindex].sh_addr | ||
77 | + ELF32_R_SYM(rel[i].r_info); | ||
78 | |||
79 | switch (ELF32_R_TYPE(rel[i].r_info)) { | ||
80 | case R_386_32: | ||
81 | /* We add the value into the location given */ | ||
82 | *location += sym->st_value; | ||
83 | break; | ||
84 | case R_386_PC32: | ||
85 | /* Add the value, subtract its postition */ | ||
86 | *location += sym->st_value - (uint32_t)location; | ||
87 | break; | ||
88 | default: | ||
89 | printk(KERN_ERR "module %s: Unknown relocation: %u\n", | ||
90 | me->name, ELF32_R_TYPE(rel[i].r_info)); | ||
91 | return -ENOEXEC; | ||
92 | } | ||
93 | } | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | int apply_relocate_add(Elf32_Shdr *sechdrs, | ||
98 | const char *strtab, | ||
99 | unsigned int symindex, | ||
100 | unsigned int relsec, | ||
101 | struct module *me) | ||
102 | { | ||
103 | printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", | ||
104 | me->name); | ||
105 | return -ENOEXEC; | ||
106 | } | ||
107 | |||
108 | int module_finalize(const Elf_Ehdr *hdr, | ||
109 | const Elf_Shdr *sechdrs, | ||
110 | struct module *me) | ||
111 | { | ||
112 | const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, | ||
113 | *para = NULL; | ||
114 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
115 | |||
116 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | ||
117 | if (!strcmp(".text", secstrings + s->sh_name)) | ||
118 | text = s; | ||
119 | if (!strcmp(".altinstructions", secstrings + s->sh_name)) | ||
120 | alt = s; | ||
121 | if (!strcmp(".smp_locks", secstrings + s->sh_name)) | ||
122 | locks = s; | ||
123 | if (!strcmp(".parainstructions", secstrings + s->sh_name)) | ||
124 | para = s; | ||
125 | } | ||
126 | |||
127 | if (alt) { | ||
128 | /* patch .altinstructions */ | ||
129 | void *aseg = (void *)alt->sh_addr; | ||
130 | apply_alternatives(aseg, aseg + alt->sh_size); | ||
131 | } | ||
132 | if (locks && text) { | ||
133 | void *lseg = (void *)locks->sh_addr; | ||
134 | void *tseg = (void *)text->sh_addr; | ||
135 | alternatives_smp_module_add(me, me->name, | ||
136 | lseg, lseg + locks->sh_size, | ||
137 | tseg, tseg + text->sh_size); | ||
138 | } | ||
139 | |||
140 | if (para) { | ||
141 | void *pseg = (void *)para->sh_addr; | ||
142 | apply_paravirt(pseg, pseg + para->sh_size); | ||
143 | } | ||
144 | |||
145 | return module_bug_finalize(hdr, sechdrs, me); | ||
146 | } | ||
147 | |||
148 | void module_arch_cleanup(struct module *mod) | ||
149 | { | ||
150 | alternatives_smp_module_del(mod); | ||
151 | module_bug_cleanup(mod); | ||
152 | } | ||
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3cf3413ec626..98fd6cd4e3a4 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { | |||
196 | .notifier_call = msr_class_cpu_callback, | 196 | .notifier_call = msr_class_cpu_callback, |
197 | }; | 197 | }; |
198 | 198 | ||
199 | static char *msr_nodename(struct device *dev) | ||
200 | { | ||
201 | return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); | ||
202 | } | ||
203 | |||
199 | static int __init msr_init(void) | 204 | static int __init msr_init(void) |
200 | { | 205 | { |
201 | int i, err = 0; | 206 | int i, err = 0; |
@@ -212,6 +217,7 @@ static int __init msr_init(void) | |||
212 | err = PTR_ERR(msr_class); | 217 | err = PTR_ERR(msr_class); |
213 | goto out_chrdev; | 218 | goto out_chrdev; |
214 | } | 219 | } |
220 | msr_class->nodename = msr_nodename; | ||
215 | for_each_online_cpu(i) { | 221 | for_each_online_cpu(i) { |
216 | err = msr_device_create(i); | 222 | err = msr_device_create(i); |
217 | if (err != 0) | 223 | if (err != 0) |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3bb2be1649bd..994dd6a4a2a0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -63,7 +63,7 @@ void arch_task_cache_init(void) | |||
63 | task_xstate_cachep = | 63 | task_xstate_cachep = |
64 | kmem_cache_create("task_xstate", xstate_size, | 64 | kmem_cache_create("task_xstate", xstate_size, |
65 | __alignof__(union thread_xstate), | 65 | __alignof__(union thread_xstate), |
66 | SLAB_PANIC, NULL); | 66 | SLAB_PANIC | SLAB_NOTRACK, NULL); |
67 | } | 67 | } |
68 | 68 | ||
69 | /* | 69 | /* |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d1c636bf31a7..be5ae80f897f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -301,15 +301,13 @@ static void __init reserve_brk(void) | |||
301 | 301 | ||
302 | #ifdef CONFIG_BLK_DEV_INITRD | 302 | #ifdef CONFIG_BLK_DEV_INITRD |
303 | 303 | ||
304 | #ifdef CONFIG_X86_32 | ||
305 | |||
306 | #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) | 304 | #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) |
307 | static void __init relocate_initrd(void) | 305 | static void __init relocate_initrd(void) |
308 | { | 306 | { |
309 | 307 | ||
310 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; | 308 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; |
311 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; | 309 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; |
312 | u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; | 310 | u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; |
313 | u64 ramdisk_here; | 311 | u64 ramdisk_here; |
314 | unsigned long slop, clen, mapaddr; | 312 | unsigned long slop, clen, mapaddr; |
315 | char *p, *q; | 313 | char *p, *q; |
@@ -365,14 +363,13 @@ static void __init relocate_initrd(void) | |||
365 | ramdisk_image, ramdisk_image + ramdisk_size - 1, | 363 | ramdisk_image, ramdisk_image + ramdisk_size - 1, |
366 | ramdisk_here, ramdisk_here + ramdisk_size - 1); | 364 | ramdisk_here, ramdisk_here + ramdisk_size - 1); |
367 | } | 365 | } |
368 | #endif | ||
369 | 366 | ||
370 | static void __init reserve_initrd(void) | 367 | static void __init reserve_initrd(void) |
371 | { | 368 | { |
372 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; | 369 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; |
373 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; | 370 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; |
374 | u64 ramdisk_end = ramdisk_image + ramdisk_size; | 371 | u64 ramdisk_end = ramdisk_image + ramdisk_size; |
375 | u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; | 372 | u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; |
376 | 373 | ||
377 | if (!boot_params.hdr.type_of_loader || | 374 | if (!boot_params.hdr.type_of_loader || |
378 | !ramdisk_image || !ramdisk_size) | 375 | !ramdisk_image || !ramdisk_size) |
@@ -402,14 +399,8 @@ static void __init reserve_initrd(void) | |||
402 | return; | 399 | return; |
403 | } | 400 | } |
404 | 401 | ||
405 | #ifdef CONFIG_X86_32 | ||
406 | relocate_initrd(); | 402 | relocate_initrd(); |
407 | #else | 403 | |
408 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
409 | "(0x%08llx > 0x%08llx)\ndisabling initrd\n", | ||
410 | ramdisk_end, end_of_lowmem); | ||
411 | initrd_start = 0; | ||
412 | #endif | ||
413 | free_early(ramdisk_image, ramdisk_end); | 404 | free_early(ramdisk_image, ramdisk_end); |
414 | } | 405 | } |
415 | #else | 406 | #else |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 0a813b17b172..4c578751e94e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -24,11 +24,11 @@ | |||
24 | #include <asm/ucontext.h> | 24 | #include <asm/ucontext.h> |
25 | #include <asm/i387.h> | 25 | #include <asm/i387.h> |
26 | #include <asm/vdso.h> | 26 | #include <asm/vdso.h> |
27 | #include <asm/mce.h> | ||
27 | 28 | ||
28 | #ifdef CONFIG_X86_64 | 29 | #ifdef CONFIG_X86_64 |
29 | #include <asm/proto.h> | 30 | #include <asm/proto.h> |
30 | #include <asm/ia32_unistd.h> | 31 | #include <asm/ia32_unistd.h> |
31 | #include <asm/mce.h> | ||
32 | #endif /* CONFIG_X86_64 */ | 32 | #endif /* CONFIG_X86_64 */ |
33 | 33 | ||
34 | #include <asm/syscall.h> | 34 | #include <asm/syscall.h> |
@@ -856,10 +856,10 @@ static void do_signal(struct pt_regs *regs) | |||
856 | void | 856 | void |
857 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | 857 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) |
858 | { | 858 | { |
859 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) | 859 | #ifdef CONFIG_X86_NEW_MCE |
860 | /* notify userspace of pending MCEs */ | 860 | /* notify userspace of pending MCEs */ |
861 | if (thread_info_flags & _TIF_MCE_NOTIFY) | 861 | if (thread_info_flags & _TIF_MCE_NOTIFY) |
862 | mce_notify_user(); | 862 | mce_notify_process(); |
863 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ | 863 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ |
864 | 864 | ||
865 | /* deal with pending signal delivery */ | 865 | /* deal with pending signal delivery */ |
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index f6db48c405b8..ec1de97600e7 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c | |||
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask) | |||
150 | * this function calls the 'stop' function on all other CPUs in the system. | 150 | * this function calls the 'stop' function on all other CPUs in the system. |
151 | */ | 151 | */ |
152 | 152 | ||
153 | asmlinkage void smp_reboot_interrupt(void) | ||
154 | { | ||
155 | ack_APIC_irq(); | ||
156 | irq_enter(); | ||
157 | stop_this_cpu(NULL); | ||
158 | irq_exit(); | ||
159 | } | ||
160 | |||
153 | static void native_smp_send_stop(void) | 161 | static void native_smp_send_stop(void) |
154 | { | 162 | { |
155 | unsigned long flags; | 163 | unsigned long flags; |
164 | unsigned long wait; | ||
156 | 165 | ||
157 | if (reboot_force) | 166 | if (reboot_force) |
158 | return; | 167 | return; |
159 | 168 | ||
160 | smp_call_function(stop_this_cpu, NULL, 0); | 169 | /* |
170 | * Use an own vector here because smp_call_function | ||
171 | * does lots of things not suitable in a panic situation. | ||
172 | * On most systems we could also use an NMI here, | ||
173 | * but there are a few systems around where NMI | ||
174 | * is problematic so stay with an non NMI for now | ||
175 | * (this implies we cannot stop CPUs spinning with irq off | ||
176 | * currently) | ||
177 | */ | ||
178 | if (num_online_cpus() > 1) { | ||
179 | apic->send_IPI_allbutself(REBOOT_VECTOR); | ||
180 | |||
181 | /* Don't wait longer than a second */ | ||
182 | wait = USEC_PER_SEC; | ||
183 | while (num_online_cpus() > 1 && wait--) | ||
184 | udelay(1); | ||
185 | } | ||
186 | |||
161 | local_irq_save(flags); | 187 | local_irq_save(flags); |
162 | disable_local_APIC(); | 188 | disable_local_APIC(); |
163 | local_irq_restore(flags); | 189 | local_irq_restore(flags); |
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs) | |||
172 | { | 198 | { |
173 | ack_APIC_irq(); | 199 | ack_APIC_irq(); |
174 | inc_irq_stat(irq_resched_count); | 200 | inc_irq_stat(irq_resched_count); |
201 | /* | ||
202 | * KVM uses this interrupt to force a cpu out of guest mode | ||
203 | */ | ||
175 | } | 204 | } |
176 | 205 | ||
177 | void smp_call_function_interrupt(struct pt_regs *regs) | 206 | void smp_call_function_interrupt(struct pt_regs *regs) |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7c80007ea5f7..2fecda69ee64 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -873,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
873 | 873 | ||
874 | err = do_boot_cpu(apicid, cpu); | 874 | err = do_boot_cpu(apicid, cpu); |
875 | 875 | ||
876 | zap_low_mappings(); | 876 | zap_low_mappings(false); |
877 | low_mappings = 0; | 877 | low_mappings = 0; |
878 | #else | 878 | #else |
879 | err = do_boot_cpu(apicid, cpu); | 879 | err = do_boot_cpu(apicid, cpu); |
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4aaf7e48394f..c3eb207181fe 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace) | |||
77 | } | 77 | } |
78 | EXPORT_SYMBOL_GPL(save_stack_trace); | 78 | EXPORT_SYMBOL_GPL(save_stack_trace); |
79 | 79 | ||
80 | void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) | ||
81 | { | ||
82 | dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); | ||
83 | if (trace->nr_entries < trace->max_entries) | ||
84 | trace->entries[trace->nr_entries++] = ULONG_MAX; | ||
85 | } | ||
86 | |||
80 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | 87 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) |
81 | { | 88 | { |
82 | dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); | 89 | dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 07d60c870ce2..5f935f0d5861 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/edac.h> | 45 | #include <linux/edac.h> |
46 | #endif | 46 | #endif |
47 | 47 | ||
48 | #include <asm/kmemcheck.h> | ||
48 | #include <asm/stacktrace.h> | 49 | #include <asm/stacktrace.h> |
49 | #include <asm/processor.h> | 50 | #include <asm/processor.h> |
50 | #include <asm/debugreg.h> | 51 | #include <asm/debugreg.h> |
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
534 | 535 | ||
535 | get_debugreg(condition, 6); | 536 | get_debugreg(condition, 6); |
536 | 537 | ||
538 | /* Catch kmemcheck conditions first of all! */ | ||
539 | if (condition & DR_STEP && kmemcheck_trap(regs)) | ||
540 | return; | ||
541 | |||
537 | /* | 542 | /* |
538 | * The processor cleared BTF, so don't mark that we need it set. | 543 | * The processor cleared BTF, so don't mark that we need it set. |
539 | */ | 544 | */ |
@@ -798,15 +803,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | |||
798 | 803 | ||
799 | return new_kesp; | 804 | return new_kesp; |
800 | } | 805 | } |
801 | #else | 806 | #endif |
807 | |||
802 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | 808 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) |
803 | { | 809 | { |
804 | } | 810 | } |
805 | 811 | ||
806 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | 812 | asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) |
807 | { | 813 | { |
808 | } | 814 | } |
809 | #endif | ||
810 | 815 | ||
811 | /* | 816 | /* |
812 | * 'math_state_restore()' saves the current math information in the | 817 | * 'math_state_restore()' saves the current math information in the |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3e1c057e98fe..ae3180c506a6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/delay.h> | 9 | #include <linux/delay.h> |
10 | #include <linux/clocksource.h> | 10 | #include <linux/clocksource.h> |
11 | #include <linux/percpu.h> | 11 | #include <linux/percpu.h> |
12 | #include <linux/timex.h> | ||
12 | 13 | ||
13 | #include <asm/hpet.h> | 14 | #include <asm/hpet.h> |
14 | #include <asm/timer.h> | 15 | #include <asm/timer.h> |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c85b2e2bb65..367e87882041 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -108,6 +108,8 @@ SECTIONS | |||
108 | /* Data */ | 108 | /* Data */ |
109 | . = ALIGN(PAGE_SIZE); | 109 | . = ALIGN(PAGE_SIZE); |
110 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | 110 | .data : AT(ADDR(.data) - LOAD_OFFSET) { |
111 | /* Start of data section */ | ||
112 | _sdata = .; | ||
111 | DATA_DATA | 113 | DATA_DATA |
112 | CONSTRUCTORS | 114 | CONSTRUCTORS |
113 | 115 | ||
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a58504ea78cc..8600a09e0c6c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -50,6 +50,9 @@ config KVM_INTEL | |||
50 | Provides support for KVM on Intel processors equipped with the VT | 50 | Provides support for KVM on Intel processors equipped with the VT |
51 | extensions. | 51 | extensions. |
52 | 52 | ||
53 | To compile this as a module, choose M here: the module | ||
54 | will be called kvm-intel. | ||
55 | |||
53 | config KVM_AMD | 56 | config KVM_AMD |
54 | tristate "KVM for AMD processors support" | 57 | tristate "KVM for AMD processors support" |
55 | depends on KVM | 58 | depends on KVM |
@@ -57,6 +60,9 @@ config KVM_AMD | |||
57 | Provides support for KVM on AMD processors equipped with the AMD-V | 60 | Provides support for KVM on AMD processors equipped with the AMD-V |
58 | (SVM) extensions. | 61 | (SVM) extensions. |
59 | 62 | ||
63 | To compile this as a module, choose M here: the module | ||
64 | will be called kvm-amd. | ||
65 | |||
60 | config KVM_TRACE | 66 | config KVM_TRACE |
61 | bool "KVM trace support" | 67 | bool "KVM trace support" |
62 | depends on KVM && SYSFS | 68 | depends on KVM && SYSFS |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d3ec292f00f2..b43c4efafe80 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -14,7 +14,7 @@ endif | |||
14 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 14 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm |
15 | 15 | ||
16 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ | 16 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ |
17 | i8254.o | 17 | i8254.o timer.o |
18 | obj-$(CONFIG_KVM) += kvm.o | 18 | obj-$(CONFIG_KVM) += kvm.o |
19 | kvm-intel-objs = vmx.o | 19 | kvm-intel-objs = vmx.o |
20 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | 20 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c13bb92d3157..4d6f0d293ee2 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel) | |||
98 | return kvm->arch.vpit->pit_state.channels[channel].gate; | 98 | return kvm->arch.vpit->pit_state.channels[channel].gate; |
99 | } | 99 | } |
100 | 100 | ||
101 | static s64 __kpit_elapsed(struct kvm *kvm) | ||
102 | { | ||
103 | s64 elapsed; | ||
104 | ktime_t remaining; | ||
105 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; | ||
106 | |||
107 | /* | ||
108 | * The Counter does not stop when it reaches zero. In | ||
109 | * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to | ||
110 | * the highest count, either FFFF hex for binary counting | ||
111 | * or 9999 for BCD counting, and continues counting. | ||
112 | * Modes 2 and 3 are periodic; the Counter reloads | ||
113 | * itself with the initial count and continues counting | ||
114 | * from there. | ||
115 | */ | ||
116 | remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); | ||
117 | elapsed = ps->pit_timer.period - ktime_to_ns(remaining); | ||
118 | elapsed = mod_64(elapsed, ps->pit_timer.period); | ||
119 | |||
120 | return elapsed; | ||
121 | } | ||
122 | |||
123 | static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, | ||
124 | int channel) | ||
125 | { | ||
126 | if (channel == 0) | ||
127 | return __kpit_elapsed(kvm); | ||
128 | |||
129 | return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); | ||
130 | } | ||
131 | |||
101 | static int pit_get_count(struct kvm *kvm, int channel) | 132 | static int pit_get_count(struct kvm *kvm, int channel) |
102 | { | 133 | { |
103 | struct kvm_kpit_channel_state *c = | 134 | struct kvm_kpit_channel_state *c = |
@@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel) | |||
107 | 138 | ||
108 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | 139 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); |
109 | 140 | ||
110 | t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); | 141 | t = kpit_elapsed(kvm, c, channel); |
111 | d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); | 142 | d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); |
112 | 143 | ||
113 | switch (c->mode) { | 144 | switch (c->mode) { |
@@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel) | |||
137 | 168 | ||
138 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | 169 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); |
139 | 170 | ||
140 | t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); | 171 | t = kpit_elapsed(kvm, c, channel); |
141 | d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); | 172 | d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); |
142 | 173 | ||
143 | switch (c->mode) { | 174 | switch (c->mode) { |
@@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) | |||
193 | } | 224 | } |
194 | } | 225 | } |
195 | 226 | ||
196 | static int __pit_timer_fn(struct kvm_kpit_state *ps) | ||
197 | { | ||
198 | struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; | ||
199 | struct kvm_kpit_timer *pt = &ps->pit_timer; | ||
200 | |||
201 | if (!atomic_inc_and_test(&pt->pending)) | ||
202 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); | ||
203 | |||
204 | if (!pt->reinject) | ||
205 | atomic_set(&pt->pending, 1); | ||
206 | |||
207 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) | ||
208 | wake_up_interruptible(&vcpu0->wq); | ||
209 | |||
210 | hrtimer_add_expires_ns(&pt->timer, pt->period); | ||
211 | pt->scheduled = hrtimer_get_expires_ns(&pt->timer); | ||
212 | if (pt->period) | ||
213 | ps->channels[0].count_load_time = ktime_get(); | ||
214 | |||
215 | return (pt->period == 0 ? 0 : 1); | ||
216 | } | ||
217 | |||
218 | int pit_has_pending_timer(struct kvm_vcpu *vcpu) | 227 | int pit_has_pending_timer(struct kvm_vcpu *vcpu) |
219 | { | 228 | { |
220 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 229 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
@@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | |||
235 | spin_unlock(&ps->inject_lock); | 244 | spin_unlock(&ps->inject_lock); |
236 | } | 245 | } |
237 | 246 | ||
238 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | ||
239 | { | ||
240 | struct kvm_kpit_state *ps; | ||
241 | int restart_timer = 0; | ||
242 | |||
243 | ps = container_of(data, struct kvm_kpit_state, pit_timer.timer); | ||
244 | |||
245 | restart_timer = __pit_timer_fn(ps); | ||
246 | |||
247 | if (restart_timer) | ||
248 | return HRTIMER_RESTART; | ||
249 | else | ||
250 | return HRTIMER_NORESTART; | ||
251 | } | ||
252 | |||
253 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | 247 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) |
254 | { | 248 | { |
255 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 249 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
@@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | |||
263 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | 257 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
264 | } | 258 | } |
265 | 259 | ||
266 | static void destroy_pit_timer(struct kvm_kpit_timer *pt) | 260 | static void destroy_pit_timer(struct kvm_timer *pt) |
267 | { | 261 | { |
268 | pr_debug("pit: execute del timer!\n"); | 262 | pr_debug("pit: execute del timer!\n"); |
269 | hrtimer_cancel(&pt->timer); | 263 | hrtimer_cancel(&pt->timer); |
270 | } | 264 | } |
271 | 265 | ||
266 | static bool kpit_is_periodic(struct kvm_timer *ktimer) | ||
267 | { | ||
268 | struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, | ||
269 | pit_timer); | ||
270 | return ps->is_periodic; | ||
271 | } | ||
272 | |||
273 | static struct kvm_timer_ops kpit_ops = { | ||
274 | .is_periodic = kpit_is_periodic, | ||
275 | }; | ||
276 | |||
272 | static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) | 277 | static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) |
273 | { | 278 | { |
274 | struct kvm_kpit_timer *pt = &ps->pit_timer; | 279 | struct kvm_timer *pt = &ps->pit_timer; |
275 | s64 interval; | 280 | s64 interval; |
276 | 281 | ||
277 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); | 282 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); |
@@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) | |||
280 | 285 | ||
281 | /* TODO The new value only affected after the retriggered */ | 286 | /* TODO The new value only affected after the retriggered */ |
282 | hrtimer_cancel(&pt->timer); | 287 | hrtimer_cancel(&pt->timer); |
283 | pt->period = (is_period == 0) ? 0 : interval; | 288 | pt->period = interval; |
284 | pt->timer.function = pit_timer_fn; | 289 | ps->is_periodic = is_period; |
290 | |||
291 | pt->timer.function = kvm_timer_fn; | ||
292 | pt->t_ops = &kpit_ops; | ||
293 | pt->kvm = ps->pit->kvm; | ||
294 | pt->vcpu_id = 0; | ||
295 | |||
285 | atomic_set(&pt->pending, 0); | 296 | atomic_set(&pt->pending, 0); |
286 | ps->irq_ack = 1; | 297 | ps->irq_ack = 1; |
287 | 298 | ||
@@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
298 | pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); | 309 | pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); |
299 | 310 | ||
300 | /* | 311 | /* |
301 | * Though spec said the state of 8254 is undefined after power-up, | 312 | * The largest possible initial count is 0; this is equivalent |
302 | * seems some tricky OS like Windows XP depends on IRQ0 interrupt | 313 | * to 216 for binary counting and 104 for BCD counting. |
303 | * when booting up. | ||
304 | * So here setting initialize rate for it, and not a specific number | ||
305 | */ | 314 | */ |
306 | if (val == 0) | 315 | if (val == 0) |
307 | val = 0x10000; | 316 | val = 0x10000; |
308 | 317 | ||
309 | ps->channels[channel].count_load_time = ktime_get(); | ||
310 | ps->channels[channel].count = val; | 318 | ps->channels[channel].count = val; |
311 | 319 | ||
312 | if (channel != 0) | 320 | if (channel != 0) { |
321 | ps->channels[channel].count_load_time = ktime_get(); | ||
313 | return; | 322 | return; |
323 | } | ||
314 | 324 | ||
315 | /* Two types of timer | 325 | /* Two types of timer |
316 | * mode 1 is one shot, mode 2 is period, otherwise del timer */ | 326 | * mode 1 is one shot, mode 2 is period, otherwise del timer */ |
317 | switch (ps->channels[0].mode) { | 327 | switch (ps->channels[0].mode) { |
328 | case 0: | ||
318 | case 1: | 329 | case 1: |
319 | /* FIXME: enhance mode 4 precision */ | 330 | /* FIXME: enhance mode 4 precision */ |
320 | case 4: | 331 | case 4: |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 6acbe4b505d5..bbd863ff60b7 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -3,15 +3,6 @@ | |||
3 | 3 | ||
4 | #include "iodev.h" | 4 | #include "iodev.h" |
5 | 5 | ||
6 | struct kvm_kpit_timer { | ||
7 | struct hrtimer timer; | ||
8 | int irq; | ||
9 | s64 period; /* unit: ns */ | ||
10 | s64 scheduled; | ||
11 | atomic_t pending; | ||
12 | bool reinject; | ||
13 | }; | ||
14 | |||
15 | struct kvm_kpit_channel_state { | 6 | struct kvm_kpit_channel_state { |
16 | u32 count; /* can be 65536 */ | 7 | u32 count; /* can be 65536 */ |
17 | u16 latched_count; | 8 | u16 latched_count; |
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state { | |||
30 | 21 | ||
31 | struct kvm_kpit_state { | 22 | struct kvm_kpit_state { |
32 | struct kvm_kpit_channel_state channels[3]; | 23 | struct kvm_kpit_channel_state channels[3]; |
33 | struct kvm_kpit_timer pit_timer; | 24 | struct kvm_timer pit_timer; |
25 | bool is_periodic; | ||
34 | u32 speaker_data_on; | 26 | u32 speaker_data_on; |
35 | struct mutex lock; | 27 | struct mutex lock; |
36 | struct kvm_pit *pit; | 28 | struct kvm_pit *pit; |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index cf17ed52f6fb..96dfbb6ad2a9 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -24,6 +24,7 @@ | |||
24 | 24 | ||
25 | #include "irq.h" | 25 | #include "irq.h" |
26 | #include "i8254.h" | 26 | #include "i8254.h" |
27 | #include "x86.h" | ||
27 | 28 | ||
28 | /* | 29 | /* |
29 | * check if there are pending timer events | 30 | * check if there are pending timer events |
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) | |||
48 | { | 49 | { |
49 | struct kvm_pic *s; | 50 | struct kvm_pic *s; |
50 | 51 | ||
52 | if (!irqchip_in_kernel(v->kvm)) | ||
53 | return v->arch.interrupt.pending; | ||
54 | |||
51 | if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ | 55 | if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ |
52 | if (kvm_apic_accept_pic_intr(v)) { | 56 | if (kvm_apic_accept_pic_intr(v)) { |
53 | s = pic_irqchip(v->kvm); /* PIC */ | 57 | s = pic_irqchip(v->kvm); /* PIC */ |
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | |||
67 | struct kvm_pic *s; | 71 | struct kvm_pic *s; |
68 | int vector; | 72 | int vector; |
69 | 73 | ||
74 | if (!irqchip_in_kernel(v->kvm)) | ||
75 | return v->arch.interrupt.nr; | ||
76 | |||
70 | vector = kvm_get_apic_interrupt(v); /* APIC */ | 77 | vector = kvm_get_apic_interrupt(v); /* APIC */ |
71 | if (vector == -1) { | 78 | if (vector == -1) { |
72 | if (kvm_apic_accept_pic_intr(v)) { | 79 | if (kvm_apic_accept_pic_intr(v)) { |
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h new file mode 100644 index 000000000000..26bd6ba74e1c --- /dev/null +++ b/arch/x86/kvm/kvm_timer.h | |||
@@ -0,0 +1,18 @@ | |||
1 | |||
2 | struct kvm_timer { | ||
3 | struct hrtimer timer; | ||
4 | s64 period; /* unit: ns */ | ||
5 | atomic_t pending; /* accumulated triggered timers */ | ||
6 | bool reinject; | ||
7 | struct kvm_timer_ops *t_ops; | ||
8 | struct kvm *kvm; | ||
9 | int vcpu_id; | ||
10 | }; | ||
11 | |||
12 | struct kvm_timer_ops { | ||
13 | bool (*is_periodic)(struct kvm_timer *); | ||
14 | }; | ||
15 | |||
16 | |||
17 | enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); | ||
18 | |||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f0b67f2cdd69..ae99d83f81a3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | |||
196 | } | 196 | } |
197 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | 197 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); |
198 | 198 | ||
199 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) | 199 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, |
200 | int vector, int level, int trig_mode); | ||
201 | |||
202 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) | ||
200 | { | 203 | { |
201 | struct kvm_lapic *apic = vcpu->arch.apic; | 204 | struct kvm_lapic *apic = vcpu->arch.apic; |
202 | 205 | ||
203 | if (!apic_test_and_set_irr(vec, apic)) { | 206 | return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, |
204 | /* a new pending irq is set in IRR */ | 207 | irq->level, irq->trig_mode); |
205 | if (trig) | ||
206 | apic_set_vector(vec, apic->regs + APIC_TMR); | ||
207 | else | ||
208 | apic_clear_vector(vec, apic->regs + APIC_TMR); | ||
209 | kvm_vcpu_kick(apic->vcpu); | ||
210 | return 1; | ||
211 | } | ||
212 | return 0; | ||
213 | } | 208 | } |
214 | 209 | ||
215 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | 210 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) |
@@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | |||
250 | 245 | ||
251 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) | 246 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) |
252 | { | 247 | { |
253 | return kvm_apic_id(apic) == dest; | 248 | return dest == 0xff || kvm_apic_id(apic) == dest; |
254 | } | 249 | } |
255 | 250 | ||
256 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | 251 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) |
@@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | |||
279 | return result; | 274 | return result; |
280 | } | 275 | } |
281 | 276 | ||
282 | static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | 277 | int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, |
283 | int short_hand, int dest, int dest_mode) | 278 | int short_hand, int dest, int dest_mode) |
284 | { | 279 | { |
285 | int result = 0; | 280 | int result = 0; |
286 | struct kvm_lapic *target = vcpu->arch.apic; | 281 | struct kvm_lapic *target = vcpu->arch.apic; |
287 | 282 | ||
288 | apic_debug("target %p, source %p, dest 0x%x, " | 283 | apic_debug("target %p, source %p, dest 0x%x, " |
289 | "dest_mode 0x%x, short_hand 0x%x", | 284 | "dest_mode 0x%x, short_hand 0x%x\n", |
290 | target, source, dest, dest_mode, short_hand); | 285 | target, source, dest, dest_mode, short_hand); |
291 | 286 | ||
292 | ASSERT(!target); | 287 | ASSERT(!target); |
293 | switch (short_hand) { | 288 | switch (short_hand) { |
294 | case APIC_DEST_NOSHORT: | 289 | case APIC_DEST_NOSHORT: |
295 | if (dest_mode == 0) { | 290 | if (dest_mode == 0) |
296 | /* Physical mode. */ | 291 | /* Physical mode. */ |
297 | if ((dest == 0xFF) || (dest == kvm_apic_id(target))) | 292 | result = kvm_apic_match_physical_addr(target, dest); |
298 | result = 1; | 293 | else |
299 | } else | ||
300 | /* Logical mode. */ | 294 | /* Logical mode. */ |
301 | result = kvm_apic_match_logical_addr(target, dest); | 295 | result = kvm_apic_match_logical_addr(target, dest); |
302 | break; | 296 | break; |
303 | case APIC_DEST_SELF: | 297 | case APIC_DEST_SELF: |
304 | if (target == source) | 298 | result = (target == source); |
305 | result = 1; | ||
306 | break; | 299 | break; |
307 | case APIC_DEST_ALLINC: | 300 | case APIC_DEST_ALLINC: |
308 | result = 1; | 301 | result = 1; |
309 | break; | 302 | break; |
310 | case APIC_DEST_ALLBUT: | 303 | case APIC_DEST_ALLBUT: |
311 | if (target != source) | 304 | result = (target != source); |
312 | result = 1; | ||
313 | break; | 305 | break; |
314 | default: | 306 | default: |
315 | printk(KERN_WARNING "Bad dest shorthand value %x\n", | 307 | printk(KERN_WARNING "Bad dest shorthand value %x\n", |
@@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | |||
327 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | 319 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, |
328 | int vector, int level, int trig_mode) | 320 | int vector, int level, int trig_mode) |
329 | { | 321 | { |
330 | int orig_irr, result = 0; | 322 | int result = 0; |
331 | struct kvm_vcpu *vcpu = apic->vcpu; | 323 | struct kvm_vcpu *vcpu = apic->vcpu; |
332 | 324 | ||
333 | switch (delivery_mode) { | 325 | switch (delivery_mode) { |
334 | case APIC_DM_FIXED: | ||
335 | case APIC_DM_LOWEST: | 326 | case APIC_DM_LOWEST: |
327 | vcpu->arch.apic_arb_prio++; | ||
328 | case APIC_DM_FIXED: | ||
336 | /* FIXME add logic for vcpu on reset */ | 329 | /* FIXME add logic for vcpu on reset */ |
337 | if (unlikely(!apic_enabled(apic))) | 330 | if (unlikely(!apic_enabled(apic))) |
338 | break; | 331 | break; |
339 | 332 | ||
340 | orig_irr = apic_test_and_set_irr(vector, apic); | 333 | result = !apic_test_and_set_irr(vector, apic); |
341 | if (orig_irr && trig_mode) { | 334 | if (!result) { |
342 | apic_debug("level trig mode repeatedly for vector %d", | 335 | if (trig_mode) |
343 | vector); | 336 | apic_debug("level trig mode repeatedly for " |
337 | "vector %d", vector); | ||
344 | break; | 338 | break; |
345 | } | 339 | } |
346 | 340 | ||
@@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
349 | apic_set_vector(vector, apic->regs + APIC_TMR); | 343 | apic_set_vector(vector, apic->regs + APIC_TMR); |
350 | } else | 344 | } else |
351 | apic_clear_vector(vector, apic->regs + APIC_TMR); | 345 | apic_clear_vector(vector, apic->regs + APIC_TMR); |
352 | |||
353 | kvm_vcpu_kick(vcpu); | 346 | kvm_vcpu_kick(vcpu); |
354 | |||
355 | result = (orig_irr == 0); | ||
356 | break; | 347 | break; |
357 | 348 | ||
358 | case APIC_DM_REMRD: | 349 | case APIC_DM_REMRD: |
@@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
364 | break; | 355 | break; |
365 | 356 | ||
366 | case APIC_DM_NMI: | 357 | case APIC_DM_NMI: |
358 | result = 1; | ||
367 | kvm_inject_nmi(vcpu); | 359 | kvm_inject_nmi(vcpu); |
368 | kvm_vcpu_kick(vcpu); | 360 | kvm_vcpu_kick(vcpu); |
369 | break; | 361 | break; |
370 | 362 | ||
371 | case APIC_DM_INIT: | 363 | case APIC_DM_INIT: |
372 | if (level) { | 364 | if (level) { |
365 | result = 1; | ||
373 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | 366 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) |
374 | printk(KERN_DEBUG | 367 | printk(KERN_DEBUG |
375 | "INIT on a runnable vcpu %d\n", | 368 | "INIT on a runnable vcpu %d\n", |
@@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
386 | apic_debug("SIPI to vcpu %d vector 0x%02x\n", | 379 | apic_debug("SIPI to vcpu %d vector 0x%02x\n", |
387 | vcpu->vcpu_id, vector); | 380 | vcpu->vcpu_id, vector); |
388 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { | 381 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
382 | result = 1; | ||
389 | vcpu->arch.sipi_vector = vector; | 383 | vcpu->arch.sipi_vector = vector; |
390 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; | 384 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; |
391 | kvm_vcpu_kick(vcpu); | 385 | kvm_vcpu_kick(vcpu); |
@@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
408 | return result; | 402 | return result; |
409 | } | 403 | } |
410 | 404 | ||
411 | static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | 405 | int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) |
412 | unsigned long bitmap) | ||
413 | { | ||
414 | int last; | ||
415 | int next; | ||
416 | struct kvm_lapic *apic = NULL; | ||
417 | |||
418 | last = kvm->arch.round_robin_prev_vcpu; | ||
419 | next = last; | ||
420 | |||
421 | do { | ||
422 | if (++next == KVM_MAX_VCPUS) | ||
423 | next = 0; | ||
424 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) | ||
425 | continue; | ||
426 | apic = kvm->vcpus[next]->arch.apic; | ||
427 | if (apic && apic_enabled(apic)) | ||
428 | break; | ||
429 | apic = NULL; | ||
430 | } while (next != last); | ||
431 | kvm->arch.round_robin_prev_vcpu = next; | ||
432 | |||
433 | if (!apic) | ||
434 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); | ||
435 | |||
436 | return apic; | ||
437 | } | ||
438 | |||
439 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
440 | unsigned long bitmap) | ||
441 | { | 406 | { |
442 | struct kvm_lapic *apic; | 407 | return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; |
443 | |||
444 | apic = kvm_apic_round_robin(kvm, vector, bitmap); | ||
445 | if (apic) | ||
446 | return apic->vcpu; | ||
447 | return NULL; | ||
448 | } | 408 | } |
449 | 409 | ||
450 | static void apic_set_eoi(struct kvm_lapic *apic) | 410 | static void apic_set_eoi(struct kvm_lapic *apic) |
@@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
472 | { | 432 | { |
473 | u32 icr_low = apic_get_reg(apic, APIC_ICR); | 433 | u32 icr_low = apic_get_reg(apic, APIC_ICR); |
474 | u32 icr_high = apic_get_reg(apic, APIC_ICR2); | 434 | u32 icr_high = apic_get_reg(apic, APIC_ICR2); |
435 | struct kvm_lapic_irq irq; | ||
475 | 436 | ||
476 | unsigned int dest = GET_APIC_DEST_FIELD(icr_high); | 437 | irq.vector = icr_low & APIC_VECTOR_MASK; |
477 | unsigned int short_hand = icr_low & APIC_SHORT_MASK; | 438 | irq.delivery_mode = icr_low & APIC_MODE_MASK; |
478 | unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; | 439 | irq.dest_mode = icr_low & APIC_DEST_MASK; |
479 | unsigned int level = icr_low & APIC_INT_ASSERT; | 440 | irq.level = icr_low & APIC_INT_ASSERT; |
480 | unsigned int dest_mode = icr_low & APIC_DEST_MASK; | 441 | irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; |
481 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; | 442 | irq.shorthand = icr_low & APIC_SHORT_MASK; |
482 | unsigned int vector = icr_low & APIC_VECTOR_MASK; | 443 | irq.dest_id = GET_APIC_DEST_FIELD(icr_high); |
483 | |||
484 | struct kvm_vcpu *target; | ||
485 | struct kvm_vcpu *vcpu; | ||
486 | unsigned long lpr_map = 0; | ||
487 | int i; | ||
488 | 444 | ||
489 | apic_debug("icr_high 0x%x, icr_low 0x%x, " | 445 | apic_debug("icr_high 0x%x, icr_low 0x%x, " |
490 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " | 446 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " |
491 | "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", | 447 | "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", |
492 | icr_high, icr_low, short_hand, dest, | 448 | icr_high, icr_low, irq.shorthand, irq.dest_id, |
493 | trig_mode, level, dest_mode, delivery_mode, vector); | 449 | irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, |
494 | 450 | irq.vector); | |
495 | for (i = 0; i < KVM_MAX_VCPUS; i++) { | ||
496 | vcpu = apic->vcpu->kvm->vcpus[i]; | ||
497 | if (!vcpu) | ||
498 | continue; | ||
499 | |||
500 | if (vcpu->arch.apic && | ||
501 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { | ||
502 | if (delivery_mode == APIC_DM_LOWEST) | ||
503 | set_bit(vcpu->vcpu_id, &lpr_map); | ||
504 | else | ||
505 | __apic_accept_irq(vcpu->arch.apic, delivery_mode, | ||
506 | vector, level, trig_mode); | ||
507 | } | ||
508 | } | ||
509 | 451 | ||
510 | if (delivery_mode == APIC_DM_LOWEST) { | 452 | kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); |
511 | target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); | ||
512 | if (target != NULL) | ||
513 | __apic_accept_irq(target->arch.apic, delivery_mode, | ||
514 | vector, level, trig_mode); | ||
515 | } | ||
516 | } | 453 | } |
517 | 454 | ||
518 | static u32 apic_get_tmcct(struct kvm_lapic *apic) | 455 | static u32 apic_get_tmcct(struct kvm_lapic *apic) |
@@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) | |||
527 | if (apic_get_reg(apic, APIC_TMICT) == 0) | 464 | if (apic_get_reg(apic, APIC_TMICT) == 0) |
528 | return 0; | 465 | return 0; |
529 | 466 | ||
530 | remaining = hrtimer_expires_remaining(&apic->timer.dev); | 467 | remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); |
531 | if (ktime_to_ns(remaining) < 0) | 468 | if (ktime_to_ns(remaining) < 0) |
532 | remaining = ktime_set(0, 0); | 469 | remaining = ktime_set(0, 0); |
533 | 470 | ||
534 | ns = mod_64(ktime_to_ns(remaining), apic->timer.period); | 471 | ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); |
535 | tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); | 472 | tmcct = div64_u64(ns, |
473 | (APIC_BUS_CYCLE_NS * apic->divide_count)); | ||
536 | 474 | ||
537 | return tmcct; | 475 | return tmcct; |
538 | } | 476 | } |
@@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic) | |||
619 | tdcr = apic_get_reg(apic, APIC_TDCR); | 557 | tdcr = apic_get_reg(apic, APIC_TDCR); |
620 | tmp1 = tdcr & 0xf; | 558 | tmp1 = tdcr & 0xf; |
621 | tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; | 559 | tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; |
622 | apic->timer.divide_count = 0x1 << (tmp2 & 0x7); | 560 | apic->divide_count = 0x1 << (tmp2 & 0x7); |
623 | 561 | ||
624 | apic_debug("timer divide count is 0x%x\n", | 562 | apic_debug("timer divide count is 0x%x\n", |
625 | apic->timer.divide_count); | 563 | apic->divide_count); |
626 | } | 564 | } |
627 | 565 | ||
628 | static void start_apic_timer(struct kvm_lapic *apic) | 566 | static void start_apic_timer(struct kvm_lapic *apic) |
629 | { | 567 | { |
630 | ktime_t now = apic->timer.dev.base->get_time(); | 568 | ktime_t now = apic->lapic_timer.timer.base->get_time(); |
631 | 569 | ||
632 | apic->timer.period = apic_get_reg(apic, APIC_TMICT) * | 570 | apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * |
633 | APIC_BUS_CYCLE_NS * apic->timer.divide_count; | 571 | APIC_BUS_CYCLE_NS * apic->divide_count; |
634 | atomic_set(&apic->timer.pending, 0); | 572 | atomic_set(&apic->lapic_timer.pending, 0); |
635 | 573 | ||
636 | if (!apic->timer.period) | 574 | if (!apic->lapic_timer.period) |
637 | return; | 575 | return; |
638 | 576 | ||
639 | hrtimer_start(&apic->timer.dev, | 577 | hrtimer_start(&apic->lapic_timer.timer, |
640 | ktime_add_ns(now, apic->timer.period), | 578 | ktime_add_ns(now, apic->lapic_timer.period), |
641 | HRTIMER_MODE_ABS); | 579 | HRTIMER_MODE_ABS); |
642 | 580 | ||
643 | apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" | 581 | apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" |
@@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
646 | "expire @ 0x%016" PRIx64 ".\n", __func__, | 584 | "expire @ 0x%016" PRIx64 ".\n", __func__, |
647 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), | 585 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), |
648 | apic_get_reg(apic, APIC_TMICT), | 586 | apic_get_reg(apic, APIC_TMICT), |
649 | apic->timer.period, | 587 | apic->lapic_timer.period, |
650 | ktime_to_ns(ktime_add_ns(now, | 588 | ktime_to_ns(ktime_add_ns(now, |
651 | apic->timer.period))); | 589 | apic->lapic_timer.period))); |
652 | } | 590 | } |
653 | 591 | ||
654 | static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) | 592 | static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) |
@@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
730 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, | 668 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, |
731 | lvt_val | APIC_LVT_MASKED); | 669 | lvt_val | APIC_LVT_MASKED); |
732 | } | 670 | } |
733 | atomic_set(&apic->timer.pending, 0); | 671 | atomic_set(&apic->lapic_timer.pending, 0); |
734 | 672 | ||
735 | } | 673 | } |
736 | break; | 674 | break; |
@@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
762 | break; | 700 | break; |
763 | 701 | ||
764 | case APIC_TMICT: | 702 | case APIC_TMICT: |
765 | hrtimer_cancel(&apic->timer.dev); | 703 | hrtimer_cancel(&apic->lapic_timer.timer); |
766 | apic_set_reg(apic, APIC_TMICT, val); | 704 | apic_set_reg(apic, APIC_TMICT, val); |
767 | start_apic_timer(apic); | 705 | start_apic_timer(apic); |
768 | return; | 706 | return; |
@@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) | |||
802 | if (!vcpu->arch.apic) | 740 | if (!vcpu->arch.apic) |
803 | return; | 741 | return; |
804 | 742 | ||
805 | hrtimer_cancel(&vcpu->arch.apic->timer.dev); | 743 | hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); |
806 | 744 | ||
807 | if (vcpu->arch.apic->regs_page) | 745 | if (vcpu->arch.apic->regs_page) |
808 | __free_page(vcpu->arch.apic->regs_page); | 746 | __free_page(vcpu->arch.apic->regs_page); |
@@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
880 | ASSERT(apic != NULL); | 818 | ASSERT(apic != NULL); |
881 | 819 | ||
882 | /* Stop the timer in case it's a reset to an active apic */ | 820 | /* Stop the timer in case it's a reset to an active apic */ |
883 | hrtimer_cancel(&apic->timer.dev); | 821 | hrtimer_cancel(&apic->lapic_timer.timer); |
884 | 822 | ||
885 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); | 823 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); |
886 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 824 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); |
@@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
905 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | 843 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); |
906 | } | 844 | } |
907 | update_divide_count(apic); | 845 | update_divide_count(apic); |
908 | atomic_set(&apic->timer.pending, 0); | 846 | atomic_set(&apic->lapic_timer.pending, 0); |
909 | if (vcpu->vcpu_id == 0) | 847 | if (vcpu->vcpu_id == 0) |
910 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | 848 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; |
911 | apic_update_ppr(apic); | 849 | apic_update_ppr(apic); |
912 | 850 | ||
851 | vcpu->arch.apic_arb_prio = 0; | ||
852 | |||
913 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | 853 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" |
914 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, | 854 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, |
915 | vcpu, kvm_apic_id(apic), | 855 | vcpu, kvm_apic_id(apic), |
@@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
917 | } | 857 | } |
918 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | 858 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); |
919 | 859 | ||
920 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | 860 | bool kvm_apic_present(struct kvm_vcpu *vcpu) |
921 | { | 861 | { |
922 | struct kvm_lapic *apic = vcpu->arch.apic; | 862 | return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); |
923 | int ret = 0; | 863 | } |
924 | |||
925 | if (!apic) | ||
926 | return 0; | ||
927 | ret = apic_enabled(apic); | ||
928 | 864 | ||
929 | return ret; | 865 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) |
866 | { | ||
867 | return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); | ||
930 | } | 868 | } |
931 | EXPORT_SYMBOL_GPL(kvm_lapic_enabled); | 869 | EXPORT_SYMBOL_GPL(kvm_lapic_enabled); |
932 | 870 | ||
@@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled); | |||
936 | *---------------------------------------------------------------------- | 874 | *---------------------------------------------------------------------- |
937 | */ | 875 | */ |
938 | 876 | ||
939 | /* TODO: make sure __apic_timer_fn runs in current pCPU */ | 877 | static bool lapic_is_periodic(struct kvm_timer *ktimer) |
940 | static int __apic_timer_fn(struct kvm_lapic *apic) | ||
941 | { | 878 | { |
942 | int result = 0; | 879 | struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, |
943 | wait_queue_head_t *q = &apic->vcpu->wq; | 880 | lapic_timer); |
944 | 881 | return apic_lvtt_period(apic); | |
945 | if(!atomic_inc_and_test(&apic->timer.pending)) | ||
946 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); | ||
947 | if (waitqueue_active(q)) | ||
948 | wake_up_interruptible(q); | ||
949 | |||
950 | if (apic_lvtt_period(apic)) { | ||
951 | result = 1; | ||
952 | hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); | ||
953 | } | ||
954 | return result; | ||
955 | } | 882 | } |
956 | 883 | ||
957 | int apic_has_pending_timer(struct kvm_vcpu *vcpu) | 884 | int apic_has_pending_timer(struct kvm_vcpu *vcpu) |
@@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) | |||
959 | struct kvm_lapic *lapic = vcpu->arch.apic; | 886 | struct kvm_lapic *lapic = vcpu->arch.apic; |
960 | 887 | ||
961 | if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) | 888 | if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) |
962 | return atomic_read(&lapic->timer.pending); | 889 | return atomic_read(&lapic->lapic_timer.pending); |
963 | 890 | ||
964 | return 0; | 891 | return 0; |
965 | } | 892 | } |
@@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) | |||
986 | kvm_apic_local_deliver(apic, APIC_LVT0); | 913 | kvm_apic_local_deliver(apic, APIC_LVT0); |
987 | } | 914 | } |
988 | 915 | ||
989 | static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) | 916 | static struct kvm_timer_ops lapic_timer_ops = { |
990 | { | 917 | .is_periodic = lapic_is_periodic, |
991 | struct kvm_lapic *apic; | 918 | }; |
992 | int restart_timer = 0; | ||
993 | |||
994 | apic = container_of(data, struct kvm_lapic, timer.dev); | ||
995 | |||
996 | restart_timer = __apic_timer_fn(apic); | ||
997 | |||
998 | if (restart_timer) | ||
999 | return HRTIMER_RESTART; | ||
1000 | else | ||
1001 | return HRTIMER_NORESTART; | ||
1002 | } | ||
1003 | 919 | ||
1004 | int kvm_create_lapic(struct kvm_vcpu *vcpu) | 920 | int kvm_create_lapic(struct kvm_vcpu *vcpu) |
1005 | { | 921 | { |
@@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
1024 | memset(apic->regs, 0, PAGE_SIZE); | 940 | memset(apic->regs, 0, PAGE_SIZE); |
1025 | apic->vcpu = vcpu; | 941 | apic->vcpu = vcpu; |
1026 | 942 | ||
1027 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 943 | hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, |
1028 | apic->timer.dev.function = apic_timer_fn; | 944 | HRTIMER_MODE_ABS); |
945 | apic->lapic_timer.timer.function = kvm_timer_fn; | ||
946 | apic->lapic_timer.t_ops = &lapic_timer_ops; | ||
947 | apic->lapic_timer.kvm = vcpu->kvm; | ||
948 | apic->lapic_timer.vcpu_id = vcpu->vcpu_id; | ||
949 | |||
1029 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | 950 | apic->base_address = APIC_DEFAULT_PHYS_BASE; |
1030 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; | 951 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; |
1031 | 952 | ||
@@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | |||
1078 | { | 999 | { |
1079 | struct kvm_lapic *apic = vcpu->arch.apic; | 1000 | struct kvm_lapic *apic = vcpu->arch.apic; |
1080 | 1001 | ||
1081 | if (apic && atomic_read(&apic->timer.pending) > 0) { | 1002 | if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { |
1082 | if (kvm_apic_local_deliver(apic, APIC_LVTT)) | 1003 | if (kvm_apic_local_deliver(apic, APIC_LVTT)) |
1083 | atomic_dec(&apic->timer.pending); | 1004 | atomic_dec(&apic->lapic_timer.pending); |
1084 | } | 1005 | } |
1085 | } | 1006 | } |
1086 | 1007 | ||
@@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
1106 | MSR_IA32_APICBASE_BASE; | 1027 | MSR_IA32_APICBASE_BASE; |
1107 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 1028 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); |
1108 | apic_update_ppr(apic); | 1029 | apic_update_ppr(apic); |
1109 | hrtimer_cancel(&apic->timer.dev); | 1030 | hrtimer_cancel(&apic->lapic_timer.timer); |
1110 | update_divide_count(apic); | 1031 | update_divide_count(apic); |
1111 | start_apic_timer(apic); | 1032 | start_apic_timer(apic); |
1112 | } | 1033 | } |
@@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | |||
1119 | if (!apic) | 1040 | if (!apic) |
1120 | return; | 1041 | return; |
1121 | 1042 | ||
1122 | timer = &apic->timer.dev; | 1043 | timer = &apic->lapic_timer.timer; |
1123 | if (hrtimer_cancel(timer)) | 1044 | if (hrtimer_cancel(timer)) |
1124 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | 1045 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
1125 | } | 1046 | } |
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 45ab6ee71209..a587f8349c46 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -2,18 +2,15 @@ | |||
2 | #define __KVM_X86_LAPIC_H | 2 | #define __KVM_X86_LAPIC_H |
3 | 3 | ||
4 | #include "iodev.h" | 4 | #include "iodev.h" |
5 | #include "kvm_timer.h" | ||
5 | 6 | ||
6 | #include <linux/kvm_host.h> | 7 | #include <linux/kvm_host.h> |
7 | 8 | ||
8 | struct kvm_lapic { | 9 | struct kvm_lapic { |
9 | unsigned long base_address; | 10 | unsigned long base_address; |
10 | struct kvm_io_device dev; | 11 | struct kvm_io_device dev; |
11 | struct { | 12 | struct kvm_timer lapic_timer; |
12 | atomic_t pending; | 13 | u32 divide_count; |
13 | s64 period; /* unit: ns */ | ||
14 | u32 divide_count; | ||
15 | struct hrtimer dev; | ||
16 | } timer; | ||
17 | struct kvm_vcpu *vcpu; | 14 | struct kvm_vcpu *vcpu; |
18 | struct page *regs_page; | 15 | struct page *regs_page; |
19 | void *regs; | 16 | void *regs; |
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); | |||
34 | 31 | ||
35 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | 32 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); |
36 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | 33 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); |
37 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); | 34 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); |
38 | 35 | ||
39 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | 36 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); |
40 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | 37 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); |
41 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | 38 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); |
42 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | 39 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); |
40 | bool kvm_apic_present(struct kvm_vcpu *vcpu); | ||
43 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | 41 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); |
44 | 42 | ||
45 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); | 43 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 32cf11e5728a..5c3d6e81a7dc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644); | |||
126 | #define PFERR_PRESENT_MASK (1U << 0) | 126 | #define PFERR_PRESENT_MASK (1U << 0) |
127 | #define PFERR_WRITE_MASK (1U << 1) | 127 | #define PFERR_WRITE_MASK (1U << 1) |
128 | #define PFERR_USER_MASK (1U << 2) | 128 | #define PFERR_USER_MASK (1U << 2) |
129 | #define PFERR_RSVD_MASK (1U << 3) | ||
129 | #define PFERR_FETCH_MASK (1U << 4) | 130 | #define PFERR_FETCH_MASK (1U << 4) |
130 | 131 | ||
131 | #define PT_DIRECTORY_LEVEL 2 | 132 | #define PT_DIRECTORY_LEVEL 2 |
@@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | |||
177 | static u64 __read_mostly shadow_user_mask; | 178 | static u64 __read_mostly shadow_user_mask; |
178 | static u64 __read_mostly shadow_accessed_mask; | 179 | static u64 __read_mostly shadow_accessed_mask; |
179 | static u64 __read_mostly shadow_dirty_mask; | 180 | static u64 __read_mostly shadow_dirty_mask; |
180 | static u64 __read_mostly shadow_mt_mask; | 181 | |
182 | static inline u64 rsvd_bits(int s, int e) | ||
183 | { | ||
184 | return ((1ULL << (e - s + 1)) - 1) << s; | ||
185 | } | ||
181 | 186 | ||
182 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | 187 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) |
183 | { | 188 | { |
@@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte) | |||
193 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); | 198 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); |
194 | 199 | ||
195 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 200 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
196 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) | 201 | u64 dirty_mask, u64 nx_mask, u64 x_mask) |
197 | { | 202 | { |
198 | shadow_user_mask = user_mask; | 203 | shadow_user_mask = user_mask; |
199 | shadow_accessed_mask = accessed_mask; | 204 | shadow_accessed_mask = accessed_mask; |
200 | shadow_dirty_mask = dirty_mask; | 205 | shadow_dirty_mask = dirty_mask; |
201 | shadow_nx_mask = nx_mask; | 206 | shadow_nx_mask = nx_mask; |
202 | shadow_x_mask = x_mask; | 207 | shadow_x_mask = x_mask; |
203 | shadow_mt_mask = mt_mask; | ||
204 | } | 208 | } |
205 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); | 209 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); |
206 | 210 | ||
@@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu) | |||
219 | return vcpu->arch.shadow_efer & EFER_NX; | 223 | return vcpu->arch.shadow_efer & EFER_NX; |
220 | } | 224 | } |
221 | 225 | ||
222 | static int is_present_pte(unsigned long pte) | ||
223 | { | ||
224 | return pte & PT_PRESENT_MASK; | ||
225 | } | ||
226 | |||
227 | static int is_shadow_present_pte(u64 pte) | 226 | static int is_shadow_present_pte(u64 pte) |
228 | { | 227 | { |
229 | return pte != shadow_trap_nonpresent_pte | 228 | return pte != shadow_trap_nonpresent_pte |
@@ -1074,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | |||
1074 | return NULL; | 1073 | return NULL; |
1075 | } | 1074 | } |
1076 | 1075 | ||
1077 | static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1078 | { | ||
1079 | list_del(&sp->oos_link); | ||
1080 | --kvm->stat.mmu_unsync_global; | ||
1081 | } | ||
1082 | |||
1083 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1076 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
1084 | { | 1077 | { |
1085 | WARN_ON(!sp->unsync); | 1078 | WARN_ON(!sp->unsync); |
1086 | sp->unsync = 0; | 1079 | sp->unsync = 0; |
1087 | if (sp->global) | ||
1088 | kvm_unlink_unsync_global(kvm, sp); | ||
1089 | --kvm->stat.mmu_unsync; | 1080 | --kvm->stat.mmu_unsync; |
1090 | } | 1081 | } |
1091 | 1082 | ||
@@ -1248,7 +1239,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1248 | pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); | 1239 | pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); |
1249 | sp->gfn = gfn; | 1240 | sp->gfn = gfn; |
1250 | sp->role = role; | 1241 | sp->role = role; |
1251 | sp->global = 0; | ||
1252 | hlist_add_head(&sp->hash_link, bucket); | 1242 | hlist_add_head(&sp->hash_link, bucket); |
1253 | if (!direct) { | 1243 | if (!direct) { |
1254 | if (rmap_write_protect(vcpu->kvm, gfn)) | 1244 | if (rmap_write_protect(vcpu->kvm, gfn)) |
@@ -1616,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state, | |||
1616 | return mtrr_state->def_type; | 1606 | return mtrr_state->def_type; |
1617 | } | 1607 | } |
1618 | 1608 | ||
1619 | static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) | 1609 | u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) |
1620 | { | 1610 | { |
1621 | u8 mtrr; | 1611 | u8 mtrr; |
1622 | 1612 | ||
@@ -1626,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
1626 | mtrr = MTRR_TYPE_WRBACK; | 1616 | mtrr = MTRR_TYPE_WRBACK; |
1627 | return mtrr; | 1617 | return mtrr; |
1628 | } | 1618 | } |
1619 | EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); | ||
1629 | 1620 | ||
1630 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 1621 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
1631 | { | 1622 | { |
@@ -1646,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1646 | ++vcpu->kvm->stat.mmu_unsync; | 1637 | ++vcpu->kvm->stat.mmu_unsync; |
1647 | sp->unsync = 1; | 1638 | sp->unsync = 1; |
1648 | 1639 | ||
1649 | if (sp->global) { | 1640 | kvm_mmu_mark_parents_unsync(vcpu, sp); |
1650 | list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); | ||
1651 | ++vcpu->kvm->stat.mmu_unsync_global; | ||
1652 | } else | ||
1653 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1654 | 1641 | ||
1655 | mmu_convert_notrap(sp); | 1642 | mmu_convert_notrap(sp); |
1656 | return 0; | 1643 | return 0; |
@@ -1677,21 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
1677 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1664 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, |
1678 | unsigned pte_access, int user_fault, | 1665 | unsigned pte_access, int user_fault, |
1679 | int write_fault, int dirty, int largepage, | 1666 | int write_fault, int dirty, int largepage, |
1680 | int global, gfn_t gfn, pfn_t pfn, bool speculative, | 1667 | gfn_t gfn, pfn_t pfn, bool speculative, |
1681 | bool can_unsync) | 1668 | bool can_unsync) |
1682 | { | 1669 | { |
1683 | u64 spte; | 1670 | u64 spte; |
1684 | int ret = 0; | 1671 | int ret = 0; |
1685 | u64 mt_mask = shadow_mt_mask; | ||
1686 | struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); | ||
1687 | |||
1688 | if (!global && sp->global) { | ||
1689 | sp->global = 0; | ||
1690 | if (sp->unsync) { | ||
1691 | kvm_unlink_unsync_global(vcpu->kvm, sp); | ||
1692 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1693 | } | ||
1694 | } | ||
1695 | 1672 | ||
1696 | /* | 1673 | /* |
1697 | * We don't set the accessed bit, since we sometimes want to see | 1674 | * We don't set the accessed bit, since we sometimes want to see |
@@ -1711,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1711 | spte |= shadow_user_mask; | 1688 | spte |= shadow_user_mask; |
1712 | if (largepage) | 1689 | if (largepage) |
1713 | spte |= PT_PAGE_SIZE_MASK; | 1690 | spte |= PT_PAGE_SIZE_MASK; |
1714 | if (mt_mask) { | 1691 | if (tdp_enabled) |
1715 | if (!kvm_is_mmio_pfn(pfn)) { | 1692 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, |
1716 | mt_mask = get_memory_type(vcpu, gfn) << | 1693 | kvm_is_mmio_pfn(pfn)); |
1717 | kvm_x86_ops->get_mt_mask_shift(); | ||
1718 | mt_mask |= VMX_EPT_IGMT_BIT; | ||
1719 | } else | ||
1720 | mt_mask = MTRR_TYPE_UNCACHABLE << | ||
1721 | kvm_x86_ops->get_mt_mask_shift(); | ||
1722 | spte |= mt_mask; | ||
1723 | } | ||
1724 | 1694 | ||
1725 | spte |= (u64)pfn << PAGE_SHIFT; | 1695 | spte |= (u64)pfn << PAGE_SHIFT; |
1726 | 1696 | ||
@@ -1765,8 +1735,8 @@ set_pte: | |||
1765 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1735 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, |
1766 | unsigned pt_access, unsigned pte_access, | 1736 | unsigned pt_access, unsigned pte_access, |
1767 | int user_fault, int write_fault, int dirty, | 1737 | int user_fault, int write_fault, int dirty, |
1768 | int *ptwrite, int largepage, int global, | 1738 | int *ptwrite, int largepage, gfn_t gfn, |
1769 | gfn_t gfn, pfn_t pfn, bool speculative) | 1739 | pfn_t pfn, bool speculative) |
1770 | { | 1740 | { |
1771 | int was_rmapped = 0; | 1741 | int was_rmapped = 0; |
1772 | int was_writeble = is_writeble_pte(*shadow_pte); | 1742 | int was_writeble = is_writeble_pte(*shadow_pte); |
@@ -1795,7 +1765,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1795 | was_rmapped = 1; | 1765 | was_rmapped = 1; |
1796 | } | 1766 | } |
1797 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, | 1767 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, |
1798 | dirty, largepage, global, gfn, pfn, speculative, true)) { | 1768 | dirty, largepage, gfn, pfn, speculative, true)) { |
1799 | if (write_fault) | 1769 | if (write_fault) |
1800 | *ptwrite = 1; | 1770 | *ptwrite = 1; |
1801 | kvm_x86_ops->tlb_flush(vcpu); | 1771 | kvm_x86_ops->tlb_flush(vcpu); |
@@ -1843,7 +1813,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
1843 | || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { | 1813 | || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { |
1844 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 1814 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, |
1845 | 0, write, 1, &pt_write, | 1815 | 0, write, 1, &pt_write, |
1846 | largepage, 0, gfn, pfn, false); | 1816 | largepage, gfn, pfn, false); |
1847 | ++vcpu->stat.pf_fixed; | 1817 | ++vcpu->stat.pf_fixed; |
1848 | break; | 1818 | break; |
1849 | } | 1819 | } |
@@ -1942,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
1942 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 1912 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
1943 | } | 1913 | } |
1944 | 1914 | ||
1945 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | 1915 | static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) |
1916 | { | ||
1917 | int ret = 0; | ||
1918 | |||
1919 | if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { | ||
1920 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | ||
1921 | ret = 1; | ||
1922 | } | ||
1923 | |||
1924 | return ret; | ||
1925 | } | ||
1926 | |||
1927 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
1946 | { | 1928 | { |
1947 | int i; | 1929 | int i; |
1948 | gfn_t root_gfn; | 1930 | gfn_t root_gfn; |
@@ -1957,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1957 | ASSERT(!VALID_PAGE(root)); | 1939 | ASSERT(!VALID_PAGE(root)); |
1958 | if (tdp_enabled) | 1940 | if (tdp_enabled) |
1959 | direct = 1; | 1941 | direct = 1; |
1942 | if (mmu_check_root(vcpu, root_gfn)) | ||
1943 | return 1; | ||
1960 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | 1944 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, |
1961 | PT64_ROOT_LEVEL, direct, | 1945 | PT64_ROOT_LEVEL, direct, |
1962 | ACC_ALL, NULL); | 1946 | ACC_ALL, NULL); |
1963 | root = __pa(sp->spt); | 1947 | root = __pa(sp->spt); |
1964 | ++sp->root_count; | 1948 | ++sp->root_count; |
1965 | vcpu->arch.mmu.root_hpa = root; | 1949 | vcpu->arch.mmu.root_hpa = root; |
1966 | return; | 1950 | return 0; |
1967 | } | 1951 | } |
1968 | direct = !is_paging(vcpu); | 1952 | direct = !is_paging(vcpu); |
1969 | if (tdp_enabled) | 1953 | if (tdp_enabled) |
@@ -1980,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1980 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; | 1964 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; |
1981 | } else if (vcpu->arch.mmu.root_level == 0) | 1965 | } else if (vcpu->arch.mmu.root_level == 0) |
1982 | root_gfn = 0; | 1966 | root_gfn = 0; |
1967 | if (mmu_check_root(vcpu, root_gfn)) | ||
1968 | return 1; | ||
1983 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | 1969 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, |
1984 | PT32_ROOT_LEVEL, direct, | 1970 | PT32_ROOT_LEVEL, direct, |
1985 | ACC_ALL, NULL); | 1971 | ACC_ALL, NULL); |
@@ -1988,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1988 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | 1974 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; |
1989 | } | 1975 | } |
1990 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | 1976 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); |
1977 | return 0; | ||
1991 | } | 1978 | } |
1992 | 1979 | ||
1993 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | 1980 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) |
@@ -2006,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2006 | for (i = 0; i < 4; ++i) { | 1993 | for (i = 0; i < 4; ++i) { |
2007 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 1994 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
2008 | 1995 | ||
2009 | if (root) { | 1996 | if (root && VALID_PAGE(root)) { |
2010 | root &= PT64_BASE_ADDR_MASK; | 1997 | root &= PT64_BASE_ADDR_MASK; |
2011 | sp = page_header(root); | 1998 | sp = page_header(root); |
2012 | mmu_sync_children(vcpu, sp); | 1999 | mmu_sync_children(vcpu, sp); |
@@ -2014,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2014 | } | 2001 | } |
2015 | } | 2002 | } |
2016 | 2003 | ||
2017 | static void mmu_sync_global(struct kvm_vcpu *vcpu) | ||
2018 | { | ||
2019 | struct kvm *kvm = vcpu->kvm; | ||
2020 | struct kvm_mmu_page *sp, *n; | ||
2021 | |||
2022 | list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) | ||
2023 | kvm_sync_page(vcpu, sp); | ||
2024 | } | ||
2025 | |||
2026 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | 2004 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) |
2027 | { | 2005 | { |
2028 | spin_lock(&vcpu->kvm->mmu_lock); | 2006 | spin_lock(&vcpu->kvm->mmu_lock); |
@@ -2030,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2030 | spin_unlock(&vcpu->kvm->mmu_lock); | 2008 | spin_unlock(&vcpu->kvm->mmu_lock); |
2031 | } | 2009 | } |
2032 | 2010 | ||
2033 | void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) | ||
2034 | { | ||
2035 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2036 | mmu_sync_global(vcpu); | ||
2037 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2038 | } | ||
2039 | |||
2040 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | 2011 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) |
2041 | { | 2012 | { |
2042 | return vaddr; | 2013 | return vaddr; |
@@ -2151,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu) | |||
2151 | nonpaging_free(vcpu); | 2122 | nonpaging_free(vcpu); |
2152 | } | 2123 | } |
2153 | 2124 | ||
2125 | static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) | ||
2126 | { | ||
2127 | int bit7; | ||
2128 | |||
2129 | bit7 = (gpte >> 7) & 1; | ||
2130 | return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; | ||
2131 | } | ||
2132 | |||
2154 | #define PTTYPE 64 | 2133 | #define PTTYPE 64 |
2155 | #include "paging_tmpl.h" | 2134 | #include "paging_tmpl.h" |
2156 | #undef PTTYPE | 2135 | #undef PTTYPE |
@@ -2159,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu) | |||
2159 | #include "paging_tmpl.h" | 2138 | #include "paging_tmpl.h" |
2160 | #undef PTTYPE | 2139 | #undef PTTYPE |
2161 | 2140 | ||
2141 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | ||
2142 | { | ||
2143 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
2144 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
2145 | u64 exb_bit_rsvd = 0; | ||
2146 | |||
2147 | if (!is_nx(vcpu)) | ||
2148 | exb_bit_rsvd = rsvd_bits(63, 63); | ||
2149 | switch (level) { | ||
2150 | case PT32_ROOT_LEVEL: | ||
2151 | /* no rsvd bits for 2 level 4K page table entries */ | ||
2152 | context->rsvd_bits_mask[0][1] = 0; | ||
2153 | context->rsvd_bits_mask[0][0] = 0; | ||
2154 | if (is_cpuid_PSE36()) | ||
2155 | /* 36bits PSE 4MB page */ | ||
2156 | context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); | ||
2157 | else | ||
2158 | /* 32 bits PSE 4MB page */ | ||
2159 | context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); | ||
2160 | context->rsvd_bits_mask[1][0] = ~0ull; | ||
2161 | break; | ||
2162 | case PT32E_ROOT_LEVEL: | ||
2163 | context->rsvd_bits_mask[0][2] = | ||
2164 | rsvd_bits(maxphyaddr, 63) | | ||
2165 | rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ | ||
2166 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | | ||
2167 | rsvd_bits(maxphyaddr, 62); /* PDE */ | ||
2168 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | ||
2169 | rsvd_bits(maxphyaddr, 62); /* PTE */ | ||
2170 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | ||
2171 | rsvd_bits(maxphyaddr, 62) | | ||
2172 | rsvd_bits(13, 20); /* large page */ | ||
2173 | context->rsvd_bits_mask[1][0] = ~0ull; | ||
2174 | break; | ||
2175 | case PT64_ROOT_LEVEL: | ||
2176 | context->rsvd_bits_mask[0][3] = exb_bit_rsvd | | ||
2177 | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); | ||
2178 | context->rsvd_bits_mask[0][2] = exb_bit_rsvd | | ||
2179 | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); | ||
2180 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | | ||
2181 | rsvd_bits(maxphyaddr, 51); | ||
2182 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | ||
2183 | rsvd_bits(maxphyaddr, 51); | ||
2184 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; | ||
2185 | context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; | ||
2186 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | ||
2187 | rsvd_bits(maxphyaddr, 51) | | ||
2188 | rsvd_bits(13, 20); /* large page */ | ||
2189 | context->rsvd_bits_mask[1][0] = ~0ull; | ||
2190 | break; | ||
2191 | } | ||
2192 | } | ||
2193 | |||
2162 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | 2194 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) |
2163 | { | 2195 | { |
2164 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2196 | struct kvm_mmu *context = &vcpu->arch.mmu; |
@@ -2179,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
2179 | 2211 | ||
2180 | static int paging64_init_context(struct kvm_vcpu *vcpu) | 2212 | static int paging64_init_context(struct kvm_vcpu *vcpu) |
2181 | { | 2213 | { |
2214 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | ||
2182 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | 2215 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); |
2183 | } | 2216 | } |
2184 | 2217 | ||
@@ -2186,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
2186 | { | 2219 | { |
2187 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2220 | struct kvm_mmu *context = &vcpu->arch.mmu; |
2188 | 2221 | ||
2222 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | ||
2189 | context->new_cr3 = paging_new_cr3; | 2223 | context->new_cr3 = paging_new_cr3; |
2190 | context->page_fault = paging32_page_fault; | 2224 | context->page_fault = paging32_page_fault; |
2191 | context->gva_to_gpa = paging32_gva_to_gpa; | 2225 | context->gva_to_gpa = paging32_gva_to_gpa; |
@@ -2201,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
2201 | 2235 | ||
2202 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | 2236 | static int paging32E_init_context(struct kvm_vcpu *vcpu) |
2203 | { | 2237 | { |
2238 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | ||
2204 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | 2239 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); |
2205 | } | 2240 | } |
2206 | 2241 | ||
@@ -2221,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2221 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 2256 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
2222 | context->root_level = 0; | 2257 | context->root_level = 0; |
2223 | } else if (is_long_mode(vcpu)) { | 2258 | } else if (is_long_mode(vcpu)) { |
2259 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | ||
2224 | context->gva_to_gpa = paging64_gva_to_gpa; | 2260 | context->gva_to_gpa = paging64_gva_to_gpa; |
2225 | context->root_level = PT64_ROOT_LEVEL; | 2261 | context->root_level = PT64_ROOT_LEVEL; |
2226 | } else if (is_pae(vcpu)) { | 2262 | } else if (is_pae(vcpu)) { |
2263 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | ||
2227 | context->gva_to_gpa = paging64_gva_to_gpa; | 2264 | context->gva_to_gpa = paging64_gva_to_gpa; |
2228 | context->root_level = PT32E_ROOT_LEVEL; | 2265 | context->root_level = PT32E_ROOT_LEVEL; |
2229 | } else { | 2266 | } else { |
2267 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | ||
2230 | context->gva_to_gpa = paging32_gva_to_gpa; | 2268 | context->gva_to_gpa = paging32_gva_to_gpa; |
2231 | context->root_level = PT32_ROOT_LEVEL; | 2269 | context->root_level = PT32_ROOT_LEVEL; |
2232 | } | 2270 | } |
@@ -2290,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
2290 | goto out; | 2328 | goto out; |
2291 | spin_lock(&vcpu->kvm->mmu_lock); | 2329 | spin_lock(&vcpu->kvm->mmu_lock); |
2292 | kvm_mmu_free_some_pages(vcpu); | 2330 | kvm_mmu_free_some_pages(vcpu); |
2293 | mmu_alloc_roots(vcpu); | 2331 | r = mmu_alloc_roots(vcpu); |
2294 | mmu_sync_roots(vcpu); | 2332 | mmu_sync_roots(vcpu); |
2295 | spin_unlock(&vcpu->kvm->mmu_lock); | 2333 | spin_unlock(&vcpu->kvm->mmu_lock); |
2334 | if (r) | ||
2335 | goto out; | ||
2296 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 2336 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
2297 | kvm_mmu_flush_tlb(vcpu); | 2337 | kvm_mmu_flush_tlb(vcpu); |
2298 | out: | 2338 | out: |
@@ -2638,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); | |||
2638 | 2678 | ||
2639 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | 2679 | static void free_mmu_pages(struct kvm_vcpu *vcpu) |
2640 | { | 2680 | { |
2641 | struct kvm_mmu_page *sp; | ||
2642 | |||
2643 | while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | ||
2644 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, | ||
2645 | struct kvm_mmu_page, link); | ||
2646 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
2647 | cond_resched(); | ||
2648 | } | ||
2649 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | 2681 | free_page((unsigned long)vcpu->arch.mmu.pae_root); |
2650 | } | 2682 | } |
2651 | 2683 | ||
@@ -2710,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2710 | { | 2742 | { |
2711 | struct kvm_mmu_page *sp; | 2743 | struct kvm_mmu_page *sp; |
2712 | 2744 | ||
2713 | spin_lock(&kvm->mmu_lock); | ||
2714 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 2745 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { |
2715 | int i; | 2746 | int i; |
2716 | u64 *pt; | 2747 | u64 *pt; |
@@ -2725,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2725 | pt[i] &= ~PT_WRITABLE_MASK; | 2756 | pt[i] &= ~PT_WRITABLE_MASK; |
2726 | } | 2757 | } |
2727 | kvm_flush_remote_tlbs(kvm); | 2758 | kvm_flush_remote_tlbs(kvm); |
2728 | spin_unlock(&kvm->mmu_lock); | ||
2729 | } | 2759 | } |
2730 | 2760 | ||
2731 | void kvm_mmu_zap_all(struct kvm *kvm) | 2761 | void kvm_mmu_zap_all(struct kvm *kvm) |
@@ -3007,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | |||
3007 | " in nonleaf level: levels %d gva %lx" | 3037 | " in nonleaf level: levels %d gva %lx" |
3008 | " level %d pte %llx\n", audit_msg, | 3038 | " level %d pte %llx\n", audit_msg, |
3009 | vcpu->arch.mmu.root_level, va, level, ent); | 3039 | vcpu->arch.mmu.root_level, va, level, ent); |
3010 | 3040 | else | |
3011 | audit_mappings_page(vcpu, ent, va, level - 1); | 3041 | audit_mappings_page(vcpu, ent, va, level - 1); |
3012 | } else { | 3042 | } else { |
3013 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | 3043 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); |
3014 | hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; | 3044 | gfn_t gfn = gpa >> PAGE_SHIFT; |
3045 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
3046 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; | ||
3015 | 3047 | ||
3016 | if (is_shadow_present_pte(ent) | 3048 | if (is_shadow_present_pte(ent) |
3017 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | 3049 | && (ent & PT64_BASE_ADDR_MASK) != hpa) |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index eaab2145f62b..3494a2fb136e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu) | |||
75 | return vcpu->arch.cr0 & X86_CR0_PG; | 75 | return vcpu->arch.cr0 & X86_CR0_PG; |
76 | } | 76 | } |
77 | 77 | ||
78 | static inline int is_present_pte(unsigned long pte) | ||
79 | { | ||
80 | return pte & PT_PRESENT_MASK; | ||
81 | } | ||
82 | |||
78 | #endif | 83 | #endif |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bd70206c561..258e4591e1ca 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, | |||
123 | gfn_t table_gfn; | 123 | gfn_t table_gfn; |
124 | unsigned index, pt_access, pte_access; | 124 | unsigned index, pt_access, pte_access; |
125 | gpa_t pte_gpa; | 125 | gpa_t pte_gpa; |
126 | int rsvd_fault = 0; | ||
126 | 127 | ||
127 | pgprintk("%s: addr %lx\n", __func__, addr); | 128 | pgprintk("%s: addr %lx\n", __func__, addr); |
128 | walk: | 129 | walk: |
@@ -157,6 +158,10 @@ walk: | |||
157 | if (!is_present_pte(pte)) | 158 | if (!is_present_pte(pte)) |
158 | goto not_present; | 159 | goto not_present; |
159 | 160 | ||
161 | rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); | ||
162 | if (rsvd_fault) | ||
163 | goto access_error; | ||
164 | |||
160 | if (write_fault && !is_writeble_pte(pte)) | 165 | if (write_fault && !is_writeble_pte(pte)) |
161 | if (user_fault || is_write_protection(vcpu)) | 166 | if (user_fault || is_write_protection(vcpu)) |
162 | goto access_error; | 167 | goto access_error; |
@@ -209,7 +214,6 @@ walk: | |||
209 | if (ret) | 214 | if (ret) |
210 | goto walk; | 215 | goto walk; |
211 | pte |= PT_DIRTY_MASK; | 216 | pte |= PT_DIRTY_MASK; |
212 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); | ||
213 | walker->ptes[walker->level - 1] = pte; | 217 | walker->ptes[walker->level - 1] = pte; |
214 | } | 218 | } |
215 | 219 | ||
@@ -233,6 +237,8 @@ err: | |||
233 | walker->error_code |= PFERR_USER_MASK; | 237 | walker->error_code |= PFERR_USER_MASK; |
234 | if (fetch_fault) | 238 | if (fetch_fault) |
235 | walker->error_code |= PFERR_FETCH_MASK; | 239 | walker->error_code |= PFERR_FETCH_MASK; |
240 | if (rsvd_fault) | ||
241 | walker->error_code |= PFERR_RSVD_MASK; | ||
236 | return 0; | 242 | return 0; |
237 | } | 243 | } |
238 | 244 | ||
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
262 | kvm_get_pfn(pfn); | 268 | kvm_get_pfn(pfn); |
263 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 269 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
264 | gpte & PT_DIRTY_MASK, NULL, largepage, | 270 | gpte & PT_DIRTY_MASK, NULL, largepage, |
265 | gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), | 271 | gpte_to_gfn(gpte), pfn, true); |
266 | pfn, true); | ||
267 | } | 272 | } |
268 | 273 | ||
269 | /* | 274 | /* |
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
297 | user_fault, write_fault, | 302 | user_fault, write_fault, |
298 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | 303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, |
299 | ptwrite, largepage, | 304 | ptwrite, largepage, |
300 | gw->ptes[gw->level-1] & PT_GLOBAL_MASK, | ||
301 | gw->gfn, pfn, false); | 305 | gw->gfn, pfn, false); |
302 | break; | 306 | break; |
303 | } | 307 | } |
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
380 | return r; | 384 | return r; |
381 | 385 | ||
382 | /* | 386 | /* |
383 | * Look up the shadow pte for the faulting address. | 387 | * Look up the guest pte for the faulting address. |
384 | */ | 388 | */ |
385 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | 389 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, |
386 | fetch_fault); | 390 | fetch_fault); |
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
586 | nr_present++; | 590 | nr_present++; |
587 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 591 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
588 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 592 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
589 | is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, | 593 | is_dirty_pte(gpte), 0, gfn, |
590 | spte_to_pfn(sp->spt[i]), true, false); | 594 | spte_to_pfn(sp->spt[i]), true, false); |
591 | } | 595 | } |
592 | 596 | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1f8510c51d6e..71510e07e69e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include "irq.h" | 19 | #include "irq.h" |
20 | #include "mmu.h" | 20 | #include "mmu.h" |
21 | #include "kvm_cache_regs.h" | 21 | #include "kvm_cache_regs.h" |
22 | #include "x86.h" | ||
22 | 23 | ||
23 | #include <linux/module.h> | 24 | #include <linux/module.h> |
24 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
@@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO); | |||
69 | static int nested = 0; | 70 | static int nested = 0; |
70 | module_param(nested, int, S_IRUGO); | 71 | module_param(nested, int, S_IRUGO); |
71 | 72 | ||
72 | static void kvm_reput_irq(struct vcpu_svm *svm); | ||
73 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); | 73 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); |
74 | 74 | ||
75 | static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); | 75 | static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); |
@@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat) | |||
132 | return svm_features & feat; | 132 | return svm_features & feat; |
133 | } | 133 | } |
134 | 134 | ||
135 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) | ||
136 | { | ||
137 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
138 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
139 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
140 | |||
141 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
142 | if (!vcpu->arch.irq_pending[word_index]) | ||
143 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
144 | return irq; | ||
145 | } | ||
146 | |||
147 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) | ||
148 | { | ||
149 | set_bit(irq, vcpu->arch.irq_pending); | ||
150 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
151 | } | ||
152 | |||
153 | static inline void clgi(void) | 135 | static inline void clgi(void) |
154 | { | 136 | { |
155 | asm volatile (__ex(SVM_CLGI)); | 137 | asm volatile (__ex(SVM_CLGI)); |
@@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
214 | svm->vmcb->control.event_inj_err = error_code; | 196 | svm->vmcb->control.event_inj_err = error_code; |
215 | } | 197 | } |
216 | 198 | ||
217 | static bool svm_exception_injected(struct kvm_vcpu *vcpu) | 199 | static int is_external_interrupt(u32 info) |
200 | { | ||
201 | info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
202 | return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); | ||
203 | } | ||
204 | |||
205 | static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) | ||
218 | { | 206 | { |
219 | struct vcpu_svm *svm = to_svm(vcpu); | 207 | struct vcpu_svm *svm = to_svm(vcpu); |
208 | u32 ret = 0; | ||
220 | 209 | ||
221 | return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); | 210 | if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) |
211 | ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; | ||
212 | return ret & mask; | ||
222 | } | 213 | } |
223 | 214 | ||
224 | static int is_external_interrupt(u32 info) | 215 | static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) |
225 | { | 216 | { |
226 | info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | 217 | struct vcpu_svm *svm = to_svm(vcpu); |
227 | return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); | 218 | |
219 | if (mask == 0) | ||
220 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | ||
221 | else | ||
222 | svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; | ||
223 | |||
228 | } | 224 | } |
229 | 225 | ||
230 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | 226 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) |
@@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
232 | struct vcpu_svm *svm = to_svm(vcpu); | 228 | struct vcpu_svm *svm = to_svm(vcpu); |
233 | 229 | ||
234 | if (!svm->next_rip) { | 230 | if (!svm->next_rip) { |
235 | printk(KERN_DEBUG "%s: NOP\n", __func__); | 231 | if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != |
232 | EMULATE_DONE) | ||
233 | printk(KERN_DEBUG "%s: NOP\n", __func__); | ||
236 | return; | 234 | return; |
237 | } | 235 | } |
238 | if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) | 236 | if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) |
@@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
240 | __func__, kvm_rip_read(vcpu), svm->next_rip); | 238 | __func__, kvm_rip_read(vcpu), svm->next_rip); |
241 | 239 | ||
242 | kvm_rip_write(vcpu, svm->next_rip); | 240 | kvm_rip_write(vcpu, svm->next_rip); |
243 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | 241 | svm_set_interrupt_shadow(vcpu, 0); |
244 | |||
245 | vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK); | ||
246 | } | 242 | } |
247 | 243 | ||
248 | static int has_svm(void) | 244 | static int has_svm(void) |
@@ -830,6 +826,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, | |||
830 | if (!var->unusable) | 826 | if (!var->unusable) |
831 | var->type |= 0x1; | 827 | var->type |= 0x1; |
832 | break; | 828 | break; |
829 | case VCPU_SREG_SS: | ||
830 | /* On AMD CPUs sometimes the DB bit in the segment | ||
831 | * descriptor is left as 1, although the whole segment has | ||
832 | * been made unusable. Clear it here to pass an Intel VMX | ||
833 | * entry check when cross vendor migrating. | ||
834 | */ | ||
835 | if (var->unusable) | ||
836 | var->db = 0; | ||
837 | break; | ||
833 | } | 838 | } |
834 | } | 839 | } |
835 | 840 | ||
@@ -960,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
960 | 965 | ||
961 | } | 966 | } |
962 | 967 | ||
963 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | 968 | static void update_db_intercept(struct kvm_vcpu *vcpu) |
964 | { | 969 | { |
965 | int old_debug = vcpu->guest_debug; | ||
966 | struct vcpu_svm *svm = to_svm(vcpu); | 970 | struct vcpu_svm *svm = to_svm(vcpu); |
967 | 971 | ||
968 | vcpu->guest_debug = dbg->control; | ||
969 | |||
970 | svm->vmcb->control.intercept_exceptions &= | 972 | svm->vmcb->control.intercept_exceptions &= |
971 | ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); | 973 | ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); |
974 | |||
975 | if (vcpu->arch.singlestep) | ||
976 | svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); | ||
977 | |||
972 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { | 978 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
973 | if (vcpu->guest_debug & | 979 | if (vcpu->guest_debug & |
974 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | 980 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) |
@@ -979,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | |||
979 | 1 << BP_VECTOR; | 985 | 1 << BP_VECTOR; |
980 | } else | 986 | } else |
981 | vcpu->guest_debug = 0; | 987 | vcpu->guest_debug = 0; |
988 | } | ||
989 | |||
990 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | ||
991 | { | ||
992 | int old_debug = vcpu->guest_debug; | ||
993 | struct vcpu_svm *svm = to_svm(vcpu); | ||
994 | |||
995 | vcpu->guest_debug = dbg->control; | ||
996 | |||
997 | update_db_intercept(vcpu); | ||
982 | 998 | ||
983 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | 999 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) |
984 | svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; | 1000 | svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; |
@@ -993,16 +1009,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | |||
993 | return 0; | 1009 | return 0; |
994 | } | 1010 | } |
995 | 1011 | ||
996 | static int svm_get_irq(struct kvm_vcpu *vcpu) | ||
997 | { | ||
998 | struct vcpu_svm *svm = to_svm(vcpu); | ||
999 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
1000 | |||
1001 | if (is_external_interrupt(exit_int_info)) | ||
1002 | return exit_int_info & SVM_EVTINJ_VEC_MASK; | ||
1003 | return -1; | ||
1004 | } | ||
1005 | |||
1006 | static void load_host_msrs(struct kvm_vcpu *vcpu) | 1012 | static void load_host_msrs(struct kvm_vcpu *vcpu) |
1007 | { | 1013 | { |
1008 | #ifdef CONFIG_X86_64 | 1014 | #ifdef CONFIG_X86_64 |
@@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | |||
1107 | 1113 | ||
1108 | static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1114 | static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1109 | { | 1115 | { |
1110 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
1111 | struct kvm *kvm = svm->vcpu.kvm; | ||
1112 | u64 fault_address; | 1116 | u64 fault_address; |
1113 | u32 error_code; | 1117 | u32 error_code; |
1114 | bool event_injection = false; | ||
1115 | |||
1116 | if (!irqchip_in_kernel(kvm) && | ||
1117 | is_external_interrupt(exit_int_info)) { | ||
1118 | event_injection = true; | ||
1119 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | ||
1120 | } | ||
1121 | 1118 | ||
1122 | fault_address = svm->vmcb->control.exit_info_2; | 1119 | fault_address = svm->vmcb->control.exit_info_2; |
1123 | error_code = svm->vmcb->control.exit_info_1; | 1120 | error_code = svm->vmcb->control.exit_info_1; |
@@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1137 | */ | 1134 | */ |
1138 | if (npt_enabled) | 1135 | if (npt_enabled) |
1139 | svm_flush_tlb(&svm->vcpu); | 1136 | svm_flush_tlb(&svm->vcpu); |
1140 | 1137 | else { | |
1141 | if (!npt_enabled && event_injection) | 1138 | if (kvm_event_needs_reinjection(&svm->vcpu)) |
1142 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | 1139 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1140 | } | ||
1143 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1141 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
1144 | } | 1142 | } |
1145 | 1143 | ||
1146 | static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1144 | static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1147 | { | 1145 | { |
1148 | if (!(svm->vcpu.guest_debug & | 1146 | if (!(svm->vcpu.guest_debug & |
1149 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { | 1147 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && |
1148 | !svm->vcpu.arch.singlestep) { | ||
1150 | kvm_queue_exception(&svm->vcpu, DB_VECTOR); | 1149 | kvm_queue_exception(&svm->vcpu, DB_VECTOR); |
1151 | return 1; | 1150 | return 1; |
1152 | } | 1151 | } |
1153 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | 1152 | |
1154 | kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; | 1153 | if (svm->vcpu.arch.singlestep) { |
1155 | kvm_run->debug.arch.exception = DB_VECTOR; | 1154 | svm->vcpu.arch.singlestep = false; |
1156 | return 0; | 1155 | if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) |
1156 | svm->vmcb->save.rflags &= | ||
1157 | ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
1158 | update_db_intercept(&svm->vcpu); | ||
1159 | } | ||
1160 | |||
1161 | if (svm->vcpu.guest_debug & | ||
1162 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ | ||
1163 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | ||
1164 | kvm_run->debug.arch.pc = | ||
1165 | svm->vmcb->save.cs.base + svm->vmcb->save.rip; | ||
1166 | kvm_run->debug.arch.exception = DB_VECTOR; | ||
1167 | return 0; | ||
1168 | } | ||
1169 | |||
1170 | return 1; | ||
1157 | } | 1171 | } |
1158 | 1172 | ||
1159 | static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1173 | static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
@@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm, | |||
1842 | struct kvm_run *kvm_run) | 1856 | struct kvm_run *kvm_run) |
1843 | { | 1857 | { |
1844 | u16 tss_selector; | 1858 | u16 tss_selector; |
1859 | int reason; | ||
1860 | int int_type = svm->vmcb->control.exit_int_info & | ||
1861 | SVM_EXITINTINFO_TYPE_MASK; | ||
1862 | int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; | ||
1863 | uint32_t type = | ||
1864 | svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; | ||
1865 | uint32_t idt_v = | ||
1866 | svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; | ||
1845 | 1867 | ||
1846 | tss_selector = (u16)svm->vmcb->control.exit_info_1; | 1868 | tss_selector = (u16)svm->vmcb->control.exit_info_1; |
1869 | |||
1847 | if (svm->vmcb->control.exit_info_2 & | 1870 | if (svm->vmcb->control.exit_info_2 & |
1848 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) | 1871 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) |
1849 | return kvm_task_switch(&svm->vcpu, tss_selector, | 1872 | reason = TASK_SWITCH_IRET; |
1850 | TASK_SWITCH_IRET); | 1873 | else if (svm->vmcb->control.exit_info_2 & |
1851 | if (svm->vmcb->control.exit_info_2 & | 1874 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) |
1852 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) | 1875 | reason = TASK_SWITCH_JMP; |
1853 | return kvm_task_switch(&svm->vcpu, tss_selector, | 1876 | else if (idt_v) |
1854 | TASK_SWITCH_JMP); | 1877 | reason = TASK_SWITCH_GATE; |
1855 | return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); | 1878 | else |
1879 | reason = TASK_SWITCH_CALL; | ||
1880 | |||
1881 | if (reason == TASK_SWITCH_GATE) { | ||
1882 | switch (type) { | ||
1883 | case SVM_EXITINTINFO_TYPE_NMI: | ||
1884 | svm->vcpu.arch.nmi_injected = false; | ||
1885 | break; | ||
1886 | case SVM_EXITINTINFO_TYPE_EXEPT: | ||
1887 | kvm_clear_exception_queue(&svm->vcpu); | ||
1888 | break; | ||
1889 | case SVM_EXITINTINFO_TYPE_INTR: | ||
1890 | kvm_clear_interrupt_queue(&svm->vcpu); | ||
1891 | break; | ||
1892 | default: | ||
1893 | break; | ||
1894 | } | ||
1895 | } | ||
1896 | |||
1897 | if (reason != TASK_SWITCH_GATE || | ||
1898 | int_type == SVM_EXITINTINFO_TYPE_SOFT || | ||
1899 | (int_type == SVM_EXITINTINFO_TYPE_EXEPT && | ||
1900 | (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) | ||
1901 | skip_emulated_instruction(&svm->vcpu); | ||
1902 | |||
1903 | return kvm_task_switch(&svm->vcpu, tss_selector, reason); | ||
1856 | } | 1904 | } |
1857 | 1905 | ||
1858 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1906 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
@@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1862 | return 1; | 1910 | return 1; |
1863 | } | 1911 | } |
1864 | 1912 | ||
1913 | static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1914 | { | ||
1915 | ++svm->vcpu.stat.nmi_window_exits; | ||
1916 | svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); | ||
1917 | svm->vcpu.arch.hflags |= HF_IRET_MASK; | ||
1918 | return 1; | ||
1919 | } | ||
1920 | |||
1865 | static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1921 | static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1866 | { | 1922 | { |
1867 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) | 1923 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) |
@@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm, | |||
1879 | 1935 | ||
1880 | static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1936 | static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1881 | { | 1937 | { |
1938 | u8 cr8_prev = kvm_get_cr8(&svm->vcpu); | ||
1939 | /* instruction emulation calls kvm_set_cr8() */ | ||
1882 | emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); | 1940 | emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); |
1883 | if (irqchip_in_kernel(svm->vcpu.kvm)) | 1941 | if (irqchip_in_kernel(svm->vcpu.kvm)) { |
1942 | svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | ||
1943 | return 1; | ||
1944 | } | ||
1945 | if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) | ||
1884 | return 1; | 1946 | return 1; |
1885 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | 1947 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; |
1886 | return 0; | 1948 | return 0; |
@@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm, | |||
2090 | * If the user space waits to inject interrupts, exit as soon as | 2152 | * If the user space waits to inject interrupts, exit as soon as |
2091 | * possible | 2153 | * possible |
2092 | */ | 2154 | */ |
2093 | if (kvm_run->request_interrupt_window && | 2155 | if (!irqchip_in_kernel(svm->vcpu.kvm) && |
2094 | !svm->vcpu.arch.irq_summary) { | 2156 | kvm_run->request_interrupt_window && |
2157 | !kvm_cpu_has_interrupt(&svm->vcpu)) { | ||
2095 | ++svm->vcpu.stat.irq_window_exits; | 2158 | ++svm->vcpu.stat.irq_window_exits; |
2096 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 2159 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
2097 | return 0; | 2160 | return 0; |
@@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
2134 | [SVM_EXIT_VINTR] = interrupt_window_interception, | 2197 | [SVM_EXIT_VINTR] = interrupt_window_interception, |
2135 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ | 2198 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ |
2136 | [SVM_EXIT_CPUID] = cpuid_interception, | 2199 | [SVM_EXIT_CPUID] = cpuid_interception, |
2200 | [SVM_EXIT_IRET] = iret_interception, | ||
2137 | [SVM_EXIT_INVD] = emulate_on_interception, | 2201 | [SVM_EXIT_INVD] = emulate_on_interception, |
2138 | [SVM_EXIT_HLT] = halt_interception, | 2202 | [SVM_EXIT_HLT] = halt_interception, |
2139 | [SVM_EXIT_INVLPG] = invlpg_interception, | 2203 | [SVM_EXIT_INVLPG] = invlpg_interception, |
@@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2194 | } | 2258 | } |
2195 | } | 2259 | } |
2196 | 2260 | ||
2197 | kvm_reput_irq(svm); | ||
2198 | 2261 | ||
2199 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { | 2262 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { |
2200 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 2263 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
@@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2205 | 2268 | ||
2206 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && | 2269 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && |
2207 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && | 2270 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && |
2208 | exit_code != SVM_EXIT_NPF) | 2271 | exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) |
2209 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | 2272 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " |
2210 | "exit_code 0x%x\n", | 2273 | "exit_code 0x%x\n", |
2211 | __func__, svm->vmcb->control.exit_int_info, | 2274 | __func__, svm->vmcb->control.exit_int_info, |
@@ -2242,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm) | |||
2242 | new_asid(svm, svm_data); | 2305 | new_asid(svm, svm_data); |
2243 | } | 2306 | } |
2244 | 2307 | ||
2308 | static void svm_inject_nmi(struct kvm_vcpu *vcpu) | ||
2309 | { | ||
2310 | struct vcpu_svm *svm = to_svm(vcpu); | ||
2311 | |||
2312 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; | ||
2313 | vcpu->arch.hflags |= HF_NMI_MASK; | ||
2314 | svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); | ||
2315 | ++vcpu->stat.nmi_injections; | ||
2316 | } | ||
2245 | 2317 | ||
2246 | static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | 2318 | static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) |
2247 | { | 2319 | { |
@@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
2257 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | 2329 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); |
2258 | } | 2330 | } |
2259 | 2331 | ||
2260 | static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) | 2332 | static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) |
2261 | { | 2333 | { |
2262 | struct vcpu_svm *svm = to_svm(vcpu); | 2334 | struct vcpu_svm *svm = to_svm(vcpu); |
2263 | 2335 | ||
2264 | nested_svm_intr(svm); | 2336 | svm->vmcb->control.event_inj = nr | |
2265 | 2337 | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; | |
2266 | svm_inject_irq(svm, irq); | ||
2267 | } | 2338 | } |
2268 | 2339 | ||
2269 | static void update_cr8_intercept(struct kvm_vcpu *vcpu) | 2340 | static void svm_set_irq(struct kvm_vcpu *vcpu) |
2270 | { | 2341 | { |
2271 | struct vcpu_svm *svm = to_svm(vcpu); | 2342 | struct vcpu_svm *svm = to_svm(vcpu); |
2272 | struct vmcb *vmcb = svm->vmcb; | ||
2273 | int max_irr, tpr; | ||
2274 | 2343 | ||
2275 | if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) | 2344 | nested_svm_intr(svm); |
2276 | return; | ||
2277 | 2345 | ||
2278 | vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | 2346 | svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); |
2347 | } | ||
2279 | 2348 | ||
2280 | max_irr = kvm_lapic_find_highest_irr(vcpu); | 2349 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) |
2281 | if (max_irr == -1) | 2350 | { |
2282 | return; | 2351 | struct vcpu_svm *svm = to_svm(vcpu); |
2283 | 2352 | ||
2284 | tpr = kvm_lapic_get_cr8(vcpu) << 4; | 2353 | if (irr == -1) |
2354 | return; | ||
2285 | 2355 | ||
2286 | if (tpr >= (max_irr & 0xf0)) | 2356 | if (tpr >= irr) |
2287 | vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; | 2357 | svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; |
2288 | } | 2358 | } |
2289 | 2359 | ||
2290 | static void svm_intr_assist(struct kvm_vcpu *vcpu) | 2360 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) |
2291 | { | 2361 | { |
2292 | struct vcpu_svm *svm = to_svm(vcpu); | 2362 | struct vcpu_svm *svm = to_svm(vcpu); |
2293 | struct vmcb *vmcb = svm->vmcb; | 2363 | struct vmcb *vmcb = svm->vmcb; |
2294 | int intr_vector = -1; | 2364 | return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && |
2295 | 2365 | !(svm->vcpu.arch.hflags & HF_NMI_MASK); | |
2296 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && | ||
2297 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { | ||
2298 | intr_vector = vmcb->control.exit_int_info & | ||
2299 | SVM_EVTINJ_VEC_MASK; | ||
2300 | vmcb->control.exit_int_info = 0; | ||
2301 | svm_inject_irq(svm, intr_vector); | ||
2302 | goto out; | ||
2303 | } | ||
2304 | |||
2305 | if (vmcb->control.int_ctl & V_IRQ_MASK) | ||
2306 | goto out; | ||
2307 | |||
2308 | if (!kvm_cpu_has_interrupt(vcpu)) | ||
2309 | goto out; | ||
2310 | |||
2311 | if (nested_svm_intr(svm)) | ||
2312 | goto out; | ||
2313 | |||
2314 | if (!(svm->vcpu.arch.hflags & HF_GIF_MASK)) | ||
2315 | goto out; | ||
2316 | |||
2317 | if (!(vmcb->save.rflags & X86_EFLAGS_IF) || | ||
2318 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || | ||
2319 | (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { | ||
2320 | /* unable to deliver irq, set pending irq */ | ||
2321 | svm_set_vintr(svm); | ||
2322 | svm_inject_irq(svm, 0x0); | ||
2323 | goto out; | ||
2324 | } | ||
2325 | /* Okay, we can deliver the interrupt: grab it and update PIC state. */ | ||
2326 | intr_vector = kvm_cpu_get_interrupt(vcpu); | ||
2327 | svm_inject_irq(svm, intr_vector); | ||
2328 | out: | ||
2329 | update_cr8_intercept(vcpu); | ||
2330 | } | 2366 | } |
2331 | 2367 | ||
2332 | static void kvm_reput_irq(struct vcpu_svm *svm) | 2368 | static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) |
2333 | { | 2369 | { |
2334 | struct vmcb_control_area *control = &svm->vmcb->control; | 2370 | struct vcpu_svm *svm = to_svm(vcpu); |
2335 | 2371 | struct vmcb *vmcb = svm->vmcb; | |
2336 | if ((control->int_ctl & V_IRQ_MASK) | 2372 | return (vmcb->save.rflags & X86_EFLAGS_IF) && |
2337 | && !irqchip_in_kernel(svm->vcpu.kvm)) { | 2373 | !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && |
2338 | control->int_ctl &= ~V_IRQ_MASK; | 2374 | (svm->vcpu.arch.hflags & HF_GIF_MASK); |
2339 | push_irq(&svm->vcpu, control->int_vector); | ||
2340 | } | ||
2341 | |||
2342 | svm->vcpu.arch.interrupt_window_open = | ||
2343 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | ||
2344 | (svm->vcpu.arch.hflags & HF_GIF_MASK); | ||
2345 | } | 2375 | } |
2346 | 2376 | ||
2347 | static void svm_do_inject_vector(struct vcpu_svm *svm) | 2377 | static void enable_irq_window(struct kvm_vcpu *vcpu) |
2348 | { | 2378 | { |
2349 | struct kvm_vcpu *vcpu = &svm->vcpu; | 2379 | svm_set_vintr(to_svm(vcpu)); |
2350 | int word_index = __ffs(vcpu->arch.irq_summary); | 2380 | svm_inject_irq(to_svm(vcpu), 0x0); |
2351 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
2352 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
2353 | |||
2354 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
2355 | if (!vcpu->arch.irq_pending[word_index]) | ||
2356 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
2357 | svm_inject_irq(svm, irq); | ||
2358 | } | 2381 | } |
2359 | 2382 | ||
2360 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | 2383 | static void enable_nmi_window(struct kvm_vcpu *vcpu) |
2361 | struct kvm_run *kvm_run) | ||
2362 | { | 2384 | { |
2363 | struct vcpu_svm *svm = to_svm(vcpu); | 2385 | struct vcpu_svm *svm = to_svm(vcpu); |
2364 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
2365 | |||
2366 | if (nested_svm_intr(svm)) | ||
2367 | return; | ||
2368 | 2386 | ||
2369 | svm->vcpu.arch.interrupt_window_open = | 2387 | if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) |
2370 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | 2388 | == HF_NMI_MASK) |
2371 | (svm->vmcb->save.rflags & X86_EFLAGS_IF) && | 2389 | return; /* IRET will cause a vm exit */ |
2372 | (svm->vcpu.arch.hflags & HF_GIF_MASK)); | ||
2373 | 2390 | ||
2374 | if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) | 2391 | /* Something prevents NMI from been injected. Single step over |
2375 | /* | 2392 | possible problem (IRET or exception injection or interrupt |
2376 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | 2393 | shadow) */ |
2377 | */ | 2394 | vcpu->arch.singlestep = true; |
2378 | svm_do_inject_vector(svm); | 2395 | svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); |
2379 | 2396 | update_db_intercept(vcpu); | |
2380 | /* | ||
2381 | * Interrupts blocked. Wait for unblock. | ||
2382 | */ | ||
2383 | if (!svm->vcpu.arch.interrupt_window_open && | ||
2384 | (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) | ||
2385 | svm_set_vintr(svm); | ||
2386 | else | ||
2387 | svm_clear_vintr(svm); | ||
2388 | } | 2397 | } |
2389 | 2398 | ||
2390 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | 2399 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) |
@@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) | |||
2407 | 2416 | ||
2408 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { | 2417 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { |
2409 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; | 2418 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; |
2410 | kvm_lapic_set_tpr(vcpu, cr8); | 2419 | kvm_set_cr8(vcpu, cr8); |
2411 | } | 2420 | } |
2412 | } | 2421 | } |
2413 | 2422 | ||
@@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | |||
2416 | struct vcpu_svm *svm = to_svm(vcpu); | 2425 | struct vcpu_svm *svm = to_svm(vcpu); |
2417 | u64 cr8; | 2426 | u64 cr8; |
2418 | 2427 | ||
2419 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
2420 | return; | ||
2421 | |||
2422 | cr8 = kvm_get_cr8(vcpu); | 2428 | cr8 = kvm_get_cr8(vcpu); |
2423 | svm->vmcb->control.int_ctl &= ~V_TPR_MASK; | 2429 | svm->vmcb->control.int_ctl &= ~V_TPR_MASK; |
2424 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; | 2430 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; |
2425 | } | 2431 | } |
2426 | 2432 | ||
2433 | static void svm_complete_interrupts(struct vcpu_svm *svm) | ||
2434 | { | ||
2435 | u8 vector; | ||
2436 | int type; | ||
2437 | u32 exitintinfo = svm->vmcb->control.exit_int_info; | ||
2438 | |||
2439 | if (svm->vcpu.arch.hflags & HF_IRET_MASK) | ||
2440 | svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); | ||
2441 | |||
2442 | svm->vcpu.arch.nmi_injected = false; | ||
2443 | kvm_clear_exception_queue(&svm->vcpu); | ||
2444 | kvm_clear_interrupt_queue(&svm->vcpu); | ||
2445 | |||
2446 | if (!(exitintinfo & SVM_EXITINTINFO_VALID)) | ||
2447 | return; | ||
2448 | |||
2449 | vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; | ||
2450 | type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; | ||
2451 | |||
2452 | switch (type) { | ||
2453 | case SVM_EXITINTINFO_TYPE_NMI: | ||
2454 | svm->vcpu.arch.nmi_injected = true; | ||
2455 | break; | ||
2456 | case SVM_EXITINTINFO_TYPE_EXEPT: | ||
2457 | /* In case of software exception do not reinject an exception | ||
2458 | vector, but re-execute and instruction instead */ | ||
2459 | if (kvm_exception_is_soft(vector)) | ||
2460 | break; | ||
2461 | if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { | ||
2462 | u32 err = svm->vmcb->control.exit_int_info_err; | ||
2463 | kvm_queue_exception_e(&svm->vcpu, vector, err); | ||
2464 | |||
2465 | } else | ||
2466 | kvm_queue_exception(&svm->vcpu, vector); | ||
2467 | break; | ||
2468 | case SVM_EXITINTINFO_TYPE_INTR: | ||
2469 | kvm_queue_interrupt(&svm->vcpu, vector, false); | ||
2470 | break; | ||
2471 | default: | ||
2472 | break; | ||
2473 | } | ||
2474 | } | ||
2475 | |||
2427 | #ifdef CONFIG_X86_64 | 2476 | #ifdef CONFIG_X86_64 |
2428 | #define R "r" | 2477 | #define R "r" |
2429 | #else | 2478 | #else |
@@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2552 | sync_cr8_to_lapic(vcpu); | 2601 | sync_cr8_to_lapic(vcpu); |
2553 | 2602 | ||
2554 | svm->next_rip = 0; | 2603 | svm->next_rip = 0; |
2604 | |||
2605 | svm_complete_interrupts(svm); | ||
2555 | } | 2606 | } |
2556 | 2607 | ||
2557 | #undef R | 2608 | #undef R |
@@ -2617,7 +2668,7 @@ static int get_npt_level(void) | |||
2617 | #endif | 2668 | #endif |
2618 | } | 2669 | } |
2619 | 2670 | ||
2620 | static int svm_get_mt_mask_shift(void) | 2671 | static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) |
2621 | { | 2672 | { |
2622 | return 0; | 2673 | return 0; |
2623 | } | 2674 | } |
@@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
2667 | .run = svm_vcpu_run, | 2718 | .run = svm_vcpu_run, |
2668 | .handle_exit = handle_exit, | 2719 | .handle_exit = handle_exit, |
2669 | .skip_emulated_instruction = skip_emulated_instruction, | 2720 | .skip_emulated_instruction = skip_emulated_instruction, |
2721 | .set_interrupt_shadow = svm_set_interrupt_shadow, | ||
2722 | .get_interrupt_shadow = svm_get_interrupt_shadow, | ||
2670 | .patch_hypercall = svm_patch_hypercall, | 2723 | .patch_hypercall = svm_patch_hypercall, |
2671 | .get_irq = svm_get_irq, | ||
2672 | .set_irq = svm_set_irq, | 2724 | .set_irq = svm_set_irq, |
2725 | .set_nmi = svm_inject_nmi, | ||
2673 | .queue_exception = svm_queue_exception, | 2726 | .queue_exception = svm_queue_exception, |
2674 | .exception_injected = svm_exception_injected, | 2727 | .interrupt_allowed = svm_interrupt_allowed, |
2675 | .inject_pending_irq = svm_intr_assist, | 2728 | .nmi_allowed = svm_nmi_allowed, |
2676 | .inject_pending_vectors = do_interrupt_requests, | 2729 | .enable_nmi_window = enable_nmi_window, |
2730 | .enable_irq_window = enable_irq_window, | ||
2731 | .update_cr8_intercept = update_cr8_intercept, | ||
2677 | 2732 | ||
2678 | .set_tss_addr = svm_set_tss_addr, | 2733 | .set_tss_addr = svm_set_tss_addr, |
2679 | .get_tdp_level = get_npt_level, | 2734 | .get_tdp_level = get_npt_level, |
2680 | .get_mt_mask_shift = svm_get_mt_mask_shift, | 2735 | .get_mt_mask = svm_get_mt_mask, |
2681 | }; | 2736 | }; |
2682 | 2737 | ||
2683 | static int __init svm_init(void) | 2738 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c new file mode 100644 index 000000000000..86dbac072d0c --- /dev/null +++ b/arch/x86/kvm/timer.c | |||
@@ -0,0 +1,46 @@ | |||
1 | #include <linux/kvm_host.h> | ||
2 | #include <linux/kvm.h> | ||
3 | #include <linux/hrtimer.h> | ||
4 | #include <asm/atomic.h> | ||
5 | #include "kvm_timer.h" | ||
6 | |||
7 | static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) | ||
8 | { | ||
9 | int restart_timer = 0; | ||
10 | wait_queue_head_t *q = &vcpu->wq; | ||
11 | |||
12 | /* FIXME: this code should not know anything about vcpus */ | ||
13 | if (!atomic_inc_and_test(&ktimer->pending)) | ||
14 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); | ||
15 | |||
16 | if (!ktimer->reinject) | ||
17 | atomic_set(&ktimer->pending, 1); | ||
18 | |||
19 | if (waitqueue_active(q)) | ||
20 | wake_up_interruptible(q); | ||
21 | |||
22 | if (ktimer->t_ops->is_periodic(ktimer)) { | ||
23 | hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); | ||
24 | restart_timer = 1; | ||
25 | } | ||
26 | |||
27 | return restart_timer; | ||
28 | } | ||
29 | |||
30 | enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) | ||
31 | { | ||
32 | int restart_timer; | ||
33 | struct kvm_vcpu *vcpu; | ||
34 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | ||
35 | |||
36 | vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; | ||
37 | if (!vcpu) | ||
38 | return HRTIMER_NORESTART; | ||
39 | |||
40 | restart_timer = __kvm_timer_fn(vcpu, ktimer); | ||
41 | if (restart_timer) | ||
42 | return HRTIMER_RESTART; | ||
43 | else | ||
44 | return HRTIMER_NORESTART; | ||
45 | } | ||
46 | |||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bb481330716f..e770bf349ec4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -32,26 +32,27 @@ | |||
32 | #include <asm/desc.h> | 32 | #include <asm/desc.h> |
33 | #include <asm/vmx.h> | 33 | #include <asm/vmx.h> |
34 | #include <asm/virtext.h> | 34 | #include <asm/virtext.h> |
35 | #include <asm/mce.h> | ||
35 | 36 | ||
36 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 37 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
37 | 38 | ||
38 | MODULE_AUTHOR("Qumranet"); | 39 | MODULE_AUTHOR("Qumranet"); |
39 | MODULE_LICENSE("GPL"); | 40 | MODULE_LICENSE("GPL"); |
40 | 41 | ||
41 | static int bypass_guest_pf = 1; | 42 | static int __read_mostly bypass_guest_pf = 1; |
42 | module_param(bypass_guest_pf, bool, 0); | 43 | module_param(bypass_guest_pf, bool, S_IRUGO); |
43 | 44 | ||
44 | static int enable_vpid = 1; | 45 | static int __read_mostly enable_vpid = 1; |
45 | module_param(enable_vpid, bool, 0); | 46 | module_param_named(vpid, enable_vpid, bool, 0444); |
46 | 47 | ||
47 | static int flexpriority_enabled = 1; | 48 | static int __read_mostly flexpriority_enabled = 1; |
48 | module_param(flexpriority_enabled, bool, 0); | 49 | module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); |
49 | 50 | ||
50 | static int enable_ept = 1; | 51 | static int __read_mostly enable_ept = 1; |
51 | module_param(enable_ept, bool, 0); | 52 | module_param_named(ept, enable_ept, bool, S_IRUGO); |
52 | 53 | ||
53 | static int emulate_invalid_guest_state = 0; | 54 | static int __read_mostly emulate_invalid_guest_state = 0; |
54 | module_param(emulate_invalid_guest_state, bool, 0); | 55 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); |
55 | 56 | ||
56 | struct vmcs { | 57 | struct vmcs { |
57 | u32 revision_id; | 58 | u32 revision_id; |
@@ -97,6 +98,7 @@ struct vcpu_vmx { | |||
97 | int soft_vnmi_blocked; | 98 | int soft_vnmi_blocked; |
98 | ktime_t entry_time; | 99 | ktime_t entry_time; |
99 | s64 vnmi_blocked_time; | 100 | s64 vnmi_blocked_time; |
101 | u32 exit_reason; | ||
100 | }; | 102 | }; |
101 | 103 | ||
102 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 104 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
@@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea); | |||
111 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 113 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
112 | static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); | 114 | static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); |
113 | 115 | ||
114 | static struct page *vmx_io_bitmap_a; | 116 | static unsigned long *vmx_io_bitmap_a; |
115 | static struct page *vmx_io_bitmap_b; | 117 | static unsigned long *vmx_io_bitmap_b; |
116 | static struct page *vmx_msr_bitmap; | 118 | static unsigned long *vmx_msr_bitmap_legacy; |
119 | static unsigned long *vmx_msr_bitmap_longmode; | ||
117 | 120 | ||
118 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); | 121 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); |
119 | static DEFINE_SPINLOCK(vmx_vpid_lock); | 122 | static DEFINE_SPINLOCK(vmx_vpid_lock); |
@@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info) | |||
213 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | 216 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); |
214 | } | 217 | } |
215 | 218 | ||
219 | static inline int is_machine_check(u32 intr_info) | ||
220 | { | ||
221 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
222 | INTR_INFO_VALID_MASK)) == | ||
223 | (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); | ||
224 | } | ||
225 | |||
216 | static inline int cpu_has_vmx_msr_bitmap(void) | 226 | static inline int cpu_has_vmx_msr_bitmap(void) |
217 | { | 227 | { |
218 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); | 228 | return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; |
219 | } | 229 | } |
220 | 230 | ||
221 | static inline int cpu_has_vmx_tpr_shadow(void) | 231 | static inline int cpu_has_vmx_tpr_shadow(void) |
222 | { | 232 | { |
223 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); | 233 | return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; |
224 | } | 234 | } |
225 | 235 | ||
226 | static inline int vm_need_tpr_shadow(struct kvm *kvm) | 236 | static inline int vm_need_tpr_shadow(struct kvm *kvm) |
227 | { | 237 | { |
228 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); | 238 | return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); |
229 | } | 239 | } |
230 | 240 | ||
231 | static inline int cpu_has_secondary_exec_ctrls(void) | 241 | static inline int cpu_has_secondary_exec_ctrls(void) |
232 | { | 242 | { |
233 | return (vmcs_config.cpu_based_exec_ctrl & | 243 | return vmcs_config.cpu_based_exec_ctrl & |
234 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); | 244 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
235 | } | 245 | } |
236 | 246 | ||
237 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) | 247 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) |
238 | { | 248 | { |
239 | return flexpriority_enabled | 249 | return vmcs_config.cpu_based_2nd_exec_ctrl & |
240 | && (vmcs_config.cpu_based_2nd_exec_ctrl & | 250 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; |
241 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | 251 | } |
252 | |||
253 | static inline bool cpu_has_vmx_flexpriority(void) | ||
254 | { | ||
255 | return cpu_has_vmx_tpr_shadow() && | ||
256 | cpu_has_vmx_virtualize_apic_accesses(); | ||
242 | } | 257 | } |
243 | 258 | ||
244 | static inline int cpu_has_vmx_invept_individual_addr(void) | 259 | static inline int cpu_has_vmx_invept_individual_addr(void) |
245 | { | 260 | { |
246 | return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); | 261 | return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); |
247 | } | 262 | } |
248 | 263 | ||
249 | static inline int cpu_has_vmx_invept_context(void) | 264 | static inline int cpu_has_vmx_invept_context(void) |
250 | { | 265 | { |
251 | return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); | 266 | return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); |
252 | } | 267 | } |
253 | 268 | ||
254 | static inline int cpu_has_vmx_invept_global(void) | 269 | static inline int cpu_has_vmx_invept_global(void) |
255 | { | 270 | { |
256 | return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); | 271 | return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); |
257 | } | 272 | } |
258 | 273 | ||
259 | static inline int cpu_has_vmx_ept(void) | 274 | static inline int cpu_has_vmx_ept(void) |
260 | { | 275 | { |
261 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | 276 | return vmcs_config.cpu_based_2nd_exec_ctrl & |
262 | SECONDARY_EXEC_ENABLE_EPT); | 277 | SECONDARY_EXEC_ENABLE_EPT; |
263 | } | ||
264 | |||
265 | static inline int vm_need_ept(void) | ||
266 | { | ||
267 | return (cpu_has_vmx_ept() && enable_ept); | ||
268 | } | 278 | } |
269 | 279 | ||
270 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | 280 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) |
271 | { | 281 | { |
272 | return ((cpu_has_vmx_virtualize_apic_accesses()) && | 282 | return flexpriority_enabled && |
273 | (irqchip_in_kernel(kvm))); | 283 | (cpu_has_vmx_virtualize_apic_accesses()) && |
284 | (irqchip_in_kernel(kvm)); | ||
274 | } | 285 | } |
275 | 286 | ||
276 | static inline int cpu_has_vmx_vpid(void) | 287 | static inline int cpu_has_vmx_vpid(void) |
277 | { | 288 | { |
278 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | 289 | return vmcs_config.cpu_based_2nd_exec_ctrl & |
279 | SECONDARY_EXEC_ENABLE_VPID); | 290 | SECONDARY_EXEC_ENABLE_VPID; |
280 | } | 291 | } |
281 | 292 | ||
282 | static inline int cpu_has_virtual_nmis(void) | 293 | static inline int cpu_has_virtual_nmis(void) |
@@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void) | |||
284 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; | 295 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; |
285 | } | 296 | } |
286 | 297 | ||
298 | static inline bool report_flexpriority(void) | ||
299 | { | ||
300 | return flexpriority_enabled; | ||
301 | } | ||
302 | |||
287 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | 303 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) |
288 | { | 304 | { |
289 | int i; | 305 | int i; |
@@ -381,7 +397,7 @@ static inline void ept_sync_global(void) | |||
381 | 397 | ||
382 | static inline void ept_sync_context(u64 eptp) | 398 | static inline void ept_sync_context(u64 eptp) |
383 | { | 399 | { |
384 | if (vm_need_ept()) { | 400 | if (enable_ept) { |
385 | if (cpu_has_vmx_invept_context()) | 401 | if (cpu_has_vmx_invept_context()) |
386 | __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); | 402 | __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); |
387 | else | 403 | else |
@@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp) | |||
391 | 407 | ||
392 | static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | 408 | static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) |
393 | { | 409 | { |
394 | if (vm_need_ept()) { | 410 | if (enable_ept) { |
395 | if (cpu_has_vmx_invept_individual_addr()) | 411 | if (cpu_has_vmx_invept_individual_addr()) |
396 | __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, | 412 | __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, |
397 | eptp, gpa); | 413 | eptp, gpa); |
@@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
478 | { | 494 | { |
479 | u32 eb; | 495 | u32 eb; |
480 | 496 | ||
481 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); | 497 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); |
482 | if (!vcpu->fpu_active) | 498 | if (!vcpu->fpu_active) |
483 | eb |= 1u << NM_VECTOR; | 499 | eb |= 1u << NM_VECTOR; |
484 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { | 500 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
@@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
488 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | 504 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
489 | eb |= 1u << BP_VECTOR; | 505 | eb |= 1u << BP_VECTOR; |
490 | } | 506 | } |
491 | if (vcpu->arch.rmode.active) | 507 | if (vcpu->arch.rmode.vm86_active) |
492 | eb = ~0; | 508 | eb = ~0; |
493 | if (vm_need_ept()) | 509 | if (enable_ept) |
494 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | 510 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ |
495 | vmcs_write32(EXCEPTION_BITMAP, eb); | 511 | vmcs_write32(EXCEPTION_BITMAP, eb); |
496 | } | 512 | } |
@@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | |||
724 | 740 | ||
725 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 741 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
726 | { | 742 | { |
727 | if (vcpu->arch.rmode.active) | 743 | if (vcpu->arch.rmode.vm86_active) |
728 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 744 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
729 | vmcs_writel(GUEST_RFLAGS, rflags); | 745 | vmcs_writel(GUEST_RFLAGS, rflags); |
730 | } | 746 | } |
731 | 747 | ||
748 | static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) | ||
749 | { | ||
750 | u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
751 | int ret = 0; | ||
752 | |||
753 | if (interruptibility & GUEST_INTR_STATE_STI) | ||
754 | ret |= X86_SHADOW_INT_STI; | ||
755 | if (interruptibility & GUEST_INTR_STATE_MOV_SS) | ||
756 | ret |= X86_SHADOW_INT_MOV_SS; | ||
757 | |||
758 | return ret & mask; | ||
759 | } | ||
760 | |||
761 | static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) | ||
762 | { | ||
763 | u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
764 | u32 interruptibility = interruptibility_old; | ||
765 | |||
766 | interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); | ||
767 | |||
768 | if (mask & X86_SHADOW_INT_MOV_SS) | ||
769 | interruptibility |= GUEST_INTR_STATE_MOV_SS; | ||
770 | if (mask & X86_SHADOW_INT_STI) | ||
771 | interruptibility |= GUEST_INTR_STATE_STI; | ||
772 | |||
773 | if ((interruptibility != interruptibility_old)) | ||
774 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); | ||
775 | } | ||
776 | |||
732 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | 777 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) |
733 | { | 778 | { |
734 | unsigned long rip; | 779 | unsigned long rip; |
735 | u32 interruptibility; | ||
736 | 780 | ||
737 | rip = kvm_rip_read(vcpu); | 781 | rip = kvm_rip_read(vcpu); |
738 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 782 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
739 | kvm_rip_write(vcpu, rip); | 783 | kvm_rip_write(vcpu, rip); |
740 | 784 | ||
741 | /* | 785 | /* skipping an emulated instruction also counts */ |
742 | * We emulated an instruction, so temporary interrupt blocking | 786 | vmx_set_interrupt_shadow(vcpu, 0); |
743 | * should be removed, if set. | ||
744 | */ | ||
745 | interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
746 | if (interruptibility & 3) | ||
747 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
748 | interruptibility & ~3); | ||
749 | vcpu->arch.interrupt_window_open = 1; | ||
750 | } | 787 | } |
751 | 788 | ||
752 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 789 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
@@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
760 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | 797 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
761 | } | 798 | } |
762 | 799 | ||
763 | if (vcpu->arch.rmode.active) { | 800 | if (vcpu->arch.rmode.vm86_active) { |
764 | vmx->rmode.irq.pending = true; | 801 | vmx->rmode.irq.pending = true; |
765 | vmx->rmode.irq.vector = nr; | 802 | vmx->rmode.irq.vector = nr; |
766 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 803 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
@@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
773 | return; | 810 | return; |
774 | } | 811 | } |
775 | 812 | ||
776 | if (nr == BP_VECTOR || nr == OF_VECTOR) { | 813 | if (kvm_exception_is_soft(nr)) { |
777 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 814 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, |
815 | vmx->vcpu.arch.event_exit_inst_len); | ||
778 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; | 816 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; |
779 | } else | 817 | } else |
780 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | 818 | intr_info |= INTR_TYPE_HARD_EXCEPTION; |
@@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
782 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | 820 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); |
783 | } | 821 | } |
784 | 822 | ||
785 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | ||
786 | { | ||
787 | return false; | ||
788 | } | ||
789 | |||
790 | /* | 823 | /* |
791 | * Swap MSR entry in host/guest MSR entry array. | 824 | * Swap MSR entry in host/guest MSR entry array. |
792 | */ | 825 | */ |
@@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | |||
812 | static void setup_msrs(struct vcpu_vmx *vmx) | 845 | static void setup_msrs(struct vcpu_vmx *vmx) |
813 | { | 846 | { |
814 | int save_nmsrs; | 847 | int save_nmsrs; |
848 | unsigned long *msr_bitmap; | ||
815 | 849 | ||
816 | vmx_load_host_state(vmx); | 850 | vmx_load_host_state(vmx); |
817 | save_nmsrs = 0; | 851 | save_nmsrs = 0; |
@@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) | |||
847 | __find_msr_index(vmx, MSR_KERNEL_GS_BASE); | 881 | __find_msr_index(vmx, MSR_KERNEL_GS_BASE); |
848 | #endif | 882 | #endif |
849 | vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); | 883 | vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); |
884 | |||
885 | if (cpu_has_vmx_msr_bitmap()) { | ||
886 | if (is_long_mode(&vmx->vcpu)) | ||
887 | msr_bitmap = vmx_msr_bitmap_longmode; | ||
888 | else | ||
889 | msr_bitmap = vmx_msr_bitmap_legacy; | ||
890 | |||
891 | vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); | ||
892 | } | ||
850 | } | 893 | } |
851 | 894 | ||
852 | /* | 895 | /* |
@@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | |||
1034 | return 0; | 1077 | return 0; |
1035 | } | 1078 | } |
1036 | 1079 | ||
1037 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | ||
1038 | { | ||
1039 | if (!vcpu->arch.interrupt.pending) | ||
1040 | return -1; | ||
1041 | return vcpu->arch.interrupt.nr; | ||
1042 | } | ||
1043 | |||
1044 | static __init int cpu_has_kvm_support(void) | 1080 | static __init int cpu_has_kvm_support(void) |
1045 | { | 1081 | { |
1046 | return cpu_has_vmx(); | 1082 | return cpu_has_vmx(); |
@@ -1241,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) | |||
1241 | struct page *pages; | 1277 | struct page *pages; |
1242 | struct vmcs *vmcs; | 1278 | struct vmcs *vmcs; |
1243 | 1279 | ||
1244 | pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); | 1280 | pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); |
1245 | if (!pages) | 1281 | if (!pages) |
1246 | return NULL; | 1282 | return NULL; |
1247 | vmcs = page_address(pages); | 1283 | vmcs = page_address(pages); |
@@ -1294,6 +1330,18 @@ static __init int hardware_setup(void) | |||
1294 | if (boot_cpu_has(X86_FEATURE_NX)) | 1330 | if (boot_cpu_has(X86_FEATURE_NX)) |
1295 | kvm_enable_efer_bits(EFER_NX); | 1331 | kvm_enable_efer_bits(EFER_NX); |
1296 | 1332 | ||
1333 | if (!cpu_has_vmx_vpid()) | ||
1334 | enable_vpid = 0; | ||
1335 | |||
1336 | if (!cpu_has_vmx_ept()) | ||
1337 | enable_ept = 0; | ||
1338 | |||
1339 | if (!cpu_has_vmx_flexpriority()) | ||
1340 | flexpriority_enabled = 0; | ||
1341 | |||
1342 | if (!cpu_has_vmx_tpr_shadow()) | ||
1343 | kvm_x86_ops->update_cr8_intercept = NULL; | ||
1344 | |||
1297 | return alloc_kvm_area(); | 1345 | return alloc_kvm_area(); |
1298 | } | 1346 | } |
1299 | 1347 | ||
@@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1324 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1372 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1325 | 1373 | ||
1326 | vmx->emulation_required = 1; | 1374 | vmx->emulation_required = 1; |
1327 | vcpu->arch.rmode.active = 0; | 1375 | vcpu->arch.rmode.vm86_active = 0; |
1328 | 1376 | ||
1329 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | 1377 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); |
1330 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); | 1378 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); |
@@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1386 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1434 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1387 | 1435 | ||
1388 | vmx->emulation_required = 1; | 1436 | vmx->emulation_required = 1; |
1389 | vcpu->arch.rmode.active = 1; | 1437 | vcpu->arch.rmode.vm86_active = 1; |
1390 | 1438 | ||
1391 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1439 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
1392 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | 1440 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); |
@@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) | |||
1485 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | 1533 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) |
1486 | { | 1534 | { |
1487 | vpid_sync_vcpu_all(to_vmx(vcpu)); | 1535 | vpid_sync_vcpu_all(to_vmx(vcpu)); |
1488 | if (vm_need_ept()) | 1536 | if (enable_ept) |
1489 | ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); | 1537 | ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); |
1490 | } | 1538 | } |
1491 | 1539 | ||
@@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1555 | 1603 | ||
1556 | vmx_fpu_deactivate(vcpu); | 1604 | vmx_fpu_deactivate(vcpu); |
1557 | 1605 | ||
1558 | if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) | 1606 | if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) |
1559 | enter_pmode(vcpu); | 1607 | enter_pmode(vcpu); |
1560 | 1608 | ||
1561 | if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) | 1609 | if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) |
1562 | enter_rmode(vcpu); | 1610 | enter_rmode(vcpu); |
1563 | 1611 | ||
1564 | #ifdef CONFIG_X86_64 | 1612 | #ifdef CONFIG_X86_64 |
@@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1570 | } | 1618 | } |
1571 | #endif | 1619 | #endif |
1572 | 1620 | ||
1573 | if (vm_need_ept()) | 1621 | if (enable_ept) |
1574 | ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); | 1622 | ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); |
1575 | 1623 | ||
1576 | vmcs_writel(CR0_READ_SHADOW, cr0); | 1624 | vmcs_writel(CR0_READ_SHADOW, cr0); |
@@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
1599 | u64 eptp; | 1647 | u64 eptp; |
1600 | 1648 | ||
1601 | guest_cr3 = cr3; | 1649 | guest_cr3 = cr3; |
1602 | if (vm_need_ept()) { | 1650 | if (enable_ept) { |
1603 | eptp = construct_eptp(cr3); | 1651 | eptp = construct_eptp(cr3); |
1604 | vmcs_write64(EPT_POINTER, eptp); | 1652 | vmcs_write64(EPT_POINTER, eptp); |
1605 | ept_sync_context(eptp); | 1653 | ept_sync_context(eptp); |
@@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
1616 | 1664 | ||
1617 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 1665 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1618 | { | 1666 | { |
1619 | unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? | 1667 | unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? |
1620 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); | 1668 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); |
1621 | 1669 | ||
1622 | vcpu->arch.cr4 = cr4; | 1670 | vcpu->arch.cr4 = cr4; |
1623 | if (vm_need_ept()) | 1671 | if (enable_ept) |
1624 | ept_update_paging_mode_cr4(&hw_cr4, vcpu); | 1672 | ept_update_paging_mode_cr4(&hw_cr4, vcpu); |
1625 | 1673 | ||
1626 | vmcs_writel(CR4_READ_SHADOW, cr4); | 1674 | vmcs_writel(CR4_READ_SHADOW, cr4); |
@@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
1699 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 1747 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
1700 | u32 ar; | 1748 | u32 ar; |
1701 | 1749 | ||
1702 | if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { | 1750 | if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { |
1703 | vcpu->arch.rmode.tr.selector = var->selector; | 1751 | vcpu->arch.rmode.tr.selector = var->selector; |
1704 | vcpu->arch.rmode.tr.base = var->base; | 1752 | vcpu->arch.rmode.tr.base = var->base; |
1705 | vcpu->arch.rmode.tr.limit = var->limit; | 1753 | vcpu->arch.rmode.tr.limit = var->limit; |
@@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
1709 | vmcs_writel(sf->base, var->base); | 1757 | vmcs_writel(sf->base, var->base); |
1710 | vmcs_write32(sf->limit, var->limit); | 1758 | vmcs_write32(sf->limit, var->limit); |
1711 | vmcs_write16(sf->selector, var->selector); | 1759 | vmcs_write16(sf->selector, var->selector); |
1712 | if (vcpu->arch.rmode.active && var->s) { | 1760 | if (vcpu->arch.rmode.vm86_active && var->s) { |
1713 | /* | 1761 | /* |
1714 | * Hack real-mode segments into vm86 compatibility. | 1762 | * Hack real-mode segments into vm86 compatibility. |
1715 | */ | 1763 | */ |
@@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm) | |||
1982 | pfn_t identity_map_pfn; | 2030 | pfn_t identity_map_pfn; |
1983 | u32 tmp; | 2031 | u32 tmp; |
1984 | 2032 | ||
1985 | if (!vm_need_ept()) | 2033 | if (!enable_ept) |
1986 | return 1; | 2034 | return 1; |
1987 | if (unlikely(!kvm->arch.ept_identity_pagetable)) { | 2035 | if (unlikely(!kvm->arch.ept_identity_pagetable)) { |
1988 | printk(KERN_ERR "EPT: identity-mapping pagetable " | 2036 | printk(KERN_ERR "EPT: identity-mapping pagetable " |
@@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx) | |||
2071 | int vpid; | 2119 | int vpid; |
2072 | 2120 | ||
2073 | vmx->vpid = 0; | 2121 | vmx->vpid = 0; |
2074 | if (!enable_vpid || !cpu_has_vmx_vpid()) | 2122 | if (!enable_vpid) |
2075 | return; | 2123 | return; |
2076 | spin_lock(&vmx_vpid_lock); | 2124 | spin_lock(&vmx_vpid_lock); |
2077 | vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); | 2125 | vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); |
@@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx) | |||
2082 | spin_unlock(&vmx_vpid_lock); | 2130 | spin_unlock(&vmx_vpid_lock); |
2083 | } | 2131 | } |
2084 | 2132 | ||
2085 | static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) | 2133 | static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) |
2086 | { | 2134 | { |
2087 | void *va; | 2135 | int f = sizeof(unsigned long); |
2088 | 2136 | ||
2089 | if (!cpu_has_vmx_msr_bitmap()) | 2137 | if (!cpu_has_vmx_msr_bitmap()) |
2090 | return; | 2138 | return; |
@@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) | |||
2094 | * have the write-low and read-high bitmap offsets the wrong way round. | 2142 | * have the write-low and read-high bitmap offsets the wrong way round. |
2095 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | 2143 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. |
2096 | */ | 2144 | */ |
2097 | va = kmap(msr_bitmap); | ||
2098 | if (msr <= 0x1fff) { | 2145 | if (msr <= 0x1fff) { |
2099 | __clear_bit(msr, va + 0x000); /* read-low */ | 2146 | __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ |
2100 | __clear_bit(msr, va + 0x800); /* write-low */ | 2147 | __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ |
2101 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | 2148 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { |
2102 | msr &= 0x1fff; | 2149 | msr &= 0x1fff; |
2103 | __clear_bit(msr, va + 0x400); /* read-high */ | 2150 | __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ |
2104 | __clear_bit(msr, va + 0xc00); /* write-high */ | 2151 | __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ |
2105 | } | 2152 | } |
2106 | kunmap(msr_bitmap); | 2153 | } |
2154 | |||
2155 | static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) | ||
2156 | { | ||
2157 | if (!longmode_only) | ||
2158 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); | ||
2159 | __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); | ||
2107 | } | 2160 | } |
2108 | 2161 | ||
2109 | /* | 2162 | /* |
@@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2121 | u32 exec_control; | 2174 | u32 exec_control; |
2122 | 2175 | ||
2123 | /* I/O */ | 2176 | /* I/O */ |
2124 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); | 2177 | vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); |
2125 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); | 2178 | vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); |
2126 | 2179 | ||
2127 | if (cpu_has_vmx_msr_bitmap()) | 2180 | if (cpu_has_vmx_msr_bitmap()) |
2128 | vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); | 2181 | vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); |
2129 | 2182 | ||
2130 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | 2183 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ |
2131 | 2184 | ||
@@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2141 | CPU_BASED_CR8_LOAD_EXITING; | 2194 | CPU_BASED_CR8_LOAD_EXITING; |
2142 | #endif | 2195 | #endif |
2143 | } | 2196 | } |
2144 | if (!vm_need_ept()) | 2197 | if (!enable_ept) |
2145 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | 2198 | exec_control |= CPU_BASED_CR3_STORE_EXITING | |
2146 | CPU_BASED_CR3_LOAD_EXITING | | 2199 | CPU_BASED_CR3_LOAD_EXITING | |
2147 | CPU_BASED_INVLPG_EXITING; | 2200 | CPU_BASED_INVLPG_EXITING; |
@@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2154 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | 2207 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; |
2155 | if (vmx->vpid == 0) | 2208 | if (vmx->vpid == 0) |
2156 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | 2209 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; |
2157 | if (!vm_need_ept()) | 2210 | if (!enable_ept) |
2158 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | 2211 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; |
2159 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | 2212 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); |
2160 | } | 2213 | } |
@@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2273 | goto out; | 2326 | goto out; |
2274 | } | 2327 | } |
2275 | 2328 | ||
2276 | vmx->vcpu.arch.rmode.active = 0; | 2329 | vmx->vcpu.arch.rmode.vm86_active = 0; |
2277 | 2330 | ||
2278 | vmx->soft_vnmi_blocked = 0; | 2331 | vmx->soft_vnmi_blocked = 0; |
2279 | 2332 | ||
@@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) | |||
2402 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 2455 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
2403 | } | 2456 | } |
2404 | 2457 | ||
2405 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | 2458 | static void vmx_inject_irq(struct kvm_vcpu *vcpu) |
2406 | { | 2459 | { |
2407 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2460 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2461 | uint32_t intr; | ||
2462 | int irq = vcpu->arch.interrupt.nr; | ||
2408 | 2463 | ||
2409 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); | 2464 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); |
2410 | 2465 | ||
2411 | ++vcpu->stat.irq_injections; | 2466 | ++vcpu->stat.irq_injections; |
2412 | if (vcpu->arch.rmode.active) { | 2467 | if (vcpu->arch.rmode.vm86_active) { |
2413 | vmx->rmode.irq.pending = true; | 2468 | vmx->rmode.irq.pending = true; |
2414 | vmx->rmode.irq.vector = irq; | 2469 | vmx->rmode.irq.vector = irq; |
2415 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 2470 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
@@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
2419 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | 2474 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); |
2420 | return; | 2475 | return; |
2421 | } | 2476 | } |
2422 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2477 | intr = irq | INTR_INFO_VALID_MASK; |
2423 | irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | 2478 | if (vcpu->arch.interrupt.soft) { |
2479 | intr |= INTR_TYPE_SOFT_INTR; | ||
2480 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
2481 | vmx->vcpu.arch.event_exit_inst_len); | ||
2482 | } else | ||
2483 | intr |= INTR_TYPE_EXT_INTR; | ||
2484 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); | ||
2424 | } | 2485 | } |
2425 | 2486 | ||
2426 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | 2487 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) |
@@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2441 | } | 2502 | } |
2442 | 2503 | ||
2443 | ++vcpu->stat.nmi_injections; | 2504 | ++vcpu->stat.nmi_injections; |
2444 | if (vcpu->arch.rmode.active) { | 2505 | if (vcpu->arch.rmode.vm86_active) { |
2445 | vmx->rmode.irq.pending = true; | 2506 | vmx->rmode.irq.pending = true; |
2446 | vmx->rmode.irq.vector = NMI_VECTOR; | 2507 | vmx->rmode.irq.vector = NMI_VECTOR; |
2447 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 2508 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
@@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2456 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 2517 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
2457 | } | 2518 | } |
2458 | 2519 | ||
2459 | static void vmx_update_window_states(struct kvm_vcpu *vcpu) | 2520 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) |
2460 | { | 2521 | { |
2461 | u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
2462 | |||
2463 | vcpu->arch.nmi_window_open = | ||
2464 | !(guest_intr & (GUEST_INTR_STATE_STI | | ||
2465 | GUEST_INTR_STATE_MOV_SS | | ||
2466 | GUEST_INTR_STATE_NMI)); | ||
2467 | if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) | 2522 | if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) |
2468 | vcpu->arch.nmi_window_open = 0; | 2523 | return 0; |
2469 | |||
2470 | vcpu->arch.interrupt_window_open = | ||
2471 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
2472 | !(guest_intr & (GUEST_INTR_STATE_STI | | ||
2473 | GUEST_INTR_STATE_MOV_SS))); | ||
2474 | } | ||
2475 | |||
2476 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | ||
2477 | { | ||
2478 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
2479 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
2480 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
2481 | 2524 | ||
2482 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | 2525 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & |
2483 | if (!vcpu->arch.irq_pending[word_index]) | 2526 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | |
2484 | clear_bit(word_index, &vcpu->arch.irq_summary); | 2527 | GUEST_INTR_STATE_NMI)); |
2485 | kvm_queue_interrupt(vcpu, irq); | ||
2486 | } | 2528 | } |
2487 | 2529 | ||
2488 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | 2530 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) |
2489 | struct kvm_run *kvm_run) | ||
2490 | { | 2531 | { |
2491 | vmx_update_window_states(vcpu); | 2532 | return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && |
2492 | 2533 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | |
2493 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | 2534 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); |
2494 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
2495 | GUEST_INTR_STATE_STI | | ||
2496 | GUEST_INTR_STATE_MOV_SS); | ||
2497 | |||
2498 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { | ||
2499 | if (vcpu->arch.interrupt.pending) { | ||
2500 | enable_nmi_window(vcpu); | ||
2501 | } else if (vcpu->arch.nmi_window_open) { | ||
2502 | vcpu->arch.nmi_pending = false; | ||
2503 | vcpu->arch.nmi_injected = true; | ||
2504 | } else { | ||
2505 | enable_nmi_window(vcpu); | ||
2506 | return; | ||
2507 | } | ||
2508 | } | ||
2509 | if (vcpu->arch.nmi_injected) { | ||
2510 | vmx_inject_nmi(vcpu); | ||
2511 | if (vcpu->arch.nmi_pending) | ||
2512 | enable_nmi_window(vcpu); | ||
2513 | else if (vcpu->arch.irq_summary | ||
2514 | || kvm_run->request_interrupt_window) | ||
2515 | enable_irq_window(vcpu); | ||
2516 | return; | ||
2517 | } | ||
2518 | |||
2519 | if (vcpu->arch.interrupt_window_open) { | ||
2520 | if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) | ||
2521 | kvm_do_inject_irq(vcpu); | ||
2522 | |||
2523 | if (vcpu->arch.interrupt.pending) | ||
2524 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | ||
2525 | } | ||
2526 | if (!vcpu->arch.interrupt_window_open && | ||
2527 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) | ||
2528 | enable_irq_window(vcpu); | ||
2529 | } | 2535 | } |
2530 | 2536 | ||
2531 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | 2537 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) |
@@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
2585 | return 0; | 2591 | return 0; |
2586 | } | 2592 | } |
2587 | 2593 | ||
2594 | /* | ||
2595 | * Trigger machine check on the host. We assume all the MSRs are already set up | ||
2596 | * by the CPU and that we still run on the same CPU as the MCE occurred on. | ||
2597 | * We pass a fake environment to the machine check handler because we want | ||
2598 | * the guest to be always treated like user space, no matter what context | ||
2599 | * it used internally. | ||
2600 | */ | ||
2601 | static void kvm_machine_check(void) | ||
2602 | { | ||
2603 | #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) | ||
2604 | struct pt_regs regs = { | ||
2605 | .cs = 3, /* Fake ring 3 no matter what the guest ran on */ | ||
2606 | .flags = X86_EFLAGS_IF, | ||
2607 | }; | ||
2608 | |||
2609 | do_machine_check(®s, 0); | ||
2610 | #endif | ||
2611 | } | ||
2612 | |||
2613 | static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2614 | { | ||
2615 | /* already handled by vcpu_run */ | ||
2616 | return 1; | ||
2617 | } | ||
2618 | |||
2588 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2619 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2589 | { | 2620 | { |
2590 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2621 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2596 | vect_info = vmx->idt_vectoring_info; | 2627 | vect_info = vmx->idt_vectoring_info; |
2597 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 2628 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
2598 | 2629 | ||
2630 | if (is_machine_check(intr_info)) | ||
2631 | return handle_machine_check(vcpu, kvm_run); | ||
2632 | |||
2599 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | 2633 | if ((vect_info & VECTORING_INFO_VALID_MASK) && |
2600 | !is_page_fault(intr_info)) | 2634 | !is_page_fault(intr_info)) |
2601 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | 2635 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " |
2602 | "intr info 0x%x\n", __func__, vect_info, intr_info); | 2636 | "intr info 0x%x\n", __func__, vect_info, intr_info); |
2603 | 2637 | ||
2604 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { | ||
2605 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | ||
2606 | set_bit(irq, vcpu->arch.irq_pending); | ||
2607 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
2608 | } | ||
2609 | |||
2610 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) | 2638 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) |
2611 | return 1; /* already handled by vmx_vcpu_run() */ | 2639 | return 1; /* already handled by vmx_vcpu_run() */ |
2612 | 2640 | ||
@@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2628 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 2656 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
2629 | if (is_page_fault(intr_info)) { | 2657 | if (is_page_fault(intr_info)) { |
2630 | /* EPT won't cause page fault directly */ | 2658 | /* EPT won't cause page fault directly */ |
2631 | if (vm_need_ept()) | 2659 | if (enable_ept) |
2632 | BUG(); | 2660 | BUG(); |
2633 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 2661 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
2634 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, | 2662 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, |
2635 | (u32)((u64)cr2 >> 32), handler); | 2663 | (u32)((u64)cr2 >> 32), handler); |
2636 | if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) | 2664 | if (kvm_event_needs_reinjection(vcpu)) |
2637 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 2665 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
2638 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 2666 | return kvm_mmu_page_fault(vcpu, cr2, error_code); |
2639 | } | 2667 | } |
2640 | 2668 | ||
2641 | if (vcpu->arch.rmode.active && | 2669 | if (vcpu->arch.rmode.vm86_active && |
2642 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | 2670 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, |
2643 | error_code)) { | 2671 | error_code)) { |
2644 | if (vcpu->arch.halt_request) { | 2672 | if (vcpu->arch.halt_request) { |
@@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2753 | kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); | 2781 | kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); |
2754 | skip_emulated_instruction(vcpu); | 2782 | skip_emulated_instruction(vcpu); |
2755 | return 1; | 2783 | return 1; |
2756 | case 8: | 2784 | case 8: { |
2757 | kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); | 2785 | u8 cr8_prev = kvm_get_cr8(vcpu); |
2758 | skip_emulated_instruction(vcpu); | 2786 | u8 cr8 = kvm_register_read(vcpu, reg); |
2759 | if (irqchip_in_kernel(vcpu->kvm)) | 2787 | kvm_set_cr8(vcpu, cr8); |
2760 | return 1; | 2788 | skip_emulated_instruction(vcpu); |
2761 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | 2789 | if (irqchip_in_kernel(vcpu->kvm)) |
2762 | return 0; | 2790 | return 1; |
2791 | if (cr8_prev <= cr8) | ||
2792 | return 1; | ||
2793 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
2794 | return 0; | ||
2795 | } | ||
2763 | }; | 2796 | }; |
2764 | break; | 2797 | break; |
2765 | case 2: /* clts */ | 2798 | case 2: /* clts */ |
@@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
2957 | * If the user space waits to inject interrupts, exit as soon as | 2990 | * If the user space waits to inject interrupts, exit as soon as |
2958 | * possible | 2991 | * possible |
2959 | */ | 2992 | */ |
2960 | if (kvm_run->request_interrupt_window && | 2993 | if (!irqchip_in_kernel(vcpu->kvm) && |
2961 | !vcpu->arch.irq_summary) { | 2994 | kvm_run->request_interrupt_window && |
2995 | !kvm_cpu_has_interrupt(vcpu)) { | ||
2962 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 2996 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
2963 | return 0; | 2997 | return 0; |
2964 | } | 2998 | } |
@@ -2980,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2980 | 3014 | ||
2981 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3015 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2982 | { | 3016 | { |
2983 | u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | 3017 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
2984 | 3018 | ||
2985 | kvm_mmu_invlpg(vcpu, exit_qualification); | 3019 | kvm_mmu_invlpg(vcpu, exit_qualification); |
2986 | skip_emulated_instruction(vcpu); | 3020 | skip_emulated_instruction(vcpu); |
@@ -2996,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2996 | 3030 | ||
2997 | static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3031 | static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2998 | { | 3032 | { |
2999 | u64 exit_qualification; | 3033 | unsigned long exit_qualification; |
3000 | enum emulation_result er; | 3034 | enum emulation_result er; |
3001 | unsigned long offset; | 3035 | unsigned long offset; |
3002 | 3036 | ||
3003 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | 3037 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
3004 | offset = exit_qualification & 0xffful; | 3038 | offset = exit_qualification & 0xffful; |
3005 | 3039 | ||
3006 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | 3040 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); |
@@ -3019,22 +3053,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3019 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3053 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3020 | unsigned long exit_qualification; | 3054 | unsigned long exit_qualification; |
3021 | u16 tss_selector; | 3055 | u16 tss_selector; |
3022 | int reason; | 3056 | int reason, type, idt_v; |
3057 | |||
3058 | idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
3059 | type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); | ||
3023 | 3060 | ||
3024 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 3061 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
3025 | 3062 | ||
3026 | reason = (u32)exit_qualification >> 30; | 3063 | reason = (u32)exit_qualification >> 30; |
3027 | if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && | 3064 | if (reason == TASK_SWITCH_GATE && idt_v) { |
3028 | (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && | 3065 | switch (type) { |
3029 | (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) | 3066 | case INTR_TYPE_NMI_INTR: |
3030 | == INTR_TYPE_NMI_INTR) { | 3067 | vcpu->arch.nmi_injected = false; |
3031 | vcpu->arch.nmi_injected = false; | 3068 | if (cpu_has_virtual_nmis()) |
3032 | if (cpu_has_virtual_nmis()) | 3069 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
3033 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | 3070 | GUEST_INTR_STATE_NMI); |
3034 | GUEST_INTR_STATE_NMI); | 3071 | break; |
3072 | case INTR_TYPE_EXT_INTR: | ||
3073 | case INTR_TYPE_SOFT_INTR: | ||
3074 | kvm_clear_interrupt_queue(vcpu); | ||
3075 | break; | ||
3076 | case INTR_TYPE_HARD_EXCEPTION: | ||
3077 | case INTR_TYPE_SOFT_EXCEPTION: | ||
3078 | kvm_clear_exception_queue(vcpu); | ||
3079 | break; | ||
3080 | default: | ||
3081 | break; | ||
3082 | } | ||
3035 | } | 3083 | } |
3036 | tss_selector = exit_qualification; | 3084 | tss_selector = exit_qualification; |
3037 | 3085 | ||
3086 | if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && | ||
3087 | type != INTR_TYPE_EXT_INTR && | ||
3088 | type != INTR_TYPE_NMI_INTR)) | ||
3089 | skip_emulated_instruction(vcpu); | ||
3090 | |||
3038 | if (!kvm_task_switch(vcpu, tss_selector, reason)) | 3091 | if (!kvm_task_switch(vcpu, tss_selector, reason)) |
3039 | return 0; | 3092 | return 0; |
3040 | 3093 | ||
@@ -3051,11 +3104,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3051 | 3104 | ||
3052 | static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3105 | static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
3053 | { | 3106 | { |
3054 | u64 exit_qualification; | 3107 | unsigned long exit_qualification; |
3055 | gpa_t gpa; | 3108 | gpa_t gpa; |
3056 | int gla_validity; | 3109 | int gla_validity; |
3057 | 3110 | ||
3058 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | 3111 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
3059 | 3112 | ||
3060 | if (exit_qualification & (1 << 6)) { | 3113 | if (exit_qualification & (1 << 6)) { |
3061 | printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); | 3114 | printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); |
@@ -3067,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3067 | printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); | 3120 | printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); |
3068 | printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", | 3121 | printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", |
3069 | (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), | 3122 | (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), |
3070 | (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); | 3123 | vmcs_readl(GUEST_LINEAR_ADDRESS)); |
3071 | printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", | 3124 | printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", |
3072 | (long unsigned int)exit_qualification); | 3125 | (long unsigned int)exit_qualification); |
3073 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 3126 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; |
@@ -3150,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
3150 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 3203 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
3151 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | 3204 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, |
3152 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | 3205 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, |
3206 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, | ||
3153 | }; | 3207 | }; |
3154 | 3208 | ||
3155 | static const int kvm_vmx_max_exit_handlers = | 3209 | static const int kvm_vmx_max_exit_handlers = |
@@ -3159,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers = | |||
3159 | * The guest has exited. See if we can fix it or if we need userspace | 3213 | * The guest has exited. See if we can fix it or if we need userspace |
3160 | * assistance. | 3214 | * assistance. |
3161 | */ | 3215 | */ |
3162 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | 3216 | static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) |
3163 | { | 3217 | { |
3164 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
3165 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3218 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3219 | u32 exit_reason = vmx->exit_reason; | ||
3166 | u32 vectoring_info = vmx->idt_vectoring_info; | 3220 | u32 vectoring_info = vmx->idt_vectoring_info; |
3167 | 3221 | ||
3168 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), | 3222 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), |
@@ -3178,7 +3232,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3178 | 3232 | ||
3179 | /* Access CR3 don't cause VMExit in paging mode, so we need | 3233 | /* Access CR3 don't cause VMExit in paging mode, so we need |
3180 | * to sync with guest real CR3. */ | 3234 | * to sync with guest real CR3. */ |
3181 | if (vm_need_ept() && is_paging(vcpu)) { | 3235 | if (enable_ept && is_paging(vcpu)) { |
3182 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | 3236 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); |
3183 | ept_load_pdptrs(vcpu); | 3237 | ept_load_pdptrs(vcpu); |
3184 | } | 3238 | } |
@@ -3199,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3199 | __func__, vectoring_info, exit_reason); | 3253 | __func__, vectoring_info, exit_reason); |
3200 | 3254 | ||
3201 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { | 3255 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { |
3202 | if (vcpu->arch.interrupt_window_open) { | 3256 | if (vmx_interrupt_allowed(vcpu)) { |
3203 | vmx->soft_vnmi_blocked = 0; | 3257 | vmx->soft_vnmi_blocked = 0; |
3204 | vcpu->arch.nmi_window_open = 1; | ||
3205 | } else if (vmx->vnmi_blocked_time > 1000000000LL && | 3258 | } else if (vmx->vnmi_blocked_time > 1000000000LL && |
3206 | vcpu->arch.nmi_pending) { | 3259 | vcpu->arch.nmi_pending) { |
3207 | /* | 3260 | /* |
@@ -3214,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3214 | "state on VCPU %d after 1 s timeout\n", | 3267 | "state on VCPU %d after 1 s timeout\n", |
3215 | __func__, vcpu->vcpu_id); | 3268 | __func__, vcpu->vcpu_id); |
3216 | vmx->soft_vnmi_blocked = 0; | 3269 | vmx->soft_vnmi_blocked = 0; |
3217 | vmx->vcpu.arch.nmi_window_open = 1; | ||
3218 | } | 3270 | } |
3219 | } | 3271 | } |
3220 | 3272 | ||
@@ -3228,122 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3228 | return 0; | 3280 | return 0; |
3229 | } | 3281 | } |
3230 | 3282 | ||
3231 | static void update_tpr_threshold(struct kvm_vcpu *vcpu) | 3283 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) |
3232 | { | 3284 | { |
3233 | int max_irr, tpr; | 3285 | if (irr == -1 || tpr < irr) { |
3234 | |||
3235 | if (!vm_need_tpr_shadow(vcpu->kvm)) | ||
3236 | return; | ||
3237 | |||
3238 | if (!kvm_lapic_enabled(vcpu) || | ||
3239 | ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { | ||
3240 | vmcs_write32(TPR_THRESHOLD, 0); | 3286 | vmcs_write32(TPR_THRESHOLD, 0); |
3241 | return; | 3287 | return; |
3242 | } | 3288 | } |
3243 | 3289 | ||
3244 | tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; | 3290 | vmcs_write32(TPR_THRESHOLD, irr); |
3245 | vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); | ||
3246 | } | 3291 | } |
3247 | 3292 | ||
3248 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | 3293 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
3249 | { | 3294 | { |
3250 | u32 exit_intr_info; | 3295 | u32 exit_intr_info; |
3251 | u32 idt_vectoring_info; | 3296 | u32 idt_vectoring_info = vmx->idt_vectoring_info; |
3252 | bool unblock_nmi; | 3297 | bool unblock_nmi; |
3253 | u8 vector; | 3298 | u8 vector; |
3254 | int type; | 3299 | int type; |
3255 | bool idtv_info_valid; | 3300 | bool idtv_info_valid; |
3256 | u32 error; | ||
3257 | 3301 | ||
3258 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 3302 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
3303 | |||
3304 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
3305 | |||
3306 | /* Handle machine checks before interrupts are enabled */ | ||
3307 | if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) | ||
3308 | || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI | ||
3309 | && is_machine_check(exit_intr_info))) | ||
3310 | kvm_machine_check(); | ||
3311 | |||
3312 | /* We need to handle NMIs before interrupts are enabled */ | ||
3313 | if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && | ||
3314 | (exit_intr_info & INTR_INFO_VALID_MASK)) { | ||
3315 | KVMTRACE_0D(NMI, &vmx->vcpu, handler); | ||
3316 | asm("int $2"); | ||
3317 | } | ||
3318 | |||
3319 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3320 | |||
3259 | if (cpu_has_virtual_nmis()) { | 3321 | if (cpu_has_virtual_nmis()) { |
3260 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; | 3322 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; |
3261 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; | 3323 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; |
3262 | /* | 3324 | /* |
3263 | * SDM 3: 25.7.1.2 | 3325 | * SDM 3: 27.7.1.2 (September 2008) |
3264 | * Re-set bit "block by NMI" before VM entry if vmexit caused by | 3326 | * Re-set bit "block by NMI" before VM entry if vmexit caused by |
3265 | * a guest IRET fault. | 3327 | * a guest IRET fault. |
3328 | * SDM 3: 23.2.2 (September 2008) | ||
3329 | * Bit 12 is undefined in any of the following cases: | ||
3330 | * If the VM exit sets the valid bit in the IDT-vectoring | ||
3331 | * information field. | ||
3332 | * If the VM exit is due to a double fault. | ||
3266 | */ | 3333 | */ |
3267 | if (unblock_nmi && vector != DF_VECTOR) | 3334 | if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && |
3335 | vector != DF_VECTOR && !idtv_info_valid) | ||
3268 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | 3336 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
3269 | GUEST_INTR_STATE_NMI); | 3337 | GUEST_INTR_STATE_NMI); |
3270 | } else if (unlikely(vmx->soft_vnmi_blocked)) | 3338 | } else if (unlikely(vmx->soft_vnmi_blocked)) |
3271 | vmx->vnmi_blocked_time += | 3339 | vmx->vnmi_blocked_time += |
3272 | ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); | 3340 | ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); |
3273 | 3341 | ||
3274 | idt_vectoring_info = vmx->idt_vectoring_info; | 3342 | vmx->vcpu.arch.nmi_injected = false; |
3275 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | 3343 | kvm_clear_exception_queue(&vmx->vcpu); |
3344 | kvm_clear_interrupt_queue(&vmx->vcpu); | ||
3345 | |||
3346 | if (!idtv_info_valid) | ||
3347 | return; | ||
3348 | |||
3276 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | 3349 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; |
3277 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | 3350 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; |
3278 | if (vmx->vcpu.arch.nmi_injected) { | 3351 | |
3352 | switch (type) { | ||
3353 | case INTR_TYPE_NMI_INTR: | ||
3354 | vmx->vcpu.arch.nmi_injected = true; | ||
3279 | /* | 3355 | /* |
3280 | * SDM 3: 25.7.1.2 | 3356 | * SDM 3: 27.7.1.2 (September 2008) |
3281 | * Clear bit "block by NMI" before VM entry if a NMI delivery | 3357 | * Clear bit "block by NMI" before VM entry if a NMI |
3282 | * faulted. | 3358 | * delivery faulted. |
3283 | */ | 3359 | */ |
3284 | if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) | 3360 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, |
3285 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, | 3361 | GUEST_INTR_STATE_NMI); |
3286 | GUEST_INTR_STATE_NMI); | 3362 | break; |
3287 | else | 3363 | case INTR_TYPE_SOFT_EXCEPTION: |
3288 | vmx->vcpu.arch.nmi_injected = false; | 3364 | vmx->vcpu.arch.event_exit_inst_len = |
3289 | } | 3365 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
3290 | kvm_clear_exception_queue(&vmx->vcpu); | 3366 | /* fall through */ |
3291 | if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || | 3367 | case INTR_TYPE_HARD_EXCEPTION: |
3292 | type == INTR_TYPE_SOFT_EXCEPTION)) { | ||
3293 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { | 3368 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { |
3294 | error = vmcs_read32(IDT_VECTORING_ERROR_CODE); | 3369 | u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); |
3295 | kvm_queue_exception_e(&vmx->vcpu, vector, error); | 3370 | kvm_queue_exception_e(&vmx->vcpu, vector, err); |
3296 | } else | 3371 | } else |
3297 | kvm_queue_exception(&vmx->vcpu, vector); | 3372 | kvm_queue_exception(&vmx->vcpu, vector); |
3298 | vmx->idt_vectoring_info = 0; | 3373 | break; |
3299 | } | 3374 | case INTR_TYPE_SOFT_INTR: |
3300 | kvm_clear_interrupt_queue(&vmx->vcpu); | 3375 | vmx->vcpu.arch.event_exit_inst_len = |
3301 | if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { | 3376 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
3302 | kvm_queue_interrupt(&vmx->vcpu, vector); | 3377 | /* fall through */ |
3303 | vmx->idt_vectoring_info = 0; | 3378 | case INTR_TYPE_EXT_INTR: |
3304 | } | 3379 | kvm_queue_interrupt(&vmx->vcpu, vector, |
3305 | } | 3380 | type == INTR_TYPE_SOFT_INTR); |
3306 | 3381 | break; | |
3307 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | 3382 | default: |
3308 | { | 3383 | break; |
3309 | update_tpr_threshold(vcpu); | ||
3310 | |||
3311 | vmx_update_window_states(vcpu); | ||
3312 | |||
3313 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
3314 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
3315 | GUEST_INTR_STATE_STI | | ||
3316 | GUEST_INTR_STATE_MOV_SS); | ||
3317 | |||
3318 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { | ||
3319 | if (vcpu->arch.interrupt.pending) { | ||
3320 | enable_nmi_window(vcpu); | ||
3321 | } else if (vcpu->arch.nmi_window_open) { | ||
3322 | vcpu->arch.nmi_pending = false; | ||
3323 | vcpu->arch.nmi_injected = true; | ||
3324 | } else { | ||
3325 | enable_nmi_window(vcpu); | ||
3326 | return; | ||
3327 | } | ||
3328 | } | ||
3329 | if (vcpu->arch.nmi_injected) { | ||
3330 | vmx_inject_nmi(vcpu); | ||
3331 | if (vcpu->arch.nmi_pending) | ||
3332 | enable_nmi_window(vcpu); | ||
3333 | else if (kvm_cpu_has_interrupt(vcpu)) | ||
3334 | enable_irq_window(vcpu); | ||
3335 | return; | ||
3336 | } | ||
3337 | if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { | ||
3338 | if (vcpu->arch.interrupt_window_open) | ||
3339 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); | ||
3340 | else | ||
3341 | enable_irq_window(vcpu); | ||
3342 | } | ||
3343 | if (vcpu->arch.interrupt.pending) { | ||
3344 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | ||
3345 | if (kvm_cpu_has_interrupt(vcpu)) | ||
3346 | enable_irq_window(vcpu); | ||
3347 | } | 3384 | } |
3348 | } | 3385 | } |
3349 | 3386 | ||
@@ -3381,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | |||
3381 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3418 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
3382 | { | 3419 | { |
3383 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3420 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3384 | u32 intr_info; | ||
3385 | 3421 | ||
3386 | /* Record the guest's net vcpu time for enforced NMI injections. */ | 3422 | /* Record the guest's net vcpu time for enforced NMI injections. */ |
3387 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) | 3423 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) |
@@ -3505,20 +3541,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3505 | if (vmx->rmode.irq.pending) | 3541 | if (vmx->rmode.irq.pending) |
3506 | fixup_rmode_irq(vmx); | 3542 | fixup_rmode_irq(vmx); |
3507 | 3543 | ||
3508 | vmx_update_window_states(vcpu); | ||
3509 | |||
3510 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 3544 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
3511 | vmx->launched = 1; | 3545 | vmx->launched = 1; |
3512 | 3546 | ||
3513 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
3514 | |||
3515 | /* We need to handle NMIs before interrupts are enabled */ | ||
3516 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && | ||
3517 | (intr_info & INTR_INFO_VALID_MASK)) { | ||
3518 | KVMTRACE_0D(NMI, vcpu, handler); | ||
3519 | asm("int $2"); | ||
3520 | } | ||
3521 | |||
3522 | vmx_complete_interrupts(vmx); | 3547 | vmx_complete_interrupts(vmx); |
3523 | } | 3548 | } |
3524 | 3549 | ||
@@ -3593,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
3593 | if (alloc_apic_access_page(kvm) != 0) | 3618 | if (alloc_apic_access_page(kvm) != 0) |
3594 | goto free_vmcs; | 3619 | goto free_vmcs; |
3595 | 3620 | ||
3596 | if (vm_need_ept()) | 3621 | if (enable_ept) |
3597 | if (alloc_identity_pagetable(kvm) != 0) | 3622 | if (alloc_identity_pagetable(kvm) != 0) |
3598 | goto free_vmcs; | 3623 | goto free_vmcs; |
3599 | 3624 | ||
@@ -3631,9 +3656,32 @@ static int get_ept_level(void) | |||
3631 | return VMX_EPT_DEFAULT_GAW + 1; | 3656 | return VMX_EPT_DEFAULT_GAW + 1; |
3632 | } | 3657 | } |
3633 | 3658 | ||
3634 | static int vmx_get_mt_mask_shift(void) | 3659 | static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) |
3635 | { | 3660 | { |
3636 | return VMX_EPT_MT_EPTE_SHIFT; | 3661 | u64 ret; |
3662 | |||
3663 | /* For VT-d and EPT combination | ||
3664 | * 1. MMIO: always map as UC | ||
3665 | * 2. EPT with VT-d: | ||
3666 | * a. VT-d without snooping control feature: can't guarantee the | ||
3667 | * result, try to trust guest. | ||
3668 | * b. VT-d with snooping control feature: snooping control feature of | ||
3669 | * VT-d engine can guarantee the cache correctness. Just set it | ||
3670 | * to WB to keep consistent with host. So the same as item 3. | ||
3671 | * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep | ||
3672 | * consistent with host MTRR | ||
3673 | */ | ||
3674 | if (is_mmio) | ||
3675 | ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; | ||
3676 | else if (vcpu->kvm->arch.iommu_domain && | ||
3677 | !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) | ||
3678 | ret = kvm_get_guest_memory_type(vcpu, gfn) << | ||
3679 | VMX_EPT_MT_EPTE_SHIFT; | ||
3680 | else | ||
3681 | ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | ||
3682 | | VMX_EPT_IGMT_BIT; | ||
3683 | |||
3684 | return ret; | ||
3637 | } | 3685 | } |
3638 | 3686 | ||
3639 | static struct kvm_x86_ops vmx_x86_ops = { | 3687 | static struct kvm_x86_ops vmx_x86_ops = { |
@@ -3644,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
3644 | .check_processor_compatibility = vmx_check_processor_compat, | 3692 | .check_processor_compatibility = vmx_check_processor_compat, |
3645 | .hardware_enable = hardware_enable, | 3693 | .hardware_enable = hardware_enable, |
3646 | .hardware_disable = hardware_disable, | 3694 | .hardware_disable = hardware_disable, |
3647 | .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, | 3695 | .cpu_has_accelerated_tpr = report_flexpriority, |
3648 | 3696 | ||
3649 | .vcpu_create = vmx_create_vcpu, | 3697 | .vcpu_create = vmx_create_vcpu, |
3650 | .vcpu_free = vmx_free_vcpu, | 3698 | .vcpu_free = vmx_free_vcpu, |
@@ -3678,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
3678 | .tlb_flush = vmx_flush_tlb, | 3726 | .tlb_flush = vmx_flush_tlb, |
3679 | 3727 | ||
3680 | .run = vmx_vcpu_run, | 3728 | .run = vmx_vcpu_run, |
3681 | .handle_exit = kvm_handle_exit, | 3729 | .handle_exit = vmx_handle_exit, |
3682 | .skip_emulated_instruction = skip_emulated_instruction, | 3730 | .skip_emulated_instruction = skip_emulated_instruction, |
3731 | .set_interrupt_shadow = vmx_set_interrupt_shadow, | ||
3732 | .get_interrupt_shadow = vmx_get_interrupt_shadow, | ||
3683 | .patch_hypercall = vmx_patch_hypercall, | 3733 | .patch_hypercall = vmx_patch_hypercall, |
3684 | .get_irq = vmx_get_irq, | ||
3685 | .set_irq = vmx_inject_irq, | 3734 | .set_irq = vmx_inject_irq, |
3735 | .set_nmi = vmx_inject_nmi, | ||
3686 | .queue_exception = vmx_queue_exception, | 3736 | .queue_exception = vmx_queue_exception, |
3687 | .exception_injected = vmx_exception_injected, | 3737 | .interrupt_allowed = vmx_interrupt_allowed, |
3688 | .inject_pending_irq = vmx_intr_assist, | 3738 | .nmi_allowed = vmx_nmi_allowed, |
3689 | .inject_pending_vectors = do_interrupt_requests, | 3739 | .enable_nmi_window = enable_nmi_window, |
3740 | .enable_irq_window = enable_irq_window, | ||
3741 | .update_cr8_intercept = update_cr8_intercept, | ||
3690 | 3742 | ||
3691 | .set_tss_addr = vmx_set_tss_addr, | 3743 | .set_tss_addr = vmx_set_tss_addr, |
3692 | .get_tdp_level = get_ept_level, | 3744 | .get_tdp_level = get_ept_level, |
3693 | .get_mt_mask_shift = vmx_get_mt_mask_shift, | 3745 | .get_mt_mask = vmx_get_mt_mask, |
3694 | }; | 3746 | }; |
3695 | 3747 | ||
3696 | static int __init vmx_init(void) | 3748 | static int __init vmx_init(void) |
3697 | { | 3749 | { |
3698 | void *va; | ||
3699 | int r; | 3750 | int r; |
3700 | 3751 | ||
3701 | vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | 3752 | vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); |
3702 | if (!vmx_io_bitmap_a) | 3753 | if (!vmx_io_bitmap_a) |
3703 | return -ENOMEM; | 3754 | return -ENOMEM; |
3704 | 3755 | ||
3705 | vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | 3756 | vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); |
3706 | if (!vmx_io_bitmap_b) { | 3757 | if (!vmx_io_bitmap_b) { |
3707 | r = -ENOMEM; | 3758 | r = -ENOMEM; |
3708 | goto out; | 3759 | goto out; |
3709 | } | 3760 | } |
3710 | 3761 | ||
3711 | vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | 3762 | vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); |
3712 | if (!vmx_msr_bitmap) { | 3763 | if (!vmx_msr_bitmap_legacy) { |
3713 | r = -ENOMEM; | 3764 | r = -ENOMEM; |
3714 | goto out1; | 3765 | goto out1; |
3715 | } | 3766 | } |
3716 | 3767 | ||
3768 | vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); | ||
3769 | if (!vmx_msr_bitmap_longmode) { | ||
3770 | r = -ENOMEM; | ||
3771 | goto out2; | ||
3772 | } | ||
3773 | |||
3717 | /* | 3774 | /* |
3718 | * Allow direct access to the PC debug port (it is often used for I/O | 3775 | * Allow direct access to the PC debug port (it is often used for I/O |
3719 | * delays, but the vmexits simply slow things down). | 3776 | * delays, but the vmexits simply slow things down). |
3720 | */ | 3777 | */ |
3721 | va = kmap(vmx_io_bitmap_a); | 3778 | memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); |
3722 | memset(va, 0xff, PAGE_SIZE); | 3779 | clear_bit(0x80, vmx_io_bitmap_a); |
3723 | clear_bit(0x80, va); | ||
3724 | kunmap(vmx_io_bitmap_a); | ||
3725 | 3780 | ||
3726 | va = kmap(vmx_io_bitmap_b); | 3781 | memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); |
3727 | memset(va, 0xff, PAGE_SIZE); | ||
3728 | kunmap(vmx_io_bitmap_b); | ||
3729 | 3782 | ||
3730 | va = kmap(vmx_msr_bitmap); | 3783 | memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); |
3731 | memset(va, 0xff, PAGE_SIZE); | 3784 | memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); |
3732 | kunmap(vmx_msr_bitmap); | ||
3733 | 3785 | ||
3734 | set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ | 3786 | set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ |
3735 | 3787 | ||
3736 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); | 3788 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); |
3737 | if (r) | 3789 | if (r) |
3738 | goto out2; | 3790 | goto out3; |
3739 | 3791 | ||
3740 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); | 3792 | vmx_disable_intercept_for_msr(MSR_FS_BASE, false); |
3741 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); | 3793 | vmx_disable_intercept_for_msr(MSR_GS_BASE, false); |
3742 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); | 3794 | vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); |
3743 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); | 3795 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); |
3744 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); | 3796 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); |
3797 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); | ||
3745 | 3798 | ||
3746 | if (vm_need_ept()) { | 3799 | if (enable_ept) { |
3747 | bypass_guest_pf = 0; | 3800 | bypass_guest_pf = 0; |
3748 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | | 3801 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | |
3749 | VMX_EPT_WRITABLE_MASK); | 3802 | VMX_EPT_WRITABLE_MASK); |
3750 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 3803 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
3751 | VMX_EPT_EXECUTABLE_MASK, | 3804 | VMX_EPT_EXECUTABLE_MASK); |
3752 | VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); | ||
3753 | kvm_enable_tdp(); | 3805 | kvm_enable_tdp(); |
3754 | } else | 3806 | } else |
3755 | kvm_disable_tdp(); | 3807 | kvm_disable_tdp(); |
@@ -3761,20 +3813,23 @@ static int __init vmx_init(void) | |||
3761 | 3813 | ||
3762 | return 0; | 3814 | return 0; |
3763 | 3815 | ||
3816 | out3: | ||
3817 | free_page((unsigned long)vmx_msr_bitmap_longmode); | ||
3764 | out2: | 3818 | out2: |
3765 | __free_page(vmx_msr_bitmap); | 3819 | free_page((unsigned long)vmx_msr_bitmap_legacy); |
3766 | out1: | 3820 | out1: |
3767 | __free_page(vmx_io_bitmap_b); | 3821 | free_page((unsigned long)vmx_io_bitmap_b); |
3768 | out: | 3822 | out: |
3769 | __free_page(vmx_io_bitmap_a); | 3823 | free_page((unsigned long)vmx_io_bitmap_a); |
3770 | return r; | 3824 | return r; |
3771 | } | 3825 | } |
3772 | 3826 | ||
3773 | static void __exit vmx_exit(void) | 3827 | static void __exit vmx_exit(void) |
3774 | { | 3828 | { |
3775 | __free_page(vmx_msr_bitmap); | 3829 | free_page((unsigned long)vmx_msr_bitmap_legacy); |
3776 | __free_page(vmx_io_bitmap_b); | 3830 | free_page((unsigned long)vmx_msr_bitmap_longmode); |
3777 | __free_page(vmx_io_bitmap_a); | 3831 | free_page((unsigned long)vmx_io_bitmap_b); |
3832 | free_page((unsigned long)vmx_io_bitmap_a); | ||
3778 | 3833 | ||
3779 | kvm_exit(); | 3834 | kvm_exit(); |
3780 | } | 3835 | } |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3944e917e794..249540f98513 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
91 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | 91 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, |
92 | { "hypercalls", VCPU_STAT(hypercalls) }, | 92 | { "hypercalls", VCPU_STAT(hypercalls) }, |
93 | { "request_irq", VCPU_STAT(request_irq_exits) }, | 93 | { "request_irq", VCPU_STAT(request_irq_exits) }, |
94 | { "request_nmi", VCPU_STAT(request_nmi_exits) }, | ||
95 | { "irq_exits", VCPU_STAT(irq_exits) }, | 94 | { "irq_exits", VCPU_STAT(irq_exits) }, |
96 | { "host_state_reload", VCPU_STAT(host_state_reload) }, | 95 | { "host_state_reload", VCPU_STAT(host_state_reload) }, |
97 | { "efer_reload", VCPU_STAT(efer_reload) }, | 96 | { "efer_reload", VCPU_STAT(efer_reload) }, |
@@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
108 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | 107 | { "mmu_recycled", VM_STAT(mmu_recycled) }, |
109 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | 108 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, |
110 | { "mmu_unsync", VM_STAT(mmu_unsync) }, | 109 | { "mmu_unsync", VM_STAT(mmu_unsync) }, |
111 | { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, | ||
112 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | 110 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
113 | { "largepages", VM_STAT(lpages) }, | 111 | { "largepages", VM_STAT(lpages) }, |
114 | { NULL } | 112 | { NULL } |
@@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
234 | goto out; | 232 | goto out; |
235 | } | 233 | } |
236 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | 234 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
237 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { | 235 | if (is_present_pte(pdpte[i]) && |
236 | (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { | ||
238 | ret = 0; | 237 | ret = 0; |
239 | goto out; | 238 | goto out; |
240 | } | 239 | } |
@@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
321 | kvm_x86_ops->set_cr0(vcpu, cr0); | 320 | kvm_x86_ops->set_cr0(vcpu, cr0); |
322 | vcpu->arch.cr0 = cr0; | 321 | vcpu->arch.cr0 = cr0; |
323 | 322 | ||
324 | kvm_mmu_sync_global(vcpu); | ||
325 | kvm_mmu_reset_context(vcpu); | 323 | kvm_mmu_reset_context(vcpu); |
326 | return; | 324 | return; |
327 | } | 325 | } |
@@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
370 | kvm_x86_ops->set_cr4(vcpu, cr4); | 368 | kvm_x86_ops->set_cr4(vcpu, cr4); |
371 | vcpu->arch.cr4 = cr4; | 369 | vcpu->arch.cr4 = cr4; |
372 | vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; | 370 | vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; |
373 | kvm_mmu_sync_global(vcpu); | ||
374 | kvm_mmu_reset_context(vcpu); | 371 | kvm_mmu_reset_context(vcpu); |
375 | } | 372 | } |
376 | EXPORT_SYMBOL_GPL(kvm_set_cr4); | 373 | EXPORT_SYMBOL_GPL(kvm_set_cr4); |
@@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
523 | efer |= vcpu->arch.shadow_efer & EFER_LMA; | 520 | efer |= vcpu->arch.shadow_efer & EFER_LMA; |
524 | 521 | ||
525 | vcpu->arch.shadow_efer = efer; | 522 | vcpu->arch.shadow_efer = efer; |
523 | |||
524 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; | ||
525 | kvm_mmu_reset_context(vcpu); | ||
526 | } | 526 | } |
527 | 527 | ||
528 | void kvm_enable_efer_bits(u64 mask) | 528 | void kvm_enable_efer_bits(u64 mask) |
@@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) | |||
630 | unsigned long flags; | 630 | unsigned long flags; |
631 | struct kvm_vcpu_arch *vcpu = &v->arch; | 631 | struct kvm_vcpu_arch *vcpu = &v->arch; |
632 | void *shared_kaddr; | 632 | void *shared_kaddr; |
633 | unsigned long this_tsc_khz; | ||
633 | 634 | ||
634 | if ((!vcpu->time_page)) | 635 | if ((!vcpu->time_page)) |
635 | return; | 636 | return; |
636 | 637 | ||
637 | if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { | 638 | this_tsc_khz = get_cpu_var(cpu_tsc_khz); |
638 | kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); | 639 | if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { |
639 | vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); | 640 | kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); |
641 | vcpu->hv_clock_tsc_khz = this_tsc_khz; | ||
640 | } | 642 | } |
643 | put_cpu_var(cpu_tsc_khz); | ||
641 | 644 | ||
642 | /* Keep irq disabled to prevent changes to the clock */ | 645 | /* Keep irq disabled to prevent changes to the clock */ |
643 | local_irq_save(flags); | 646 | local_irq_save(flags); |
@@ -893,6 +896,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
893 | case MSR_IA32_LASTINTFROMIP: | 896 | case MSR_IA32_LASTINTFROMIP: |
894 | case MSR_IA32_LASTINTTOIP: | 897 | case MSR_IA32_LASTINTTOIP: |
895 | case MSR_VM_HSAVE_PA: | 898 | case MSR_VM_HSAVE_PA: |
899 | case MSR_P6_EVNTSEL0: | ||
900 | case MSR_P6_EVNTSEL1: | ||
896 | data = 0; | 901 | data = 0; |
897 | break; | 902 | break; |
898 | case MSR_MTRRcap: | 903 | case MSR_MTRRcap: |
@@ -1024,6 +1029,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1024 | case KVM_CAP_SYNC_MMU: | 1029 | case KVM_CAP_SYNC_MMU: |
1025 | case KVM_CAP_REINJECT_CONTROL: | 1030 | case KVM_CAP_REINJECT_CONTROL: |
1026 | case KVM_CAP_IRQ_INJECT_STATUS: | 1031 | case KVM_CAP_IRQ_INJECT_STATUS: |
1032 | case KVM_CAP_ASSIGN_DEV_IRQ: | ||
1027 | r = 1; | 1033 | r = 1; |
1028 | break; | 1034 | break; |
1029 | case KVM_CAP_COALESCED_MMIO: | 1035 | case KVM_CAP_COALESCED_MMIO: |
@@ -1241,41 +1247,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1241 | entry->flags = 0; | 1247 | entry->flags = 0; |
1242 | } | 1248 | } |
1243 | 1249 | ||
1250 | #define F(x) bit(X86_FEATURE_##x) | ||
1251 | |||
1244 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | 1252 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
1245 | u32 index, int *nent, int maxnent) | 1253 | u32 index, int *nent, int maxnent) |
1246 | { | 1254 | { |
1247 | const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | | 1255 | unsigned f_nx = is_efer_nx() ? F(NX) : 0; |
1248 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
1249 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
1250 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
1251 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
1252 | bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | | ||
1253 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
1254 | bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | | ||
1255 | bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | | ||
1256 | bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); | ||
1257 | const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | | ||
1258 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
1259 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
1260 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
1261 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
1262 | bit(X86_FEATURE_PGE) | | ||
1263 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
1264 | bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | | ||
1265 | bit(X86_FEATURE_SYSCALL) | | ||
1266 | (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) | | ||
1267 | #ifdef CONFIG_X86_64 | 1256 | #ifdef CONFIG_X86_64 |
1268 | bit(X86_FEATURE_LM) | | 1257 | unsigned f_lm = F(LM); |
1258 | #else | ||
1259 | unsigned f_lm = 0; | ||
1269 | #endif | 1260 | #endif |
1270 | bit(X86_FEATURE_FXSR_OPT) | | 1261 | |
1271 | bit(X86_FEATURE_MMXEXT) | | 1262 | /* cpuid 1.edx */ |
1272 | bit(X86_FEATURE_3DNOWEXT) | | 1263 | const u32 kvm_supported_word0_x86_features = |
1273 | bit(X86_FEATURE_3DNOW); | 1264 | F(FPU) | F(VME) | F(DE) | F(PSE) | |
1274 | const u32 kvm_supported_word3_x86_features = | 1265 | F(TSC) | F(MSR) | F(PAE) | F(MCE) | |
1275 | bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); | 1266 | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | |
1267 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | ||
1268 | F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | | ||
1269 | 0 /* Reserved, DS, ACPI */ | F(MMX) | | ||
1270 | F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | | ||
1271 | 0 /* HTT, TM, Reserved, PBE */; | ||
1272 | /* cpuid 0x80000001.edx */ | ||
1273 | const u32 kvm_supported_word1_x86_features = | ||
1274 | F(FPU) | F(VME) | F(DE) | F(PSE) | | ||
1275 | F(TSC) | F(MSR) | F(PAE) | F(MCE) | | ||
1276 | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | | ||
1277 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | ||
1278 | F(PAT) | F(PSE36) | 0 /* Reserved */ | | ||
1279 | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | | ||
1280 | F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | | ||
1281 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); | ||
1282 | /* cpuid 1.ecx */ | ||
1283 | const u32 kvm_supported_word4_x86_features = | ||
1284 | F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | | ||
1285 | 0 /* DS-CPL, VMX, SMX, EST */ | | ||
1286 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | ||
1287 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | | ||
1288 | 0 /* Reserved, DCA */ | F(XMM4_1) | | ||
1289 | F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | | ||
1290 | 0 /* Reserved, XSAVE, OSXSAVE */; | ||
1291 | /* cpuid 0x80000001.ecx */ | ||
1276 | const u32 kvm_supported_word6_x86_features = | 1292 | const u32 kvm_supported_word6_x86_features = |
1277 | bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | | 1293 | F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | |
1278 | bit(X86_FEATURE_SVM); | 1294 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | |
1295 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | | ||
1296 | 0 /* SKINIT */ | 0 /* WDT */; | ||
1279 | 1297 | ||
1280 | /* all calls to cpuid_count() should be made on the same cpu */ | 1298 | /* all calls to cpuid_count() should be made on the same cpu */ |
1281 | get_cpu(); | 1299 | get_cpu(); |
@@ -1288,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1288 | break; | 1306 | break; |
1289 | case 1: | 1307 | case 1: |
1290 | entry->edx &= kvm_supported_word0_x86_features; | 1308 | entry->edx &= kvm_supported_word0_x86_features; |
1291 | entry->ecx &= kvm_supported_word3_x86_features; | 1309 | entry->ecx &= kvm_supported_word4_x86_features; |
1292 | break; | 1310 | break; |
1293 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | 1311 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands |
1294 | * may return different values. This forces us to get_cpu() before | 1312 | * may return different values. This forces us to get_cpu() before |
@@ -1350,6 +1368,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1350 | put_cpu(); | 1368 | put_cpu(); |
1351 | } | 1369 | } |
1352 | 1370 | ||
1371 | #undef F | ||
1372 | |||
1353 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | 1373 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, |
1354 | struct kvm_cpuid_entry2 __user *entries) | 1374 | struct kvm_cpuid_entry2 __user *entries) |
1355 | { | 1375 | { |
@@ -1421,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
1421 | return -ENXIO; | 1441 | return -ENXIO; |
1422 | vcpu_load(vcpu); | 1442 | vcpu_load(vcpu); |
1423 | 1443 | ||
1424 | set_bit(irq->irq, vcpu->arch.irq_pending); | 1444 | kvm_queue_interrupt(vcpu, irq->irq, false); |
1425 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
1426 | 1445 | ||
1427 | vcpu_put(vcpu); | 1446 | vcpu_put(vcpu); |
1428 | 1447 | ||
@@ -1584,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1584 | r = -EINVAL; | 1603 | r = -EINVAL; |
1585 | } | 1604 | } |
1586 | out: | 1605 | out: |
1587 | if (lapic) | 1606 | kfree(lapic); |
1588 | kfree(lapic); | ||
1589 | return r; | 1607 | return r; |
1590 | } | 1608 | } |
1591 | 1609 | ||
@@ -1606,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | |||
1606 | return -EINVAL; | 1624 | return -EINVAL; |
1607 | 1625 | ||
1608 | down_write(&kvm->slots_lock); | 1626 | down_write(&kvm->slots_lock); |
1627 | spin_lock(&kvm->mmu_lock); | ||
1609 | 1628 | ||
1610 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); | 1629 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); |
1611 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; | 1630 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; |
1612 | 1631 | ||
1632 | spin_unlock(&kvm->mmu_lock); | ||
1613 | up_write(&kvm->slots_lock); | 1633 | up_write(&kvm->slots_lock); |
1614 | return 0; | 1634 | return 0; |
1615 | } | 1635 | } |
@@ -1785,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
1785 | 1805 | ||
1786 | /* If nothing is dirty, don't bother messing with page tables. */ | 1806 | /* If nothing is dirty, don't bother messing with page tables. */ |
1787 | if (is_dirty) { | 1807 | if (is_dirty) { |
1808 | spin_lock(&kvm->mmu_lock); | ||
1788 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | 1809 | kvm_mmu_slot_remove_write_access(kvm, log->slot); |
1810 | spin_unlock(&kvm->mmu_lock); | ||
1789 | kvm_flush_remote_tlbs(kvm); | 1811 | kvm_flush_remote_tlbs(kvm); |
1790 | memslot = &kvm->memslots[log->slot]; | 1812 | memslot = &kvm->memslots[log->slot]; |
1791 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | 1813 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; |
@@ -2360,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2360 | u16 error_code, | 2382 | u16 error_code, |
2361 | int emulation_type) | 2383 | int emulation_type) |
2362 | { | 2384 | { |
2363 | int r; | 2385 | int r, shadow_mask; |
2364 | struct decode_cache *c; | 2386 | struct decode_cache *c; |
2365 | 2387 | ||
2366 | kvm_clear_exception_queue(vcpu); | 2388 | kvm_clear_exception_queue(vcpu); |
@@ -2408,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2408 | } | 2430 | } |
2409 | } | 2431 | } |
2410 | 2432 | ||
2433 | if (emulation_type & EMULTYPE_SKIP) { | ||
2434 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); | ||
2435 | return EMULATE_DONE; | ||
2436 | } | ||
2437 | |||
2411 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | 2438 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); |
2439 | shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; | ||
2440 | |||
2441 | if (r == 0) | ||
2442 | kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); | ||
2412 | 2443 | ||
2413 | if (vcpu->arch.pio.string) | 2444 | if (vcpu->arch.pio.string) |
2414 | return EMULATE_DO_MMIO; | 2445 | return EMULATE_DO_MMIO; |
@@ -2761,7 +2792,7 @@ int kvm_arch_init(void *opaque) | |||
2761 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | 2792 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); |
2762 | kvm_mmu_set_base_ptes(PT_PRESENT_MASK); | 2793 | kvm_mmu_set_base_ptes(PT_PRESENT_MASK); |
2763 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 2794 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
2764 | PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); | 2795 | PT_DIRTY_MASK, PT64_NX_MASK, 0); |
2765 | 2796 | ||
2766 | for_each_possible_cpu(cpu) | 2797 | for_each_possible_cpu(cpu) |
2767 | per_cpu(cpu_tsc_khz, cpu) = tsc_khz; | 2798 | per_cpu(cpu_tsc_khz, cpu) = tsc_khz; |
@@ -3012,6 +3043,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | |||
3012 | return best; | 3043 | return best; |
3013 | } | 3044 | } |
3014 | 3045 | ||
3046 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) | ||
3047 | { | ||
3048 | struct kvm_cpuid_entry2 *best; | ||
3049 | |||
3050 | best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); | ||
3051 | if (best) | ||
3052 | return best->eax & 0xff; | ||
3053 | return 36; | ||
3054 | } | ||
3055 | |||
3015 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | 3056 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) |
3016 | { | 3057 | { |
3017 | u32 function, index; | 3058 | u32 function, index; |
@@ -3048,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | |||
3048 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | 3089 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, |
3049 | struct kvm_run *kvm_run) | 3090 | struct kvm_run *kvm_run) |
3050 | { | 3091 | { |
3051 | return (!vcpu->arch.irq_summary && | 3092 | return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && |
3052 | kvm_run->request_interrupt_window && | 3093 | kvm_run->request_interrupt_window && |
3053 | vcpu->arch.interrupt_window_open && | 3094 | kvm_arch_interrupt_allowed(vcpu)); |
3054 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); | ||
3055 | } | 3095 | } |
3056 | 3096 | ||
3057 | static void post_kvm_run_save(struct kvm_vcpu *vcpu, | 3097 | static void post_kvm_run_save(struct kvm_vcpu *vcpu, |
@@ -3064,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, | |||
3064 | kvm_run->ready_for_interrupt_injection = 1; | 3104 | kvm_run->ready_for_interrupt_injection = 1; |
3065 | else | 3105 | else |
3066 | kvm_run->ready_for_interrupt_injection = | 3106 | kvm_run->ready_for_interrupt_injection = |
3067 | (vcpu->arch.interrupt_window_open && | 3107 | kvm_arch_interrupt_allowed(vcpu) && |
3068 | vcpu->arch.irq_summary == 0); | 3108 | !kvm_cpu_has_interrupt(vcpu) && |
3109 | !kvm_event_needs_reinjection(vcpu); | ||
3069 | } | 3110 | } |
3070 | 3111 | ||
3071 | static void vapic_enter(struct kvm_vcpu *vcpu) | 3112 | static void vapic_enter(struct kvm_vcpu *vcpu) |
@@ -3094,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu) | |||
3094 | up_read(&vcpu->kvm->slots_lock); | 3135 | up_read(&vcpu->kvm->slots_lock); |
3095 | } | 3136 | } |
3096 | 3137 | ||
3138 | static void update_cr8_intercept(struct kvm_vcpu *vcpu) | ||
3139 | { | ||
3140 | int max_irr, tpr; | ||
3141 | |||
3142 | if (!kvm_x86_ops->update_cr8_intercept) | ||
3143 | return; | ||
3144 | |||
3145 | if (!vcpu->arch.apic->vapic_addr) | ||
3146 | max_irr = kvm_lapic_find_highest_irr(vcpu); | ||
3147 | else | ||
3148 | max_irr = -1; | ||
3149 | |||
3150 | if (max_irr != -1) | ||
3151 | max_irr >>= 4; | ||
3152 | |||
3153 | tpr = kvm_lapic_get_cr8(vcpu); | ||
3154 | |||
3155 | kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); | ||
3156 | } | ||
3157 | |||
3158 | static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
3159 | { | ||
3160 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
3161 | kvm_x86_ops->set_interrupt_shadow(vcpu, 0); | ||
3162 | |||
3163 | /* try to reinject previous events if any */ | ||
3164 | if (vcpu->arch.nmi_injected) { | ||
3165 | kvm_x86_ops->set_nmi(vcpu); | ||
3166 | return; | ||
3167 | } | ||
3168 | |||
3169 | if (vcpu->arch.interrupt.pending) { | ||
3170 | kvm_x86_ops->set_irq(vcpu); | ||
3171 | return; | ||
3172 | } | ||
3173 | |||
3174 | /* try to inject new event if pending */ | ||
3175 | if (vcpu->arch.nmi_pending) { | ||
3176 | if (kvm_x86_ops->nmi_allowed(vcpu)) { | ||
3177 | vcpu->arch.nmi_pending = false; | ||
3178 | vcpu->arch.nmi_injected = true; | ||
3179 | kvm_x86_ops->set_nmi(vcpu); | ||
3180 | } | ||
3181 | } else if (kvm_cpu_has_interrupt(vcpu)) { | ||
3182 | if (kvm_x86_ops->interrupt_allowed(vcpu)) { | ||
3183 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), | ||
3184 | false); | ||
3185 | kvm_x86_ops->set_irq(vcpu); | ||
3186 | } | ||
3187 | } | ||
3188 | } | ||
3189 | |||
3097 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3190 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
3098 | { | 3191 | { |
3099 | int r; | 3192 | int r; |
3193 | bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && | ||
3194 | kvm_run->request_interrupt_window; | ||
3100 | 3195 | ||
3101 | if (vcpu->requests) | 3196 | if (vcpu->requests) |
3102 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) | 3197 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) |
@@ -3128,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3128 | } | 3223 | } |
3129 | } | 3224 | } |
3130 | 3225 | ||
3131 | clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); | ||
3132 | kvm_inject_pending_timer_irqs(vcpu); | ||
3133 | |||
3134 | preempt_disable(); | 3226 | preempt_disable(); |
3135 | 3227 | ||
3136 | kvm_x86_ops->prepare_guest_switch(vcpu); | 3228 | kvm_x86_ops->prepare_guest_switch(vcpu); |
@@ -3138,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3138 | 3230 | ||
3139 | local_irq_disable(); | 3231 | local_irq_disable(); |
3140 | 3232 | ||
3233 | clear_bit(KVM_REQ_KICK, &vcpu->requests); | ||
3234 | smp_mb__after_clear_bit(); | ||
3235 | |||
3141 | if (vcpu->requests || need_resched() || signal_pending(current)) { | 3236 | if (vcpu->requests || need_resched() || signal_pending(current)) { |
3142 | local_irq_enable(); | 3237 | local_irq_enable(); |
3143 | preempt_enable(); | 3238 | preempt_enable(); |
@@ -3145,21 +3240,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3145 | goto out; | 3240 | goto out; |
3146 | } | 3241 | } |
3147 | 3242 | ||
3148 | vcpu->guest_mode = 1; | ||
3149 | /* | ||
3150 | * Make sure that guest_mode assignment won't happen after | ||
3151 | * testing the pending IRQ vector bitmap. | ||
3152 | */ | ||
3153 | smp_wmb(); | ||
3154 | |||
3155 | if (vcpu->arch.exception.pending) | 3243 | if (vcpu->arch.exception.pending) |
3156 | __queue_exception(vcpu); | 3244 | __queue_exception(vcpu); |
3157 | else if (irqchip_in_kernel(vcpu->kvm)) | ||
3158 | kvm_x86_ops->inject_pending_irq(vcpu); | ||
3159 | else | 3245 | else |
3160 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | 3246 | inject_pending_irq(vcpu, kvm_run); |
3161 | 3247 | ||
3162 | kvm_lapic_sync_to_vapic(vcpu); | 3248 | /* enable NMI/IRQ window open exits if needed */ |
3249 | if (vcpu->arch.nmi_pending) | ||
3250 | kvm_x86_ops->enable_nmi_window(vcpu); | ||
3251 | else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) | ||
3252 | kvm_x86_ops->enable_irq_window(vcpu); | ||
3253 | |||
3254 | if (kvm_lapic_enabled(vcpu)) { | ||
3255 | update_cr8_intercept(vcpu); | ||
3256 | kvm_lapic_sync_to_vapic(vcpu); | ||
3257 | } | ||
3163 | 3258 | ||
3164 | up_read(&vcpu->kvm->slots_lock); | 3259 | up_read(&vcpu->kvm->slots_lock); |
3165 | 3260 | ||
@@ -3193,7 +3288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3193 | set_debugreg(vcpu->arch.host_dr6, 6); | 3288 | set_debugreg(vcpu->arch.host_dr6, 6); |
3194 | set_debugreg(vcpu->arch.host_dr7, 7); | 3289 | set_debugreg(vcpu->arch.host_dr7, 7); |
3195 | 3290 | ||
3196 | vcpu->guest_mode = 0; | 3291 | set_bit(KVM_REQ_KICK, &vcpu->requests); |
3197 | local_irq_enable(); | 3292 | local_irq_enable(); |
3198 | 3293 | ||
3199 | ++vcpu->stat.exits; | 3294 | ++vcpu->stat.exits; |
@@ -3220,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3220 | profile_hit(KVM_PROFILING, (void *)rip); | 3315 | profile_hit(KVM_PROFILING, (void *)rip); |
3221 | } | 3316 | } |
3222 | 3317 | ||
3223 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | ||
3224 | vcpu->arch.exception.pending = false; | ||
3225 | 3318 | ||
3226 | kvm_lapic_sync_from_vapic(vcpu); | 3319 | kvm_lapic_sync_from_vapic(vcpu); |
3227 | 3320 | ||
@@ -3230,6 +3323,7 @@ out: | |||
3230 | return r; | 3323 | return r; |
3231 | } | 3324 | } |
3232 | 3325 | ||
3326 | |||
3233 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3327 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
3234 | { | 3328 | { |
3235 | int r; | 3329 | int r; |
@@ -3256,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3256 | kvm_vcpu_block(vcpu); | 3350 | kvm_vcpu_block(vcpu); |
3257 | down_read(&vcpu->kvm->slots_lock); | 3351 | down_read(&vcpu->kvm->slots_lock); |
3258 | if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) | 3352 | if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) |
3259 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) | 3353 | { |
3354 | switch(vcpu->arch.mp_state) { | ||
3355 | case KVM_MP_STATE_HALTED: | ||
3260 | vcpu->arch.mp_state = | 3356 | vcpu->arch.mp_state = |
3261 | KVM_MP_STATE_RUNNABLE; | 3357 | KVM_MP_STATE_RUNNABLE; |
3262 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | 3358 | case KVM_MP_STATE_RUNNABLE: |
3263 | r = -EINTR; | 3359 | break; |
3360 | case KVM_MP_STATE_SIPI_RECEIVED: | ||
3361 | default: | ||
3362 | r = -EINTR; | ||
3363 | break; | ||
3364 | } | ||
3365 | } | ||
3264 | } | 3366 | } |
3265 | 3367 | ||
3266 | if (r > 0) { | 3368 | if (r <= 0) |
3267 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | 3369 | break; |
3268 | r = -EINTR; | 3370 | |
3269 | kvm_run->exit_reason = KVM_EXIT_INTR; | 3371 | clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); |
3270 | ++vcpu->stat.request_irq_exits; | 3372 | if (kvm_cpu_has_pending_timer(vcpu)) |
3271 | } | 3373 | kvm_inject_pending_timer_irqs(vcpu); |
3272 | if (signal_pending(current)) { | 3374 | |
3273 | r = -EINTR; | 3375 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { |
3274 | kvm_run->exit_reason = KVM_EXIT_INTR; | 3376 | r = -EINTR; |
3275 | ++vcpu->stat.signal_exits; | 3377 | kvm_run->exit_reason = KVM_EXIT_INTR; |
3276 | } | 3378 | ++vcpu->stat.request_irq_exits; |
3277 | if (need_resched()) { | 3379 | } |
3278 | up_read(&vcpu->kvm->slots_lock); | 3380 | if (signal_pending(current)) { |
3279 | kvm_resched(vcpu); | 3381 | r = -EINTR; |
3280 | down_read(&vcpu->kvm->slots_lock); | 3382 | kvm_run->exit_reason = KVM_EXIT_INTR; |
3281 | } | 3383 | ++vcpu->stat.signal_exits; |
3384 | } | ||
3385 | if (need_resched()) { | ||
3386 | up_read(&vcpu->kvm->slots_lock); | ||
3387 | kvm_resched(vcpu); | ||
3388 | down_read(&vcpu->kvm->slots_lock); | ||
3282 | } | 3389 | } |
3283 | } | 3390 | } |
3284 | 3391 | ||
@@ -3442,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
3442 | struct kvm_sregs *sregs) | 3549 | struct kvm_sregs *sregs) |
3443 | { | 3550 | { |
3444 | struct descriptor_table dt; | 3551 | struct descriptor_table dt; |
3445 | int pending_vec; | ||
3446 | 3552 | ||
3447 | vcpu_load(vcpu); | 3553 | vcpu_load(vcpu); |
3448 | 3554 | ||
@@ -3472,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
3472 | sregs->efer = vcpu->arch.shadow_efer; | 3578 | sregs->efer = vcpu->arch.shadow_efer; |
3473 | sregs->apic_base = kvm_get_apic_base(vcpu); | 3579 | sregs->apic_base = kvm_get_apic_base(vcpu); |
3474 | 3580 | ||
3475 | if (irqchip_in_kernel(vcpu->kvm)) { | 3581 | memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); |
3476 | memset(sregs->interrupt_bitmap, 0, | 3582 | |
3477 | sizeof sregs->interrupt_bitmap); | 3583 | if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) |
3478 | pending_vec = kvm_x86_ops->get_irq(vcpu); | 3584 | set_bit(vcpu->arch.interrupt.nr, |
3479 | if (pending_vec >= 0) | 3585 | (unsigned long *)sregs->interrupt_bitmap); |
3480 | set_bit(pending_vec, | ||
3481 | (unsigned long *)sregs->interrupt_bitmap); | ||
3482 | } else | ||
3483 | memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, | ||
3484 | sizeof sregs->interrupt_bitmap); | ||
3485 | 3586 | ||
3486 | vcpu_put(vcpu); | 3587 | vcpu_put(vcpu); |
3487 | 3588 | ||
@@ -3688,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, | |||
3688 | tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); | 3789 | tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); |
3689 | tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); | 3790 | tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); |
3690 | tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); | 3791 | tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); |
3691 | tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); | ||
3692 | } | 3792 | } |
3693 | 3793 | ||
3694 | static int load_state_from_tss32(struct kvm_vcpu *vcpu, | 3794 | static int load_state_from_tss32(struct kvm_vcpu *vcpu, |
@@ -3785,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, | |||
3785 | } | 3885 | } |
3786 | 3886 | ||
3787 | static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, | 3887 | static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, |
3788 | u32 old_tss_base, | 3888 | u16 old_tss_sel, u32 old_tss_base, |
3789 | struct desc_struct *nseg_desc) | 3889 | struct desc_struct *nseg_desc) |
3790 | { | 3890 | { |
3791 | struct tss_segment_16 tss_segment_16; | 3891 | struct tss_segment_16 tss_segment_16; |
3792 | int ret = 0; | 3892 | int ret = 0; |
@@ -3805,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, | |||
3805 | &tss_segment_16, sizeof tss_segment_16)) | 3905 | &tss_segment_16, sizeof tss_segment_16)) |
3806 | goto out; | 3906 | goto out; |
3807 | 3907 | ||
3908 | if (old_tss_sel != 0xffff) { | ||
3909 | tss_segment_16.prev_task_link = old_tss_sel; | ||
3910 | |||
3911 | if (kvm_write_guest(vcpu->kvm, | ||
3912 | get_tss_base_addr(vcpu, nseg_desc), | ||
3913 | &tss_segment_16.prev_task_link, | ||
3914 | sizeof tss_segment_16.prev_task_link)) | ||
3915 | goto out; | ||
3916 | } | ||
3917 | |||
3808 | if (load_state_from_tss16(vcpu, &tss_segment_16)) | 3918 | if (load_state_from_tss16(vcpu, &tss_segment_16)) |
3809 | goto out; | 3919 | goto out; |
3810 | 3920 | ||
@@ -3814,7 +3924,7 @@ out: | |||
3814 | } | 3924 | } |
3815 | 3925 | ||
3816 | static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, | 3926 | static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, |
3817 | u32 old_tss_base, | 3927 | u16 old_tss_sel, u32 old_tss_base, |
3818 | struct desc_struct *nseg_desc) | 3928 | struct desc_struct *nseg_desc) |
3819 | { | 3929 | { |
3820 | struct tss_segment_32 tss_segment_32; | 3930 | struct tss_segment_32 tss_segment_32; |
@@ -3834,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, | |||
3834 | &tss_segment_32, sizeof tss_segment_32)) | 3944 | &tss_segment_32, sizeof tss_segment_32)) |
3835 | goto out; | 3945 | goto out; |
3836 | 3946 | ||
3947 | if (old_tss_sel != 0xffff) { | ||
3948 | tss_segment_32.prev_task_link = old_tss_sel; | ||
3949 | |||
3950 | if (kvm_write_guest(vcpu->kvm, | ||
3951 | get_tss_base_addr(vcpu, nseg_desc), | ||
3952 | &tss_segment_32.prev_task_link, | ||
3953 | sizeof tss_segment_32.prev_task_link)) | ||
3954 | goto out; | ||
3955 | } | ||
3956 | |||
3837 | if (load_state_from_tss32(vcpu, &tss_segment_32)) | 3957 | if (load_state_from_tss32(vcpu, &tss_segment_32)) |
3838 | goto out; | 3958 | goto out; |
3839 | 3959 | ||
@@ -3887,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
3887 | kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); | 4007 | kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); |
3888 | } | 4008 | } |
3889 | 4009 | ||
3890 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 4010 | /* set back link to prev task only if NT bit is set in eflags |
4011 | note that old_tss_sel is not used afetr this point */ | ||
4012 | if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) | ||
4013 | old_tss_sel = 0xffff; | ||
4014 | |||
4015 | /* set back link to prev task only if NT bit is set in eflags | ||
4016 | note that old_tss_sel is not used afetr this point */ | ||
4017 | if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) | ||
4018 | old_tss_sel = 0xffff; | ||
3891 | 4019 | ||
3892 | if (nseg_desc.type & 8) | 4020 | if (nseg_desc.type & 8) |
3893 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, | 4021 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, |
3894 | &nseg_desc); | 4022 | old_tss_base, &nseg_desc); |
3895 | else | 4023 | else |
3896 | ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, | 4024 | ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, |
3897 | &nseg_desc); | 4025 | old_tss_base, &nseg_desc); |
3898 | 4026 | ||
3899 | if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { | 4027 | if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { |
3900 | u32 eflags = kvm_x86_ops->get_rflags(vcpu); | 4028 | u32 eflags = kvm_x86_ops->get_rflags(vcpu); |
@@ -3920,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3920 | struct kvm_sregs *sregs) | 4048 | struct kvm_sregs *sregs) |
3921 | { | 4049 | { |
3922 | int mmu_reset_needed = 0; | 4050 | int mmu_reset_needed = 0; |
3923 | int i, pending_vec, max_bits; | 4051 | int pending_vec, max_bits; |
3924 | struct descriptor_table dt; | 4052 | struct descriptor_table dt; |
3925 | 4053 | ||
3926 | vcpu_load(vcpu); | 4054 | vcpu_load(vcpu); |
@@ -3934,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3934 | 4062 | ||
3935 | vcpu->arch.cr2 = sregs->cr2; | 4063 | vcpu->arch.cr2 = sregs->cr2; |
3936 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | 4064 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; |
3937 | vcpu->arch.cr3 = sregs->cr3; | 4065 | |
4066 | down_read(&vcpu->kvm->slots_lock); | ||
4067 | if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) | ||
4068 | vcpu->arch.cr3 = sregs->cr3; | ||
4069 | else | ||
4070 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | ||
4071 | up_read(&vcpu->kvm->slots_lock); | ||
3938 | 4072 | ||
3939 | kvm_set_cr8(vcpu, sregs->cr8); | 4073 | kvm_set_cr8(vcpu, sregs->cr8); |
3940 | 4074 | ||
@@ -3956,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3956 | if (mmu_reset_needed) | 4090 | if (mmu_reset_needed) |
3957 | kvm_mmu_reset_context(vcpu); | 4091 | kvm_mmu_reset_context(vcpu); |
3958 | 4092 | ||
3959 | if (!irqchip_in_kernel(vcpu->kvm)) { | 4093 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; |
3960 | memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, | 4094 | pending_vec = find_first_bit( |
3961 | sizeof vcpu->arch.irq_pending); | 4095 | (const unsigned long *)sregs->interrupt_bitmap, max_bits); |
3962 | vcpu->arch.irq_summary = 0; | 4096 | if (pending_vec < max_bits) { |
3963 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) | 4097 | kvm_queue_interrupt(vcpu, pending_vec, false); |
3964 | if (vcpu->arch.irq_pending[i]) | 4098 | pr_debug("Set back pending irq %d\n", pending_vec); |
3965 | __set_bit(i, &vcpu->arch.irq_summary); | 4099 | if (irqchip_in_kernel(vcpu->kvm)) |
3966 | } else { | 4100 | kvm_pic_clear_isr_ack(vcpu->kvm); |
3967 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; | ||
3968 | pending_vec = find_first_bit( | ||
3969 | (const unsigned long *)sregs->interrupt_bitmap, | ||
3970 | max_bits); | ||
3971 | /* Only pending external irq is handled here */ | ||
3972 | if (pending_vec < max_bits) { | ||
3973 | kvm_x86_ops->set_irq(vcpu, pending_vec); | ||
3974 | pr_debug("Set back pending irq %d\n", | ||
3975 | pending_vec); | ||
3976 | } | ||
3977 | kvm_pic_clear_isr_ack(vcpu->kvm); | ||
3978 | } | 4101 | } |
3979 | 4102 | ||
3980 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | 4103 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); |
@@ -4308,7 +4431,6 @@ struct kvm *kvm_arch_create_vm(void) | |||
4308 | return ERR_PTR(-ENOMEM); | 4431 | return ERR_PTR(-ENOMEM); |
4309 | 4432 | ||
4310 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 4433 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
4311 | INIT_LIST_HEAD(&kvm->arch.oos_global_pages); | ||
4312 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 4434 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
4313 | 4435 | ||
4314 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ | 4436 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
@@ -4411,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
4411 | } | 4533 | } |
4412 | } | 4534 | } |
4413 | 4535 | ||
4536 | spin_lock(&kvm->mmu_lock); | ||
4414 | if (!kvm->arch.n_requested_mmu_pages) { | 4537 | if (!kvm->arch.n_requested_mmu_pages) { |
4415 | unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | 4538 | unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); |
4416 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | 4539 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
4417 | } | 4540 | } |
4418 | 4541 | ||
4419 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 4542 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |
4543 | spin_unlock(&kvm->mmu_lock); | ||
4420 | kvm_flush_remote_tlbs(kvm); | 4544 | kvm_flush_remote_tlbs(kvm); |
4421 | 4545 | ||
4422 | return 0; | 4546 | return 0; |
@@ -4425,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
4425 | void kvm_arch_flush_shadow(struct kvm *kvm) | 4549 | void kvm_arch_flush_shadow(struct kvm *kvm) |
4426 | { | 4550 | { |
4427 | kvm_mmu_zap_all(kvm); | 4551 | kvm_mmu_zap_all(kvm); |
4552 | kvm_reload_remote_mmus(kvm); | ||
4428 | } | 4553 | } |
4429 | 4554 | ||
4430 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 4555 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
@@ -4434,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | |||
4434 | || vcpu->arch.nmi_pending; | 4559 | || vcpu->arch.nmi_pending; |
4435 | } | 4560 | } |
4436 | 4561 | ||
4437 | static void vcpu_kick_intr(void *info) | ||
4438 | { | ||
4439 | #ifdef DEBUG | ||
4440 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; | ||
4441 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | ||
4442 | #endif | ||
4443 | } | ||
4444 | |||
4445 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | 4562 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) |
4446 | { | 4563 | { |
4447 | int ipi_pcpu = vcpu->cpu; | 4564 | int me; |
4448 | int cpu = get_cpu(); | 4565 | int cpu = vcpu->cpu; |
4449 | 4566 | ||
4450 | if (waitqueue_active(&vcpu->wq)) { | 4567 | if (waitqueue_active(&vcpu->wq)) { |
4451 | wake_up_interruptible(&vcpu->wq); | 4568 | wake_up_interruptible(&vcpu->wq); |
4452 | ++vcpu->stat.halt_wakeup; | 4569 | ++vcpu->stat.halt_wakeup; |
4453 | } | 4570 | } |
4454 | /* | 4571 | |
4455 | * We may be called synchronously with irqs disabled in guest mode, | 4572 | me = get_cpu(); |
4456 | * So need not to call smp_call_function_single() in that case. | 4573 | if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) |
4457 | */ | 4574 | if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) |
4458 | if (vcpu->guest_mode && vcpu->cpu != cpu) | 4575 | smp_send_reschedule(cpu); |
4459 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); | ||
4460 | put_cpu(); | 4576 | put_cpu(); |
4461 | } | 4577 | } |
4578 | |||
4579 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) | ||
4580 | { | ||
4581 | return kvm_x86_ops->interrupt_allowed(vcpu); | ||
4582 | } | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6a4be78a7384..4c8e10af78e8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) | |||
8 | vcpu->arch.exception.pending = false; | 8 | vcpu->arch.exception.pending = false; |
9 | } | 9 | } |
10 | 10 | ||
11 | static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) | 11 | static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, |
12 | bool soft) | ||
12 | { | 13 | { |
13 | vcpu->arch.interrupt.pending = true; | 14 | vcpu->arch.interrupt.pending = true; |
15 | vcpu->arch.interrupt.soft = soft; | ||
14 | vcpu->arch.interrupt.nr = vector; | 16 | vcpu->arch.interrupt.nr = vector; |
15 | } | 17 | } |
16 | 18 | ||
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) | |||
19 | vcpu->arch.interrupt.pending = false; | 21 | vcpu->arch.interrupt.pending = false; |
20 | } | 22 | } |
21 | 23 | ||
24 | static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) | ||
25 | { | ||
26 | return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || | ||
27 | vcpu->arch.nmi_injected; | ||
28 | } | ||
29 | |||
30 | static inline bool kvm_exception_is_soft(unsigned int nr) | ||
31 | { | ||
32 | return (nr == BP_VECTOR) || (nr == OF_VECTOR); | ||
33 | } | ||
22 | #endif | 34 | #endif |
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ca91749d2083..c1b6c232e02b 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -59,13 +59,14 @@ | |||
59 | #define SrcImm (5<<4) /* Immediate operand. */ | 59 | #define SrcImm (5<<4) /* Immediate operand. */ |
60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ | 60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ |
61 | #define SrcOne (7<<4) /* Implied '1' */ | 61 | #define SrcOne (7<<4) /* Implied '1' */ |
62 | #define SrcMask (7<<4) | 62 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ |
63 | #define SrcMask (0xf<<4) | ||
63 | /* Generic ModRM decode. */ | 64 | /* Generic ModRM decode. */ |
64 | #define ModRM (1<<7) | 65 | #define ModRM (1<<8) |
65 | /* Destination is only written; never read. */ | 66 | /* Destination is only written; never read. */ |
66 | #define Mov (1<<8) | 67 | #define Mov (1<<9) |
67 | #define BitOp (1<<9) | 68 | #define BitOp (1<<10) |
68 | #define MemAbs (1<<10) /* Memory operand is absolute displacement */ | 69 | #define MemAbs (1<<11) /* Memory operand is absolute displacement */ |
69 | #define String (1<<12) /* String instruction (rep capable) */ | 70 | #define String (1<<12) /* String instruction (rep capable) */ |
70 | #define Stack (1<<13) /* Stack instruction (push/pop) */ | 71 | #define Stack (1<<13) /* Stack instruction (push/pop) */ |
71 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | 72 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ |
@@ -76,6 +77,7 @@ | |||
76 | #define Src2CL (1<<29) | 77 | #define Src2CL (1<<29) |
77 | #define Src2ImmByte (2<<29) | 78 | #define Src2ImmByte (2<<29) |
78 | #define Src2One (3<<29) | 79 | #define Src2One (3<<29) |
80 | #define Src2Imm16 (4<<29) | ||
79 | #define Src2Mask (7<<29) | 81 | #define Src2Mask (7<<29) |
80 | 82 | ||
81 | enum { | 83 | enum { |
@@ -135,11 +137,11 @@ static u32 opcode_table[256] = { | |||
135 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | 137 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ |
136 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | 138 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ |
137 | /* 0x70 - 0x77 */ | 139 | /* 0x70 - 0x77 */ |
138 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 140 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, |
139 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 141 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, |
140 | /* 0x78 - 0x7F */ | 142 | /* 0x78 - 0x7F */ |
141 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 143 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, |
142 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 144 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, |
143 | /* 0x80 - 0x87 */ | 145 | /* 0x80 - 0x87 */ |
144 | Group | Group1_80, Group | Group1_81, | 146 | Group | Group1_80, Group | Group1_81, |
145 | Group | Group1_82, Group | Group1_83, | 147 | Group | Group1_82, Group | Group1_83, |
@@ -153,7 +155,8 @@ static u32 opcode_table[256] = { | |||
153 | /* 0x90 - 0x97 */ | 155 | /* 0x90 - 0x97 */ |
154 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | 156 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, |
155 | /* 0x98 - 0x9F */ | 157 | /* 0x98 - 0x9F */ |
156 | 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | 158 | 0, 0, SrcImm | Src2Imm16, 0, |
159 | ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | ||
157 | /* 0xA0 - 0xA7 */ | 160 | /* 0xA0 - 0xA7 */ |
158 | ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, | 161 | ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, |
159 | ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, | 162 | ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, |
@@ -178,7 +181,8 @@ static u32 opcode_table[256] = { | |||
178 | 0, ImplicitOps | Stack, 0, 0, | 181 | 0, ImplicitOps | Stack, 0, 0, |
179 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | 182 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, |
180 | /* 0xC8 - 0xCF */ | 183 | /* 0xC8 - 0xCF */ |
181 | 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, | 184 | 0, 0, 0, ImplicitOps | Stack, |
185 | ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, | ||
182 | /* 0xD0 - 0xD7 */ | 186 | /* 0xD0 - 0xD7 */ |
183 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | 187 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, |
184 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | 188 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, |
@@ -187,11 +191,11 @@ static u32 opcode_table[256] = { | |||
187 | 0, 0, 0, 0, 0, 0, 0, 0, | 191 | 0, 0, 0, 0, 0, 0, 0, 0, |
188 | /* 0xE0 - 0xE7 */ | 192 | /* 0xE0 - 0xE7 */ |
189 | 0, 0, 0, 0, | 193 | 0, 0, 0, 0, |
190 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 194 | ByteOp | SrcImmUByte, SrcImmUByte, |
191 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 195 | ByteOp | SrcImmUByte, SrcImmUByte, |
192 | /* 0xE8 - 0xEF */ | 196 | /* 0xE8 - 0xEF */ |
193 | ImplicitOps | Stack, SrcImm | ImplicitOps, | 197 | SrcImm | Stack, SrcImm | ImplicitOps, |
194 | ImplicitOps, SrcImmByte | ImplicitOps, | 198 | SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, |
195 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 199 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
196 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 200 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
197 | /* 0xF0 - 0xF7 */ | 201 | /* 0xF0 - 0xF7 */ |
@@ -230,10 +234,8 @@ static u32 twobyte_table[256] = { | |||
230 | /* 0x70 - 0x7F */ | 234 | /* 0x70 - 0x7F */ |
231 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 235 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
232 | /* 0x80 - 0x8F */ | 236 | /* 0x80 - 0x8F */ |
233 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 237 | SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, |
234 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 238 | SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, |
235 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
236 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
237 | /* 0x90 - 0x9F */ | 239 | /* 0x90 - 0x9F */ |
238 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 240 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
239 | /* 0xA0 - 0xA7 */ | 241 | /* 0xA0 - 0xA7 */ |
@@ -1044,10 +1046,14 @@ done_prefixes: | |||
1044 | } | 1046 | } |
1045 | break; | 1047 | break; |
1046 | case SrcImmByte: | 1048 | case SrcImmByte: |
1049 | case SrcImmUByte: | ||
1047 | c->src.type = OP_IMM; | 1050 | c->src.type = OP_IMM; |
1048 | c->src.ptr = (unsigned long *)c->eip; | 1051 | c->src.ptr = (unsigned long *)c->eip; |
1049 | c->src.bytes = 1; | 1052 | c->src.bytes = 1; |
1050 | c->src.val = insn_fetch(s8, 1, c->eip); | 1053 | if ((c->d & SrcMask) == SrcImmByte) |
1054 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1055 | else | ||
1056 | c->src.val = insn_fetch(u8, 1, c->eip); | ||
1051 | break; | 1057 | break; |
1052 | case SrcOne: | 1058 | case SrcOne: |
1053 | c->src.bytes = 1; | 1059 | c->src.bytes = 1; |
@@ -1072,6 +1078,12 @@ done_prefixes: | |||
1072 | c->src2.bytes = 1; | 1078 | c->src2.bytes = 1; |
1073 | c->src2.val = insn_fetch(u8, 1, c->eip); | 1079 | c->src2.val = insn_fetch(u8, 1, c->eip); |
1074 | break; | 1080 | break; |
1081 | case Src2Imm16: | ||
1082 | c->src2.type = OP_IMM; | ||
1083 | c->src2.ptr = (unsigned long *)c->eip; | ||
1084 | c->src2.bytes = 2; | ||
1085 | c->src2.val = insn_fetch(u16, 2, c->eip); | ||
1086 | break; | ||
1075 | case Src2One: | 1087 | case Src2One: |
1076 | c->src2.bytes = 1; | 1088 | c->src2.bytes = 1; |
1077 | c->src2.val = 1; | 1089 | c->src2.val = 1; |
@@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1349 | return 0; | 1361 | return 0; |
1350 | } | 1362 | } |
1351 | 1363 | ||
1364 | void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) | ||
1365 | { | ||
1366 | u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); | ||
1367 | /* | ||
1368 | * an sti; sti; sequence only disable interrupts for the first | ||
1369 | * instruction. So, if the last instruction, be it emulated or | ||
1370 | * not, left the system with the INT_STI flag enabled, it | ||
1371 | * means that the last instruction is an sti. We should not | ||
1372 | * leave the flag on in this case. The same goes for mov ss | ||
1373 | */ | ||
1374 | if (!(int_shadow & mask)) | ||
1375 | ctxt->interruptibility = mask; | ||
1376 | } | ||
1377 | |||
1352 | int | 1378 | int |
1353 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | 1379 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) |
1354 | { | 1380 | { |
@@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1360 | int io_dir_in; | 1386 | int io_dir_in; |
1361 | int rc = 0; | 1387 | int rc = 0; |
1362 | 1388 | ||
1389 | ctxt->interruptibility = 0; | ||
1390 | |||
1363 | /* Shadow copy of register state. Committed on successful emulation. | 1391 | /* Shadow copy of register state. Committed on successful emulation. |
1364 | * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't | 1392 | * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't |
1365 | * modify them. | 1393 | * modify them. |
@@ -1531,13 +1559,10 @@ special_insn: | |||
1531 | return -1; | 1559 | return -1; |
1532 | } | 1560 | } |
1533 | return 0; | 1561 | return 0; |
1534 | case 0x70 ... 0x7f: /* jcc (short) */ { | 1562 | case 0x70 ... 0x7f: /* jcc (short) */ |
1535 | int rel = insn_fetch(s8, 1, c->eip); | ||
1536 | |||
1537 | if (test_cc(c->b, ctxt->eflags)) | 1563 | if (test_cc(c->b, ctxt->eflags)) |
1538 | jmp_rel(c, rel); | 1564 | jmp_rel(c, c->src.val); |
1539 | break; | 1565 | break; |
1540 | } | ||
1541 | case 0x80 ... 0x83: /* Grp1 */ | 1566 | case 0x80 ... 0x83: /* Grp1 */ |
1542 | switch (c->modrm_reg) { | 1567 | switch (c->modrm_reg) { |
1543 | case 0: | 1568 | case 0: |
@@ -1609,6 +1634,9 @@ special_insn: | |||
1609 | int err; | 1634 | int err; |
1610 | 1635 | ||
1611 | sel = c->src.val; | 1636 | sel = c->src.val; |
1637 | if (c->modrm_reg == VCPU_SREG_SS) | ||
1638 | toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); | ||
1639 | |||
1612 | if (c->modrm_reg <= 5) { | 1640 | if (c->modrm_reg <= 5) { |
1613 | type_bits = (c->modrm_reg == 1) ? 9 : 1; | 1641 | type_bits = (c->modrm_reg == 1) ? 9 : 1; |
1614 | err = kvm_load_segment_descriptor(ctxt->vcpu, sel, | 1642 | err = kvm_load_segment_descriptor(ctxt->vcpu, sel, |
@@ -1769,59 +1797,32 @@ special_insn: | |||
1769 | break; | 1797 | break; |
1770 | case 0xe4: /* inb */ | 1798 | case 0xe4: /* inb */ |
1771 | case 0xe5: /* in */ | 1799 | case 0xe5: /* in */ |
1772 | port = insn_fetch(u8, 1, c->eip); | 1800 | port = c->src.val; |
1773 | io_dir_in = 1; | 1801 | io_dir_in = 1; |
1774 | goto do_io; | 1802 | goto do_io; |
1775 | case 0xe6: /* outb */ | 1803 | case 0xe6: /* outb */ |
1776 | case 0xe7: /* out */ | 1804 | case 0xe7: /* out */ |
1777 | port = insn_fetch(u8, 1, c->eip); | 1805 | port = c->src.val; |
1778 | io_dir_in = 0; | 1806 | io_dir_in = 0; |
1779 | goto do_io; | 1807 | goto do_io; |
1780 | case 0xe8: /* call (near) */ { | 1808 | case 0xe8: /* call (near) */ { |
1781 | long int rel; | 1809 | long int rel = c->src.val; |
1782 | switch (c->op_bytes) { | ||
1783 | case 2: | ||
1784 | rel = insn_fetch(s16, 2, c->eip); | ||
1785 | break; | ||
1786 | case 4: | ||
1787 | rel = insn_fetch(s32, 4, c->eip); | ||
1788 | break; | ||
1789 | default: | ||
1790 | DPRINTF("Call: Invalid op_bytes\n"); | ||
1791 | goto cannot_emulate; | ||
1792 | } | ||
1793 | c->src.val = (unsigned long) c->eip; | 1810 | c->src.val = (unsigned long) c->eip; |
1794 | jmp_rel(c, rel); | 1811 | jmp_rel(c, rel); |
1795 | c->op_bytes = c->ad_bytes; | ||
1796 | emulate_push(ctxt); | 1812 | emulate_push(ctxt); |
1797 | break; | 1813 | break; |
1798 | } | 1814 | } |
1799 | case 0xe9: /* jmp rel */ | 1815 | case 0xe9: /* jmp rel */ |
1800 | goto jmp; | 1816 | goto jmp; |
1801 | case 0xea: /* jmp far */ { | 1817 | case 0xea: /* jmp far */ |
1802 | uint32_t eip; | 1818 | if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, |
1803 | uint16_t sel; | 1819 | VCPU_SREG_CS) < 0) { |
1804 | |||
1805 | switch (c->op_bytes) { | ||
1806 | case 2: | ||
1807 | eip = insn_fetch(u16, 2, c->eip); | ||
1808 | break; | ||
1809 | case 4: | ||
1810 | eip = insn_fetch(u32, 4, c->eip); | ||
1811 | break; | ||
1812 | default: | ||
1813 | DPRINTF("jmp far: Invalid op_bytes\n"); | ||
1814 | goto cannot_emulate; | ||
1815 | } | ||
1816 | sel = insn_fetch(u16, 2, c->eip); | ||
1817 | if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) { | ||
1818 | DPRINTF("jmp far: Failed to load CS descriptor\n"); | 1820 | DPRINTF("jmp far: Failed to load CS descriptor\n"); |
1819 | goto cannot_emulate; | 1821 | goto cannot_emulate; |
1820 | } | 1822 | } |
1821 | 1823 | ||
1822 | c->eip = eip; | 1824 | c->eip = c->src.val; |
1823 | break; | 1825 | break; |
1824 | } | ||
1825 | case 0xeb: | 1826 | case 0xeb: |
1826 | jmp: /* jmp rel short */ | 1827 | jmp: /* jmp rel short */ |
1827 | jmp_rel(c, c->src.val); | 1828 | jmp_rel(c, c->src.val); |
@@ -1865,6 +1866,7 @@ special_insn: | |||
1865 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1866 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1866 | break; | 1867 | break; |
1867 | case 0xfb: /* sti */ | 1868 | case 0xfb: /* sti */ |
1869 | toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); | ||
1868 | ctxt->eflags |= X86_EFLAGS_IF; | 1870 | ctxt->eflags |= X86_EFLAGS_IF; |
1869 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1871 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1870 | break; | 1872 | break; |
@@ -2039,28 +2041,11 @@ twobyte_insn: | |||
2039 | if (!test_cc(c->b, ctxt->eflags)) | 2041 | if (!test_cc(c->b, ctxt->eflags)) |
2040 | c->dst.type = OP_NONE; /* no writeback */ | 2042 | c->dst.type = OP_NONE; /* no writeback */ |
2041 | break; | 2043 | break; |
2042 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | 2044 | case 0x80 ... 0x8f: /* jnz rel, etc*/ |
2043 | long int rel; | ||
2044 | |||
2045 | switch (c->op_bytes) { | ||
2046 | case 2: | ||
2047 | rel = insn_fetch(s16, 2, c->eip); | ||
2048 | break; | ||
2049 | case 4: | ||
2050 | rel = insn_fetch(s32, 4, c->eip); | ||
2051 | break; | ||
2052 | case 8: | ||
2053 | rel = insn_fetch(s64, 8, c->eip); | ||
2054 | break; | ||
2055 | default: | ||
2056 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
2057 | goto cannot_emulate; | ||
2058 | } | ||
2059 | if (test_cc(c->b, ctxt->eflags)) | 2045 | if (test_cc(c->b, ctxt->eflags)) |
2060 | jmp_rel(c, rel); | 2046 | jmp_rel(c, c->src.val); |
2061 | c->dst.type = OP_NONE; | 2047 | c->dst.type = OP_NONE; |
2062 | break; | 2048 | break; |
2063 | } | ||
2064 | case 0xa3: | 2049 | case 0xa3: |
2065 | bt: /* bt */ | 2050 | bt: /* bt */ |
2066 | c->dst.type = OP_NONE; | 2051 | c->dst.type = OP_NONE; |
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d3..38718041efc3 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig | |||
@@ -2,7 +2,6 @@ config LGUEST_GUEST | |||
2 | bool "Lguest guest support" | 2 | bool "Lguest guest support" |
3 | select PARAVIRT | 3 | select PARAVIRT |
4 | depends on X86_32 | 4 | depends on X86_32 |
5 | depends on !X86_PAE | ||
6 | select VIRTIO | 5 | select VIRTIO |
7 | select VIRTIO_RING | 6 | select VIRTIO_RING |
8 | select VIRTIO_CONSOLE | 7 | select VIRTIO_CONSOLE |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4e0c26559395..7bc65f0f62c4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -87,7 +87,7 @@ struct lguest_data lguest_data = { | |||
87 | 87 | ||
88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a |
89 | * ring buffer of stored hypercalls which the Host will run though next time we | 89 | * ring buffer of stored hypercalls which the Host will run though next time we |
90 | * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall | 90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
92 | * and 255 once the Host has finished with it. | 92 | * and 255 once the Host has finished with it. |
93 | * | 93 | * |
@@ -96,7 +96,8 @@ struct lguest_data lguest_data = { | |||
96 | * effect of causing the Host to run all the stored calls in the ring buffer | 96 | * effect of causing the Host to run all the stored calls in the ring buffer |
97 | * which empties it for next time! */ | 97 | * which empties it for next time! */ |
98 | static void async_hcall(unsigned long call, unsigned long arg1, | 98 | static void async_hcall(unsigned long call, unsigned long arg1, |
99 | unsigned long arg2, unsigned long arg3) | 99 | unsigned long arg2, unsigned long arg3, |
100 | unsigned long arg4) | ||
100 | { | 101 | { |
101 | /* Note: This code assumes we're uniprocessor. */ | 102 | /* Note: This code assumes we're uniprocessor. */ |
102 | static unsigned int next_call; | 103 | static unsigned int next_call; |
@@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
108 | local_irq_save(flags); | 109 | local_irq_save(flags); |
109 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 110 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
110 | /* Table full, so do normal hcall which will flush table. */ | 111 | /* Table full, so do normal hcall which will flush table. */ |
111 | kvm_hypercall3(call, arg1, arg2, arg3); | 112 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); |
112 | } else { | 113 | } else { |
113 | lguest_data.hcalls[next_call].arg0 = call; | 114 | lguest_data.hcalls[next_call].arg0 = call; |
114 | lguest_data.hcalls[next_call].arg1 = arg1; | 115 | lguest_data.hcalls[next_call].arg1 = arg1; |
115 | lguest_data.hcalls[next_call].arg2 = arg2; | 116 | lguest_data.hcalls[next_call].arg2 = arg2; |
116 | lguest_data.hcalls[next_call].arg3 = arg3; | 117 | lguest_data.hcalls[next_call].arg3 = arg3; |
118 | lguest_data.hcalls[next_call].arg4 = arg4; | ||
117 | /* Arguments must all be written before we mark it to go */ | 119 | /* Arguments must all be written before we mark it to go */ |
118 | wmb(); | 120 | wmb(); |
119 | lguest_data.hcall_status[next_call] = 0; | 121 | lguest_data.hcall_status[next_call] = 0; |
@@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call, | |||
141 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 143 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
142 | kvm_hypercall1(call, arg1); | 144 | kvm_hypercall1(call, arg1); |
143 | else | 145 | else |
144 | async_hcall(call, arg1, 0, 0); | 146 | async_hcall(call, arg1, 0, 0, 0); |
145 | } | 147 | } |
146 | 148 | ||
147 | static void lazy_hcall2(unsigned long call, | 149 | static void lazy_hcall2(unsigned long call, |
@@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call, | |||
151 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 153 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
152 | kvm_hypercall2(call, arg1, arg2); | 154 | kvm_hypercall2(call, arg1, arg2); |
153 | else | 155 | else |
154 | async_hcall(call, arg1, arg2, 0); | 156 | async_hcall(call, arg1, arg2, 0, 0); |
155 | } | 157 | } |
156 | 158 | ||
157 | static void lazy_hcall3(unsigned long call, | 159 | static void lazy_hcall3(unsigned long call, |
@@ -162,9 +164,23 @@ static void lazy_hcall3(unsigned long call, | |||
162 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 164 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
163 | kvm_hypercall3(call, arg1, arg2, arg3); | 165 | kvm_hypercall3(call, arg1, arg2, arg3); |
164 | else | 166 | else |
165 | async_hcall(call, arg1, arg2, arg3); | 167 | async_hcall(call, arg1, arg2, arg3, 0); |
166 | } | 168 | } |
167 | 169 | ||
170 | #ifdef CONFIG_X86_PAE | ||
171 | static void lazy_hcall4(unsigned long call, | ||
172 | unsigned long arg1, | ||
173 | unsigned long arg2, | ||
174 | unsigned long arg3, | ||
175 | unsigned long arg4) | ||
176 | { | ||
177 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | ||
178 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); | ||
179 | else | ||
180 | async_hcall(call, arg1, arg2, arg3, arg4); | ||
181 | } | ||
182 | #endif | ||
183 | |||
168 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then |
169 | * issue the do-nothing hypercall to flush any stored calls. */ | 185 | * issue the do-nothing hypercall to flush any stored calls. */ |
170 | static void lguest_leave_lazy_mmu_mode(void) | 186 | static void lguest_leave_lazy_mmu_mode(void) |
@@ -179,7 +195,7 @@ static void lguest_end_context_switch(struct task_struct *next) | |||
179 | paravirt_end_context_switch(next); | 195 | paravirt_end_context_switch(next); |
180 | } | 196 | } |
181 | 197 | ||
182 | /*G:033 | 198 | /*G:032 |
183 | * After that diversion we return to our first native-instruction | 199 | * After that diversion we return to our first native-instruction |
184 | * replacements: four functions for interrupt control. | 200 | * replacements: four functions for interrupt control. |
185 | * | 201 | * |
@@ -199,30 +215,28 @@ static unsigned long save_fl(void) | |||
199 | { | 215 | { |
200 | return lguest_data.irq_enabled; | 216 | return lguest_data.irq_enabled; |
201 | } | 217 | } |
202 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
203 | |||
204 | /* restore_flags() just sets the flags back to the value given. */ | ||
205 | static void restore_fl(unsigned long flags) | ||
206 | { | ||
207 | lguest_data.irq_enabled = flags; | ||
208 | } | ||
209 | PV_CALLEE_SAVE_REGS_THUNK(restore_fl); | ||
210 | 218 | ||
211 | /* Interrupts go off... */ | 219 | /* Interrupts go off... */ |
212 | static void irq_disable(void) | 220 | static void irq_disable(void) |
213 | { | 221 | { |
214 | lguest_data.irq_enabled = 0; | 222 | lguest_data.irq_enabled = 0; |
215 | } | 223 | } |
224 | |||
225 | /* Let's pause a moment. Remember how I said these are called so often? | ||
226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | ||
227 | * break some rules. In particular, these functions are assumed to save their | ||
228 | * own registers if they need to: normal C functions assume they can trash the | ||
229 | * eax register. To use normal C functions, we use | ||
230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | ||
231 | * C function, then restores it. */ | ||
232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
216 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
234 | /*:*/ | ||
217 | 235 | ||
218 | /* Interrupts go on... */ | 236 | /* These are in i386_head.S */ |
219 | static void irq_enable(void) | 237 | extern void lg_irq_enable(void); |
220 | { | 238 | extern void lg_restore_fl(unsigned long flags); |
221 | lguest_data.irq_enabled = X86_EFLAGS_IF; | ||
222 | } | ||
223 | PV_CALLEE_SAVE_REGS_THUNK(irq_enable); | ||
224 | 239 | ||
225 | /*:*/ | ||
226 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable |
227 | * them (or when we unmask an interrupt). This seems to work for the moment, | 241 | * them (or when we unmask an interrupt). This seems to work for the moment, |
228 | * since interrupts are rare and we'll just get the interrupt on the next timer | 242 | * since interrupts are rare and we'll just get the interrupt on the next timer |
@@ -368,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
368 | case 1: /* Basic feature request. */ | 382 | case 1: /* Basic feature request. */ |
369 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 383 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ |
370 | *cx &= 0x00002201; | 384 | *cx &= 0x00002201; |
371 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ | 385 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ |
372 | *dx &= 0x07808111; | 386 | *dx &= 0x07808151; |
373 | /* The Host can do a nice optimization if it knows that the | 387 | /* The Host can do a nice optimization if it knows that the |
374 | * kernel mappings (addresses above 0xC0000000 or whatever | 388 | * kernel mappings (addresses above 0xC0000000 or whatever |
375 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 389 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
@@ -388,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
388 | if (*ax > 0x80000008) | 402 | if (*ax > 0x80000008) |
389 | *ax = 0x80000008; | 403 | *ax = 0x80000008; |
390 | break; | 404 | break; |
405 | case 0x80000001: | ||
406 | /* Here we should fix nx cap depending on host. */ | ||
407 | /* For this version of PAE, we just clear NX bit. */ | ||
408 | *dx &= ~(1 << 20); | ||
409 | break; | ||
391 | } | 410 | } |
392 | } | 411 | } |
393 | 412 | ||
@@ -521,25 +540,52 @@ static void lguest_write_cr4(unsigned long val) | |||
521 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 540 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
522 | pte_t *ptep) | 541 | pte_t *ptep) |
523 | { | 542 | { |
543 | #ifdef CONFIG_X86_PAE | ||
544 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | ||
545 | ptep->pte_low, ptep->pte_high); | ||
546 | #else | ||
524 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); | 547 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); |
548 | #endif | ||
525 | } | 549 | } |
526 | 550 | ||
527 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 551 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
528 | pte_t *ptep, pte_t pteval) | 552 | pte_t *ptep, pte_t pteval) |
529 | { | 553 | { |
530 | *ptep = pteval; | 554 | native_set_pte(ptep, pteval); |
531 | lguest_pte_update(mm, addr, ptep); | 555 | lguest_pte_update(mm, addr, ptep); |
532 | } | 556 | } |
533 | 557 | ||
534 | /* The Guest calls this to set a top-level entry. Again, we set the entry then | 558 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd |
535 | * tell the Host which top-level page we changed, and the index of the entry we | 559 | * to set a middle-level entry when PAE is activated. |
536 | * changed. */ | 560 | * Again, we set the entry then tell the Host which page we changed, |
561 | * and the index of the entry we changed. */ | ||
562 | #ifdef CONFIG_X86_PAE | ||
563 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | ||
564 | { | ||
565 | native_set_pud(pudp, pudval); | ||
566 | |||
567 | /* 32 bytes aligned pdpt address and the index. */ | ||
568 | lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, | ||
569 | (__pa(pudp) & 0x1F) / sizeof(pud_t)); | ||
570 | } | ||
571 | |||
537 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 572 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
538 | { | 573 | { |
539 | *pmdp = pmdval; | 574 | native_set_pmd(pmdp, pmdval); |
540 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, | 575 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, |
541 | (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); | 576 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); |
542 | } | 577 | } |
578 | #else | ||
579 | |||
580 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | ||
581 | * activated. */ | ||
582 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
583 | { | ||
584 | native_set_pmd(pmdp, pmdval); | ||
585 | lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, | ||
586 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); | ||
587 | } | ||
588 | #endif | ||
543 | 589 | ||
544 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 590 | /* There are a couple of legacy places where the kernel sets a PTE, but we |
545 | * don't know the top level any more. This is useless for us, since we don't | 591 | * don't know the top level any more. This is useless for us, since we don't |
@@ -552,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
552 | * which brings boot back to 0.25 seconds. */ | 598 | * which brings boot back to 0.25 seconds. */ |
553 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 599 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
554 | { | 600 | { |
555 | *ptep = pteval; | 601 | native_set_pte(ptep, pteval); |
602 | if (cr3_changed) | ||
603 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | ||
604 | } | ||
605 | |||
606 | #ifdef CONFIG_X86_PAE | ||
607 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
608 | { | ||
609 | native_set_pte_atomic(ptep, pte); | ||
556 | if (cr3_changed) | 610 | if (cr3_changed) |
557 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 611 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
558 | } | 612 | } |
559 | 613 | ||
614 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
615 | { | ||
616 | native_pte_clear(mm, addr, ptep); | ||
617 | lguest_pte_update(mm, addr, ptep); | ||
618 | } | ||
619 | |||
620 | void lguest_pmd_clear(pmd_t *pmdp) | ||
621 | { | ||
622 | lguest_set_pmd(pmdp, __pmd(0)); | ||
623 | } | ||
624 | #endif | ||
625 | |||
560 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 626 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on |
561 | * native page table operations. On native hardware you can set a new page | 627 | * native page table operations. On native hardware you can set a new page |
562 | * table entry whenever you want, but if you want to remove one you have to do | 628 | * table entry whenever you want, but if you want to remove one you have to do |
@@ -628,13 +694,12 @@ static void __init lguest_init_IRQ(void) | |||
628 | { | 694 | { |
629 | unsigned int i; | 695 | unsigned int i; |
630 | 696 | ||
631 | for (i = 0; i < LGUEST_IRQS; i++) { | 697 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
632 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
633 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 698 | /* Some systems map "vectors" to interrupts weirdly. Lguest has |
634 | * a straightforward 1 to 1 mapping, so force that here. */ | 699 | * a straightforward 1 to 1 mapping, so force that here. */ |
635 | __get_cpu_var(vector_irq)[vector] = i; | 700 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
636 | if (vector != SYSCALL_VECTOR) | 701 | if (i != SYSCALL_VECTOR) |
637 | set_intr_gate(vector, interrupt[i]); | 702 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
638 | } | 703 | } |
639 | /* This call is required to set up for 4k stacks, where we have | 704 | /* This call is required to set up for 4k stacks, where we have |
640 | * separate stacks for hard and soft interrupts. */ | 705 | * separate stacks for hard and soft interrupts. */ |
@@ -973,10 +1038,10 @@ static void lguest_restart(char *reason) | |||
973 | * | 1038 | * |
974 | * Our current solution is to allow the paravirt back end to optionally patch | 1039 | * Our current solution is to allow the paravirt back end to optionally patch |
975 | * over the indirect calls to replace them with something more efficient. We | 1040 | * over the indirect calls to replace them with something more efficient. We |
976 | * patch the four most commonly called functions: disable interrupts, enable | 1041 | * patch two of the simplest of the most commonly called functions: disable |
977 | * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 | 1042 | * interrupts and save interrupts. We usually have 6 or 10 bytes to patch |
978 | * bytes to patch into: the Guest versions of these operations are small enough | 1043 | * into: the Guest versions of these operations are small enough that we can |
979 | * that we can fit comfortably. | 1044 | * fit comfortably. |
980 | * | 1045 | * |
981 | * First we need assembly templates of each of the patchable Guest operations, | 1046 | * First we need assembly templates of each of the patchable Guest operations, |
982 | * and these are in i386_head.S. */ | 1047 | * and these are in i386_head.S. */ |
@@ -987,8 +1052,6 @@ static const struct lguest_insns | |||
987 | const char *start, *end; | 1052 | const char *start, *end; |
988 | } lguest_insns[] = { | 1053 | } lguest_insns[] = { |
989 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, | 1054 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, |
990 | [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, | ||
991 | [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, | ||
992 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1055 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
993 | }; | 1056 | }; |
994 | 1057 | ||
@@ -1026,6 +1089,7 @@ __init void lguest_init(void) | |||
1026 | pv_info.name = "lguest"; | 1089 | pv_info.name = "lguest"; |
1027 | pv_info.paravirt_enabled = 1; | 1090 | pv_info.paravirt_enabled = 1; |
1028 | pv_info.kernel_rpl = 1; | 1091 | pv_info.kernel_rpl = 1; |
1092 | pv_info.shared_kernel_pmd = 1; | ||
1029 | 1093 | ||
1030 | /* We set up all the lguest overrides for sensitive operations. These | 1094 | /* We set up all the lguest overrides for sensitive operations. These |
1031 | * are detailed with the operations themselves. */ | 1095 | * are detailed with the operations themselves. */ |
@@ -1033,9 +1097,9 @@ __init void lguest_init(void) | |||
1033 | /* interrupt-related operations */ | 1097 | /* interrupt-related operations */ |
1034 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1098 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
1035 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1099 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
1036 | pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); | 1100 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
1037 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); | 1101 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); |
1038 | pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); | 1102 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
1039 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1103 | pv_irq_ops.safe_halt = lguest_safe_halt; |
1040 | 1104 | ||
1041 | /* init-time operations */ | 1105 | /* init-time operations */ |
@@ -1071,6 +1135,12 @@ __init void lguest_init(void) | |||
1071 | pv_mmu_ops.set_pte = lguest_set_pte; | 1135 | pv_mmu_ops.set_pte = lguest_set_pte; |
1072 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; | 1136 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; |
1073 | pv_mmu_ops.set_pmd = lguest_set_pmd; | 1137 | pv_mmu_ops.set_pmd = lguest_set_pmd; |
1138 | #ifdef CONFIG_X86_PAE | ||
1139 | pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; | ||
1140 | pv_mmu_ops.pte_clear = lguest_pte_clear; | ||
1141 | pv_mmu_ops.pmd_clear = lguest_pmd_clear; | ||
1142 | pv_mmu_ops.set_pud = lguest_set_pud; | ||
1143 | #endif | ||
1074 | pv_mmu_ops.read_cr2 = lguest_read_cr2; | 1144 | pv_mmu_ops.read_cr2 = lguest_read_cr2; |
1075 | pv_mmu_ops.read_cr3 = lguest_read_cr3; | 1145 | pv_mmu_ops.read_cr3 = lguest_read_cr3; |
1076 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; | 1146 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f79541989471..a9c8cfe61cd4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -46,10 +46,64 @@ ENTRY(lguest_entry) | |||
46 | .globl lgstart_##name; .globl lgend_##name | 46 | .globl lgstart_##name; .globl lgend_##name |
47 | 47 | ||
48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | 48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) |
49 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) | ||
50 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) | ||
51 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 49 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
52 | /*:*/ | 50 | |
51 | /*G:033 But using those wrappers is inefficient (we'll see why that doesn't | ||
52 | * matter for save_fl and irq_disable later). If we write our routines | ||
53 | * carefully in assembler, we can avoid clobbering any registers and avoid | ||
54 | * jumping through the wrapper functions. | ||
55 | * | ||
56 | * I skipped over our first piece of assembler, but this one is worth studying | ||
57 | * in a bit more detail so I'll describe in easy stages. First, the routine | ||
58 | * to enable interrupts: */ | ||
59 | ENTRY(lg_irq_enable) | ||
60 | /* The reverse of irq_disable, this sets lguest_data.irq_enabled to | ||
61 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ | ||
62 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled | ||
63 | /* But now we need to check if the Host wants to know: there might have | ||
64 | * been interrupts waiting to be delivered, in which case it will have | ||
65 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we | ||
66 | * jump to send_interrupts, otherwise we're done. */ | ||
67 | testl $0, lguest_data+LGUEST_DATA_irq_pending | ||
68 | jnz send_interrupts | ||
69 | /* One cool thing about x86 is that you can do many things without using | ||
70 | * a register. In this case, the normal path hasn't needed to save or | ||
71 | * restore any registers at all! */ | ||
72 | ret | ||
73 | send_interrupts: | ||
74 | /* OK, now we need a register: eax is used for the hypercall number, | ||
75 | * which is LHCALL_SEND_INTERRUPTS. | ||
76 | * | ||
77 | * We used not to bother with this pending detection at all, which was | ||
78 | * much simpler. Sooner or later the Host would realize it had to | ||
79 | * send us an interrupt. But that turns out to make performance 7 | ||
80 | * times worse on a simple tcp benchmark. So now we do this the hard | ||
81 | * way. */ | ||
82 | pushl %eax | ||
83 | movl $LHCALL_SEND_INTERRUPTS, %eax | ||
84 | /* This is a vmcall instruction (same thing that KVM uses). Older | ||
85 | * assembler versions might not know the "vmcall" instruction, so we | ||
86 | * create one manually here. */ | ||
87 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | ||
88 | popl %eax | ||
89 | ret | ||
90 | |||
91 | /* Finally, the "popf" or "restore flags" routine. The %eax register holds the | ||
92 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're | ||
93 | * enabling interrupts again, if it's 0 we're leaving them off. */ | ||
94 | ENTRY(lg_restore_fl) | ||
95 | /* This is just "lguest_data.irq_enabled = flags;" */ | ||
96 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled | ||
97 | /* Now, if the %eax value has enabled interrupts and | ||
98 | * lguest_data.irq_pending is set, we want to tell the Host so it can | ||
99 | * deliver any outstanding interrupts. Fortunately, both values will | ||
100 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" | ||
101 | * instruction will AND them together for us. If both are set, we | ||
102 | * jump to send_interrupts. */ | ||
103 | testl lguest_data+LGUEST_DATA_irq_pending, %eax | ||
104 | jnz send_interrupts | ||
105 | /* Again, the normal path has used no extra registers. Clever, huh? */ | ||
106 | ret | ||
53 | 107 | ||
54 | /* These demark the EIP range where host should never deliver interrupts. */ | 108 | /* These demark the EIP range where host should never deliver interrupts. */ |
55 | .global lguest_noirq_start | 109 | .global lguest_noirq_start |
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 55e11aa6d66c..f9d35632666b 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | # Makefile for x86 specific library files. | 2 | # Makefile for x86 specific library files. |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_SMP) := msr-on-cpu.o | 5 | obj-$(CONFIG_SMP) := msr.o |
6 | 6 | ||
7 | lib-y := delay.o | 7 | lib-y := delay.o |
8 | lib-y += thunk_$(BITS).o | 8 | lib-y += thunk_$(BITS).o |
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c deleted file mode 100644 index 321cf720dbb6..000000000000 --- a/arch/x86/lib/msr-on-cpu.c +++ /dev/null | |||
@@ -1,97 +0,0 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/preempt.h> | ||
3 | #include <linux/smp.h> | ||
4 | #include <asm/msr.h> | ||
5 | |||
6 | struct msr_info { | ||
7 | u32 msr_no; | ||
8 | u32 l, h; | ||
9 | int err; | ||
10 | }; | ||
11 | |||
12 | static void __rdmsr_on_cpu(void *info) | ||
13 | { | ||
14 | struct msr_info *rv = info; | ||
15 | |||
16 | rdmsr(rv->msr_no, rv->l, rv->h); | ||
17 | } | ||
18 | |||
19 | static void __wrmsr_on_cpu(void *info) | ||
20 | { | ||
21 | struct msr_info *rv = info; | ||
22 | |||
23 | wrmsr(rv->msr_no, rv->l, rv->h); | ||
24 | } | ||
25 | |||
26 | int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | ||
27 | { | ||
28 | int err; | ||
29 | struct msr_info rv; | ||
30 | |||
31 | rv.msr_no = msr_no; | ||
32 | err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); | ||
33 | *l = rv.l; | ||
34 | *h = rv.h; | ||
35 | |||
36 | return err; | ||
37 | } | ||
38 | |||
39 | int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | ||
40 | { | ||
41 | int err; | ||
42 | struct msr_info rv; | ||
43 | |||
44 | rv.msr_no = msr_no; | ||
45 | rv.l = l; | ||
46 | rv.h = h; | ||
47 | err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); | ||
48 | |||
49 | return err; | ||
50 | } | ||
51 | |||
52 | /* These "safe" variants are slower and should be used when the target MSR | ||
53 | may not actually exist. */ | ||
54 | static void __rdmsr_safe_on_cpu(void *info) | ||
55 | { | ||
56 | struct msr_info *rv = info; | ||
57 | |||
58 | rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); | ||
59 | } | ||
60 | |||
61 | static void __wrmsr_safe_on_cpu(void *info) | ||
62 | { | ||
63 | struct msr_info *rv = info; | ||
64 | |||
65 | rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); | ||
66 | } | ||
67 | |||
68 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | ||
69 | { | ||
70 | int err; | ||
71 | struct msr_info rv; | ||
72 | |||
73 | rv.msr_no = msr_no; | ||
74 | err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); | ||
75 | *l = rv.l; | ||
76 | *h = rv.h; | ||
77 | |||
78 | return err ? err : rv.err; | ||
79 | } | ||
80 | |||
81 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | ||
82 | { | ||
83 | int err; | ||
84 | struct msr_info rv; | ||
85 | |||
86 | rv.msr_no = msr_no; | ||
87 | rv.l = l; | ||
88 | rv.h = h; | ||
89 | err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); | ||
90 | |||
91 | return err ? err : rv.err; | ||
92 | } | ||
93 | |||
94 | EXPORT_SYMBOL(rdmsr_on_cpu); | ||
95 | EXPORT_SYMBOL(wrmsr_on_cpu); | ||
96 | EXPORT_SYMBOL(rdmsr_safe_on_cpu); | ||
97 | EXPORT_SYMBOL(wrmsr_safe_on_cpu); | ||
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c new file mode 100644 index 000000000000..1440b9c0547e --- /dev/null +++ b/arch/x86/lib/msr.c | |||
@@ -0,0 +1,183 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/preempt.h> | ||
3 | #include <linux/smp.h> | ||
4 | #include <asm/msr.h> | ||
5 | |||
6 | struct msr_info { | ||
7 | u32 msr_no; | ||
8 | struct msr reg; | ||
9 | struct msr *msrs; | ||
10 | int off; | ||
11 | int err; | ||
12 | }; | ||
13 | |||
14 | static void __rdmsr_on_cpu(void *info) | ||
15 | { | ||
16 | struct msr_info *rv = info; | ||
17 | struct msr *reg; | ||
18 | int this_cpu = raw_smp_processor_id(); | ||
19 | |||
20 | if (rv->msrs) | ||
21 | reg = &rv->msrs[this_cpu - rv->off]; | ||
22 | else | ||
23 | reg = &rv->reg; | ||
24 | |||
25 | rdmsr(rv->msr_no, reg->l, reg->h); | ||
26 | } | ||
27 | |||
28 | static void __wrmsr_on_cpu(void *info) | ||
29 | { | ||
30 | struct msr_info *rv = info; | ||
31 | struct msr *reg; | ||
32 | int this_cpu = raw_smp_processor_id(); | ||
33 | |||
34 | if (rv->msrs) | ||
35 | reg = &rv->msrs[this_cpu - rv->off]; | ||
36 | else | ||
37 | reg = &rv->reg; | ||
38 | |||
39 | wrmsr(rv->msr_no, reg->l, reg->h); | ||
40 | } | ||
41 | |||
42 | int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | ||
43 | { | ||
44 | int err; | ||
45 | struct msr_info rv; | ||
46 | |||
47 | memset(&rv, 0, sizeof(rv)); | ||
48 | |||
49 | rv.msr_no = msr_no; | ||
50 | err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); | ||
51 | *l = rv.reg.l; | ||
52 | *h = rv.reg.h; | ||
53 | |||
54 | return err; | ||
55 | } | ||
56 | EXPORT_SYMBOL(rdmsr_on_cpu); | ||
57 | |||
58 | int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | ||
59 | { | ||
60 | int err; | ||
61 | struct msr_info rv; | ||
62 | |||
63 | memset(&rv, 0, sizeof(rv)); | ||
64 | |||
65 | rv.msr_no = msr_no; | ||
66 | rv.reg.l = l; | ||
67 | rv.reg.h = h; | ||
68 | err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); | ||
69 | |||
70 | return err; | ||
71 | } | ||
72 | EXPORT_SYMBOL(wrmsr_on_cpu); | ||
73 | |||
74 | /* rdmsr on a bunch of CPUs | ||
75 | * | ||
76 | * @mask: which CPUs | ||
77 | * @msr_no: which MSR | ||
78 | * @msrs: array of MSR values | ||
79 | * | ||
80 | */ | ||
81 | void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) | ||
82 | { | ||
83 | struct msr_info rv; | ||
84 | int this_cpu; | ||
85 | |||
86 | memset(&rv, 0, sizeof(rv)); | ||
87 | |||
88 | rv.off = cpumask_first(mask); | ||
89 | rv.msrs = msrs; | ||
90 | rv.msr_no = msr_no; | ||
91 | |||
92 | preempt_disable(); | ||
93 | /* | ||
94 | * FIXME: handle the CPU we're executing on separately for now until | ||
95 | * smp_call_function_many has been fixed to not skip it. | ||
96 | */ | ||
97 | this_cpu = raw_smp_processor_id(); | ||
98 | smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1); | ||
99 | |||
100 | smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); | ||
101 | preempt_enable(); | ||
102 | } | ||
103 | EXPORT_SYMBOL(rdmsr_on_cpus); | ||
104 | |||
105 | /* | ||
106 | * wrmsr on a bunch of CPUs | ||
107 | * | ||
108 | * @mask: which CPUs | ||
109 | * @msr_no: which MSR | ||
110 | * @msrs: array of MSR values | ||
111 | * | ||
112 | */ | ||
113 | void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) | ||
114 | { | ||
115 | struct msr_info rv; | ||
116 | int this_cpu; | ||
117 | |||
118 | memset(&rv, 0, sizeof(rv)); | ||
119 | |||
120 | rv.off = cpumask_first(mask); | ||
121 | rv.msrs = msrs; | ||
122 | rv.msr_no = msr_no; | ||
123 | |||
124 | preempt_disable(); | ||
125 | /* | ||
126 | * FIXME: handle the CPU we're executing on separately for now until | ||
127 | * smp_call_function_many has been fixed to not skip it. | ||
128 | */ | ||
129 | this_cpu = raw_smp_processor_id(); | ||
130 | smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1); | ||
131 | |||
132 | smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); | ||
133 | preempt_enable(); | ||
134 | } | ||
135 | EXPORT_SYMBOL(wrmsr_on_cpus); | ||
136 | |||
137 | /* These "safe" variants are slower and should be used when the target MSR | ||
138 | may not actually exist. */ | ||
139 | static void __rdmsr_safe_on_cpu(void *info) | ||
140 | { | ||
141 | struct msr_info *rv = info; | ||
142 | |||
143 | rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); | ||
144 | } | ||
145 | |||
146 | static void __wrmsr_safe_on_cpu(void *info) | ||
147 | { | ||
148 | struct msr_info *rv = info; | ||
149 | |||
150 | rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); | ||
151 | } | ||
152 | |||
153 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | ||
154 | { | ||
155 | int err; | ||
156 | struct msr_info rv; | ||
157 | |||
158 | memset(&rv, 0, sizeof(rv)); | ||
159 | |||
160 | rv.msr_no = msr_no; | ||
161 | err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); | ||
162 | *l = rv.reg.l; | ||
163 | *h = rv.reg.h; | ||
164 | |||
165 | return err ? err : rv.err; | ||
166 | } | ||
167 | EXPORT_SYMBOL(rdmsr_safe_on_cpu); | ||
168 | |||
169 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | ||
170 | { | ||
171 | int err; | ||
172 | struct msr_info rv; | ||
173 | |||
174 | memset(&rv, 0, sizeof(rv)); | ||
175 | |||
176 | rv.msr_no = msr_no; | ||
177 | rv.reg.l = l; | ||
178 | rv.reg.h = h; | ||
179 | err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); | ||
180 | |||
181 | return err ? err : rv.err; | ||
182 | } | ||
183 | EXPORT_SYMBOL(wrmsr_safe_on_cpu); | ||
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fdd30d08ab52..eefdeee8a871 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o | |||
10 | 10 | ||
11 | obj-$(CONFIG_HIGHMEM) += highmem_32.o | 11 | obj-$(CONFIG_HIGHMEM) += highmem_32.o |
12 | 12 | ||
13 | obj-$(CONFIG_KMEMCHECK) += kmemcheck/ | ||
14 | |||
13 | obj-$(CONFIG_MMIOTRACE) += mmiotrace.o | 15 | obj-$(CONFIG_MMIOTRACE) += mmiotrace.o |
14 | mmiotrace-y := kmmio.o pf_in.o mmio-mod.o | 16 | mmiotrace-y := kmmio.o pf_in.o mmio-mod.o |
15 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | 17 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c6acc6326374..baa0e86adfbc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -14,6 +14,7 @@ | |||
14 | 14 | ||
15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 15 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
17 | #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ | ||
17 | 18 | ||
18 | /* | 19 | /* |
19 | * Page fault error code bits: | 20 | * Page fault error code bits: |
@@ -956,6 +957,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
956 | /* Get the faulting address: */ | 957 | /* Get the faulting address: */ |
957 | address = read_cr2(); | 958 | address = read_cr2(); |
958 | 959 | ||
960 | /* | ||
961 | * Detect and handle instructions that would cause a page fault for | ||
962 | * both a tracked kernel page and a userspace page. | ||
963 | */ | ||
964 | if (kmemcheck_active(regs)) | ||
965 | kmemcheck_hide(regs); | ||
966 | |||
959 | if (unlikely(kmmio_fault(regs, address))) | 967 | if (unlikely(kmmio_fault(regs, address))) |
960 | return; | 968 | return; |
961 | 969 | ||
@@ -973,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
973 | * protection error (error_code & 9) == 0. | 981 | * protection error (error_code & 9) == 0. |
974 | */ | 982 | */ |
975 | if (unlikely(fault_in_kernel_space(address))) { | 983 | if (unlikely(fault_in_kernel_space(address))) { |
976 | if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | 984 | if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { |
977 | vmalloc_fault(address) >= 0) | 985 | if (vmalloc_fault(address) >= 0) |
978 | return; | 986 | return; |
987 | |||
988 | if (kmemcheck_fault(regs, address, error_code)) | ||
989 | return; | ||
990 | } | ||
979 | 991 | ||
980 | /* Can handle a stale RO->RW TLB: */ | 992 | /* Can handle a stale RO->RW TLB: */ |
981 | if (spurious_fault(error_code, address)) | 993 | if (spurious_fault(error_code, address)) |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 34c1bfb64f1c..f53b57e4086f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -213,7 +213,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
213 | if (!after_bootmem) | 213 | if (!after_bootmem) |
214 | init_gbpages(); | 214 | init_gbpages(); |
215 | 215 | ||
216 | #ifdef CONFIG_DEBUG_PAGEALLOC | 216 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) |
217 | /* | 217 | /* |
218 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | 218 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. |
219 | * This will simplify cpa(), which otherwise needs to support splitting | 219 | * This will simplify cpa(), which otherwise needs to support splitting |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 949708d7a481..3cd7711bb949 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -111,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
111 | pte_t *page_table = NULL; | 111 | pte_t *page_table = NULL; |
112 | 112 | ||
113 | if (after_bootmem) { | 113 | if (after_bootmem) { |
114 | #ifdef CONFIG_DEBUG_PAGEALLOC | 114 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) |
115 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); | 115 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); |
116 | #endif | 116 | #endif |
117 | if (!page_table) | 117 | if (!page_table) |
@@ -564,7 +564,7 @@ static inline void save_pg_dir(void) | |||
564 | } | 564 | } |
565 | #endif /* !CONFIG_ACPI_SLEEP */ | 565 | #endif /* !CONFIG_ACPI_SLEEP */ |
566 | 566 | ||
567 | void zap_low_mappings(void) | 567 | void zap_low_mappings(bool early) |
568 | { | 568 | { |
569 | int i; | 569 | int i; |
570 | 570 | ||
@@ -581,7 +581,11 @@ void zap_low_mappings(void) | |||
581 | set_pgd(swapper_pg_dir+i, __pgd(0)); | 581 | set_pgd(swapper_pg_dir+i, __pgd(0)); |
582 | #endif | 582 | #endif |
583 | } | 583 | } |
584 | flush_tlb_all(); | 584 | |
585 | if (early) | ||
586 | __flush_tlb(); | ||
587 | else | ||
588 | flush_tlb_all(); | ||
585 | } | 589 | } |
586 | 590 | ||
587 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); | 591 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); |
@@ -956,7 +960,7 @@ void __init mem_init(void) | |||
956 | test_wp_bit(); | 960 | test_wp_bit(); |
957 | 961 | ||
958 | save_pg_dir(); | 962 | save_pg_dir(); |
959 | zap_low_mappings(); | 963 | zap_low_mappings(true); |
960 | } | 964 | } |
961 | 965 | ||
962 | #ifdef CONFIG_MEMORY_HOTPLUG | 966 | #ifdef CONFIG_MEMORY_HOTPLUG |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 52bb9519bb86..9c543290a813 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -104,7 +104,7 @@ static __ref void *spp_getpage(void) | |||
104 | void *ptr; | 104 | void *ptr; |
105 | 105 | ||
106 | if (after_bootmem) | 106 | if (after_bootmem) |
107 | ptr = (void *) get_zeroed_page(GFP_ATOMIC); | 107 | ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); |
108 | else | 108 | else |
109 | ptr = alloc_bootmem_pages(PAGE_SIZE); | 109 | ptr = alloc_bootmem_pages(PAGE_SIZE); |
110 | 110 | ||
@@ -281,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
281 | void *adr; | 281 | void *adr; |
282 | 282 | ||
283 | if (after_bootmem) { | 283 | if (after_bootmem) { |
284 | adr = (void *)get_zeroed_page(GFP_ATOMIC); | 284 | adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); |
285 | *phys = __pa(adr); | 285 | *phys = __pa(adr); |
286 | 286 | ||
287 | return adr; | 287 | return adr; |
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile new file mode 100644 index 000000000000..520b3bce4095 --- /dev/null +++ b/arch/x86/mm/kmemcheck/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o | |||
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c new file mode 100644 index 000000000000..4901d0dafda6 --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.c | |||
@@ -0,0 +1,228 @@ | |||
1 | #include <linux/interrupt.h> | ||
2 | #include <linux/kdebug.h> | ||
3 | #include <linux/kmemcheck.h> | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/ptrace.h> | ||
7 | #include <linux/stacktrace.h> | ||
8 | #include <linux/string.h> | ||
9 | |||
10 | #include "error.h" | ||
11 | #include "shadow.h" | ||
12 | |||
13 | enum kmemcheck_error_type { | ||
14 | KMEMCHECK_ERROR_INVALID_ACCESS, | ||
15 | KMEMCHECK_ERROR_BUG, | ||
16 | }; | ||
17 | |||
18 | #define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) | ||
19 | |||
20 | struct kmemcheck_error { | ||
21 | enum kmemcheck_error_type type; | ||
22 | |||
23 | union { | ||
24 | /* KMEMCHECK_ERROR_INVALID_ACCESS */ | ||
25 | struct { | ||
26 | /* Kind of access that caused the error */ | ||
27 | enum kmemcheck_shadow state; | ||
28 | /* Address and size of the erroneous read */ | ||
29 | unsigned long address; | ||
30 | unsigned int size; | ||
31 | }; | ||
32 | }; | ||
33 | |||
34 | struct pt_regs regs; | ||
35 | struct stack_trace trace; | ||
36 | unsigned long trace_entries[32]; | ||
37 | |||
38 | /* We compress it to a char. */ | ||
39 | unsigned char shadow_copy[SHADOW_COPY_SIZE]; | ||
40 | unsigned char memory_copy[SHADOW_COPY_SIZE]; | ||
41 | }; | ||
42 | |||
43 | /* | ||
44 | * Create a ring queue of errors to output. We can't call printk() directly | ||
45 | * from the kmemcheck traps, since this may call the console drivers and | ||
46 | * result in a recursive fault. | ||
47 | */ | ||
48 | static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; | ||
49 | static unsigned int error_count; | ||
50 | static unsigned int error_rd; | ||
51 | static unsigned int error_wr; | ||
52 | static unsigned int error_missed_count; | ||
53 | |||
54 | static struct kmemcheck_error *error_next_wr(void) | ||
55 | { | ||
56 | struct kmemcheck_error *e; | ||
57 | |||
58 | if (error_count == ARRAY_SIZE(error_fifo)) { | ||
59 | ++error_missed_count; | ||
60 | return NULL; | ||
61 | } | ||
62 | |||
63 | e = &error_fifo[error_wr]; | ||
64 | if (++error_wr == ARRAY_SIZE(error_fifo)) | ||
65 | error_wr = 0; | ||
66 | ++error_count; | ||
67 | return e; | ||
68 | } | ||
69 | |||
70 | static struct kmemcheck_error *error_next_rd(void) | ||
71 | { | ||
72 | struct kmemcheck_error *e; | ||
73 | |||
74 | if (error_count == 0) | ||
75 | return NULL; | ||
76 | |||
77 | e = &error_fifo[error_rd]; | ||
78 | if (++error_rd == ARRAY_SIZE(error_fifo)) | ||
79 | error_rd = 0; | ||
80 | --error_count; | ||
81 | return e; | ||
82 | } | ||
83 | |||
84 | void kmemcheck_error_recall(void) | ||
85 | { | ||
86 | static const char *desc[] = { | ||
87 | [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", | ||
88 | [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", | ||
89 | [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", | ||
90 | [KMEMCHECK_SHADOW_FREED] = "freed", | ||
91 | }; | ||
92 | |||
93 | static const char short_desc[] = { | ||
94 | [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', | ||
95 | [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', | ||
96 | [KMEMCHECK_SHADOW_INITIALIZED] = 'i', | ||
97 | [KMEMCHECK_SHADOW_FREED] = 'f', | ||
98 | }; | ||
99 | |||
100 | struct kmemcheck_error *e; | ||
101 | unsigned int i; | ||
102 | |||
103 | e = error_next_rd(); | ||
104 | if (!e) | ||
105 | return; | ||
106 | |||
107 | switch (e->type) { | ||
108 | case KMEMCHECK_ERROR_INVALID_ACCESS: | ||
109 | printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " | ||
110 | "from %s memory (%p)\n", | ||
111 | 8 * e->size, e->state < ARRAY_SIZE(desc) ? | ||
112 | desc[e->state] : "(invalid shadow state)", | ||
113 | (void *) e->address); | ||
114 | |||
115 | printk(KERN_INFO); | ||
116 | for (i = 0; i < SHADOW_COPY_SIZE; ++i) | ||
117 | printk("%02x", e->memory_copy[i]); | ||
118 | printk("\n"); | ||
119 | |||
120 | printk(KERN_INFO); | ||
121 | for (i = 0; i < SHADOW_COPY_SIZE; ++i) { | ||
122 | if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) | ||
123 | printk(" %c", short_desc[e->shadow_copy[i]]); | ||
124 | else | ||
125 | printk(" ?"); | ||
126 | } | ||
127 | printk("\n"); | ||
128 | printk(KERN_INFO "%*c\n", 2 + 2 | ||
129 | * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); | ||
130 | break; | ||
131 | case KMEMCHECK_ERROR_BUG: | ||
132 | printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); | ||
133 | break; | ||
134 | } | ||
135 | |||
136 | __show_regs(&e->regs, 1); | ||
137 | print_stack_trace(&e->trace, 0); | ||
138 | } | ||
139 | |||
140 | static void do_wakeup(unsigned long data) | ||
141 | { | ||
142 | while (error_count > 0) | ||
143 | kmemcheck_error_recall(); | ||
144 | |||
145 | if (error_missed_count > 0) { | ||
146 | printk(KERN_WARNING "kmemcheck: Lost %d error reports because " | ||
147 | "the queue was too small\n", error_missed_count); | ||
148 | error_missed_count = 0; | ||
149 | } | ||
150 | } | ||
151 | |||
152 | static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); | ||
153 | |||
154 | /* | ||
155 | * Save the context of an error report. | ||
156 | */ | ||
157 | void kmemcheck_error_save(enum kmemcheck_shadow state, | ||
158 | unsigned long address, unsigned int size, struct pt_regs *regs) | ||
159 | { | ||
160 | static unsigned long prev_ip; | ||
161 | |||
162 | struct kmemcheck_error *e; | ||
163 | void *shadow_copy; | ||
164 | void *memory_copy; | ||
165 | |||
166 | /* Don't report several adjacent errors from the same EIP. */ | ||
167 | if (regs->ip == prev_ip) | ||
168 | return; | ||
169 | prev_ip = regs->ip; | ||
170 | |||
171 | e = error_next_wr(); | ||
172 | if (!e) | ||
173 | return; | ||
174 | |||
175 | e->type = KMEMCHECK_ERROR_INVALID_ACCESS; | ||
176 | |||
177 | e->state = state; | ||
178 | e->address = address; | ||
179 | e->size = size; | ||
180 | |||
181 | /* Save regs */ | ||
182 | memcpy(&e->regs, regs, sizeof(*regs)); | ||
183 | |||
184 | /* Save stack trace */ | ||
185 | e->trace.nr_entries = 0; | ||
186 | e->trace.entries = e->trace_entries; | ||
187 | e->trace.max_entries = ARRAY_SIZE(e->trace_entries); | ||
188 | e->trace.skip = 0; | ||
189 | save_stack_trace_bp(&e->trace, regs->bp); | ||
190 | |||
191 | /* Round address down to nearest 16 bytes */ | ||
192 | shadow_copy = kmemcheck_shadow_lookup(address | ||
193 | & ~(SHADOW_COPY_SIZE - 1)); | ||
194 | BUG_ON(!shadow_copy); | ||
195 | |||
196 | memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); | ||
197 | |||
198 | kmemcheck_show_addr(address); | ||
199 | memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); | ||
200 | memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); | ||
201 | kmemcheck_hide_addr(address); | ||
202 | |||
203 | tasklet_hi_schedule_first(&kmemcheck_tasklet); | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * Save the context of a kmemcheck bug. | ||
208 | */ | ||
209 | void kmemcheck_error_save_bug(struct pt_regs *regs) | ||
210 | { | ||
211 | struct kmemcheck_error *e; | ||
212 | |||
213 | e = error_next_wr(); | ||
214 | if (!e) | ||
215 | return; | ||
216 | |||
217 | e->type = KMEMCHECK_ERROR_BUG; | ||
218 | |||
219 | memcpy(&e->regs, regs, sizeof(*regs)); | ||
220 | |||
221 | e->trace.nr_entries = 0; | ||
222 | e->trace.entries = e->trace_entries; | ||
223 | e->trace.max_entries = ARRAY_SIZE(e->trace_entries); | ||
224 | e->trace.skip = 1; | ||
225 | save_stack_trace(&e->trace); | ||
226 | |||
227 | tasklet_hi_schedule_first(&kmemcheck_tasklet); | ||
228 | } | ||
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h new file mode 100644 index 000000000000..0efc2e8d0a20 --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.h | |||
@@ -0,0 +1,15 @@ | |||
1 | #ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H | ||
2 | #define ARCH__X86__MM__KMEMCHECK__ERROR_H | ||
3 | |||
4 | #include <linux/ptrace.h> | ||
5 | |||
6 | #include "shadow.h" | ||
7 | |||
8 | void kmemcheck_error_save(enum kmemcheck_shadow state, | ||
9 | unsigned long address, unsigned int size, struct pt_regs *regs); | ||
10 | |||
11 | void kmemcheck_error_save_bug(struct pt_regs *regs); | ||
12 | |||
13 | void kmemcheck_error_recall(void); | ||
14 | |||
15 | #endif | ||
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c new file mode 100644 index 000000000000..2c55ed098654 --- /dev/null +++ b/arch/x86/mm/kmemcheck/kmemcheck.c | |||
@@ -0,0 +1,640 @@ | |||
1 | /** | ||
2 | * kmemcheck - a heavyweight memory checker for the linux kernel | ||
3 | * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no> | ||
4 | * (With a lot of help from Ingo Molnar and Pekka Enberg.) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License (version 2) as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/kallsyms.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/kmemcheck.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/page-flags.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <linux/ptrace.h> | ||
21 | #include <linux/string.h> | ||
22 | #include <linux/types.h> | ||
23 | |||
24 | #include <asm/cacheflush.h> | ||
25 | #include <asm/kmemcheck.h> | ||
26 | #include <asm/pgtable.h> | ||
27 | #include <asm/tlbflush.h> | ||
28 | |||
29 | #include "error.h" | ||
30 | #include "opcode.h" | ||
31 | #include "pte.h" | ||
32 | #include "selftest.h" | ||
33 | #include "shadow.h" | ||
34 | |||
35 | |||
36 | #ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT | ||
37 | # define KMEMCHECK_ENABLED 0 | ||
38 | #endif | ||
39 | |||
40 | #ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT | ||
41 | # define KMEMCHECK_ENABLED 1 | ||
42 | #endif | ||
43 | |||
44 | #ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT | ||
45 | # define KMEMCHECK_ENABLED 2 | ||
46 | #endif | ||
47 | |||
48 | int kmemcheck_enabled = KMEMCHECK_ENABLED; | ||
49 | |||
50 | int __init kmemcheck_init(void) | ||
51 | { | ||
52 | #ifdef CONFIG_SMP | ||
53 | /* | ||
54 | * Limit SMP to use a single CPU. We rely on the fact that this code | ||
55 | * runs before SMP is set up. | ||
56 | */ | ||
57 | if (setup_max_cpus > 1) { | ||
58 | printk(KERN_INFO | ||
59 | "kmemcheck: Limiting number of CPUs to 1.\n"); | ||
60 | setup_max_cpus = 1; | ||
61 | } | ||
62 | #endif | ||
63 | |||
64 | if (!kmemcheck_selftest()) { | ||
65 | printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); | ||
66 | kmemcheck_enabled = 0; | ||
67 | return -EINVAL; | ||
68 | } | ||
69 | |||
70 | printk(KERN_INFO "kmemcheck: Initialized\n"); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | early_initcall(kmemcheck_init); | ||
75 | |||
76 | /* | ||
77 | * We need to parse the kmemcheck= option before any memory is allocated. | ||
78 | */ | ||
79 | static int __init param_kmemcheck(char *str) | ||
80 | { | ||
81 | if (!str) | ||
82 | return -EINVAL; | ||
83 | |||
84 | sscanf(str, "%d", &kmemcheck_enabled); | ||
85 | return 0; | ||
86 | } | ||
87 | |||
88 | early_param("kmemcheck", param_kmemcheck); | ||
89 | |||
90 | int kmemcheck_show_addr(unsigned long address) | ||
91 | { | ||
92 | pte_t *pte; | ||
93 | |||
94 | pte = kmemcheck_pte_lookup(address); | ||
95 | if (!pte) | ||
96 | return 0; | ||
97 | |||
98 | set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); | ||
99 | __flush_tlb_one(address); | ||
100 | return 1; | ||
101 | } | ||
102 | |||
103 | int kmemcheck_hide_addr(unsigned long address) | ||
104 | { | ||
105 | pte_t *pte; | ||
106 | |||
107 | pte = kmemcheck_pte_lookup(address); | ||
108 | if (!pte) | ||
109 | return 0; | ||
110 | |||
111 | set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); | ||
112 | __flush_tlb_one(address); | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | struct kmemcheck_context { | ||
117 | bool busy; | ||
118 | int balance; | ||
119 | |||
120 | /* | ||
121 | * There can be at most two memory operands to an instruction, but | ||
122 | * each address can cross a page boundary -- so we may need up to | ||
123 | * four addresses that must be hidden/revealed for each fault. | ||
124 | */ | ||
125 | unsigned long addr[4]; | ||
126 | unsigned long n_addrs; | ||
127 | unsigned long flags; | ||
128 | |||
129 | /* Data size of the instruction that caused a fault. */ | ||
130 | unsigned int size; | ||
131 | }; | ||
132 | |||
133 | static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); | ||
134 | |||
135 | bool kmemcheck_active(struct pt_regs *regs) | ||
136 | { | ||
137 | struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); | ||
138 | |||
139 | return data->balance > 0; | ||
140 | } | ||
141 | |||
142 | /* Save an address that needs to be shown/hidden */ | ||
143 | static void kmemcheck_save_addr(unsigned long addr) | ||
144 | { | ||
145 | struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); | ||
146 | |||
147 | BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); | ||
148 | data->addr[data->n_addrs++] = addr; | ||
149 | } | ||
150 | |||
151 | static unsigned int kmemcheck_show_all(void) | ||
152 | { | ||
153 | struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); | ||
154 | unsigned int i; | ||
155 | unsigned int n; | ||
156 | |||
157 | n = 0; | ||
158 | for (i = 0; i < data->n_addrs; ++i) | ||
159 | n += kmemcheck_show_addr(data->addr[i]); | ||
160 | |||
161 | return n; | ||
162 | } | ||
163 | |||
164 | static unsigned int kmemcheck_hide_all(void) | ||
165 | { | ||
166 | struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); | ||
167 | unsigned int i; | ||
168 | unsigned int n; | ||
169 | |||
170 | n = 0; | ||
171 | for (i = 0; i < data->n_addrs; ++i) | ||
172 | n += kmemcheck_hide_addr(data->addr[i]); | ||
173 | |||
174 | return n; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Called from the #PF handler. | ||
179 | */ | ||
180 | void kmemcheck_show(struct pt_regs *regs) | ||
181 | { | ||
182 | struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); | ||
183 | |||
184 | BUG_ON(!irqs_disabled()); | ||
185 | |||
186 | if (unlikely(data->balance != 0)) { | ||
187 | kmemcheck_show_all(); | ||
188 | kmemcheck_error_save_bug(regs); | ||
189 | data->balance = 0; | ||
190 | return; | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * None of the addresses actually belonged to kmemcheck. Note that | ||
195 | * this is not an error. | ||
196 | */ | ||
197 | if (kmemcheck_show_all() == 0) | ||
198 | return; | ||
199 | |||
200 | ++data->balance; | ||
201 | |||
202 | /* | ||
203 | * The IF needs to be cleared as well, so that the faulting | ||
204 | * instruction can run "uninterrupted". Otherwise, we might take | ||
205 | * an interrupt and start executing that before we've had a chance | ||
206 | * to hide the page again. | ||
207 | * | ||
208 | * NOTE: In the rare case of multiple faults, we must not override | ||
209 | * the original flags: | ||
210 | */ | ||
211 | if (!(regs->flags & X86_EFLAGS_TF)) | ||
212 | data->flags = regs->flags; | ||
213 | |||
214 | regs->flags |= X86_EFLAGS_TF; | ||
215 | regs->flags &= ~X86_EFLAGS_IF; | ||
216 | } | ||
217 | |||
218 | /* | ||
219 | * Called from the #DB handler. | ||
220 | */ | ||
221 | void kmemcheck_hide(struct pt_regs *regs) | ||
222 | { | ||
223 | struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); | ||
224 | int n; | ||
225 | |||
226 | BUG_ON(!irqs_disabled()); | ||
227 | |||
228 | if (data->balance == 0) | ||
229 | return; | ||
230 | |||
231 | if (unlikely(data->balance != 1)) { | ||
232 | kmemcheck_show_all(); | ||
233 | kmemcheck_error_save_bug(regs); | ||
234 | data->n_addrs = 0; | ||
235 | data->balance = 0; | ||
236 | |||
237 | if (!(data->flags & X86_EFLAGS_TF)) | ||
238 | regs->flags &= ~X86_EFLAGS_TF; | ||
239 | if (data->flags & X86_EFLAGS_IF) | ||
240 | regs->flags |= X86_EFLAGS_IF; | ||
241 | return; | ||
242 | } | ||
243 | |||
244 | if (kmemcheck_enabled) | ||
245 | n = kmemcheck_hide_all(); | ||
246 | else | ||
247 | n = kmemcheck_show_all(); | ||
248 | |||
249 | if (n == 0) | ||
250 | return; | ||
251 | |||
252 | --data->balance; | ||
253 | |||
254 | data->n_addrs = 0; | ||
255 | |||
256 | if (!(data->flags & X86_EFLAGS_TF)) | ||
257 | regs->flags &= ~X86_EFLAGS_TF; | ||
258 | if (data->flags & X86_EFLAGS_IF) | ||
259 | regs->flags |= X86_EFLAGS_IF; | ||
260 | } | ||
261 | |||
262 | void kmemcheck_show_pages(struct page *p, unsigned int n) | ||
263 | { | ||
264 | unsigned int i; | ||
265 | |||
266 | for (i = 0; i < n; ++i) { | ||
267 | unsigned long address; | ||
268 | pte_t *pte; | ||
269 | unsigned int level; | ||
270 | |||
271 | address = (unsigned long) page_address(&p[i]); | ||
272 | pte = lookup_address(address, &level); | ||
273 | BUG_ON(!pte); | ||
274 | BUG_ON(level != PG_LEVEL_4K); | ||
275 | |||
276 | set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); | ||
277 | set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); | ||
278 | __flush_tlb_one(address); | ||
279 | } | ||
280 | } | ||
281 | |||
282 | bool kmemcheck_page_is_tracked(struct page *p) | ||
283 | { | ||
284 | /* This will also check the "hidden" flag of the PTE. */ | ||
285 | return kmemcheck_pte_lookup((unsigned long) page_address(p)); | ||
286 | } | ||
287 | |||
288 | void kmemcheck_hide_pages(struct page *p, unsigned int n) | ||
289 | { | ||
290 | unsigned int i; | ||
291 | |||
292 | for (i = 0; i < n; ++i) { | ||
293 | unsigned long address; | ||
294 | pte_t *pte; | ||
295 | unsigned int level; | ||
296 | |||
297 | address = (unsigned long) page_address(&p[i]); | ||
298 | pte = lookup_address(address, &level); | ||
299 | BUG_ON(!pte); | ||
300 | BUG_ON(level != PG_LEVEL_4K); | ||
301 | |||
302 | set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); | ||
303 | set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); | ||
304 | __flush_tlb_one(address); | ||
305 | } | ||
306 | } | ||
307 | |||
308 | /* Access may NOT cross page boundary */ | ||
309 | static void kmemcheck_read_strict(struct pt_regs *regs, | ||
310 | unsigned long addr, unsigned int size) | ||
311 | { | ||
312 | void *shadow; | ||
313 | enum kmemcheck_shadow status; | ||
314 | |||
315 | shadow = kmemcheck_shadow_lookup(addr); | ||
316 | if (!shadow) | ||
317 | return; | ||
318 | |||
319 | kmemcheck_save_addr(addr); | ||
320 | status = kmemcheck_shadow_test(shadow, size); | ||
321 | if (status == KMEMCHECK_SHADOW_INITIALIZED) | ||
322 | return; | ||
323 | |||
324 | if (kmemcheck_enabled) | ||
325 | kmemcheck_error_save(status, addr, size, regs); | ||
326 | |||
327 | if (kmemcheck_enabled == 2) | ||
328 | kmemcheck_enabled = 0; | ||
329 | |||
330 | /* Don't warn about it again. */ | ||
331 | kmemcheck_shadow_set(shadow, size); | ||
332 | } | ||
333 | |||
334 | /* Access may cross page boundary */ | ||
335 | static void kmemcheck_read(struct pt_regs *regs, | ||
336 | unsigned long addr, unsigned int size) | ||
337 | { | ||
338 | unsigned long page = addr & PAGE_MASK; | ||
339 | unsigned long next_addr = addr + size - 1; | ||
340 | unsigned long next_page = next_addr & PAGE_MASK; | ||
341 | |||
342 | if (likely(page == next_page)) { | ||
343 | kmemcheck_read_strict(regs, addr, size); | ||
344 | return; | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * What we do is basically to split the access across the | ||
349 | * two pages and handle each part separately. Yes, this means | ||
350 | * that we may now see reads that are 3 + 5 bytes, for | ||
351 | * example (and if both are uninitialized, there will be two | ||
352 | * reports), but it makes the code a lot simpler. | ||
353 | */ | ||
354 | kmemcheck_read_strict(regs, addr, next_page - addr); | ||
355 | kmemcheck_read_strict(regs, next_page, next_addr - next_page); | ||
356 | } | ||
357 | |||
358 | static void kmemcheck_write_strict(struct pt_regs *regs, | ||
359 | unsigned long addr, unsigned int size) | ||
360 | { | ||
361 | void *shadow; | ||
362 | |||
363 | shadow = kmemcheck_shadow_lookup(addr); | ||
364 | if (!shadow) | ||
365 | return; | ||
366 | |||
367 | kmemcheck_save_addr(addr); | ||
368 | kmemcheck_shadow_set(shadow, size); | ||
369 | } | ||
370 | |||
371 | static void kmemcheck_write(struct pt_regs *regs, | ||
372 | unsigned long addr, unsigned int size) | ||
373 | { | ||
374 | unsigned long page = addr & PAGE_MASK; | ||
375 | unsigned long next_addr = addr + size - 1; | ||
376 | unsigned long next_page = next_addr & PAGE_MASK; | ||
377 | |||
378 | if (likely(page == next_page)) { | ||
379 | kmemcheck_write_strict(regs, addr, size); | ||
380 | return; | ||
381 | } | ||
382 | |||
383 | /* See comment in kmemcheck_read(). */ | ||
384 | kmemcheck_write_strict(regs, addr, next_page - addr); | ||
385 | kmemcheck_write_strict(regs, next_page, next_addr - next_page); | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * Copying is hard. We have two addresses, each of which may be split across | ||
390 | * a page (and each page will have different shadow addresses). | ||
391 | */ | ||
392 | static void kmemcheck_copy(struct pt_regs *regs, | ||
393 | unsigned long src_addr, unsigned long dst_addr, unsigned int size) | ||
394 | { | ||
395 | uint8_t shadow[8]; | ||
396 | enum kmemcheck_shadow status; | ||
397 | |||
398 | unsigned long page; | ||
399 | unsigned long next_addr; | ||
400 | unsigned long next_page; | ||
401 | |||
402 | uint8_t *x; | ||
403 | unsigned int i; | ||
404 | unsigned int n; | ||
405 | |||
406 | BUG_ON(size > sizeof(shadow)); | ||
407 | |||
408 | page = src_addr & PAGE_MASK; | ||
409 | next_addr = src_addr + size - 1; | ||
410 | next_page = next_addr & PAGE_MASK; | ||
411 | |||
412 | if (likely(page == next_page)) { | ||
413 | /* Same page */ | ||
414 | x = kmemcheck_shadow_lookup(src_addr); | ||
415 | if (x) { | ||
416 | kmemcheck_save_addr(src_addr); | ||
417 | for (i = 0; i < size; ++i) | ||
418 | shadow[i] = x[i]; | ||
419 | } else { | ||
420 | for (i = 0; i < size; ++i) | ||
421 | shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; | ||
422 | } | ||
423 | } else { | ||
424 | n = next_page - src_addr; | ||
425 | BUG_ON(n > sizeof(shadow)); | ||
426 | |||
427 | /* First page */ | ||
428 | x = kmemcheck_shadow_lookup(src_addr); | ||
429 | if (x) { | ||
430 | kmemcheck_save_addr(src_addr); | ||
431 | for (i = 0; i < n; ++i) | ||
432 | shadow[i] = x[i]; | ||
433 | } else { | ||
434 | /* Not tracked */ | ||
435 | for (i = 0; i < n; ++i) | ||
436 | shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; | ||
437 | } | ||
438 | |||
439 | /* Second page */ | ||
440 | x = kmemcheck_shadow_lookup(next_page); | ||
441 | if (x) { | ||
442 | kmemcheck_save_addr(next_page); | ||
443 | for (i = n; i < size; ++i) | ||
444 | shadow[i] = x[i - n]; | ||
445 | } else { | ||
446 | /* Not tracked */ | ||
447 | for (i = n; i < size; ++i) | ||
448 | shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; | ||
449 | } | ||
450 | } | ||
451 | |||
452 | page = dst_addr & PAGE_MASK; | ||
453 | next_addr = dst_addr + size - 1; | ||
454 | next_page = next_addr & PAGE_MASK; | ||
455 | |||
456 | if (likely(page == next_page)) { | ||
457 | /* Same page */ | ||
458 | x = kmemcheck_shadow_lookup(dst_addr); | ||
459 | if (x) { | ||
460 | kmemcheck_save_addr(dst_addr); | ||
461 | for (i = 0; i < size; ++i) { | ||
462 | x[i] = shadow[i]; | ||
463 | shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; | ||
464 | } | ||
465 | } | ||
466 | } else { | ||
467 | n = next_page - dst_addr; | ||
468 | BUG_ON(n > sizeof(shadow)); | ||
469 | |||
470 | /* First page */ | ||
471 | x = kmemcheck_shadow_lookup(dst_addr); | ||
472 | if (x) { | ||
473 | kmemcheck_save_addr(dst_addr); | ||
474 | for (i = 0; i < n; ++i) { | ||
475 | x[i] = shadow[i]; | ||
476 | shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; | ||
477 | } | ||
478 | } | ||
479 | |||
480 | /* Second page */ | ||
481 | x = kmemcheck_shadow_lookup(next_page); | ||
482 | if (x) { | ||
483 | kmemcheck_save_addr(next_page); | ||
484 | for (i = n; i < size; ++i) { | ||
485 | x[i - n] = shadow[i]; | ||
486 | shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; | ||
487 | } | ||
488 | } | ||
489 | } | ||
490 | |||
491 | status = kmemcheck_shadow_test(shadow, size); | ||
492 | if (status == KMEMCHECK_SHADOW_INITIALIZED) | ||
493 | return; | ||
494 | |||
495 | if (kmemcheck_enabled) | ||
496 | kmemcheck_error_save(status, src_addr, size, regs); | ||
497 | |||
498 | if (kmemcheck_enabled == 2) | ||
499 | kmemcheck_enabled = 0; | ||
500 | } | ||
501 | |||
502 | enum kmemcheck_method { | ||
503 | KMEMCHECK_READ, | ||
504 | KMEMCHECK_WRITE, | ||
505 | }; | ||
506 | |||
507 | static void kmemcheck_access(struct pt_regs *regs, | ||
508 | unsigned long fallback_address, enum kmemcheck_method fallback_method) | ||
509 | { | ||
510 | const uint8_t *insn; | ||
511 | const uint8_t *insn_primary; | ||
512 | unsigned int size; | ||
513 | |||
514 | struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); | ||
515 | |||
516 | /* Recursive fault -- ouch. */ | ||
517 | if (data->busy) { | ||
518 | kmemcheck_show_addr(fallback_address); | ||
519 | kmemcheck_error_save_bug(regs); | ||
520 | return; | ||
521 | } | ||
522 | |||
523 | data->busy = true; | ||
524 | |||
525 | insn = (const uint8_t *) regs->ip; | ||
526 | insn_primary = kmemcheck_opcode_get_primary(insn); | ||
527 | |||
528 | kmemcheck_opcode_decode(insn, &size); | ||
529 | |||
530 | switch (insn_primary[0]) { | ||
531 | #ifdef CONFIG_KMEMCHECK_BITOPS_OK | ||
532 | /* AND, OR, XOR */ | ||
533 | /* | ||
534 | * Unfortunately, these instructions have to be excluded from | ||
535 | * our regular checking since they access only some (and not | ||
536 | * all) bits. This clears out "bogus" bitfield-access warnings. | ||
537 | */ | ||
538 | case 0x80: | ||
539 | case 0x81: | ||
540 | case 0x82: | ||
541 | case 0x83: | ||
542 | switch ((insn_primary[1] >> 3) & 7) { | ||
543 | /* OR */ | ||
544 | case 1: | ||
545 | /* AND */ | ||
546 | case 4: | ||
547 | /* XOR */ | ||
548 | case 6: | ||
549 | kmemcheck_write(regs, fallback_address, size); | ||
550 | goto out; | ||
551 | |||
552 | /* ADD */ | ||
553 | case 0: | ||
554 | /* ADC */ | ||
555 | case 2: | ||
556 | /* SBB */ | ||
557 | case 3: | ||
558 | /* SUB */ | ||
559 | case 5: | ||
560 | /* CMP */ | ||
561 | case 7: | ||
562 | break; | ||
563 | } | ||
564 | break; | ||
565 | #endif | ||
566 | |||
567 | /* MOVS, MOVSB, MOVSW, MOVSD */ | ||
568 | case 0xa4: | ||
569 | case 0xa5: | ||
570 | /* | ||
571 | * These instructions are special because they take two | ||
572 | * addresses, but we only get one page fault. | ||
573 | */ | ||
574 | kmemcheck_copy(regs, regs->si, regs->di, size); | ||
575 | goto out; | ||
576 | |||
577 | /* CMPS, CMPSB, CMPSW, CMPSD */ | ||
578 | case 0xa6: | ||
579 | case 0xa7: | ||
580 | kmemcheck_read(regs, regs->si, size); | ||
581 | kmemcheck_read(regs, regs->di, size); | ||
582 | goto out; | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * If the opcode isn't special in any way, we use the data from the | ||
587 | * page fault handler to determine the address and type of memory | ||
588 | * access. | ||
589 | */ | ||
590 | switch (fallback_method) { | ||
591 | case KMEMCHECK_READ: | ||
592 | kmemcheck_read(regs, fallback_address, size); | ||
593 | goto out; | ||
594 | case KMEMCHECK_WRITE: | ||
595 | kmemcheck_write(regs, fallback_address, size); | ||
596 | goto out; | ||
597 | } | ||
598 | |||
599 | out: | ||
600 | data->busy = false; | ||
601 | } | ||
602 | |||
603 | bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, | ||
604 | unsigned long error_code) | ||
605 | { | ||
606 | pte_t *pte; | ||
607 | |||
608 | /* | ||
609 | * XXX: Is it safe to assume that memory accesses from virtual 86 | ||
610 | * mode or non-kernel code segments will _never_ access kernel | ||
611 | * memory (e.g. tracked pages)? For now, we need this to avoid | ||
612 | * invoking kmemcheck for PnP BIOS calls. | ||
613 | */ | ||
614 | if (regs->flags & X86_VM_MASK) | ||
615 | return false; | ||
616 | if (regs->cs != __KERNEL_CS) | ||
617 | return false; | ||
618 | |||
619 | pte = kmemcheck_pte_lookup(address); | ||
620 | if (!pte) | ||
621 | return false; | ||
622 | |||
623 | if (error_code & 2) | ||
624 | kmemcheck_access(regs, address, KMEMCHECK_WRITE); | ||
625 | else | ||
626 | kmemcheck_access(regs, address, KMEMCHECK_READ); | ||
627 | |||
628 | kmemcheck_show(regs); | ||
629 | return true; | ||
630 | } | ||
631 | |||
632 | bool kmemcheck_trap(struct pt_regs *regs) | ||
633 | { | ||
634 | if (!kmemcheck_active(regs)) | ||
635 | return false; | ||
636 | |||
637 | /* We're done. */ | ||
638 | kmemcheck_hide(regs); | ||
639 | return true; | ||
640 | } | ||
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c new file mode 100644 index 000000000000..63c19e27aa6f --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.c | |||
@@ -0,0 +1,106 @@ | |||
1 | #include <linux/types.h> | ||
2 | |||
3 | #include "opcode.h" | ||
4 | |||
5 | static bool opcode_is_prefix(uint8_t b) | ||
6 | { | ||
7 | return | ||
8 | /* Group 1 */ | ||
9 | b == 0xf0 || b == 0xf2 || b == 0xf3 | ||
10 | /* Group 2 */ | ||
11 | || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 | ||
12 | || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e | ||
13 | /* Group 3 */ | ||
14 | || b == 0x66 | ||
15 | /* Group 4 */ | ||
16 | || b == 0x67; | ||
17 | } | ||
18 | |||
19 | #ifdef CONFIG_X86_64 | ||
20 | static bool opcode_is_rex_prefix(uint8_t b) | ||
21 | { | ||
22 | return (b & 0xf0) == 0x40; | ||
23 | } | ||
24 | #else | ||
25 | static bool opcode_is_rex_prefix(uint8_t b) | ||
26 | { | ||
27 | return false; | ||
28 | } | ||
29 | #endif | ||
30 | |||
31 | #define REX_W (1 << 3) | ||
32 | |||
33 | /* | ||
34 | * This is a VERY crude opcode decoder. We only need to find the size of the | ||
35 | * load/store that caused our #PF and this should work for all the opcodes | ||
36 | * that we care about. Moreover, the ones who invented this instruction set | ||
37 | * should be shot. | ||
38 | */ | ||
39 | void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) | ||
40 | { | ||
41 | /* Default operand size */ | ||
42 | int operand_size_override = 4; | ||
43 | |||
44 | /* prefixes */ | ||
45 | for (; opcode_is_prefix(*op); ++op) { | ||
46 | if (*op == 0x66) | ||
47 | operand_size_override = 2; | ||
48 | } | ||
49 | |||
50 | /* REX prefix */ | ||
51 | if (opcode_is_rex_prefix(*op)) { | ||
52 | uint8_t rex = *op; | ||
53 | |||
54 | ++op; | ||
55 | if (rex & REX_W) { | ||
56 | switch (*op) { | ||
57 | case 0x63: | ||
58 | *size = 4; | ||
59 | return; | ||
60 | case 0x0f: | ||
61 | ++op; | ||
62 | |||
63 | switch (*op) { | ||
64 | case 0xb6: | ||
65 | case 0xbe: | ||
66 | *size = 1; | ||
67 | return; | ||
68 | case 0xb7: | ||
69 | case 0xbf: | ||
70 | *size = 2; | ||
71 | return; | ||
72 | } | ||
73 | |||
74 | break; | ||
75 | } | ||
76 | |||
77 | *size = 8; | ||
78 | return; | ||
79 | } | ||
80 | } | ||
81 | |||
82 | /* escape opcode */ | ||
83 | if (*op == 0x0f) { | ||
84 | ++op; | ||
85 | |||
86 | /* | ||
87 | * This is move with zero-extend and sign-extend, respectively; | ||
88 | * we don't have to think about 0xb6/0xbe, because this is | ||
89 | * already handled in the conditional below. | ||
90 | */ | ||
91 | if (*op == 0xb7 || *op == 0xbf) | ||
92 | operand_size_override = 2; | ||
93 | } | ||
94 | |||
95 | *size = (*op & 1) ? operand_size_override : 1; | ||
96 | } | ||
97 | |||
98 | const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) | ||
99 | { | ||
100 | /* skip prefixes */ | ||
101 | while (opcode_is_prefix(*op)) | ||
102 | ++op; | ||
103 | if (opcode_is_rex_prefix(*op)) | ||
104 | ++op; | ||
105 | return op; | ||
106 | } | ||
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h new file mode 100644 index 000000000000..6956aad66b5b --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.h | |||
@@ -0,0 +1,9 @@ | |||
1 | #ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H | ||
2 | #define ARCH__X86__MM__KMEMCHECK__OPCODE_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | |||
6 | void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); | ||
7 | const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); | ||
8 | |||
9 | #endif | ||
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c new file mode 100644 index 000000000000..4ead26eeaf96 --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.c | |||
@@ -0,0 +1,22 @@ | |||
1 | #include <linux/mm.h> | ||
2 | |||
3 | #include <asm/pgtable.h> | ||
4 | |||
5 | #include "pte.h" | ||
6 | |||
7 | pte_t *kmemcheck_pte_lookup(unsigned long address) | ||
8 | { | ||
9 | pte_t *pte; | ||
10 | unsigned int level; | ||
11 | |||
12 | pte = lookup_address(address, &level); | ||
13 | if (!pte) | ||
14 | return NULL; | ||
15 | if (level != PG_LEVEL_4K) | ||
16 | return NULL; | ||
17 | if (!pte_hidden(*pte)) | ||
18 | return NULL; | ||
19 | |||
20 | return pte; | ||
21 | } | ||
22 | |||
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h new file mode 100644 index 000000000000..9f5966456492 --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.h | |||
@@ -0,0 +1,10 @@ | |||
1 | #ifndef ARCH__X86__MM__KMEMCHECK__PTE_H | ||
2 | #define ARCH__X86__MM__KMEMCHECK__PTE_H | ||
3 | |||
4 | #include <linux/mm.h> | ||
5 | |||
6 | #include <asm/pgtable.h> | ||
7 | |||
8 | pte_t *kmemcheck_pte_lookup(unsigned long address); | ||
9 | |||
10 | #endif | ||
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c new file mode 100644 index 000000000000..036efbea8b28 --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.c | |||
@@ -0,0 +1,69 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | |||
3 | #include "opcode.h" | ||
4 | #include "selftest.h" | ||
5 | |||
6 | struct selftest_opcode { | ||
7 | unsigned int expected_size; | ||
8 | const uint8_t *insn; | ||
9 | const char *desc; | ||
10 | }; | ||
11 | |||
12 | static const struct selftest_opcode selftest_opcodes[] = { | ||
13 | /* REP MOVS */ | ||
14 | {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"}, | ||
15 | {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"}, | ||
16 | |||
17 | /* MOVZX / MOVZXD */ | ||
18 | {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"}, | ||
19 | {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"}, | ||
20 | |||
21 | /* MOVSX / MOVSXD */ | ||
22 | {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"}, | ||
23 | {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"}, | ||
24 | |||
25 | #ifdef CONFIG_X86_64 | ||
26 | /* MOVZX / MOVZXD */ | ||
27 | {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"}, | ||
28 | {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"}, | ||
29 | |||
30 | /* MOVSX / MOVSXD */ | ||
31 | {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"}, | ||
32 | {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"}, | ||
33 | {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"}, | ||
34 | #endif | ||
35 | }; | ||
36 | |||
37 | static bool selftest_opcode_one(const struct selftest_opcode *op) | ||
38 | { | ||
39 | unsigned size; | ||
40 | |||
41 | kmemcheck_opcode_decode(op->insn, &size); | ||
42 | |||
43 | if (size == op->expected_size) | ||
44 | return true; | ||
45 | |||
46 | printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", | ||
47 | op->desc, op->expected_size, size); | ||
48 | return false; | ||
49 | } | ||
50 | |||
51 | static bool selftest_opcodes_all(void) | ||
52 | { | ||
53 | bool pass = true; | ||
54 | unsigned int i; | ||
55 | |||
56 | for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) | ||
57 | pass = pass && selftest_opcode_one(&selftest_opcodes[i]); | ||
58 | |||
59 | return pass; | ||
60 | } | ||
61 | |||
62 | bool kmemcheck_selftest(void) | ||
63 | { | ||
64 | bool pass = true; | ||
65 | |||
66 | pass = pass && selftest_opcodes_all(); | ||
67 | |||
68 | return pass; | ||
69 | } | ||
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h new file mode 100644 index 000000000000..8fed4fe11f95 --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.h | |||
@@ -0,0 +1,6 @@ | |||
1 | #ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H | ||
2 | #define ARCH_X86_MM_KMEMCHECK_SELFTEST_H | ||
3 | |||
4 | bool kmemcheck_selftest(void); | ||
5 | |||
6 | #endif | ||
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c new file mode 100644 index 000000000000..e773b6bd0079 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.c | |||
@@ -0,0 +1,162 @@ | |||
1 | #include <linux/kmemcheck.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/module.h> | ||
5 | |||
6 | #include <asm/page.h> | ||
7 | #include <asm/pgtable.h> | ||
8 | |||
9 | #include "pte.h" | ||
10 | #include "shadow.h" | ||
11 | |||
12 | /* | ||
13 | * Return the shadow address for the given address. Returns NULL if the | ||
14 | * address is not tracked. | ||
15 | * | ||
16 | * We need to be extremely careful not to follow any invalid pointers, | ||
17 | * because this function can be called for *any* possible address. | ||
18 | */ | ||
19 | void *kmemcheck_shadow_lookup(unsigned long address) | ||
20 | { | ||
21 | pte_t *pte; | ||
22 | struct page *page; | ||
23 | |||
24 | if (!virt_addr_valid(address)) | ||
25 | return NULL; | ||
26 | |||
27 | pte = kmemcheck_pte_lookup(address); | ||
28 | if (!pte) | ||
29 | return NULL; | ||
30 | |||
31 | page = virt_to_page(address); | ||
32 | if (!page->shadow) | ||
33 | return NULL; | ||
34 | return page->shadow + (address & (PAGE_SIZE - 1)); | ||
35 | } | ||
36 | |||
37 | static void mark_shadow(void *address, unsigned int n, | ||
38 | enum kmemcheck_shadow status) | ||
39 | { | ||
40 | unsigned long addr = (unsigned long) address; | ||
41 | unsigned long last_addr = addr + n - 1; | ||
42 | unsigned long page = addr & PAGE_MASK; | ||
43 | unsigned long last_page = last_addr & PAGE_MASK; | ||
44 | unsigned int first_n; | ||
45 | void *shadow; | ||
46 | |||
47 | /* If the memory range crosses a page boundary, stop there. */ | ||
48 | if (page == last_page) | ||
49 | first_n = n; | ||
50 | else | ||
51 | first_n = page + PAGE_SIZE - addr; | ||
52 | |||
53 | shadow = kmemcheck_shadow_lookup(addr); | ||
54 | if (shadow) | ||
55 | memset(shadow, status, first_n); | ||
56 | |||
57 | addr += first_n; | ||
58 | n -= first_n; | ||
59 | |||
60 | /* Do full-page memset()s. */ | ||
61 | while (n >= PAGE_SIZE) { | ||
62 | shadow = kmemcheck_shadow_lookup(addr); | ||
63 | if (shadow) | ||
64 | memset(shadow, status, PAGE_SIZE); | ||
65 | |||
66 | addr += PAGE_SIZE; | ||
67 | n -= PAGE_SIZE; | ||
68 | } | ||
69 | |||
70 | /* Do the remaining page, if any. */ | ||
71 | if (n > 0) { | ||
72 | shadow = kmemcheck_shadow_lookup(addr); | ||
73 | if (shadow) | ||
74 | memset(shadow, status, n); | ||
75 | } | ||
76 | } | ||
77 | |||
78 | void kmemcheck_mark_unallocated(void *address, unsigned int n) | ||
79 | { | ||
80 | mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); | ||
81 | } | ||
82 | |||
83 | void kmemcheck_mark_uninitialized(void *address, unsigned int n) | ||
84 | { | ||
85 | mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Fill the shadow memory of the given address such that the memory at that | ||
90 | * address is marked as being initialized. | ||
91 | */ | ||
92 | void kmemcheck_mark_initialized(void *address, unsigned int n) | ||
93 | { | ||
94 | mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); | ||
95 | } | ||
96 | EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); | ||
97 | |||
98 | void kmemcheck_mark_freed(void *address, unsigned int n) | ||
99 | { | ||
100 | mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); | ||
101 | } | ||
102 | |||
103 | void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) | ||
104 | { | ||
105 | unsigned int i; | ||
106 | |||
107 | for (i = 0; i < n; ++i) | ||
108 | kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); | ||
109 | } | ||
110 | |||
111 | void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) | ||
112 | { | ||
113 | unsigned int i; | ||
114 | |||
115 | for (i = 0; i < n; ++i) | ||
116 | kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); | ||
117 | } | ||
118 | |||
119 | void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) | ||
120 | { | ||
121 | unsigned int i; | ||
122 | |||
123 | for (i = 0; i < n; ++i) | ||
124 | kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); | ||
125 | } | ||
126 | |||
127 | enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) | ||
128 | { | ||
129 | uint8_t *x; | ||
130 | unsigned int i; | ||
131 | |||
132 | x = shadow; | ||
133 | |||
134 | #ifdef CONFIG_KMEMCHECK_PARTIAL_OK | ||
135 | /* | ||
136 | * Make sure _some_ bytes are initialized. Gcc frequently generates | ||
137 | * code to access neighboring bytes. | ||
138 | */ | ||
139 | for (i = 0; i < size; ++i) { | ||
140 | if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) | ||
141 | return x[i]; | ||
142 | } | ||
143 | #else | ||
144 | /* All bytes must be initialized. */ | ||
145 | for (i = 0; i < size; ++i) { | ||
146 | if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) | ||
147 | return x[i]; | ||
148 | } | ||
149 | #endif | ||
150 | |||
151 | return x[0]; | ||
152 | } | ||
153 | |||
154 | void kmemcheck_shadow_set(void *shadow, unsigned int size) | ||
155 | { | ||
156 | uint8_t *x; | ||
157 | unsigned int i; | ||
158 | |||
159 | x = shadow; | ||
160 | for (i = 0; i < size; ++i) | ||
161 | x[i] = KMEMCHECK_SHADOW_INITIALIZED; | ||
162 | } | ||
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h new file mode 100644 index 000000000000..af46d9ab9d86 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.h | |||
@@ -0,0 +1,16 @@ | |||
1 | #ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H | ||
2 | #define ARCH__X86__MM__KMEMCHECK__SHADOW_H | ||
3 | |||
4 | enum kmemcheck_shadow { | ||
5 | KMEMCHECK_SHADOW_UNALLOCATED, | ||
6 | KMEMCHECK_SHADOW_UNINITIALIZED, | ||
7 | KMEMCHECK_SHADOW_INITIALIZED, | ||
8 | KMEMCHECK_SHADOW_FREED, | ||
9 | }; | ||
10 | |||
11 | void *kmemcheck_shadow_lookup(unsigned long address); | ||
12 | |||
13 | enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); | ||
14 | void kmemcheck_shadow_set(void *shadow, unsigned int size); | ||
15 | |||
16 | #endif | ||
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index c0bedcd10f97..18d244f70205 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c | |||
@@ -40,21 +40,20 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) | |||
40 | 40 | ||
41 | static void __init memtest(u64 pattern, u64 start_phys, u64 size) | 41 | static void __init memtest(u64 pattern, u64 start_phys, u64 size) |
42 | { | 42 | { |
43 | u64 *p; | 43 | u64 *p, *start, *end; |
44 | void *start, *end; | ||
45 | u64 start_bad, last_bad; | 44 | u64 start_bad, last_bad; |
46 | u64 start_phys_aligned; | 45 | u64 start_phys_aligned; |
47 | size_t incr; | 46 | const size_t incr = sizeof(pattern); |
48 | 47 | ||
49 | incr = sizeof(pattern); | ||
50 | start_phys_aligned = ALIGN(start_phys, incr); | 48 | start_phys_aligned = ALIGN(start_phys, incr); |
51 | start = __va(start_phys_aligned); | 49 | start = __va(start_phys_aligned); |
52 | end = start + size - (start_phys_aligned - start_phys); | 50 | end = start + (size - (start_phys_aligned - start_phys)) / incr; |
53 | start_bad = 0; | 51 | start_bad = 0; |
54 | last_bad = 0; | 52 | last_bad = 0; |
55 | 53 | ||
56 | for (p = start; p < end; p++) | 54 | for (p = start; p < end; p++) |
57 | *p = pattern; | 55 | *p = pattern; |
56 | |||
58 | for (p = start; p < end; p++, start_phys_aligned += incr) { | 57 | for (p = start; p < end; p++, start_phys_aligned += incr) { |
59 | if (*p == pattern) | 58 | if (*p == pattern) |
60 | continue; | 59 | continue; |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6ce9518fe2ac..3cfe9ced8a4c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -470,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
470 | 470 | ||
471 | if (!debug_pagealloc) | 471 | if (!debug_pagealloc) |
472 | spin_unlock(&cpa_lock); | 472 | spin_unlock(&cpa_lock); |
473 | base = alloc_pages(GFP_KERNEL, 0); | 473 | base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); |
474 | if (!debug_pagealloc) | 474 | if (!debug_pagealloc) |
475 | spin_lock(&cpa_lock); | 475 | spin_lock(&cpa_lock); |
476 | if (!base) | 476 | if (!base) |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7aa03a5389f5..8e43bdd45456 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -4,9 +4,11 @@ | |||
4 | #include <asm/tlb.h> | 4 | #include <asm/tlb.h> |
5 | #include <asm/fixmap.h> | 5 | #include <asm/fixmap.h> |
6 | 6 | ||
7 | #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | ||
8 | |||
7 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | 9 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) |
8 | { | 10 | { |
9 | return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); | 11 | return (pte_t *)__get_free_page(PGALLOC_GFP); |
10 | } | 12 | } |
11 | 13 | ||
12 | pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | 14 | pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) |
@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | |||
14 | struct page *pte; | 16 | struct page *pte; |
15 | 17 | ||
16 | #ifdef CONFIG_HIGHPTE | 18 | #ifdef CONFIG_HIGHPTE |
17 | pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | 19 | pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); |
18 | #else | 20 | #else |
19 | pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | 21 | pte = alloc_pages(PGALLOC_GFP, 0); |
20 | #endif | 22 | #endif |
21 | if (pte) | 23 | if (pte) |
22 | pgtable_page_ctor(pte); | 24 | pgtable_page_ctor(pte); |
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[]) | |||
161 | bool failed = false; | 163 | bool failed = false; |
162 | 164 | ||
163 | for(i = 0; i < PREALLOCATED_PMDS; i++) { | 165 | for(i = 0; i < PREALLOCATED_PMDS; i++) { |
164 | pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); | 166 | pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); |
165 | if (pmd == NULL) | 167 | if (pmd == NULL) |
166 | failed = true; | 168 | failed = true; |
167 | pmds[i] = pmd; | 169 | pmds[i] = pmd; |
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
228 | pmd_t *pmds[PREALLOCATED_PMDS]; | 230 | pmd_t *pmds[PREALLOCATED_PMDS]; |
229 | unsigned long flags; | 231 | unsigned long flags; |
230 | 232 | ||
231 | pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | 233 | pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); |
232 | 234 | ||
233 | if (pgd == NULL) | 235 | if (pgd == NULL) |
234 | goto out; | 236 | goto out; |
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 58b32db33125..de2abbd07544 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile | |||
@@ -3,5 +3,5 @@ | |||
3 | nostackp := $(call cc-option, -fno-stack-protector) | 3 | nostackp := $(call cc-option, -fno-stack-protector) |
4 | CFLAGS_cpu_$(BITS).o := $(nostackp) | 4 | CFLAGS_cpu_$(BITS).o := $(nostackp) |
5 | 5 | ||
6 | obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o | 6 | obj-$(CONFIG_PM_SLEEP) += cpu.o |
7 | obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o | 7 | obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o |
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu.c index 5343540f2607..d277ef1eea51 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Suspend and hibernation support for x86-64 | 2 | * Suspend support specific for i386/x86-64. |
3 | * | 3 | * |
4 | * Distribute under GPLv2 | 4 | * Distribute under GPLv2 |
5 | * | 5 | * |
@@ -8,18 +8,28 @@ | |||
8 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | 8 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/smp.h> | ||
12 | #include <linux/suspend.h> | 11 | #include <linux/suspend.h> |
13 | #include <asm/proto.h> | 12 | #include <linux/smp.h> |
14 | #include <asm/page.h> | 13 | |
15 | #include <asm/pgtable.h> | 14 | #include <asm/pgtable.h> |
15 | #include <asm/proto.h> | ||
16 | #include <asm/mtrr.h> | 16 | #include <asm/mtrr.h> |
17 | #include <asm/page.h> | ||
18 | #include <asm/mce.h> | ||
17 | #include <asm/xcr.h> | 19 | #include <asm/xcr.h> |
18 | #include <asm/suspend.h> | 20 | #include <asm/suspend.h> |
19 | 21 | ||
20 | static void fix_processor_context(void); | 22 | #ifdef CONFIG_X86_32 |
23 | static struct saved_context saved_context; | ||
21 | 24 | ||
25 | unsigned long saved_context_ebx; | ||
26 | unsigned long saved_context_esp, saved_context_ebp; | ||
27 | unsigned long saved_context_esi, saved_context_edi; | ||
28 | unsigned long saved_context_eflags; | ||
29 | #else | ||
30 | /* CONFIG_X86_64 */ | ||
22 | struct saved_context saved_context; | 31 | struct saved_context saved_context; |
32 | #endif | ||
23 | 33 | ||
24 | /** | 34 | /** |
25 | * __save_processor_state - save CPU registers before creating a | 35 | * __save_processor_state - save CPU registers before creating a |
@@ -38,19 +48,35 @@ struct saved_context saved_context; | |||
38 | */ | 48 | */ |
39 | static void __save_processor_state(struct saved_context *ctxt) | 49 | static void __save_processor_state(struct saved_context *ctxt) |
40 | { | 50 | { |
51 | #ifdef CONFIG_X86_32 | ||
52 | mtrr_save_fixed_ranges(NULL); | ||
53 | #endif | ||
41 | kernel_fpu_begin(); | 54 | kernel_fpu_begin(); |
42 | 55 | ||
43 | /* | 56 | /* |
44 | * descriptor tables | 57 | * descriptor tables |
45 | */ | 58 | */ |
59 | #ifdef CONFIG_X86_32 | ||
60 | store_gdt(&ctxt->gdt); | ||
61 | store_idt(&ctxt->idt); | ||
62 | #else | ||
63 | /* CONFIG_X86_64 */ | ||
46 | store_gdt((struct desc_ptr *)&ctxt->gdt_limit); | 64 | store_gdt((struct desc_ptr *)&ctxt->gdt_limit); |
47 | store_idt((struct desc_ptr *)&ctxt->idt_limit); | 65 | store_idt((struct desc_ptr *)&ctxt->idt_limit); |
66 | #endif | ||
48 | store_tr(ctxt->tr); | 67 | store_tr(ctxt->tr); |
49 | 68 | ||
50 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ | 69 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ |
51 | /* | 70 | /* |
52 | * segment registers | 71 | * segment registers |
53 | */ | 72 | */ |
73 | #ifdef CONFIG_X86_32 | ||
74 | savesegment(es, ctxt->es); | ||
75 | savesegment(fs, ctxt->fs); | ||
76 | savesegment(gs, ctxt->gs); | ||
77 | savesegment(ss, ctxt->ss); | ||
78 | #else | ||
79 | /* CONFIG_X86_64 */ | ||
54 | asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); | 80 | asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); |
55 | asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); | 81 | asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); |
56 | asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); | 82 | asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); |
@@ -62,30 +88,87 @@ static void __save_processor_state(struct saved_context *ctxt) | |||
62 | rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | 88 | rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); |
63 | mtrr_save_fixed_ranges(NULL); | 89 | mtrr_save_fixed_ranges(NULL); |
64 | 90 | ||
91 | rdmsrl(MSR_EFER, ctxt->efer); | ||
92 | #endif | ||
93 | |||
65 | /* | 94 | /* |
66 | * control registers | 95 | * control registers |
67 | */ | 96 | */ |
68 | rdmsrl(MSR_EFER, ctxt->efer); | ||
69 | ctxt->cr0 = read_cr0(); | 97 | ctxt->cr0 = read_cr0(); |
70 | ctxt->cr2 = read_cr2(); | 98 | ctxt->cr2 = read_cr2(); |
71 | ctxt->cr3 = read_cr3(); | 99 | ctxt->cr3 = read_cr3(); |
100 | #ifdef CONFIG_X86_32 | ||
101 | ctxt->cr4 = read_cr4_safe(); | ||
102 | #else | ||
103 | /* CONFIG_X86_64 */ | ||
72 | ctxt->cr4 = read_cr4(); | 104 | ctxt->cr4 = read_cr4(); |
73 | ctxt->cr8 = read_cr8(); | 105 | ctxt->cr8 = read_cr8(); |
106 | #endif | ||
74 | } | 107 | } |
75 | 108 | ||
109 | /* Needed by apm.c */ | ||
76 | void save_processor_state(void) | 110 | void save_processor_state(void) |
77 | { | 111 | { |
78 | __save_processor_state(&saved_context); | 112 | __save_processor_state(&saved_context); |
79 | } | 113 | } |
114 | #ifdef CONFIG_X86_32 | ||
115 | EXPORT_SYMBOL(save_processor_state); | ||
116 | #endif | ||
80 | 117 | ||
81 | static void do_fpu_end(void) | 118 | static void do_fpu_end(void) |
82 | { | 119 | { |
83 | /* | 120 | /* |
84 | * Restore FPU regs if necessary | 121 | * Restore FPU regs if necessary. |
85 | */ | 122 | */ |
86 | kernel_fpu_end(); | 123 | kernel_fpu_end(); |
87 | } | 124 | } |
88 | 125 | ||
126 | static void fix_processor_context(void) | ||
127 | { | ||
128 | int cpu = smp_processor_id(); | ||
129 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
130 | |||
131 | set_tss_desc(cpu, t); /* | ||
132 | * This just modifies memory; should not be | ||
133 | * necessary. But... This is necessary, because | ||
134 | * 386 hardware has concept of busy TSS or some | ||
135 | * similar stupidity. | ||
136 | */ | ||
137 | |||
138 | #ifdef CONFIG_X86_64 | ||
139 | get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; | ||
140 | |||
141 | syscall_init(); /* This sets MSR_*STAR and related */ | ||
142 | #endif | ||
143 | load_TR_desc(); /* This does ltr */ | ||
144 | load_LDT(¤t->active_mm->context); /* This does lldt */ | ||
145 | |||
146 | /* | ||
147 | * Now maybe reload the debug registers | ||
148 | */ | ||
149 | if (current->thread.debugreg7) { | ||
150 | #ifdef CONFIG_X86_32 | ||
151 | set_debugreg(current->thread.debugreg0, 0); | ||
152 | set_debugreg(current->thread.debugreg1, 1); | ||
153 | set_debugreg(current->thread.debugreg2, 2); | ||
154 | set_debugreg(current->thread.debugreg3, 3); | ||
155 | /* no 4 and 5 */ | ||
156 | set_debugreg(current->thread.debugreg6, 6); | ||
157 | set_debugreg(current->thread.debugreg7, 7); | ||
158 | #else | ||
159 | /* CONFIG_X86_64 */ | ||
160 | loaddebug(¤t->thread, 0); | ||
161 | loaddebug(¤t->thread, 1); | ||
162 | loaddebug(¤t->thread, 2); | ||
163 | loaddebug(¤t->thread, 3); | ||
164 | /* no 4 and 5 */ | ||
165 | loaddebug(¤t->thread, 6); | ||
166 | loaddebug(¤t->thread, 7); | ||
167 | #endif | ||
168 | } | ||
169 | |||
170 | } | ||
171 | |||
89 | /** | 172 | /** |
90 | * __restore_processor_state - restore the contents of CPU registers saved | 173 | * __restore_processor_state - restore the contents of CPU registers saved |
91 | * by __save_processor_state() | 174 | * by __save_processor_state() |
@@ -96,9 +179,16 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
96 | /* | 179 | /* |
97 | * control registers | 180 | * control registers |
98 | */ | 181 | */ |
182 | /* cr4 was introduced in the Pentium CPU */ | ||
183 | #ifdef CONFIG_X86_32 | ||
184 | if (ctxt->cr4) | ||
185 | write_cr4(ctxt->cr4); | ||
186 | #else | ||
187 | /* CONFIG X86_64 */ | ||
99 | wrmsrl(MSR_EFER, ctxt->efer); | 188 | wrmsrl(MSR_EFER, ctxt->efer); |
100 | write_cr8(ctxt->cr8); | 189 | write_cr8(ctxt->cr8); |
101 | write_cr4(ctxt->cr4); | 190 | write_cr4(ctxt->cr4); |
191 | #endif | ||
102 | write_cr3(ctxt->cr3); | 192 | write_cr3(ctxt->cr3); |
103 | write_cr2(ctxt->cr2); | 193 | write_cr2(ctxt->cr2); |
104 | write_cr0(ctxt->cr0); | 194 | write_cr0(ctxt->cr0); |
@@ -107,13 +197,31 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
107 | * now restore the descriptor tables to their proper values | 197 | * now restore the descriptor tables to their proper values |
108 | * ltr is done i fix_processor_context(). | 198 | * ltr is done i fix_processor_context(). |
109 | */ | 199 | */ |
200 | #ifdef CONFIG_X86_32 | ||
201 | load_gdt(&ctxt->gdt); | ||
202 | load_idt(&ctxt->idt); | ||
203 | #else | ||
204 | /* CONFIG_X86_64 */ | ||
110 | load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); | 205 | load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); |
111 | load_idt((const struct desc_ptr *)&ctxt->idt_limit); | 206 | load_idt((const struct desc_ptr *)&ctxt->idt_limit); |
112 | 207 | #endif | |
113 | 208 | ||
114 | /* | 209 | /* |
115 | * segment registers | 210 | * segment registers |
116 | */ | 211 | */ |
212 | #ifdef CONFIG_X86_32 | ||
213 | loadsegment(es, ctxt->es); | ||
214 | loadsegment(fs, ctxt->fs); | ||
215 | loadsegment(gs, ctxt->gs); | ||
216 | loadsegment(ss, ctxt->ss); | ||
217 | |||
218 | /* | ||
219 | * sysenter MSRs | ||
220 | */ | ||
221 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
222 | enable_sep_cpu(); | ||
223 | #else | ||
224 | /* CONFIG_X86_64 */ | ||
117 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); | 225 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); |
118 | asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); | 226 | asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); |
119 | asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); | 227 | asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); |
@@ -123,6 +231,7 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
123 | wrmsrl(MSR_FS_BASE, ctxt->fs_base); | 231 | wrmsrl(MSR_FS_BASE, ctxt->fs_base); |
124 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); | 232 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); |
125 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | 233 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); |
234 | #endif | ||
126 | 235 | ||
127 | /* | 236 | /* |
128 | * restore XCR0 for xsave capable cpu's. | 237 | * restore XCR0 for xsave capable cpu's. |
@@ -134,41 +243,17 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
134 | 243 | ||
135 | do_fpu_end(); | 244 | do_fpu_end(); |
136 | mtrr_ap_init(); | 245 | mtrr_ap_init(); |
246 | |||
247 | #ifdef CONFIG_X86_32 | ||
248 | mcheck_init(&boot_cpu_data); | ||
249 | #endif | ||
137 | } | 250 | } |
138 | 251 | ||
252 | /* Needed by apm.c */ | ||
139 | void restore_processor_state(void) | 253 | void restore_processor_state(void) |
140 | { | 254 | { |
141 | __restore_processor_state(&saved_context); | 255 | __restore_processor_state(&saved_context); |
142 | } | 256 | } |
143 | 257 | #ifdef CONFIG_X86_32 | |
144 | static void fix_processor_context(void) | 258 | EXPORT_SYMBOL(restore_processor_state); |
145 | { | 259 | #endif |
146 | int cpu = smp_processor_id(); | ||
147 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
148 | |||
149 | /* | ||
150 | * This just modifies memory; should not be necessary. But... This | ||
151 | * is necessary, because 386 hardware has concept of busy TSS or some | ||
152 | * similar stupidity. | ||
153 | */ | ||
154 | set_tss_desc(cpu, t); | ||
155 | |||
156 | get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; | ||
157 | |||
158 | syscall_init(); /* This sets MSR_*STAR and related */ | ||
159 | load_TR_desc(); /* This does ltr */ | ||
160 | load_LDT(¤t->active_mm->context); /* This does lldt */ | ||
161 | |||
162 | /* | ||
163 | * Now maybe reload the debug registers | ||
164 | */ | ||
165 | if (current->thread.debugreg7){ | ||
166 | loaddebug(¤t->thread, 0); | ||
167 | loaddebug(¤t->thread, 1); | ||
168 | loaddebug(¤t->thread, 2); | ||
169 | loaddebug(¤t->thread, 3); | ||
170 | /* no 4 and 5 */ | ||
171 | loaddebug(¤t->thread, 6); | ||
172 | loaddebug(¤t->thread, 7); | ||
173 | } | ||
174 | } | ||
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c deleted file mode 100644 index ce702c5b3a2c..000000000000 --- a/arch/x86/power/cpu_32.c +++ /dev/null | |||
@@ -1,148 +0,0 @@ | |||
1 | /* | ||
2 | * Suspend support specific for i386. | ||
3 | * | ||
4 | * Distribute under GPLv2 | ||
5 | * | ||
6 | * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> | ||
7 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | ||
8 | */ | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/suspend.h> | ||
12 | #include <asm/mtrr.h> | ||
13 | #include <asm/mce.h> | ||
14 | #include <asm/xcr.h> | ||
15 | #include <asm/suspend.h> | ||
16 | |||
17 | static struct saved_context saved_context; | ||
18 | |||
19 | unsigned long saved_context_ebx; | ||
20 | unsigned long saved_context_esp, saved_context_ebp; | ||
21 | unsigned long saved_context_esi, saved_context_edi; | ||
22 | unsigned long saved_context_eflags; | ||
23 | |||
24 | static void __save_processor_state(struct saved_context *ctxt) | ||
25 | { | ||
26 | mtrr_save_fixed_ranges(NULL); | ||
27 | kernel_fpu_begin(); | ||
28 | |||
29 | /* | ||
30 | * descriptor tables | ||
31 | */ | ||
32 | store_gdt(&ctxt->gdt); | ||
33 | store_idt(&ctxt->idt); | ||
34 | store_tr(ctxt->tr); | ||
35 | |||
36 | /* | ||
37 | * segment registers | ||
38 | */ | ||
39 | savesegment(es, ctxt->es); | ||
40 | savesegment(fs, ctxt->fs); | ||
41 | savesegment(gs, ctxt->gs); | ||
42 | savesegment(ss, ctxt->ss); | ||
43 | |||
44 | /* | ||
45 | * control registers | ||
46 | */ | ||
47 | ctxt->cr0 = read_cr0(); | ||
48 | ctxt->cr2 = read_cr2(); | ||
49 | ctxt->cr3 = read_cr3(); | ||
50 | ctxt->cr4 = read_cr4_safe(); | ||
51 | } | ||
52 | |||
53 | /* Needed by apm.c */ | ||
54 | void save_processor_state(void) | ||
55 | { | ||
56 | __save_processor_state(&saved_context); | ||
57 | } | ||
58 | EXPORT_SYMBOL(save_processor_state); | ||
59 | |||
60 | static void do_fpu_end(void) | ||
61 | { | ||
62 | /* | ||
63 | * Restore FPU regs if necessary. | ||
64 | */ | ||
65 | kernel_fpu_end(); | ||
66 | } | ||
67 | |||
68 | static void fix_processor_context(void) | ||
69 | { | ||
70 | int cpu = smp_processor_id(); | ||
71 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
72 | |||
73 | set_tss_desc(cpu, t); /* | ||
74 | * This just modifies memory; should not be | ||
75 | * necessary. But... This is necessary, because | ||
76 | * 386 hardware has concept of busy TSS or some | ||
77 | * similar stupidity. | ||
78 | */ | ||
79 | |||
80 | load_TR_desc(); /* This does ltr */ | ||
81 | load_LDT(¤t->active_mm->context); /* This does lldt */ | ||
82 | |||
83 | /* | ||
84 | * Now maybe reload the debug registers | ||
85 | */ | ||
86 | if (current->thread.debugreg7) { | ||
87 | set_debugreg(current->thread.debugreg0, 0); | ||
88 | set_debugreg(current->thread.debugreg1, 1); | ||
89 | set_debugreg(current->thread.debugreg2, 2); | ||
90 | set_debugreg(current->thread.debugreg3, 3); | ||
91 | /* no 4 and 5 */ | ||
92 | set_debugreg(current->thread.debugreg6, 6); | ||
93 | set_debugreg(current->thread.debugreg7, 7); | ||
94 | } | ||
95 | |||
96 | } | ||
97 | |||
98 | static void __restore_processor_state(struct saved_context *ctxt) | ||
99 | { | ||
100 | /* | ||
101 | * control registers | ||
102 | */ | ||
103 | /* cr4 was introduced in the Pentium CPU */ | ||
104 | if (ctxt->cr4) | ||
105 | write_cr4(ctxt->cr4); | ||
106 | write_cr3(ctxt->cr3); | ||
107 | write_cr2(ctxt->cr2); | ||
108 | write_cr0(ctxt->cr0); | ||
109 | |||
110 | /* | ||
111 | * now restore the descriptor tables to their proper values | ||
112 | * ltr is done i fix_processor_context(). | ||
113 | */ | ||
114 | load_gdt(&ctxt->gdt); | ||
115 | load_idt(&ctxt->idt); | ||
116 | |||
117 | /* | ||
118 | * segment registers | ||
119 | */ | ||
120 | loadsegment(es, ctxt->es); | ||
121 | loadsegment(fs, ctxt->fs); | ||
122 | loadsegment(gs, ctxt->gs); | ||
123 | loadsegment(ss, ctxt->ss); | ||
124 | |||
125 | /* | ||
126 | * sysenter MSRs | ||
127 | */ | ||
128 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
129 | enable_sep_cpu(); | ||
130 | |||
131 | /* | ||
132 | * restore XCR0 for xsave capable cpu's. | ||
133 | */ | ||
134 | if (cpu_has_xsave) | ||
135 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); | ||
136 | |||
137 | fix_processor_context(); | ||
138 | do_fpu_end(); | ||
139 | mtrr_ap_init(); | ||
140 | mcheck_init(&boot_cpu_data); | ||
141 | } | ||
142 | |||
143 | /* Needed by apm.c */ | ||
144 | void restore_processor_state(void) | ||
145 | { | ||
146 | __restore_processor_state(&saved_context); | ||
147 | } | ||
148 | EXPORT_SYMBOL(restore_processor_state); | ||