aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorWaiman Long <Waiman.Long@hpe.com>2016-09-06 13:22:10 -0400
committerThomas Gleixner <tglx@linutronix.de>2016-09-09 09:16:19 -0400
commitf99fd22e4d4bc84880a8a3117311bbf0e3a6a9dc (patch)
tree5c7b0a7585c26e144385144d81458699ad373db7 /arch/x86
parentc6935931c1894ff857616ff8549b61236a19148f (diff)
x86/hpet: Reduce HPET counter read contention
On a large system with many CPUs, using HPET as the clock source can have a significant impact on the overall system performance because of the following reasons: 1) There is a single HPET counter shared by all the CPUs. 2) HPET counter reading is a very slow operation. Using HPET as the default clock source may happen when, for example, the TSC clock calibration exceeds the allowable tolerance. Something the performance slowdown can be so severe that the system may crash because of a NMI watchdog soft lockup, for example. During the TSC clock calibration process, the default clock source will be set temporarily to HPET. For systems with many CPUs, it is possible that NMI watchdog soft lockup may occur occasionally during that short time period where HPET clocking is active as is shown in the kernel log below: [ 71.646504] hpet0: 8 comparators, 64-bit 14.318180 MHz counter [ 71.655313] Switching to clocksource hpet [ 95.679135] BUG: soft lockup - CPU#144 stuck for 23s! [swapper/144:0] [ 95.693363] BUG: soft lockup - CPU#145 stuck for 23s! [swapper/145:0] [ 95.695580] BUG: soft lockup - CPU#582 stuck for 23s! [swapper/582:0] [ 95.698128] BUG: soft lockup - CPU#357 stuck for 23s! [swapper/357:0] This patch addresses the above issues by reducing HPET read contention using the fact that if more than one CPUs are trying to access HPET at the same time, it will be more efficient when only one CPU in the group reads the HPET counter and shares it with the rest of the group instead of each group member trying to read the HPET counter individually. This is done by using a combination quadword that contains a 32-bit stored HPET value and a 32-bit spinlock. The CPU that gets the lock will be responsible for reading the HPET counter and storing it in the quadword. The others will monitor the change in HPET value and lock status and grab the latest stored HPET value accordingly. This change is only enabled on 64-bit SMP configuration. On a 4-socket Haswell-EX box with 144 threads (HT on), running the AIM7 compute workload (1500 users) on a 4.8-rc1 kernel (HZ=1000) with and without the patch has the following performance numbers (with HPET or TSC as clock source): TSC = 1042431 jobs/min HPET w/o patch = 798068 jobs/min HPET with patch = 1029445 jobs/min The perf profile showed a reduction of the %CPU time consumed by read_hpet from 11.19% without patch to 1.24% with patch. [ tglx: It's really sad that we need to have such hacks just to deal with the fact that cpu vendors have not managed to fix the TSC wreckage within 15+ years. Were They Forgetting? ] Signed-off-by: Waiman Long <Waiman.Long@hpe.com> Tested-by: Prarit Bhargava <prarit@redhat.com> Cc: Scott J Norton <scott.norton@hpe.com> Cc: Douglas Hatch <doug.hatch@hpe.com> Cc: Randy Wright <rwright@hpe.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@suse.de> Link: http://lkml.kernel.org/r/1473182530-29175-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/hpet.c94
1 files changed, 94 insertions, 0 deletions
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c6dfd801df97..274fab99169d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
756/* 756/*
757 * Clock source related code 757 * Clock source related code
758 */ 758 */
759#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
760/*
761 * Reading the HPET counter is a very slow operation. If a large number of
762 * CPUs are trying to access the HPET counter simultaneously, it can cause
763 * massive delay and slow down system performance dramatically. This may
764 * happen when HPET is the default clock source instead of TSC. For a
765 * really large system with hundreds of CPUs, the slowdown may be so
766 * severe that it may actually crash the system because of a NMI watchdog
767 * soft lockup, for example.
768 *
769 * If multiple CPUs are trying to access the HPET counter at the same time,
770 * we don't actually need to read the counter multiple times. Instead, the
771 * other CPUs can use the counter value read by the first CPU in the group.
772 *
773 * This special feature is only enabled on x86-64 systems. It is unlikely
774 * that 32-bit x86 systems will have enough CPUs to require this feature
775 * with its associated locking overhead. And we also need 64-bit atomic
776 * read.
777 *
778 * The lock and the hpet value are stored together and can be read in a
779 * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
780 * is 32 bits in size.
781 */
782union hpet_lock {
783 struct {
784 arch_spinlock_t lock;
785 u32 value;
786 };
787 u64 lockval;
788};
789
790static union hpet_lock hpet __cacheline_aligned = {
791 { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
792};
793
794static cycle_t read_hpet(struct clocksource *cs)
795{
796 unsigned long flags;
797 union hpet_lock old, new;
798
799 BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
800
801 /*
802 * Read HPET directly if in NMI.
803 */
804 if (in_nmi())
805 return (cycle_t)hpet_readl(HPET_COUNTER);
806
807 /*
808 * Read the current state of the lock and HPET value atomically.
809 */
810 old.lockval = READ_ONCE(hpet.lockval);
811
812 if (arch_spin_is_locked(&old.lock))
813 goto contended;
814
815 local_irq_save(flags);
816 if (arch_spin_trylock(&hpet.lock)) {
817 new.value = hpet_readl(HPET_COUNTER);
818 /*
819 * Use WRITE_ONCE() to prevent store tearing.
820 */
821 WRITE_ONCE(hpet.value, new.value);
822 arch_spin_unlock(&hpet.lock);
823 local_irq_restore(flags);
824 return (cycle_t)new.value;
825 }
826 local_irq_restore(flags);
827
828contended:
829 /*
830 * Contended case
831 * --------------
832 * Wait until the HPET value change or the lock is free to indicate
833 * its value is up-to-date.
834 *
835 * It is possible that old.value has already contained the latest
836 * HPET value while the lock holder was in the process of releasing
837 * the lock. Checking for lock state change will enable us to return
838 * the value immediately instead of waiting for the next HPET reader
839 * to come along.
840 */
841 do {
842 cpu_relax();
843 new.lockval = READ_ONCE(hpet.lockval);
844 } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
845
846 return (cycle_t)new.value;
847}
848#else
849/*
850 * For UP or 32-bit.
851 */
759static cycle_t read_hpet(struct clocksource *cs) 852static cycle_t read_hpet(struct clocksource *cs)
760{ 853{
761 return (cycle_t)hpet_readl(HPET_COUNTER); 854 return (cycle_t)hpet_readl(HPET_COUNTER);
762} 855}
856#endif
763 857
764static struct clocksource clocksource_hpet = { 858static struct clocksource clocksource_hpet = {
765 .name = "hpet", 859 .name = "hpet",