x86/hpet: Reduce HPET counter read contention

On a large system with many CPUs, using HPET as the clock source can have a significant impact on the overall system performance because of the following reasons: 1) There is a single HPET counter shared by all the CPUs. 2) HPET counter reading is a very slow operation. Using HPET as the default clock source may happen when, for example, the TSC clock calibration exceeds the allowable tolerance. Something the performance slowdown can be so severe that the system may crash because of a NMI watchdog soft lockup, for example. During the TSC clock calibration process, the default clock source will be set temporarily to HPET. For systems with many CPUs, it is possible that NMI watchdog soft lockup may occur occasionally during that short time period where HPET clocking is active as is shown in the kernel log below: [ 71.646504] hpet0: 8 comparators, 64-bit 14.318180 MHz counter [ 71.655313] Switching to clocksource hpet [ 95.679135] BUG: soft lockup - CPU#144 stuck for 23s! [swapper/144:0] [ 95.693363] BUG: soft lockup - CPU#145 stuck for 23s! [swapper/145:0] [ 95.695580] BUG: soft lockup - CPU#582 stuck for 23s! [swapper/582:0] [ 95.698128] BUG: soft lockup - CPU#357 stuck for 23s! [swapper/357:0] This patch addresses the above issues by reducing HPET read contention using the fact that if more than one CPUs are trying to access HPET at the same time, it will be more efficient when only one CPU in the group reads the HPET counter and shares it with the rest of the group instead of each group member trying to read the HPET counter individually. This is done by using a combination quadword that contains a 32-bit stored HPET value and a 32-bit spinlock. The CPU that gets the lock will be responsible for reading the HPET counter and storing it in the quadword. The others will monitor the change in HPET value and lock status and grab the latest stored HPET value accordingly. This change is only enabled on 64-bit SMP configuration. On a 4-socket Haswell-EX box with 144 threads (HT on), running the AIM7 compute workload (1500 users) on a 4.8-rc1 kernel (HZ=1000) with and without the patch has the following performance numbers (with HPET or TSC as clock source): TSC = 1042431 jobs/min HPET w/o patch = 798068 jobs/min HPET with patch = 1029445 jobs/min The perf profile showed a reduction of the %CPU time consumed by read_hpet from 11.19% without patch to 1.24% with patch. [ tglx: It's really sad that we need to have such hacks just to deal with the fact that cpu vendors have not managed to fix the TSC wreckage within 15+ years. Were They Forgetting? ] Signed-off-by: Waiman Long <Waiman.Long@hpe.com> Tested-by: Prarit Bhargava <prarit@redhat.com> Cc: Scott J Norton <scott.norton@hpe.com> Cc: Douglas Hatch <doug.hatch@hpe.com> Cc: Randy Wright <rwright@hpe.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@suse.de> Link: http://lkml.kernel.org/r/1473182530-29175-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Waiman Long <Waiman.Long@hpe.com> 2016-09-06 13:22:10 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2016-09-09 09:16:19 -0400
commit: f99fd22e4d4bc84880a8a3117311bbf0e3a6a9dc (patch)
tree: 5c7b0a7585c26e144385144d81458699ad373db7 /arch/x86
parent: c6935931c1894ff857616ff8549b61236a19148f (diff)
1 files changed, 94 insertions, 0 deletions
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c6dfd801df97..274fab99169d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
 /*
 * Clock source related code
 */
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the hpet value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+        struct {
+                arch_spinlock_t lock;
+                u32 value;
+        };
+        u64 lockval;
+};
+static union hpet_lock hpet __cacheline_aligned = {
+        { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+static cycle_t read_hpet(struct clocksource *cs)
+{
+        unsigned long flags;
+        union hpet_lock old, new;
+        BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
+        /*
+         * Read HPET directly if in NMI.
+         */
+        if (in_nmi())
+                return (cycle_t)hpet_readl(HPET_COUNTER);
+        /*
+         * Read the current state of the lock and HPET value atomically.
+         */
+        old.lockval = READ_ONCE(hpet.lockval);
+        if (arch_spin_is_locked(&old.lock))
+                goto contended;
+        local_irq_save(flags);
+        if (arch_spin_trylock(&hpet.lock)) {
+                new.value = hpet_readl(HPET_COUNTER);
+                /*
+                 * Use WRITE_ONCE() to prevent store tearing.
+                 */
+                WRITE_ONCE(hpet.value, new.value);
+                arch_spin_unlock(&hpet.lock);
+                local_irq_restore(flags);
+                return (cycle_t)new.value;
+        }
+        local_irq_restore(flags);
+contended:
+        /*
+         * Contended case
+         * --------------
+         * Wait until the HPET value change or the lock is free to indicate
+         * its value is up-to-date.
+         *
+         * It is possible that old.value has already contained the latest
+         * HPET value while the lock holder was in the process of releasing
+         * the lock. Checking for lock state change will enable us to return
+         * the value immediately instead of waiting for the next HPET reader
+         * to come along.
+         */
+        do {
+                cpu_relax();
+                new.lockval = READ_ONCE(hpet.lockval);
+        } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+        return (cycle_t)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
 static cycle_t read_hpet(struct clocksource *cs)
 {
        return (cycle_t)hpet_readl(HPET_COUNTER);
 }
+#endif
 static struct clocksource clocksource_hpet = {
        .name           = "hpet",
author	Waiman Long <Waiman.Long@hpe.com>	2016-09-06 13:22:10 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2016-09-09 09:16:19 -0400
commit	f99fd22e4d4bc84880a8a3117311bbf0e3a6a9dc (patch)
tree	5c7b0a7585c26e144385144d81458699ad373db7 /arch/x86
parent	c6935931c1894ff857616ff8549b61236a19148f (diff)

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c6dfd801df97..274fab99169d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c
@@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
756	/*	756	/*
757	* Clock source related code	757	* Clock source related code
758	*/	758	*/
		759	#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
		760	/*
		761	* Reading the HPET counter is a very slow operation. If a large number of
		762	* CPUs are trying to access the HPET counter simultaneously, it can cause
		763	* massive delay and slow down system performance dramatically. This may
		764	* happen when HPET is the default clock source instead of TSC. For a
		765	* really large system with hundreds of CPUs, the slowdown may be so
		766	* severe that it may actually crash the system because of a NMI watchdog
		767	* soft lockup, for example.
		768	*
		769	* If multiple CPUs are trying to access the HPET counter at the same time,
		770	* we don't actually need to read the counter multiple times. Instead, the
		771	* other CPUs can use the counter value read by the first CPU in the group.
		772	*
		773	* This special feature is only enabled on x86-64 systems. It is unlikely
		774	* that 32-bit x86 systems will have enough CPUs to require this feature
		775	* with its associated locking overhead. And we also need 64-bit atomic
		776	* read.
		777	*
		778	* The lock and the hpet value are stored together and can be read in a
		779	* single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
		780	* is 32 bits in size.
		781	*/
		782	union hpet_lock {
		783	struct {
		784	arch_spinlock_t lock;
		785	u32 value;
		786	};
		787	u64 lockval;
		788	};
		789
		790	static union hpet_lock hpet __cacheline_aligned = {
		791	{ .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
		792	};
		793
		794	static cycle_t read_hpet(struct clocksource *cs)
		795	{
		796	unsigned long flags;
		797	union hpet_lock old, new;
		798
		799	BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
		800
		801	/*
		802	* Read HPET directly if in NMI.
		803	*/
		804	if (in_nmi())
		805	return (cycle_t)hpet_readl(HPET_COUNTER);
		806
		807	/*
		808	* Read the current state of the lock and HPET value atomically.
		809	*/
		810	old.lockval = READ_ONCE(hpet.lockval);
		811
		812	if (arch_spin_is_locked(&old.lock))
		813	goto contended;
		814
		815	local_irq_save(flags);
		816	if (arch_spin_trylock(&hpet.lock)) {
		817	new.value = hpet_readl(HPET_COUNTER);
		818	/*
		819	* Use WRITE_ONCE() to prevent store tearing.
		820	*/
		821	WRITE_ONCE(hpet.value, new.value);
		822	arch_spin_unlock(&hpet.lock);
		823	local_irq_restore(flags);
		824	return (cycle_t)new.value;
		825	}
		826	local_irq_restore(flags);
		827
		828	contended:
		829	/*
		830	* Contended case
		831	* --------------
		832	* Wait until the HPET value change or the lock is free to indicate
		833	* its value is up-to-date.
		834	*
		835	* It is possible that old.value has already contained the latest
		836	* HPET value while the lock holder was in the process of releasing
		837	* the lock. Checking for lock state change will enable us to return
		838	* the value immediately instead of waiting for the next HPET reader
		839	* to come along.
		840	*/
		841	do {
		842	cpu_relax();
		843	new.lockval = READ_ONCE(hpet.lockval);
		844	} while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
		845
		846	return (cycle_t)new.value;
		847	}
		848	#else
		849	/*
		850	* For UP or 32-bit.
		851	*/
759	static cycle_t read_hpet(struct clocksource *cs)	852	static cycle_t read_hpet(struct clocksource *cs)
760	{	853	{
761	return (cycle_t)hpet_readl(HPET_COUNTER);	854	return (cycle_t)hpet_readl(HPET_COUNTER);
762	}	855	}
		856	#endif
763		857
764	static struct clocksource clocksource_hpet = {	858	static struct clocksource clocksource_hpet = {
765	.name = "hpet",	859	.name = "hpet",