aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWanpeng Li <wanpeng.li@hotmail.com>2017-06-29 13:15:11 -0400
committerIngo Molnar <mingo@kernel.org>2017-07-05 03:54:15 -0400
commit2a42eb9594a1480b4ead9e036e06ee1290e5fa6d (patch)
tree3be064d79c8bc8beb5a090bbb2ca0785ecea9a88
parentbac5b6b6b11560f323e71d0ebac4061cfe5f56c0 (diff)
sched/cputime: Accumulate vtime on top of nsec clocksource
Currently the cputime source used by vtime is jiffies. When we cross a context boundary and jiffies have changed since the last snapshot, the pending cputime is accounted to the switching out context. This system works ok if the ticks are not aligned across CPUs. If they instead are aligned (ie: all fire at the same time) and the CPUs run in userspace, the jiffies change is only observed on tick exit and therefore the user cputime is accounted as system cputime. This is because the CPU that maintains timekeeping fires its tick at the same time as the others. It updates jiffies in the middle of the tick and the other CPUs see that update on IRQ exit: CPU 0 (timekeeper) CPU 1 ------------------- ------------- jiffies = N ... run in userspace for a jiffy tick entry tick entry (sees jiffies = N) set jiffies = N + 1 tick exit tick exit (sees jiffies = N + 1) account 1 jiffy as stime Fix this with using a nanosec clock source instead of jiffies. The cputime is then accumulated and flushed everytime the pending delta reaches a jiffy in order to mitigate the accounting overhead. [ fweisbec: changelog, rebase on struct vtime, field renames, add delta on cputime readers, keep idle vtime as-is (low overhead accounting), harmonize clock sources. ] Suggested-by: Thomas Gleixner <tglx@linutronix.de> Reported-by: Luiz Capitulino <lcapitulino@redhat.com> Tested-by: Luiz Capitulino <lcapitulino@redhat.com> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Wanpeng Li <kernellwp@gmail.com> Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/sched/cputime.c64
2 files changed, 45 insertions, 22 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eeff8a024f0c..4818126c5153 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -236,6 +236,9 @@ struct vtime {
236 seqcount_t seqcount; 236 seqcount_t seqcount;
237 unsigned long long starttime; 237 unsigned long long starttime;
238 enum vtime_state state; 238 enum vtime_state state;
239 u64 utime;
240 u64 stime;
241 u64 gtime;
239}; 242};
240 243
241struct sched_info { 244struct sched_info {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9ee725edcbe0..6e3ea4ac1bda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
681#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 681#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
682static u64 vtime_delta(struct vtime *vtime) 682static u64 vtime_delta(struct vtime *vtime)
683{ 683{
684 unsigned long now = READ_ONCE(jiffies); 684 unsigned long long clock;
685 685
686 if (time_before(now, (unsigned long)vtime->starttime)) 686 clock = sched_clock_cpu(smp_processor_id());
687 if (clock < vtime->starttime)
687 return 0; 688 return 0;
688 689
689 return jiffies_to_nsecs(now - vtime->starttime); 690 return clock - vtime->starttime;
690} 691}
691 692
692static u64 get_vtime_delta(struct vtime *vtime) 693static u64 get_vtime_delta(struct vtime *vtime)
693{ 694{
694 unsigned long now = READ_ONCE(jiffies); 695 u64 delta = vtime_delta(vtime);
695 u64 delta, other; 696 u64 other;
696 697
697 /* 698 /*
698 * Unlike tick based timing, vtime based timing never has lost 699 * Unlike tick based timing, vtime based timing never has lost
@@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime)
701 * elapsed time. Limit account_other_time to prevent rounding 702 * elapsed time. Limit account_other_time to prevent rounding
702 * errors from causing elapsed vtime to go negative. 703 * errors from causing elapsed vtime to go negative.
703 */ 704 */
704 delta = jiffies_to_nsecs(now - vtime->starttime);
705 other = account_other_time(delta); 705 other = account_other_time(delta);
706 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 706 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
707 vtime->starttime = now; 707 vtime->starttime += delta;
708 708
709 return delta - other; 709 return delta - other;
710} 710}
711 711
712static void __vtime_account_system(struct task_struct *tsk) 712static void __vtime_account_system(struct task_struct *tsk,
713 struct vtime *vtime)
713{ 714{
714 account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); 715 vtime->stime += get_vtime_delta(vtime);
716 if (vtime->stime >= TICK_NSEC) {
717 account_system_time(tsk, irq_count(), vtime->stime);
718 vtime->stime = 0;
719 }
720}
721
722static void vtime_account_guest(struct task_struct *tsk,
723 struct vtime *vtime)
724{
725 vtime->gtime += get_vtime_delta(vtime);
726 if (vtime->gtime >= TICK_NSEC) {
727 account_guest_time(tsk, vtime->gtime);
728 vtime->gtime = 0;
729 }
715} 730}
716 731
717void vtime_account_system(struct task_struct *tsk) 732void vtime_account_system(struct task_struct *tsk)
@@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk)
722 return; 737 return;
723 738
724 write_seqcount_begin(&vtime->seqcount); 739 write_seqcount_begin(&vtime->seqcount);
725 __vtime_account_system(tsk); 740 /* We might have scheduled out from guest path */
741 if (current->flags & PF_VCPU)
742 vtime_account_guest(tsk, vtime);
743 else
744 __vtime_account_system(tsk, vtime);
726 write_seqcount_end(&vtime->seqcount); 745 write_seqcount_end(&vtime->seqcount);
727} 746}
728 747
@@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk)
731 struct vtime *vtime = &tsk->vtime; 750 struct vtime *vtime = &tsk->vtime;
732 751
733 write_seqcount_begin(&vtime->seqcount); 752 write_seqcount_begin(&vtime->seqcount);
734 if (vtime_delta(vtime)) 753 __vtime_account_system(tsk, vtime);
735 __vtime_account_system(tsk);
736 vtime->state = VTIME_USER; 754 vtime->state = VTIME_USER;
737 write_seqcount_end(&vtime->seqcount); 755 write_seqcount_end(&vtime->seqcount);
738} 756}
@@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk)
742 struct vtime *vtime = &tsk->vtime; 760 struct vtime *vtime = &tsk->vtime;
743 761
744 write_seqcount_begin(&vtime->seqcount); 762 write_seqcount_begin(&vtime->seqcount);
745 if (vtime_delta(vtime)) 763 vtime->utime += get_vtime_delta(vtime);
746 account_user_time(tsk, get_vtime_delta(vtime)); 764 if (vtime->utime >= TICK_NSEC) {
765 account_user_time(tsk, vtime->utime);
766 vtime->utime = 0;
767 }
747 vtime->state = VTIME_SYS; 768 vtime->state = VTIME_SYS;
748 write_seqcount_end(&vtime->seqcount); 769 write_seqcount_end(&vtime->seqcount);
749} 770}
@@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk)
759 * that can thus safely catch up with a tickless delta. 780 * that can thus safely catch up with a tickless delta.
760 */ 781 */
761 write_seqcount_begin(&vtime->seqcount); 782 write_seqcount_begin(&vtime->seqcount);
762 if (vtime_delta(vtime)) 783 __vtime_account_system(tsk, vtime);
763 __vtime_account_system(tsk);
764 current->flags |= PF_VCPU; 784 current->flags |= PF_VCPU;
765 write_seqcount_end(&vtime->seqcount); 785 write_seqcount_end(&vtime->seqcount);
766} 786}
@@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk)
771 struct vtime *vtime = &tsk->vtime; 791 struct vtime *vtime = &tsk->vtime;
772 792
773 write_seqcount_begin(&vtime->seqcount); 793 write_seqcount_begin(&vtime->seqcount);
774 __vtime_account_system(tsk); 794 vtime_account_guest(tsk, vtime);
775 current->flags &= ~PF_VCPU; 795 current->flags &= ~PF_VCPU;
776 write_seqcount_end(&vtime->seqcount); 796 write_seqcount_end(&vtime->seqcount);
777} 797}
@@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
794 814
795 write_seqcount_begin(&vtime->seqcount); 815 write_seqcount_begin(&vtime->seqcount);
796 vtime->state = VTIME_SYS; 816 vtime->state = VTIME_SYS;
797 vtime->starttime = jiffies; 817 vtime->starttime = sched_clock_cpu(smp_processor_id());
798 write_seqcount_end(&vtime->seqcount); 818 write_seqcount_end(&vtime->seqcount);
799} 819}
800 820
@@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
806 local_irq_save(flags); 826 local_irq_save(flags);
807 write_seqcount_begin(&vtime->seqcount); 827 write_seqcount_begin(&vtime->seqcount);
808 vtime->state = VTIME_SYS; 828 vtime->state = VTIME_SYS;
809 vtime->starttime = jiffies; 829 vtime->starttime = sched_clock_cpu(cpu);
810 write_seqcount_end(&vtime->seqcount); 830 write_seqcount_end(&vtime->seqcount);
811 local_irq_restore(flags); 831 local_irq_restore(flags);
812} 832}
@@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t)
825 845
826 gtime = t->gtime; 846 gtime = t->gtime;
827 if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) 847 if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
828 gtime += vtime_delta(vtime); 848 gtime += vtime->gtime + vtime_delta(vtime);
829 849
830 } while (read_seqcount_retry(&vtime->seqcount, seq)); 850 } while (read_seqcount_retry(&vtime->seqcount, seq));
831 851
@@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
866 * the right place. 886 * the right place.
867 */ 887 */
868 if (vtime->state == VTIME_USER || t->flags & PF_VCPU) 888 if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
869 *utime += delta; 889 *utime += vtime->utime + delta;
870 else if (vtime->state == VTIME_SYS) 890 else if (vtime->state == VTIME_SYS)
871 *stime += delta; 891 *stime += vtime->stime + delta;
872 } while (read_seqcount_retry(&vtime->seqcount, seq)); 892 } while (read_seqcount_retry(&vtime->seqcount, seq));
873} 893}
874#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 894#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */