litmus-rt.git - The LITMUS^RT kernel.

Branch	Commit message	Author	Age
archive/unc-master-3.0	P-FP: fix BUG_ON releated to priority inheritance	Bjoern Brandenburg	13 years
archived-2013.1	uncachedev: mmap memory that is not cached by CPUs	Glenn Elliott	12 years
archived-private-master	Merge branch 'wip-2.6.34' into old-private-master	Andrea Bastoni	15 years
archived-semi-part	Merge branch 'wip-semi-part' of ssh://cvs/cvs/proj/litmus/repo/litmus2010 int...	Andrea Bastoni	15 years
demo	Further refinements	Jonathan Herman	14 years
ecrts-pgm-final	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
ecrts14-pgm-final	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
gpusync-rtss12	Final GPUSync implementation.	Glenn Elliott	12 years
gpusync/staging	Rename IKGLP R2DGLP.	Glenn Elliott	12 years
linux-tip	Merge branch 'slab/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/p...	Linus Torvalds	15 years
litmus2008-patch-series	add i386 feather-trace implementation	Bjoern B. Brandenburg	16 years
master	PSN-EDF: use inferred_sporadic_job_release_at	Bjoern Brandenburg	9 years
pgm	make it compile	Glenn Elliott	12 years
prop/litmus-signals	Infrastructure for Litmus signals.	Glenn Elliott	13 years
prop/robust-tie-break	Fixed bug in edf_higher_prio().	Glenn Elliott	13 years
staging	Fix tracepoint compilation error	Felipe Cerqueira	13 years
test	9/23/2016	Namhoon Kim	9 years
tracing-devel	Test kernel tracing events capabilities	Andrea Bastoni	16 years
v2.6.34-with-arm-patches	smsc911x: Add spinlocks around registers access	Catalin Marinas	15 years
v2015.1	Add ARM syscall def for get_current_budget	Bjoern Brandenburg	10 years
wip-2011.2-bbb	Litmus core: simplify np-section protocol	Bjoern B. Brandenburg	14 years
wip-2011.2-bbb-trace	Refactor sched_trace_log_message() -> debug_trace_log_message()	Andrea Bastoni	14 years
wip-2012.3-gpu	SOBLIV draining support for C-EDF.	Glenn Elliott	12 years
wip-2012.3-gpu-preport	pick up last C-RM file	Glenn Elliott	12 years
wip-2012.3-gpu-rtss13	Fix critical bug in GPU tracker.	Glenn Elliott	12 years
wip-2012.3-gpu-sobliv-budget-w-kshark	Proper sobliv draining and many bug fixes.	Glenn Elliott	12 years
wip-aedzl-final	Make it easier to compile AEDZL interfaces in liblitmus.	Glenn Elliott	15 years
wip-aedzl-revised	Add sched_trace data for Apative EDZL	Glenn Elliott	15 years
wip-arbit-deadline	Fix compilation bug.	Glenn Elliott	13 years
wip-aux-tasks	Description of refined aux task inheritance.	Glenn Elliott	13 years
wip-bbb	GSN-EDF & Core: improve debug TRACE'ing for NP sections	Bjoern B. Brandenburg	14 years
wip-bbb-prio-don	use correct timestamp	Bjoern B. Brandenburg	14 years
wip-better-break	Implement hash-based EDF tie-breaking.	Glenn Elliott	13 years
wip-binary-heap	Make C-EDF work with simplified binheap_delete	Glenn Elliott	13 years
wip-budget	Added support for choices in budget policy enforcement.	Glenn Elliott	15 years
wip-color	Summarize schedulability with final record	Jonathan Herman	13 years
wip-color-jlh	sched_color: Fixed two bugs causing crashing on experiment restart and a rare...	Jonathan Herman	13 years
wip-d10-hz1000	Enable HZ=1000 on District 10	Bjoern B. Brandenburg	15 years
wip-default-clustering	Feature: Make default C-EDF clustering compile-time configurable.	Glenn Elliott	15 years
wip-dissipation-jerickso	Update from 2.6.36 to 2.6.36.4	Jeremy Erickson	11 years
wip-dissipation2-jerickso	Update 2.6.36 to 2.6.36.4	Jeremy Erickson	11 years
wip-ecrts14-pgm	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
wip-edf-hsb	last tested version	Jonathan Herman	14 years
wip-edf-os	Lookup table EDF-os	Jeremy Erickson	12 years
wip-edf-tie-break	Merge branch 'wip-edf-tie-break' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus...	Glenn Elliott	13 years
wip-edzl-critique	Use hr_timer's active checks instead of having own flag.	Glenn Elliott	15 years
wip-edzl-final	Implementation of the EDZL scheduler.	Glenn Elliott	15 years
wip-edzl-revised	Clean up comments.	Glenn Elliott	15 years
wip-events	Added support for tracing arbitrary actions.	Jonathan Herman	15 years
wip-extra-debug	DBG: add additional tracing	Bjoern B. Brandenburg	15 years
wip-fix-switch-jerickso	Attempt to fix race condition with plugin switching	Jeremy Erickson	15 years
wip-fix3	sched: show length of runqueue clock deactivation in /proc/sched_debug	Bjoern B. Brandenburg	15 years
wip-fmlp-dequeue	Improve FMLP queue management.	Glenn Elliott	14 years
wip-ft-irq-flag	Feather-Trace: keep track of interrupt-related interference.	Bjoern B. Brandenburg	14 years
wip-gpu-cleanup	Enable sched_trace log injection from userspace	Glenn Elliott	13 years
wip-gpu-interrupts	Remove option for threading of all softirqs.	Glenn Elliott	14 years
wip-gpu-rtas12	Generalized GPU cost predictors + EWMA. (untested)	Glenn Elliott	13 years
wip-gpu-rtss12	Final GPUSync implementation.	Glenn Elliott	13 years
wip-gpu-rtss12-srp	experimental changes to support GPUs under SRP	Glenn Elliott	13 years
wip-gpusync-merge	Cleanup priority tracking for budget enforcement.	Glenn Elliott	11 years
wip-ikglp	Move RSM and IKGLP imp. to own .c files	Glenn Elliott	13 years
wip-k-fmlp	Merge branch 'mpi-master' into wip-k-fmlp	Glenn Elliott	14 years
wip-kernel-coloring	Added recolor syscall	Namhoon Kim	7 years
wip-kernthreads	Kludge work-queue processing into klitirqd.	Glenn Elliott	15 years
wip-klmirqd-to-aux	Allow klmirqd threads to be given names.	Glenn Elliott	13 years
wip-kshark	Merge branch 'mpi-staging' into wip-kshark	Jonathan Herman	13 years
wip-litmus-3.2	Merge commit 'v3.2' into litmus-staging	Andrea Bastoni	13 years
wip-litmus2011.2	Cleanup: Coding conformance for affinity stuff.	Glenn Elliott	14 years
wip-litmus3.0-2011.2	Feather-Trace: keep track of interrupt-related interference.	Bjoern B. Brandenburg	14 years
wip-master-2.6.33-rt	Avoid deadlock when switching task policy to BACKGROUND (ugly)	Andrea Bastoni	15 years
wip-mc	Removed ARM-specific hacks which disabled less common mixed-criticality featu...	Jonathan Herman	12 years
wip-mc-bipasa	MC-EDF added	bipasa chattopadhyay	13 years
wip-mc-jerickso	Split C/D queues	Jeremy Erickson	15 years
wip-mc2-cache-slack	Manually patched mc^2 related code	Ming Yang	10 years
wip-mcrit-mac	cosmetic	Mac Mollison	15 years
wip-merge-3.0	Prevent Linux to send IPI and queue tasks on remote CPUs.	Andrea Bastoni	14 years
wip-merge-v3.0	Prevent Linux to send IPI and queue tasks on remote CPUs.	Andrea Bastoni	14 years
wip-migration-affinity	NULL affinity dereference in C-EDF.	Glenn Elliott	14 years
wip-mmap-uncache	share branch with others	Glenn Elliott	13 years
wip-modechange	RTSS 2017 submission	Namhoon Kim	8 years
wip-nested-locking	Appears to be working.	Bryan Ward	12 years
wip-omlp-gedf	First implementation of G-OMLP.	Glenn Elliott	15 years
wip-pai	Some cleanup of PAI	Glenn Elliott	14 years
wip-percore-lib	9/21/2016	Namhoon Kim	9 years
wip-performance	CONFIG_DONT_PREEMPT_ON_TIE: Don't preeempt a scheduled task on priority tie.	Glenn Elliott	14 years
wip-pgm	Add PGM support to C-FL	Glenn Elliott	12 years
wip-pgm-split	First draft of C-FL-split	Namhoon Kim	12 years
wip-pm-ovd	Add preemption-and-migration overhead tracing support	Andrea Bastoni	15 years
wip-prio-inh	P-EDF updated to use the generic pi framework.	Glenn Elliott	15 years
wip-prioq-dgl	BUG FIX: Support DGLs with PRIOQ_MUTEX	Glenn Elliott	13 years
wip-refactored-gedf	Generalizd architecture for GEDF-style scheduelrs to reduce code redundancy.	Glenn Elliott	15 years
wip-release-master-fix	bugfix: release master CPU must signal task was picked	Bjoern B. Brandenburg	14 years
wip-robust-tie-break	EDF priority tie-breaks.	Glenn Elliott	13 years
wip-rt-kshark	Move task time accounting into the complete_job method.	Jonathan Herman	13 years
wip-rtas12-pgm	Scheduling of PGM jobs.	Glenn Elliott	13 years
wip-semi-part	Fix compile error with newer GCC	Jeremy Erickson	12 years
wip-semi-part-edfos-jerickso	Use initial CPU set by client	Jeremy Erickson	12 years
wip-shared-lib	TODO: Fix condition checks in replicate_page_move_mapping()	Namhoon Kim	9 years
wip-shared-lib2	RTAS 2017 Submission ver.	Namhoon Kim	9 years
wip-shared-mem	Initial commit for shared library	Namhoon Kim	9 years
wip-splitting-jerickso	Fix release behavior	Jeremy Erickson	13 years
wip-splitting-omlp-jerickso	Bjoern's Dissertation Code with Priority Donation	Jeremy Erickson	13 years
wip-stage-binheap	An efficient binary heap implementation.	Glenn Elliott	13 years
wip-sun-port	Dynamic memory allocation and clean exit for FeatherTrace	Christopher Kenna	15 years
wip-timer-trace	bugfix: C-EDF, clear scheduled field of the correct CPU upon task_exit	Andrea Bastoni	15 years
wip-tracepoints	Add kernel-style events for sched_trace_XXX() functions	Andrea Bastoni	14 years

Tag	Download	Author	Age
2015.1	commit 8e51b37822...	Bjoern Brandenburg	10 years
2013.1	commit bcaacec1ca...	Glenn Elliott	12 years
2012.3	commit c158b5fbe4...	Jonathan Herman	13 years
2012.2	commit b53c479a0f...	Glenn Elliott	13 years
2012.1	commit 83b11ea1c6...	Bjoern B. Brandenburg	14 years
rtas12-mc-beta-exp	commit 8e236ee20f...	Christopher Kenna	14 years
2011.1	commit d11808b5c6...	Christopher Kenna	15 years
v2.6.37-rc4	commit e8a7e48bb2...	Linus Torvalds	15 years
v2.6.37-rc3	commit 3561d43fd2...	Linus Torvalds	15 years
v2.6.37-rc2	commit e53beacd23...	Linus Torvalds	15 years
v2.6.37-rc1	commit c8ddb2713c...	Linus Torvalds	15 years
v2.6.36	commit f6f94e2ab1...	Linus Torvalds	15 years
2010.2	commit 5c5456402d...	Bjoern B. Brandenburg	15 years
v2.6.36-rc8	commit cd07202cc8...	Linus Torvalds	15 years
v2.6.36-rc7	commit cb655d0f3d...	Linus Torvalds	15 years
v2.6.36-rc6	commit 899611ee7d...	Linus Torvalds	15 years
v2.6.36-rc5	commit b30a3f6257...	Linus Torvalds	15 years
v2.6.36-rc4	commit 49553c2ef8...	Linus Torvalds	15 years
v2.6.36-rc3	commit 2bfc96a127...	Linus Torvalds	15 years
v2.6.36-rc2	commit 76be97c1fc...	Linus Torvalds	15 years
v2.6.36-rc1	commit da5cabf80e...	Linus Torvalds	15 years
v2.6.35	commit 9fe6206f40...	Linus Torvalds	15 years
v2.6.35-rc6	commit b37fa16e78...	Linus Torvalds	15 years
v2.6.35-rc5	commit 1c5474a65b...	Linus Torvalds	15 years
v2.6.35-rc4	commit 815c4163b6...	Linus Torvalds	15 years
v2.6.35-rc3	commit 7e27d6e778...	Linus Torvalds	15 years
v2.6.35-rc2	commit e44a21b726...	Linus Torvalds	15 years
v2.6.35-rc1	commit 67a3e12b05...	Linus Torvalds	15 years
2010.1	commit 7c1ff4c544...	Andrea Bastoni	15 years
v2.6.34	commit e40152ee1e...	Linus Torvalds	15 years
v2.6.33.4	commit 4640b4e7d9...	Greg Kroah-Hartman	15 years
v2.6.34-rc7	commit b57f95a382...	Linus Torvalds	15 years
v2.6.34-rc6	commit 66f41d4c5c...	Linus Torvalds	15 years
v2.6.33.3	commit 3e7ad8ed97...	Greg Kroah-Hartman	15 years
v2.6.34-rc5	commit 01bf0b6457...	Linus Torvalds	15 years
v2.6.34-rc4	commit 0d0fb0f9c5...	Linus Torvalds	15 years
v2.6.33.2	commit 19f00f070c...	Greg Kroah-Hartman	15 years
v2.6.34-rc3	commit 2eaa9cfdf3...	Linus Torvalds	15 years
v2.6.34-rc2	commit 220bf991b0...	Linus Torvalds	15 years
v2.6.33.1	commit dbdafe5ccf...	Greg Kroah-Hartman	15 years
v2.6.34-rc1	commit 57d54889cd...	Linus Torvalds	16 years
v2.6.33	commit 60b341b778...	Linus Torvalds	16 years
v2.6.33-rc8	commit 724e6d3fe8...	Linus Torvalds	16 years
v2.6.33-rc7	commit 29275254ca...	Linus Torvalds	16 years
v2.6.33-rc6	commit abe94c756c...	Linus Torvalds	16 years
v2.6.33-rc5	commit 92dcffb916...	Linus Torvalds	16 years
v2.6.33-rc4	commit 7284ce6c9f...	Linus Torvalds	16 years
v2.6.33-rc3	commit 74d2e4f8d7...	Linus Torvalds	16 years
v2.6.33-rc2	commit 6b7b284958...	Linus Torvalds	16 years
v2.6.33-rc1	commit 55639353a0...	Linus Torvalds	16 years
v2.6.32	commit 22763c5cf3...	Linus Torvalds	16 years
v2.6.32-rc8	commit 648f4e3e50...	Linus Torvalds	16 years
v2.6.32-rc7	commit 156171c71a...	Linus Torvalds	16 years
v2.6.32-rc6	commit b419148e56...	Linus Torvalds	16 years
v2.6.32-rc5	commit 012abeea66...	Linus Torvalds	16 years
v2.6.32-rc4	commit 161291396e...	Linus Torvalds	16 years
v2.6.32-rc3	commit 374576a8b6...	Linus Torvalds	16 years
v2.6.32-rc1	commit 17d857be64...	Linus Torvalds	16 years
v2.6.32-rc2	commit 17d857be64...	Linus Torvalds	16 years
v2.6.31	commit 74fca6a428...	Linus Torvalds	16 years
v2.6.31-rc9	commit e07cccf404...	Linus Torvalds	16 years
v2.6.31-rc8	commit 326ba5010a...	Linus Torvalds	16 years
v2.6.31-rc7	commit 422bef879e...	Linus Torvalds	16 years
v2.6.31-rc6	commit 64f1607ffb...	Linus Torvalds	16 years
v2.6.31-rc5	commit ed680c4ad4...	Linus Torvalds	16 years
v2.6.31-rc4	commit 4be3bd7849...	Linus Torvalds	16 years
v2.6.31-rc3	commit 6847e154e3...	Linus Torvalds	16 years
v2.6.31-rc2	commit 8e4a718ff3...	Linus Torvalds	16 years
v2.6.31-rc1	commit 28d0325ce6...	Linus Torvalds	16 years
v2.6.30	commit 07a2039b8e...	Linus Torvalds	16 years
v2.6.30-rc8	commit 9fa7eb283c...	Linus Torvalds	16 years
v2.6.30-rc7	commit 59a3759d0f...	Linus Torvalds	16 years
v2.6.30-rc6	commit 1406de8e11...	Linus Torvalds	16 years
v2.6.30-rc5	commit 091bf7624d...	Linus Torvalds	16 years
v2.6.30-rc4	commit 091438dd56...	Linus Torvalds	16 years
v2.6.30-rc3	commit 0910697403...	Linus Torvalds	16 years
v2.6.30-rc2	commit 0882e8dd3a...	Linus Torvalds	16 years
v2.6.30-rc1	commit 577c9c456f...	Linus Torvalds	16 years
v2.6.29	commit 8e0ee43bc2...	Linus Torvalds	16 years
v2.6.29-rc8	commit 041b62374c...	Linus Torvalds	16 years
v2.6.29-rc7	commit fec6c6fec3...	Linus Torvalds	17 years
v2.6.29-rc6	commit 20f4d6c3a2...	Linus Torvalds	17 years
v2.6.29-rc5	commit d2f8d7ee1a...	Linus Torvalds	17 years
v2.6.29-rc4	commit 8e4921515c...	Linus Torvalds	17 years
v2.6.29-rc3	commit 18e352e4a7...	Linus Torvalds	17 years
v2.6.29-rc2	commit 1de9e8e70f...	Linus Torvalds	17 years
v2.6.29-rc1	commit c59765042f...	Linus Torvalds	17 years
v2.6.28	commit 4a6908a3a0...	Linus Torvalds	17 years
v2.6.28-rc9	commit 929096fe9f...	Linus Torvalds	17 years
v2.6.28-rc8	commit 8b1fae4e42...	Linus Torvalds	17 years
v2.6.28-rc7	commit 061e41fdb5...	Linus Torvalds	17 years
v2.6.28-rc6	commit 13d428afc0...	Linus Torvalds	17 years
v2.6.28-rc5	commit 9bf1a2445f...	Linus Torvalds	17 years
v2.6.28-rc4	commit f7160c7573...	Linus Torvalds	17 years
v2.6.28-rc3	commit 45beca08dd...	Linus Torvalds	17 years
v2.6.28-rc2	commit 0173a3265b...	Linus Torvalds	17 years
v2.6.28-rc1	commit 57f8f7b60d...	Linus Torvalds	17 years
v2.6.27	commit 3fa8749e58...	Linus Torvalds	17 years
v2.6.27-rc9	commit 4330ed8ed4...	Linus Torvalds	17 years
v2.6.27-rc8	commit 94aca1dac6...	Linus Torvalds	17 years
v2.6.27-rc7	commit 72d31053f6...	Linus Torvalds	17 years
v2.6.27-rc6	commit adee14b2e1...	Linus Torvalds	17 years
v2.6.27-rc5	commit 24342c34a0...	Linus Torvalds	17 years
v2.6.27-rc4	commit 6a55617ed5...	Linus Torvalds	17 years
v2.6.27-rc3	commit 30a2f3c60a...	Linus Torvalds	17 years
v2.6.27-rc2	commit 0967d61ea0...	Linus Torvalds	17 years
v2.6.27-rc1	commit 6e86841d05...	Linus Torvalds	17 years
v2.6.26	commit bce7f793da...	Linus Torvalds	17 years
v2.6.26-rc9	commit b7279469d6...	Linus Torvalds	17 years
v2.6.26-rc8	commit 543cf4cb3f...	Linus Torvalds	17 years
v2.6.26-rc7	commit d70ac829b7...	Linus Torvalds	17 years
v2.6.26-rc6	commit 5dd34572ad...	Linus Torvalds	17 years
v2.6.26-rc5	commit 53c8ba9540...	Linus Torvalds	17 years
v2.6.26-rc4	commit e490517a03...	Linus Torvalds	17 years
v2.6.26-rc3	commit b8291ad07a...	Linus Torvalds	17 years
v2.6.26-rc2	commit 492c2e476e...	Linus Torvalds	17 years
v2.6.26-rc1	commit 2ddcca36c8...	Linus Torvalds	17 years
v2.6.25	commit 4b119e21d0...	Linus Torvalds	17 years
v2.6.25-rc9	commit 120dd64cac...	Linus Torvalds	17 years
v2.6.25-rc8	commit 0e81a8ae37...	Linus Torvalds	17 years
v2.6.25-rc7	commit 05dda977f2...	Linus Torvalds	17 years
v2.6.25-rc6	commit a978b30af3...	Linus Torvalds	17 years
v2.6.25-rc5	commit cdeeeae056...	Linus Torvalds	18 years
v2.6.25-rc4	commit 29e8c3c304...	Linus Torvalds	18 years
v2.6.25-rc3	commit bfa274e243...	Linus Torvalds	18 years
v2.6.25-rc2	commit 101142c37b...	Linus Torvalds	18 years
v2.6.25-rc1	commit 19af35546d...	Linus Torvalds	18 years
v2.6.24	commit 49914084e7...	Linus Torvalds	18 years
v2.6.24-rc8	commit cbd9c88369...	Linus Torvalds	18 years
v2.6.24-rc7	commit 3ce5445046...	Linus Torvalds	18 years
v2.6.24-rc6	commit ea67db4cdb...	Linus Torvalds	18 years
v2.6.24-rc5	commit 82d29bf6dc...	Linus Torvalds	18 years
v2.6.24-rc4	commit 09b56adc98...	Linus Torvalds	18 years
v2.6.24-rc3	commit d9f8bcbf67...	Linus Torvalds	18 years
v2.6.24-rc2	commit dbeeb816e8...	Linus Torvalds	18 years
v2.6.24-rc1	commit c9927c2bf4...	Linus Torvalds	18 years
v2.6.23	commit bbf25010f1...	Linus Torvalds	18 years
v2.6.23-rc9	commit 3146b39c18...	Linus Torvalds	18 years
v2.6.23-rc8	commit 4942de4a0e...	Linus Torvalds	18 years
v2.6.23-rc7	commit 81cfe79b9c...	Linus Torvalds	18 years
v2.6.23-rc6	commit 0d4cbb5e7f...	Linus Torvalds	18 years
v2.6.23-rc5	commit 40ffbfad6b...	Linus Torvalds	18 years
v2.6.23-rc4	commit b07d68b5ca...	Linus Torvalds	18 years
v2.6.23-rc3	commit 39d3520c92...	Linus Torvalds	18 years
v2.6.23-rc2	commit d4ac2477fa...	Linus Torvalds	18 years
v2.6.23-rc1	commit f695baf2df...	Linus Torvalds	18 years
v2.6.22	commit 7dcca30a32...	Linus Torvalds	18 years
v2.6.22-rc7	commit a38d6181ff...	Linus Torvalds	18 years
v2.6.22-rc6	commit 189548642c...	Linus Torvalds	18 years
v2.6.22-rc5	commit 188e1f81ba...	Linus Torvalds	18 years
v2.6.22-rc4	commit 5ecd3100e6...	Linus Torvalds	18 years
v2.6.22-rc3	commit c420bc9f09...	Linus Torvalds	18 years
v2.6.22-rc2	commit 55b637c6a0...	Linus Torvalds	18 years
v2.6.22-rc1	commit 39403865d2...	Linus Torvalds	18 years
v2.6.21	commit de46c33745...	Linus Torvalds	18 years
v2.6.21-rc7	commit 94a05509a9...	Linus Torvalds	18 years
v2.6.21-rc6	commit a21bd69e15...	Linus Torvalds	18 years
v2.6.21-rc5	commit e0f2e3a06b...	Linus Torvalds	18 years
v2.6.21-rc4	commit db98e0b434...	Linus Torvalds	18 years
v2.6.21-rc3	commit 08e15e81a4...	Linus Torvalds	19 years
v2.6.21-rc2	commit 606135a308...	Linus Torvalds	19 years
v2.6.21-rc1	commit c8f71b01a5...	Linus Torvalds	19 years
v2.6.20	commit 62d0cfcb27...	Linus Torvalds	19 years
v2.6.20-rc7	commit f56df2f4db...	Linus Torvalds	19 years
v2.6.20-rc6	commit 99abfeafb5...	Linus Torvalds	19 years
v2.6.20-rc5	commit a8b3485287...	Linus Torvalds	19 years
v2.6.20-rc4	commit bf81b46482...	Linus Torvalds	19 years
v2.6.20-rc3	commit 669df1b478...	Linus Torvalds	19 years
v2.6.20-rc2	commit 3bf8ba38f3...	Linus Torvalds	19 years
v2.6.20-rc1	commit cc016448b0...	Linus Torvalds	19 years
v2.6.19	commit 0215ffb08c...	Linus Torvalds	19 years
v2.6.19-rc6	commit 44597f65f6...	Linus Torvalds	19 years
v2.6.19-rc5	commit 80c2188127...	Linus Torvalds	19 years
v2.6.19-rc4	commit ae99a78af3...	Linus Torvalds	19 years
v2.6.19-rc3	commit 7059abedd2...	Linus Torvalds	19 years
v2.6.19-rc2	commit b4bd8c6643...	Linus Torvalds	19 years
v2.6.19-rc1	commit d223a60106...	Linus Torvalds	19 years
v2.6.18	commit e478bec0ba...	Linus Torvalds	19 years
v2.6.18-rc7	commit 95064a75eb...	Linus Torvalds	19 years
v2.6.18-rc6	commit c336923b66...	Linus Torvalds	19 years
v2.6.18-rc5	commit 60d4684068...	Linus Torvalds	19 years
v2.6.18-rc4	commit 9f737633e6...	Linus Torvalds	19 years
v2.6.18-rc3	commit b6ff50833a...	Linus Torvalds	19 years
v2.6.18-rc2	commit 82d6897fef...	Linus Torvalds	19 years
v2.6.18-rc1	commit 120bda20c6...	Linus Torvalds	19 years
v2.6.17	commit 427abfa28a...	Linus Torvalds	19 years
v2.6.17-rc6	commit 1def630a6a...	Linus Torvalds	19 years
v2.6.17-rc5	commit a8bd60705a...	Linus Torvalds	19 years
v2.6.17-rc4	commit d8c3291c73...	Linus Torvalds	19 years
v2.6.17-rc3	commit 2be4d50295...	Linus Torvalds	19 years
v2.6.17-rc2	commit 8bbde0e6d5...	Linus Torvalds	19 years
v2.6.17-rc1	commit 6246b6128b...	Linus Torvalds	19 years
v2.6.16	commit 7705a8792b...	Linus Torvalds	19 years
v2.6.16-rc6	commit 535744878e...	Linus Torvalds	20 years
v2.6.16-rc5	commit b9a33cebac...	Linus Torvalds	20 years
v2.6.16-rc4	commit bd71c2b174...	Linus Torvalds	20 years
v2.6.16-rc3	commit e9bb4c9929...	Linus Torvalds	20 years
v2.6.16-rc2	commit 826eeb53a6...	Linus Torvalds	20 years
v2.6.16-rc1	commit 2664b25051...	Linus Torvalds	20 years
v2.6.15	commit 88026842b0...	Linus Torvalds	20 years
v2.6.15-rc7	commit f89f5948fc...	Linus Torvalds	20 years
v2.6.15-rc6	commit df7addbb45...	Linus Torvalds	20 years
v2.6.15-rc5	commit 436b0f76f2...	Linus Torvalds	20 years
v2.6.15-rc4	commit 5666c0947e...	Linus Torvalds	20 years
v2.6.15-rc3	commit 624f54be20...	Linus Torvalds	20 years
v2.6.15-rc2	commit 3bedff1d73...	Linus Torvalds	20 years
v2.6.15-rc1	commit cd52d1ee9a...	Linus Torvalds	20 years
v2.6.14	commit 741b2252a5...	Linus Torvalds	20 years
v2.6.14-rc5	commit 93918e9afc...	Linus Torvalds	20 years
v2.6.14-rc4	commit 907a426179...	Linus Torvalds	20 years
v2.6.14-rc3	commit 1c9426e8a5...	Linus Torvalds	20 years
v2.6.14-rc2	commit 676d55ae30...	Linus Torvalds	20 years
v2.6.14-rc1	commit 2f4ba45a75...	Linus Torvalds	20 years
v2.6.13	commit 02b3e4e2d7...	Linus Torvalds	20 years
v2.6.13-rc7	commit 0572e3da3f...	Linus Torvalds	20 years
v2.6.13-rc6	commit 6fc32179de...	Linus Torvalds	20 years
v2.6.13-rc5	commit 9a351e30d7...	Linus Torvalds	20 years
v2.6.13-rc4	commit 6395352334...	Linus Torvalds	20 years
v2.6.11	tree c39ae07f39...
v2.6.11-tree	tree c39ae07f39...
v2.6.12	commit 9ee1c939d1...
v2.6.12-rc2	commit 1da177e4c3...
v2.6.12-rc3	commit a2755a80f4...
v2.6.12-rc4	commit 88d7bd8cb9...
v2.6.12-rc5	commit 2a24ab628a...
v2.6.12-rc6	commit 7cef5677ef...
v2.6.13-rc1	commit 4c91aedb75...
v2.6.13-rc2	commit a18bcb7450...
v2.6.13-rc3	commit c32511e271...

/* * kernel/sched/core.c * * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: * hybrid priority-list and round-robin design with * an array-switch method of distributing timeslices * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements * by Peter Williams * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, * Thomas Gleixner, Mike Kravetz */ #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> #include <asm/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/kernel_stat.h> #include <linux/debug_locks.h> #include <linux/perf_event.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> #include <linux/freezer.h> #include <linux/vmalloc.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/pid_namespace.h> #include <linux/smp.h> #include <linux/threads.h> #include <linux/timer.h> #include <linux/rcupdate.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/percpu.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> #include <linux/unistd.h> #include <linux/pagemap.h> #include <linux/hrtimer.h> #include <linux/tick.h> #include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <asm/switch_to.h> #include <asm/tlb.h> #include <asm/irq_regs.h> #include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif #include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; ktime_t soft, hard, now; for (;;) { if (hrtimer_active(period_timer)) break; now = hrtimer_cb_get_time(period_timer); hrtimer_forward(period_timer, now, period); soft = hrtimer_get_softexpires(period_timer); hard = hrtimer_get_expires(period_timer); delta = ktime_to_ns(ktime_sub(hard, soft)); __hrtimer_start_range_ns(period_timer, soft, delta, HRTIMER_MODE_ABS_PINNED, 0); } } DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static void update_rq_clock_task(struct rq *rq, s64 delta); void update_rq_clock(struct rq *rq) { s64 delta; if (rq->skip_clock_update > 0) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; rq->clock += delta; update_rq_clock_task(rq, delta); } /* * Debugging: various feature bits */ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | const_debug unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT #ifdef CONFIG_SCHED_DEBUG #define SCHED_FEAT(name, enabled) \ #name , static const char * const sched_feat_names[] = { #include "features.h" }; #undef SCHED_FEAT static int sched_feat_show(struct seq_file *m, void *v) { int i; for (i = 0; i < __SCHED_FEAT_NR; i++) { if (!(sysctl_sched_features & (1UL << i))) seq_puts(m, "NO_"); seq_printf(m, "%s ", sched_feat_names[i]); } seq_puts(m, "\n"); return 0; } #ifdef HAVE_JUMP_LABEL #define jump_label_key__true STATIC_KEY_INIT_TRUE #define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; #undef SCHED_FEAT static void sched_feat_disable(int i) { if (static_key_enabled(&sched_feat_keys[i])) static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { if (!static_key_enabled(&sched_feat_keys[i])) static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ static int sched_feat_set(char *cmp) { int i; int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; cmp += 3; } for (i = 0; i < __SCHED_FEAT_NR; i++) { if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) { sysctl_sched_features &= ~(1UL << i); sched_feat_disable(i); } else { sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } break; } } return i; } static ssize_t sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; char *cmp; int i; if (cnt > 63) cnt = 63; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; cmp = strstrip(buf); i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; *ppos += cnt; return cnt; } static int sched_feat_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_feat_show, NULL); } static const struct file_operations sched_feat_fops = { .open = sched_feat_open, .write = sched_feat_write, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static __init int sched_init_debug(void) { debugfs_create_file("sched_features", 0644, NULL, NULL, &sched_feat_fops); return 0; } late_initcall(sched_init_debug); #endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ const_debug unsigned int sysctl_sched_nr_migrate = 32; /* * period over which we average the RT time consumption, measured * in ms. * * default: 1s */ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; /* * period over which we measure -rt task cpu usage in us. * default: 1s */ unsigned int sysctl_sched_rt_period = 1000000; __read_mostly int scheduler_running; /* * part of the period that we allow rt tasks to run in us. * default: 0.95s */ int sysctl_sched_rt_runtime = 950000; /* * __task_rq_lock - lock the rq @p resides on. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { struct rq *rq; lockdep_assert_held(&p->pi_lock); for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; raw_spin_unlock(&rq->lock); } } /* * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. */ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) __acquires(p->pi_lock) __acquires(rq->lock) { struct rq *rq; for (;;) { raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } } static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { raw_spin_unlock(&rq->lock); } static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) __releases(p->pi_lock) { raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } /* * this_rq_lock - lock this runqueue and disable interrupts. */ static struct rq *this_rq_lock(void) __acquires(rq->lock) { struct rq *rq; local_irq_disable(); rq = this_rq(); raw_spin_lock(&rq->lock); return rq; } #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. * * Its all a bit involved since we cannot program an hrt while holding the * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a * reschedule event. * * When we get rescheduled we reprogram the hrtick_timer outside of the * rq->lock. */ static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) hrtimer_cancel(&rq->hrtick_timer); } /* * High-resolution timer tick. * Runs from hardirq context with interrupts disabled. */ static enum hrtimer_restart hrtick(struct hrtimer *timer) { struct rq *rq = container_of(timer, struct rq, hrtick_timer); WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); raw_spin_lock(&rq->lock); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); raw_spin_unlock(&rq->lock); return HRTIMER_NORESTART; } #ifdef CONFIG_SMP /* * called from hardirq (IPI) context */ static void __hrtick_start(void *arg) { struct rq *rq = arg; raw_spin_lock(&rq->lock); hrtimer_restart(&rq->hrtick_timer); rq->hrtick_csd_pending = 0; raw_spin_unlock(&rq->lock); } /* * Called to set the hrtick timer state. * * called with rq->lock held and irqs disabled */ void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); hrtimer_set_expires(timer, time); if (rq == this_rq()) { hrtimer_restart(timer); } else if (!rq->hrtick_csd_pending) { __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; } } static int hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (int)(long)hcpu; switch (action) { case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: hrtick_clear(cpu_rq(cpu)); return NOTIFY_OK; } return NOTIFY_DONE; } static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } #else /* * Called to set the hrtick timer state. * * called with rq->lock held and irqs disabled */ void hrtick_start(struct rq *rq, u64 delay) { __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); } static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) { #ifdef CONFIG_SMP rq->hrtick_csd_pending = 0; rq->hrtick_csd.flags = 0; rq->hrtick_csd.func = __hrtick_start; rq->hrtick_csd.info = rq; #endif hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } static inline void init_rq_hrtick(struct rq *rq) { } static inline void init_hrtick(void) { } #endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ #ifdef CONFIG_SMP #ifndef tsk_is_polling #define tsk_is_polling(t) 0 #endif void resched_task(struct task_struct *p) { int cpu; assert_raw_spin_locked(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; set_tsk_need_resched(p); cpu = task_cpu(p); if (cpu == smp_processor_id()) return; /* NEED_RESCHED must be visible before we test polling */ smp_mb(); if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; resched_task(cpu_curr(cpu)); raw_spin_unlock_irqrestore(&rq->lock, flags); } #ifdef CONFIG_NO_HZ /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. * * We don't do similar optimization for completely idle system, as * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ int get_nohz_timer_target(void) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { if (!idle_cpu(i)) { cpu = i; goto unlock; } } } unlock: rcu_read_unlock(); return cpu; } /* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely * idle system the next event might even be infinite time into the * future. wake_up_idle_cpu() ensures that the CPU is woken up and * leaves the inner idle loop so the newly added timer is taken into * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); if (cpu == smp_processor_id()) return; /* * This is safe, as this function is called with the timer * wheel base lock of (cpu) held. When the CPU is on the way * to idle and has not yet set rq->curr to idle then it will * be serialized on the timer wheel base lock and take the new * timer into account automatically. */ if (rq->curr != rq->idle) return; /* * We can set TIF_RESCHED on the idle task of the other CPU * lockless. The worst case is that the other CPU runs the * idle task through an additional NOOP schedule() */ set_tsk_need_resched(rq->idle); /* NEED_RESCHED must be visible before we test polling */ smp_mb(); if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); } static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } #else /* CONFIG_NO_HZ */ static inline bool got_nohz_idle_kick(void) { return false; } #endif /* CONFIG_NO_HZ */ void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { /* * Inline assembly required to prevent the compiler * optimising this loop into a divmod call. * See __iter_div_u64_rem() for another example of this. */ asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } } #else /* !CONFIG_SMP */ void resched_task(struct task_struct *p) { assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. * * Caller must hold rcu_lock or sufficient equivalent. */ int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; int ret; parent = from; down: ret = (*down)(parent, data); if (ret) goto out; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; up: continue; } ret = (*up)(parent, data); if (ret || parent == from) goto out; child = parent; parent = parent->parent; if (parent) goto up; out: return ret; } int tg_nop(struct task_group *tg, void *data) { return 0; } #endif static void set_load_weight(struct task_struct *p) { int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; /* * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; } load->weight = scale_load(prio_to_weight[prio]); load->inv_weight = prio_to_wmult[prio]; } static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); } void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); } static void update_rq_clock_task(struct rq *rq, s64 delta) { /* * In theory, the compile should just see 0 here, and optimize out the call * to sched_rt_avg_update. But I don't trust it... */ #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) s64 steal = 0, irq_delta = 0; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; /* * Since irq_time is only updated on {soft,}irq_exit, we might run into * this case when a previous update_rq_clock() happened inside a * {soft,}irq region. * * When this happens, we stop ->clock_task and only update the * prev_irq_time stamp to account for the part that fit, so that a next * update will consume the rest. This ensures ->clock_task is * monotonic. * * It does however cause some slight miss-attribution of {soft,}irq * time, a more accurate solution would be to update the irq_time using * the current rq->clock timestamp, except that would require using * atomic ops. */ if (irq_delta > delta) irq_delta = delta; rq->prev_irq_time += irq_delta; delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((&paravirt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; st = steal_ticks(steal); steal = st * TICK_NSEC; rq->prev_steal_time_rq += steal; delta -= steal; } #endif rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) sched_rt_avg_update(rq, irq_delta + steal); #endif } void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; struct task_struct *old_stop = cpu_rq(cpu)->stop; if (stop) { /* * Make it appear like a SCHED_FIFO task, its something * userspace knows about and won't get confused about. * * Also, it will make PI more or less work without too * much confusion -- but then, stop work should not * rely on PI working anyway. */ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param); stop->sched_class = &stop_sched_class; } cpu_rq(cpu)->stop = stop; if (old_stop) { /* * Reset it back to a normal scheduling class so that * it can die in pieces. */ old_stop->sched_class = &rt_sched_class; } } /* * __normal_prio - return the priority that is based on the static prio */ static inline int __normal_prio(struct task_struct *p) { return p->static_prio; } /* * Calculate the expected normal priority: i.e. priority * without taking RT-inheritance into account. Might be * boosted by interactivity modifiers. Changes upon fork, * setprio syscalls, and whenever the interactivity * estimator recalculates. */ static inline int normal_prio(struct task_struct *p) { int prio; if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); return prio; } /* * Calculate the current priority, i.e. the priority * taken into account by the scheduler. This value might * be boosted by RT tasks, or might be boosted by * interactivity modifiers. Will be RT if the task got * RT-boosted. If not then it returns p->normal_prio. */ static int effective_prio(struct task_struct *p) { p->normal_prio = normal_prio(p); /* * If we are RT tasks or we were boosted to RT priority, * keep the priority unchanged. Otherwise, update priority * to the normal priority: */ if (!rt_prio(p->prio)) return p->normal_prio; return p->prio; } /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ inline int task_curr(const struct task_struct *p) { return cpu_curr(task_cpu(p)) == p; } static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio) p->sched_class->prio_changed(rq, p, oldprio); } void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { const struct sched_class *class; if (p->sched_class == rq->curr->sched_class) { rq->curr->sched_class->check_preempt_curr(rq, p, flags); } else { for_each_class(class) { if (class == rq->curr->sched_class) break; if (class == p->sched_class) { resched_task(rq->curr); break; } } } /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); void register_task_migration_notifier(struct notifier_block *n) { atomic_notifier_chain_register(&task_migration_notifier, n); } #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG /* * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif #endif trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { struct task_migration_notifier tmn; if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); tmn.task = p; tmn.from_cpu = task_cpu(p); tmn.to_cpu = new_cpu; atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); } struct migration_arg { struct task_struct *task; int dest_cpu; }; static int migration_cpu_stop(void *data); /* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and * not expected to change. If it changes, i.e. @p might have woken up, * then return zero. When we succeed in waiting for @p to be off its CPU, * we return a positive number (its total switch count). If a second call * a short while later returns the same number, the caller can be sure that * @p has remained unscheduled the whole time. * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; int running, on_rq; unsigned long ncsw; struct rq *rq; for (;;) { /* * We do the initial early heuristics without holding * any task-queue locks at all. We'll only try to get * the runqueue lock when things look like they will * work out! */ rq = task_rq(p); /* * If the task is actively running on another CPU * still, just relax and busy-wait without holding * any locks. * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! * But we don't care, since "task_running()" will * return false if the runqueue has changed and p * is actually now running somewhere else! */ while (task_running(rq, p)) { if (match_state && unlikely(p->state != match_state)) return 0; cpu_relax(); } /* * Ok, time to look more closely! We need the rq * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. */ if (unlikely(!ncsw)) break; /* * Was it really running after all now that we * checked with the proper locks actually held? * * Oops. Go back and try again.. */ if (unlikely(running)) { cpu_relax(); continue; } /* * It's not enough that it's not actively running, * it must be off the runqueue _entirely_, and not * preempted! * * So if it was still runnable (but just not actively * running right now), it's preempted, and we should * yield - it could be a while. */ if (unlikely(on_rq)) { ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } /* * Ahh, all good. It wasn't running, and it wasn't * runnable, which means that it will never become * running in the future either. We're all done! */ break; } return ncsw; } /*** * kick_process - kick a running thread to enter/exit the kernel * @p: the to-be-kicked thread * * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) * * NOTE: this function doesn't have to take the runqueue lock, * because all it wants to ensure is that the remote task enters * the kernel. If the IPI races and the task has been migrated * to another CPU then no harm is done and the purpose has been * achieved as well. */ void kick_process(struct task_struct *p) { int cpu; preempt_disable(); cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); #endif /* CONFIG_SMP */ #ifdef CONFIG_SMP /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ static int select_fallback_rq(int cpu, struct task_struct *p) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; /* * If the node that the cpu is on has been offlined, cpu_to_node() * will return -1. There is no cpu on the node, and we should * select the cpu on the other node. */ if (nid != -1) { nodemask = cpumask_of_node(nid); /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { if (!cpu_online(dest_cpu)) continue; if (!cpu_active(dest_cpu)) continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } } for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { if (!cpu_online(dest_cpu)) continue; if (!cpu_active(dest_cpu)) continue; goto out; } switch (state) { case cpuset: /* No more Mr. Nice Guy. */ cpuset_cpus_allowed_fallback(p); state = possible; break; case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; break; case fail: BUG(); break; } } out: if (state != cpuset) { /* * Don't tell them about moving exiting tasks or * kernel threads (both mm NULL), since they never * leave kernel. */ if (p->mm && printk_ratelimit()) { printk_sched("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } return dest_cpu; } /* * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need * to rely on ttwu() to place the task on a valid ->cpus_allowed * cpu. * * Since this is common to all placement strategies, this lives here. * * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); return cpu; } static void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; *avg += diff >> 3; } #endif static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { #ifdef CONFIG_SCHEDSTATS struct rq *rq = this_rq(); #ifdef CONFIG_SMP int this_cpu = smp_processor_id(); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); schedstat_inc(p, se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; schedstat_inc(p, se.statistics.nr_wakeups_remote); rcu_read_lock(); for_each_domain(this_cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } } rcu_read_unlock(); } if (wake_flags & WF_MIGRATED) schedstat_inc(p, se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ schedstat_inc(rq, ttwu_count); schedstat_inc(p, se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); #endif /* CONFIG_SCHEDSTATS */ } static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = 1; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) wq_worker_waking_up(p, cpu_of(rq)); } /* * Mark the task runnable and perform wakeup-preemption. */ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; if (delta > max) rq->avg_idle = max; else update_avg(&rq->avg_idle, delta); rq->idle_stamp = 0; } #endif } static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) { #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; #endif ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); ttwu_do_wakeup(rq, p, wake_flags); } /* * Called in case the task @p isn't fully descheduled from its runqueue, * in this case we must do a remote wakeup. Its a 'light' wakeup though, * since all we need to do is flip p->state to TASK_RUNNING, since * the task is still ->on_rq. */ static int ttwu_remote(struct task_struct *p, int wake_flags) { struct rq *rq; int ret = 0; rq = __task_rq_lock(p); if (p->on_rq) { ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } __task_rq_unlock(rq); return ret; } #ifdef CONFIG_SMP static void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; raw_spin_lock(&rq->lock); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); ttwu_do_activate(rq, p, 0); } raw_spin_unlock(&rq->lock); } void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* * Not all reschedule IPI handlers call irq_enter/irq_exit, since * traditionally all their work was done from the interrupt return * path. Now that we actually do some work, we need to make sure * we do call them. * * Some archs already do call them, luckily irq_enter/exit nest * properly. * * Arguably we should visit all archs and update all handlers, * however a fair share of IPIs are still resched only so this would * somewhat pessimize the simple resched case. */ irq_enter(); sched_ttwu_pending(); /* * Check if someone kicked us for doing the nohz idle load balance. */ if (unlikely(got_nohz_idle_kick() && !need_resched())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) { if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) smp_send_reschedule(cpu); } bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; } #endif raw_spin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); raw_spin_unlock(&rq->lock); } /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual * re-schedule is in progress), and as such you're allowed to do * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; success = 1; /* we're going to change ->state */ cpu = task_cpu(p); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; #ifdef CONFIG_SMP /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ while (p->on_cpu) cpu_relax(); /* * Pairs with the smp_wmb() in finish_lock_switch(). */ smp_rmb(); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; if (p->sched_class->task_waking) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); stat: ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); BUG_ON(rq != this_rq()); BUG_ON(p == current); lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); } if (!(p->state & TASK_NORMAL)) goto out; if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * * Attempt to wake up the nominated process and move it to the set of runnable * processes. Returns 1 if the process was woken up, 0 if it was already * running. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ int wake_up_process(struct task_struct *p) { WARN_ON(task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); } /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. * * __sched_fork() is basic setup used by init_idle() too: */ static void __sched_fork(struct task_struct *p) { p->on_rq = 0; p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); /* * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be * removed when useful for applications beyond shares distribution (e.g. * load-balance). */ #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { p->mm->numa_next_scan = jiffies; p->mm->numa_next_reset = jiffies; p->mm->numa_scan_seq = 0; } p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; #endif /* CONFIG_NUMA_BALANCING */ } #ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_SCHED_DEBUG void set_numabalancing_state(bool enabled) { if (enabled) sched_feat_set("NUMA"); else sched_feat_set("NO_NUMA"); } #else __read_mostly bool numabalancing_enabled; void set_numabalancing_state(bool enabled) { numabalancing_enabled = enabled; } #endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_NUMA_BALANCING */ /* * fork()/clone()-time setup: */ void sched_fork(struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); __sched_fork(p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; /* * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio; /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { if (task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: */ p->sched_reset_on_fork = 0; } if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; if (p->sched_class->task_fork) p->sched_class->task_fork(p); /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() * is ran before sched_fork(). * * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif put_cpu(); } /* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, p, &flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS /** * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) { hlist_add_head(&notifier->link, &current->preempt_notifiers); } EXPORT_SYMBOL_GPL(preempt_notifier_register); /** * preempt_notifier_unregister - no longer interested in preemption notifications * @notifier: notifier struct to unregister * * This is safe to call from within a preemption notifier. */ void preempt_notifier_unregister(struct preempt_notifier *notifier) { hlist_del(&notifier->link); } EXPORT_SYMBOL_GPL(preempt_notifier_unregister); static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); } static void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { struct preempt_notifier *notifier; hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_out(notifier, next); } #else /* !CONFIG_PREEMPT_NOTIFIERS */ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { } static void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { } #endif /* CONFIG_PREEMPT_NOTIFIERS */ /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch * @prev: the current task that is being switched out * @next: the task we are going to switch to. * * This is called with the rq lock held and interrupts off. It must * be paired with a subsequent finish_task_switch after the context * switch. * * prepare_task_switch sets up locking and calls architecture specific * hooks. */ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); } /** * finish_task_switch - clean up after a task-switch * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired * with a prepare_task_switch call before the context switch. * finish_task_switch will reconcile locking set up by prepare_task_switch, * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) */ static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. * The test for TASK_DEAD must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. * Manfred Spraul <manfred@colorfullife.com> */ prev_state = prev->state; vtime_task_switch(prev); finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. */ kprobe_flush_task(prev); put_task_struct(prev); } } #ifdef CONFIG_SMP /* assumes rq->lock is held */ static inline void pre_schedule(struct rq *rq, struct task_struct *prev) { if (prev->sched_class->pre_schedule) prev->sched_class->pre_schedule(rq, prev); } /* rq->lock is NOT held, but preemption is disabled */ static inline void post_schedule(struct rq *rq) { if (rq->post_schedule) { unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); if (rq->curr->sched_class->post_schedule) rq->curr->sched_class->post_schedule(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); rq->post_schedule = 0; } } #else static inline void pre_schedule(struct rq *rq, struct task_struct *p) { } static inline void post_schedule(struct rq *rq) { } #endif /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); finish_task_switch(rq, prev); /* * FIXME: do we need to worry about rq being invalidated by the * task_switch? */ post_schedule(rq); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } /* * context_switch - switch to the new MM and the new * thread's register state. */ static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */ arch_start_context_switch(prev); if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ #ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ finish_task_switch(this_rq(), prev); } /* * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { unsigned long i, sum = 0; for_each_online_cpu(i) sum += cpu_rq(i)->nr_running; return sum; } unsigned long long nr_context_switches(void) { int i; unsigned long long sum = 0; for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; return sum; } unsigned long nr_iowait(void) { unsigned long i, sum = 0; for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); return sum; } unsigned long nr_iowait_cpu(int cpu) { struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } unsigned long this_cpu_load(void) { struct rq *this = this_rq(); return this->cpu_load[0]; } /* * Global load-average calculations * * We take a distributed and async approach to calculating the global load-avg * in order to minimize overhead. * * The global load average is an exponentially decaying average of nr_running + * nr_uninterruptible. * * Once every LOAD_FREQ: * * nr_active = 0; * for_each_possible_cpu(cpu) * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; * * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) * * Due to a number of reasons the above turns in the mess below: * * - for_each_possible_cpu() is prohibitively expensive on machines with * serious number of cpus, therefore we need to take a distributed approach * to calculating nr_active. * * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } * * So assuming nr_active := 0 when we start out -- true per definition, we * can simply take per-cpu deltas and fold those into a global accumulate * to obtain the same result. See calc_load_fold_active(). * * Furthermore, in order to avoid synchronizing all per-cpu delta folding * across the machine, we assume 10 ticks is sufficient time for every * cpu to have completed this task. * * This places an upper-bound on the IRQ-off latency of the machine. Then * again, being late doesn't loose the delta, just wrecks the sample. * * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because * this would add another cross-cpu cacheline miss and atomic operation * to the wakeup path. Instead we increment on whatever cpu the task ran * when it went into uninterruptible state and decrement on whatever cpu * did the wakeup. This means that only the sum of nr_uninterruptible over * all cpus yields the correct result. * * This covers the NO_HZ=n code, for extra head-aches, see the comment below. */ /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; EXPORT_SYMBOL(avenrun); /* should be removed */ /** * get_avenrun - get the load average array * @loads: pointer to dest load array * @offset: offset to add * @shift: shift count to shift the result left * * These values are estimates at best, so no need for locking. */ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) { loads[0] = (avenrun[0] + offset) << shift; loads[1] = (avenrun[1] + offset) << shift; loads[2] = (avenrun[2] + offset) << shift; } static long calc_load_fold_active(struct rq *this_rq) { long nr_active, delta = 0; nr_active = this_rq->nr_running; nr_active += (long) this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; this_rq->calc_load_active = nr_active; } return delta; } /* * a1 = a0 * e + a * (1 - e) */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { load *= exp; load += active * (FIXED_1 - exp); load += 1UL << (FSHIFT - 1); return load >> FSHIFT; } #ifdef CONFIG_NO_HZ /* * Handle NO_HZ for the global load-average. * * Since the above described distributed algorithm to compute the global * load-average relies on per-cpu sampling from the tick, it is affected by * NO_HZ. * * The basic idea is to fold the nr_active delta into a global idle-delta upon * entering NO_HZ state such that we can include this as an 'extra' cpu delta * when we read the global state. * * Obviously reality has to ruin such a delightfully simple scheme: * * - When we go NO_HZ idle during the window, we can negate our sample * contribution, causing under-accounting. * * We avoid this by keeping two idle-delta counters and flipping them * when the window starts, thus separating old and new NO_HZ load. * * The only trick is the slight shift in index flip for read vs write. * * 0s 5s 10s 15s * +10 +10 +10 +10 * |-|-----------|-|-----------|-|-----------|-| * r:0 0 1 1 0 0 1 1 0 * w:0 1 1 0 0 1 1 0 0 * * This ensures we'll fold the old idle contribution in this window while * accumlating the new one. * * - When we wake up from NO_HZ idle during the window, we push up our * contribution, since we effectively move our sample point to a known * busy state. * * This is solved by pushing the window forward, and thus skipping the * sample, for this cpu (effectively using the idle-delta for this cpu which * was in effect at the time the window opened). This also solves the issue * of having to deal with a cpu having been in NOHZ idle for multiple * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ static atomic_long_t calc_load_idle[2]; static int calc_load_idx; static inline int calc_load_write_idx(void) { int idx = calc_load_idx; /* * See calc_global_nohz(), if we observe the new index, we also * need to observe the new update time. */ smp_rmb(); /* * If the folding window started, make sure we start writing in the * next idle-delta. */ if (!time_before(jiffies, calc_load_update)) idx++; return idx & 1; } static inline int calc_load_read_idx(void) { return calc_load_idx & 1; } void calc_load_enter_idle(void) { struct rq *this_rq = this_rq(); long delta; /* * We're going into NOHZ mode, if there's any pending delta, fold it * into the pending idle delta. */ delta = calc_load_fold_active(this_rq); if (delta) { int idx = calc_load_write_idx(); atomic_long_add(delta, &calc_load_idle[idx]); } } void calc_load_exit_idle(void) { struct rq *this_rq = this_rq(); /* * If we're still before the sample window, we're done. */ if (time_before(jiffies, this_rq->calc_load_update)) return; /* * We woke inside or after the sample window, this means we're already * accounted through the nohz accounting, so skip the entire deal and * sync up for the next window. */ this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update + 10)) this_rq->calc_load_update += LOAD_FREQ; } static long calc_load_fold_idle(void) { int idx = calc_load_read_idx(); long delta = 0; if (atomic_long_read(&calc_load_idle[idx])) delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } /** * fixed_power_int - compute: x^n, in O(log n) time * * @x: base of the power * @frac_bits: fractional bits of @x * @n: power to raise @x to. * * By exploiting the relation between the definition of the natural power * function: x^n := x*x*...*x (x multiplied by itself for n times), and * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, * (where: n_i \elem {0, 1}, the binary vector representing n), * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is * of course trivially computable in O(log_2 n), the length of our binary * vector. */ static unsigned long fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { unsigned long result = 1UL << frac_bits; if (n) for (;;) { if (n & 1) { result *= x; result += 1UL << (frac_bits - 1); result >>= frac_bits; } n >>= 1; if (!n) break; x *= x; x += 1UL << (frac_bits - 1); x >>= frac_bits; } return result; } /* * a1 = a0 * e + a * (1 - e) * * a2 = a1 * e + a * (1 - e) * = (a0 * e + a * (1 - e)) * e + a * (1 - e) * = a0 * e^2 + a * (1 - e) * (1 + e) * * a3 = a2 * e + a * (1 - e) * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) * * ... * * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) * = a0 * e^n + a * (1 - e^n) * * [1] application of the geometric series: * * n 1 - x^(n+1) * S_n := \Sum x^i = ------------- * i=0 1 - x */ static unsigned long calc_load_n(unsigned long load, unsigned long exp, unsigned long active, unsigned int n) { return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } /* * NO_HZ can leave us missing all per-cpu ticks calling * calc_load_account_active(), but since an idle CPU folds its delta into * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold * in the pending idle delta if our idle period crossed a load cycle boundary. * * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. */ static void calc_global_nohz(void) { long delta, active, n; if (!time_before(jiffies, calc_load_update + 10)) { /* * Catch-up, fold however many we are behind still */ delta = jiffies - calc_load_update - 10; n = 1 + (delta / LOAD_FREQ); active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); calc_load_update += n * LOAD_FREQ; } /* * Flip the idle index... * * Make sure we first write the new time then flip the index, so that * calc_load_write_idx() will see the new time when it reads the new * index, this avoids a double flip messing things up. */ smp_wmb(); calc_load_idx++; } #else /* !CONFIG_NO_HZ */ static inline long calc_load_fold_idle(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. */ void calc_global_load(unsigned long ticks) { long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; /* * Fold the 'old' idle-delta to include all NO_HZ cpus. */ delta = calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; avenrun[0] = calc_load(avenrun[0], EXP_1, active); avenrun[1] = calc_load(avenrun[1], EXP_5, active); avenrun[2] = calc_load(avenrun[2], EXP_15, active); calc_load_update += LOAD_FREQ; /* * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } /* * Called from update_cpu_load() to periodically update this CPU's * active count. */ static void calc_load_account_active(struct rq *this_rq) { long delta; if (time_before(jiffies, this_rq->calc_load_update)) return; delta = calc_load_fold_active(this_rq); if (delta) atomic_long_add(delta, &calc_load_tasks); this_rq->calc_load_update += LOAD_FREQ; } /* * End of global load-average stuff */ /* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load * * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called * on nth tick when cpu may be busy, then we have: * load = ((2^idx - 1) / 2^idx)^(n-1) * load * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load * * decay_load_missed() below does efficient calculation of * load = ((2^idx - 1) / 2^idx)^(n-1) * load * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load * * The calculation is approximated on a 128 point scale. * degrade_zero_ticks is the number of ticks after which load at any * particular idx is approximated to be zero. * degrade_factor is a precomputed table, a row for each load idx. * Each column corresponds to degradation factor for a power of two ticks, * based on 128 point scale. * Example: * row 2, col 3 (=12) says that the degradation at load idx 2 after * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). * * With this power of 2 load factors, we can degrade the load n times * by looking at 1 bits in n and doing as many mult/shift instead of * n mult/shifts needed by the exact degradation. */ #define DEGRADE_SHIFT 7 static const unsigned char degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; static const unsigned char degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { {0, 0, 0, 0, 0, 0, 0, 0}, {64, 32, 8, 0, 0, 0, 0, 0}, {96, 72, 40, 12, 1, 0, 0}, {112, 98, 75, 43, 15, 1, 0}, {120, 112, 98, 76, 45, 16, 2} }; /* * Update cpu_load for any missed ticks, due to tickless idle. The backlog * would be when CPU is idle and so we just decay the old load without * adding any new load. */ static unsigned long decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) { int j = 0; if (!missed_updates) return load; if (missed_updates >= degrade_zero_ticks[idx]) return 0; if (idx == 1) return load >> missed_updates; while (missed_updates) { if (missed_updates % 2) load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; missed_updates >>= 1; j++; } return load; } /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, unsigned long pending_updates) { int i, scale; this_rq->nr_load_updates++; /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { unsigned long old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; old_load = decay_load_missed(old_load, pending_updates - 1, i); new_load = this_load; /* * Round up the averaging division if load is increasing. This * prevents us from getting stuck on 9 if the load is 10, for * example. */ if (new_load > old_load) new_load += scale - 1; this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } sched_avg_update(this_rq); } #ifdef CONFIG_NO_HZ /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. * * Therefore we cannot use the delta approach from the regular tick since that * would seriously skew the load calculation. However we'll make do for those * updates happening while idle (nohz_idle_balance) or coming out of idle * (tick_nohz_idle_exit). * * This means we might still be one tick off for nohz periods. */ /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; pending_updates = curr_jiffies - this_rq->last_load_update_tick; this_rq->last_load_update_tick = curr_jiffies; __update_cpu_load(this_rq, load, pending_updates); } /* * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. */ void update_cpu_load_nohz(void) { struct rq *this_rq = this_rq(); unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long pending_updates; if (curr_jiffies == this_rq->last_load_update_tick) return; raw_spin_lock(&this_rq->lock); pending_updates = curr_jiffies - this_rq->last_load_update_tick; if (pending_updates) { this_rq->last_load_update_tick = curr_jiffies; /* * We were idle, this means load 0, the current load might be * !0 due to remote wakeups and the sort. */ __update_cpu_load(this_rq, 0, pending_updates); } raw_spin_unlock(&this_rq->lock); } #endif /* CONFIG_NO_HZ */ /* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); calc_load_account_active(this_rq); } #ifdef CONFIG_SMP /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. */ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; if (likely(cpu_active(dest_cpu))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); return; } unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #endif DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. * * Called with task_rq_lock() held on @rq. */ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) { u64 ns = 0; if (task_current(rq, p)) { update_rq_clock(rq); ns = rq->clock_task - p->se.exec_start; if ((s64)ns < 0) ns = 0; } return ns; } unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; struct rq *rq; u64 ns = 0; rq = task_rq_lock(p, &flags); ns = do_task_delta_exec(p, rq); task_rq_unlock(rq, p, &flags); return ns; } /* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. */ unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; struct rq *rq; u64 ns = 0; rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, p, &flags); return ns; } /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ void scheduler_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; sched_clock_tick(); raw_spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); perf_event_task_tick(); #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; if (in_lock_functions(addr)) addr = CALLER_ADDR3; } return addr; } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif preempt_count() += val; #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif if (preempt_count() == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; /* * Is the spinlock portion underflowing? */ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; #endif if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); #endif /* * Print scheduling while atomic bug: */ static noinline void __schedule_bug(struct task_struct *prev) { if (oops_in_progress) return; printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } /* * Various schedule()-time debugging checks and statistics: */ static inline void schedule_debug(struct task_struct *prev) { /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq(), sched_count); } static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } /* * Pick up the highest-prio task: */ static inline struct task_struct * pick_next_task(struct rq *rq) { const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; } BUG(); /* the idle class will always have a runnable task */ } /* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: * * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. * * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return * paths. For example, see arch/x86/entry_64.S. * * To drive preemption between tasks, the scheduler sets the flag in timer * interrupt handler scheduler_tick(). * * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. * * Now, if the new task added to the run-queue preempts the current * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * * - If the kernel is preemptible (CONFIG_PREEMPT=y): * * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s * spin_unlock()!) * * - in IRQ context, return from interrupt-handler to * preemptible context * * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) * then at the next: * * - cond_resched() call * - explicit schedule() call * - return from syscall or exception to user-space * - return from interrupt-handler to user-space */ static void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; /* * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain * concurrency. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; to_wakeup = wq_worker_sleeping(prev, cpu); if (to_wakeup) try_to_wake_up_local(to_wakeup); } } switch_count = &prev->nvcsw; } pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); rq->skip_clock_update = 0; if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ /* * The context switch have flipped the stack from under us * and restored the local variables which were saved when * this task called schedule() in the past. prev == current * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); } else raw_spin_unlock_irq(&rq->lock); post_schedule(rq); sched_preempt_enable_no_resched(); if (need_resched()) goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) return; /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(tsk)) blk_schedule_flush_plug(tsk); } asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; sched_submit_work(tsk); __schedule(); } EXPORT_SYMBOL(schedule); #ifdef CONFIG_CONTEXT_TRACKING asmlinkage void __sched schedule_user(void) { /* * If we come here after a random call to set_need_resched(), * or we have been woken up remotely but the IPI has not yet arrived, * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. */ user_exit(); schedule(); user_enter(); } #endif /** * schedule_preempt_disabled - called with preemption disabled * * Returns with preemption disabled. Note: preempt_count must be 1 */ void __sched schedule_preempt_disabled(void) { sched_preempt_enable_no_resched(); schedule(); preempt_disable(); } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { if (lock->owner != owner) return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking * lock->owner still matches owner, if that fails, owner might * point to free()d memory, if it still matches, the rcu_read_lock() * ensures the memory stays valid. */ barrier(); return owner->on_cpu; } /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. */ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) { if (!sched_feat(OWNER_SPIN)) return 0; rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) break; arch_mutex_cpu_relax(); } rcu_read_unlock(); /* * We break out the loop above on need_resched() and when the * owner changed, which is a sign for heavy contention. Return * success only when lock->owner is NULL. */ return lock->owner == NULL; } #endif #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ asmlinkage void __sched notrace preempt_schedule(void) { struct thread_info *ti = current_thread_info(); /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ if (likely(ti->preempt_count || irqs_disabled())) return; do { add_preempt_count_notrace(PREEMPT_ACTIVE); __schedule(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity * between schedule and now. */ barrier(); } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); /* * this is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. */ asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); __schedule(); local_irq_disable(); sub_preempt_count(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity * between schedule and now. */ barrier(); } while (need_resched()); } #endif /* CONFIG_PREEMPT */ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } /** * __wake_up - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); __wake_up_common(q, mode, nr_exclusive, 0, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) { __wake_up_common(q, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { __wake_up_common(q, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not * be migrated to another CPU - ie. the two threads are 'synchronized' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) wake_flags = 0; spin_lock_irqsave(&q->lock, flags); __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { __wake_up_sync_key(q, mode, nr_exclusive, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ /** * complete: - signals a single thread waiting on this completion * @x: holds the state of this particular completion * * This will wake up a single thread waiting on this completion. Threads will be * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); /** * complete_all: - signals all threads waiting on this completion * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); static inline long __sched do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); __add_wait_queue_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); if (!x->done) return timeout; } x->done--; return timeout ?: 1; } static inline long __sched __wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { might_sleep(); spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); return timeout; } static long __sched wait_for_common(struct completion *x, long timeout, int state) { return __wait_for_common(x, schedule_timeout, timeout, state); } static long __sched wait_for_common_io(struct completion *x, long timeout, int state) { return __wait_for_common(x, io_schedule_timeout, timeout, state); } /** * wait_for_completion: - waits for completion of a task * @x: holds the state of this particular completion * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. * * See also similar routines (i.e. wait_for_completion_timeout()) with timeout * and interrupt capability. Also see complete(). */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); /** * wait_for_completion_timeout: - waits for completion of a task (w/timeout) * @x: holds the state of this particular completion * @timeout: timeout value in jiffies * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. * * The return value is 0 if timed out, and positive (at least 1, or number of * jiffies left till timeout) if completed. */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion_timeout); /** * wait_for_completion_io: - waits for completion of a task * @x: holds the state of this particular completion * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. The caller is accounted as waiting * for IO. */ void __sched wait_for_completion_io(struct completion *x) { wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion_io); /** * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) * @x: holds the state of this particular completion * @timeout: timeout value in jiffies * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. The caller is accounted as waiting for IO. * * The return value is 0 if timed out, and positive (at least 1, or number of * jiffies left till timeout) if completed. */ unsigned long __sched wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) { return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion_io_timeout); /** * wait_for_completion_interruptible: - waits for completion of a task (w/intr) * @x: holds the state of this particular completion * * This waits for completion of a specific task to be signaled. It is * interruptible. * * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); if (t == -ERESTARTSYS) return t; return 0; } EXPORT_SYMBOL(wait_for_completion_interruptible); /** * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) * @x: holds the state of this particular completion * @timeout: timeout value in jiffies * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. * * The return value is -ERESTARTSYS if interrupted, 0 if timed out, * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) { return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); /** * wait_for_completion_killable: - waits for completion of a task (killable) * @x: holds the state of this particular completion * * This waits to be signaled for completion of a specific task. It can be * interrupted by a kill signal. * * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); if (t == -ERESTARTSYS) return t; return 0; } EXPORT_SYMBOL(wait_for_completion_killable); /** * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) * @x: holds the state of this particular completion * @timeout: timeout value in jiffies * * This waits for either a completion of a specific task to be * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. * * The return value is -ERESTARTSYS if interrupted, 0 if timed out, * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_killable_timeout(struct completion *x, unsigned long timeout) { return wait_for_common(x, timeout, TASK_KILLABLE); } EXPORT_SYMBOL(wait_for_completion_killable_timeout); /** * try_wait_for_completion - try to decrement a completion without blocking * @x: completion structure * * Returns: 0 if a decrement cannot be done without blocking * 1 if a decrement succeeded. * * If a completion is being used as a counting completion, * attempt to decrement the counter without blocking. This * enables us to avoid waiting if the resource the completion * is protecting is not available. */ bool try_wait_for_completion(struct completion *x) { unsigned long flags; int ret = 1; spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); /** * completion_done - Test to see if a completion has any waiters * @x: completion structure * * Returns: 0 if there are waiters (wait_for_completion() in progress) * 1 if there are no waiters. * */ bool completion_done(struct completion *x) { unsigned long flags; int ret = 1; spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { unsigned long flags; wait_queue_t wait; init_waitqueue_entry(&wait, current); __set_current_state(state); spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, &wait); spin_unlock(&q->lock); timeout = schedule_timeout(timeout); spin_lock_irq(&q->lock); __remove_wait_queue(q, &wait); spin_unlock_irqrestore(&q->lock, flags); return timeout; } void __sched interruptible_sleep_on(wait_queue_head_t *q) { sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(interruptible_sleep_on); long __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); } EXPORT_SYMBOL(interruptible_sleep_on_timeout); void __sched sleep_on(wait_queue_head_t *q) { sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(sleep_on); long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); } EXPORT_SYMBOL(sleep_on_timeout); #ifdef CONFIG_RT_MUTEXES /* * rt_mutex_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * * Used by the rt_mutex code to implement priority inheritance logic. */ void rt_mutex_setprio(struct task_struct *p, int prio) { int oldprio, on_rq, running; struct rq *rq; const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = __task_rq_lock(p); /* * Idle task boosting is a nono in general. There is one * exception, when PREEMPT_RT and NOHZ is active: * * The idle task calls get_next_timer_interrupt() and holds * the timer wheel base->lock on the CPU and another CPU wants * to access the timer (probably to cancel it). We can safely * ignore the boosting request, as the idle CPU runs this code * with interrupts disabled and will complete the lock * protected section without being interrupted. So there is no * real need to boost. */ if (unlikely(p == rq->idle)) { WARN_ON(p != rq->curr); WARN_ON(p->pi_blocked_on); goto out_unlock; } trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); if (rt_prio(prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; p->prio = prio; if (running) p->sched_class->set_curr_task(rq); if (on_rq) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); out_unlock: __task_rq_unlock(rq); } #endif void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; unsigned long flags; struct rq *rq; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; /* * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is * SCHED_FIFO/SCHED_RR: */ if (task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); old_prio = p->prio; p->prio = effective_prio(p); delta = p->prio - old_prio; if (on_rq) { enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); /* * can_nice - check if a task can reduce its nice value * @p: task * @nice: nice value */ int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ int nice_rlim = 20 - nice; return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); } #ifdef __ARCH_WANT_SYS_NICE /* * sys_nice - change the priority of the current process. * @increment: priority increment * * sys_setpriority is a more generic, but much slower function that * does similar things. */ SYSCALL_DEFINE1(nice, int, increment) { long nice, retval; /* * Setpriority might change our priority at the same moment. * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ if (increment < -40) increment = -40; if (increment > 40) increment = 40; nice = TASK_NICE(current) + increment; if (nice < -20) nice = -20; if (nice > 19) nice = 19; if (increment < 0 && !can_nice(current, nice)) return -EPERM; retval = security_task_setnice(current, nice); if (retval) return retval; set_user_nice(current, nice); return 0; } #endif /** * task_prio - return the priority value of a given task. * @p: the task in question. * * This is the priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ int task_prio(const struct task_struct *p) { return p->prio - MAX_RT_PRIO; } /** * task_nice - return the nice value of a given task. * @p: the task in question. */ int task_nice(const struct task_struct *p) { return TASK_NICE(p); } EXPORT_SYMBOL(task_nice); /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. */ int idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); if (rq->curr != rq->idle) return 0; if (rq->nr_running) return 0; #ifdef CONFIG_SMP if (!llist_empty(&rq->wake_list)) return 0; #endif return 1; } /** * idle_task - return the idle task for a given cpu. * @cpu: the processor in question. */ struct task_struct *idle_task(int cpu) { return cpu_rq(cpu)->idle; } /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ static struct task_struct *find_process_by_pid(pid_t pid) { return pid ? find_task_by_vpid(pid) : current; } /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; set_load_weight(p); } /* * check the target process has a UID that matches the current process's */ static bool check_same_owner(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred; bool match; rcu_read_lock(); pcred = __task_cred(p); match = (uid_eq(cred->euid, pcred->euid) || uid_eq(cred->euid, pcred->uid)); rcu_read_unlock(); return match; } static int __sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ if (policy < 0) { reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; } else { reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); policy &= ~SCHED_RESET_ON_FORK; if (policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && policy != SCHED_IDLE) return -EINVAL; } /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ if (param->sched_priority < 0 || (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if (rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) return -EPERM; /* can't increase priority */ if (param->sched_priority > p->rt_priority && param->sched_priority > rlim_rtprio) return -EPERM; } /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { if (!can_nice(p, TASK_NICE(p))) return -EPERM; } /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; /* Normal users shall not reset the sched_reset_on_fork flag */ if (p->sched_reset_on_fork && !reset_on_fork) return -EPERM; } if (user) { retval = security_task_setscheduler(p); if (retval) return retval; } /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: * * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { task_rq_unlock(rq, p, &flags); return -EINVAL; } /* * If not changing anything there's no need to proceed further: */ if (unlikely(policy == p->policy && (!rt_policy(policy) || param->sched_priority == p->rt_priority))) { task_rq_unlock(rq, p, &flags); return 0; } #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* * Do not allow realtime tasks into groups that have no runtime * assigned. */ if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { task_rq_unlock(rq, p, &flags); return -EPERM; } } #endif /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &flags); goto recheck; } on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); if (running) p->sched_class->set_curr_task(rq); if (on_rq) enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); rt_mutex_adjust_pi(p); return 0; } /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { return __sched_setscheduler(p, policy, param, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * * Just like sched_setscheduler, only don't bother checking if the * current context has permission. For example, this is needed in * stop_machine(): we create temporary high priority worker threads, * but our caller might not have that capability. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { return __sched_setscheduler(p, policy, param, false); } static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; struct task_struct *p; int retval; if (!param || pid < 0) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); if (p != NULL) retval = sched_setscheduler(p, policy, &lparam); rcu_read_unlock(); return retval; } /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. * @policy: new policy. * @param: structure containing the new RT priority. */ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) { /* negative values for policy are not valid */ if (policy < 0) return -EINVAL; return do_sched_setscheduler(pid, policy, param); } /** * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. * @param: structure containing the new RT priority. */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { return do_sched_setscheduler(pid, -1, param); } /** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. */ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { struct task_struct *p; int retval; if (pid < 0) return -EINVAL; retval = -ESRCH; rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); if (!retval) retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } rcu_read_unlock(); return retval; } /** * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { struct sched_param lp; struct task_struct *p; int retval; if (!param || pid < 0) return -EINVAL; rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) goto out_unlock; retval = security_task_getscheduler(p); if (retval) goto out_unlock; lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... */ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; return retval; out_unlock: rcu_read_unlock(); return retval; } long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); put_online_cpus(); return -ESRCH; } /* Prevent p going away */ get_task_struct(p); rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; } if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { retval = -ENOMEM; goto out_free_cpus_allowed; } retval = -EPERM; if (!check_same_owner(p)) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); goto out_unlock; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) goto out_unlock; cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); again: retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); put_online_cpus(); return retval; } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, struct cpumask *new_mask) { if (len < cpumask_size()) cpumask_clear(new_mask); else if (len > cpumask_size()) len = cpumask_size(); return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } /** * sys_sched_setaffinity - set the cpu affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); if (retval == 0) retval = sched_setaffinity(pid, new_mask); free_cpumask_var(new_mask); return retval; } long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; unsigned long flags; int retval; get_online_cpus(); rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); if (!p) goto out_unlock; retval = security_task_getscheduler(p); if (retval) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); put_online_cpus(); return retval; } /** * sys_sched_getaffinity - get the cpu affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) { int ret; cpumask_var_t mask; if ((len * BITS_PER_BYTE) < nr_cpu_ids) return -EINVAL; if (len & (sizeof(unsigned long)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); if (ret == 0) { size_t retlen = min_t(size_t, len, cpumask_size()); if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; else ret = retlen; } free_cpumask_var(mask); return ret; } /** * sys_sched_yield - yield the current processor to other threads. * * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. */ SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); schedstat_inc(rq, yld_count); current->sched_class->yield_task(rq); /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); sched_preempt_enable_no_resched(); schedule(); return 0; } static inline int should_resched(void) { return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); } static void __cond_resched(void) { add_preempt_count(PREEMPT_ACTIVE); __schedule(); sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) { if (should_resched()) { __cond_resched(); return 1; } return 0; } EXPORT_SYMBOL(_cond_resched); /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ int __cond_resched_lock(spinlock_t *lock) { int resched = should_resched(); int ret = 0; lockdep_assert_held(lock); if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) __cond_resched(); else cpu_relax(); ret = 1; spin_lock(lock); } return ret; } EXPORT_SYMBOL(__cond_resched_lock); int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); if (should_resched()) { local_bh_enable(); __cond_resched(); local_bh_disable(); return 1; } return 0; } EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. * * Do not ever use this function, there's a 99% chance you're doing it wrong. * * The scheduler is at all times free to pick the calling task as the most * eligible task to run, if removing the yield() call from your code breaks * it, its already broken. * * Typical broken usage is: * * while (!event) * yield(); * * where one assumes that yield() will let 'the other' process run that will * make event true. If the current task is a SCHED_FIFO task that will never * happen. Never use yield() as a progress guarantee!! * * If you want to use yield() to wait for something, use wait_event(). * If you want to use yield() to be 'nice' for others, use cond_resched(). * If you still want to use yield(), do not! */ void __sched yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } EXPORT_SYMBOL(yield); /** * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the * processor it's on. * @p: target task * @preempt: whether task preemption is allowed or not * * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * * Returns: * true (>0) if we indeed boosted the target task. * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. */ bool __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; unsigned long flags; int yielded = 0; local_irq_save(flags); rq = this_rq(); again: p_rq = task_rq(p); /* * If we're the only runnable task on the rq and target rq also * has only one task, there's absolutely no point in yielding. */ if (rq->nr_running == 1 && p_rq->nr_running == 1) { yielded = -ESRCH; goto out_irq; } double_rq_lock(rq, p_rq); while (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); goto again; } if (!curr->sched_class->yield_to_task) goto out_unlock; if (curr->sched_class != p->sched_class) goto out_unlock; if (task_running(p_rq, p) || p->state) goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { schedstat_inc(rq, yld_count); /* * Make p's CPU reschedule; pick_next_entity takes care of * fairness. */ if (preempt && rq != p_rq) resched_task(p_rq->curr); } out_unlock: double_rq_unlock(rq, p_rq); out_irq: local_irq_restore(flags); if (yielded > 0) schedule(); return yielded; } EXPORT_SYMBOL_GPL(yield_to); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ void __sched io_schedule(void) { struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); blk_flush_plug(current); current->in_iowait = 1; schedule(); current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); blk_flush_plug(current); current->in_iowait = 1; ret = schedule_timeout(timeout); current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; } /** * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. * * this syscall returns the maximum rt_priority that can be used * by a given scheduling class. */ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { int ret = -EINVAL; switch (policy) { case SCHED_FIFO: case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: ret = 0; break; } return ret; } /** * sys_sched_get_priority_min - return minimum RT priority. * @policy: scheduling class. * * this syscall returns the minimum rt_priority that can be used * by a given scheduling class. */ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { int ret = -EINVAL; switch (policy) { case SCHED_FIFO: case SCHED_RR: ret = 1; break; case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: ret = 0; } return ret; } /** * sys_sched_rr_get_interval - return the default timeslice of a process. * @pid: pid of the process. * @interval: userspace pointer to the timeslice value. * * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { struct task_struct *p; unsigned int time_slice; unsigned long flags; struct rq *rq; int retval; struct timespec t; if (pid < 0) return -EINVAL; retval = -ESRCH; rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; retval = security_task_getscheduler(p); if (retval) goto out_unlock; rq = task_rq_lock(p, &flags); time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, p, &flags); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: rcu_read_unlock(); return retval; } static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) printk(KERN_CONT " running "); else printk(KERN_CONT " %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) printk(KERN_CONT " running task "); else printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif rcu_read_lock(); ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; #if BITS_PER_LONG == 32 printk(KERN_INFO " task PC stack pid father\n"); #else printk(KERN_INFO " task PC stack pid father\n"); #endif rcu_read_lock(); do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: */ touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } while_each_thread(g, p); touch_all_softlockup_watchdogs(); #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ if (!state_filter) debug_show_all_locks(); } void __cpuinit init_idle_bootup_task(struct task_struct *idle) { idle->sched_class = &idle_sched_class; } /** * init_idle - set up an idle thread for a given CPU * @idle: task in question * @cpu: cpu the idle task belongs to * * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ void __cpuinit init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); do_set_cpus_allowed(idle, cpumask_of(cpu)); /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the * lockdep check in task_group() will fail. * * Similar case to sched_fork(). / Alternatively we could * use task_rq_lock() here and obtain the other rq->lock. * * Silence PROVE_RCU */ rcu_read_lock(); __set_task_cpu(idle, cpu); rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ task_thread_info(idle)->preempt_count = 0; /* * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } /* * This is how migration works: * * 1) we invoke migration_cpu_stop() on the target CPU using * stop_one_cpu(). * 2) stopper starts to run (implicitly forcing the migrated thread * off the CPU) * 3) it checks whether the migrated task is still in the wrong runqueue. * 4) if it's in the wrong runqueue then the migration thread removes * it and puts it into the right queue. * 5) stopper completes and stop_one_cpu() returns and the migration * is done. */ /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on * is removed from the allowed bitmask. * * NOTE: the caller must have a valid reference to the task, the * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { unsigned long flags; struct rq *rq; unsigned int dest_cpu; int ret = 0; rq = task_rq_lock(p, &flags); if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { ret = -EINVAL; goto out; } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); if (p->on_rq) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: task_rq_unlock(rq, p, &flags); return ret; } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're * attempting to rebalance this task on exec (sched_exec). * * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. * * Returns non-zero if task was successfully migrated. */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); raw_spin_lock(&p->pi_lock); double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; /* * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ if (p->on_rq) { dequeue_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); enqueue_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); raw_spin_unlock(&p->pi_lock); return ret; } /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then * 'pushing' onto another runqueue. */ static int migration_cpu_stop(void *data) { struct migration_arg *arg = data; /* * The original target cpu might have gone down and we might * be on another cpu but it doesn't matter. */ local_irq_disable(); __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); return 0; } #ifdef CONFIG_HOTPLUG_CPU /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. */ void idle_task_exit(void) { struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) switch_mm(mm, &init_mm, current); mmdrop(mm); } /* * Since this CPU is going 'away' for a while, fold any nr_active delta * we might have. Assumes we're called after migrate_tasks() so that the * nr_active count is stable. * * Also see the comment "Global load-average calculations". */ static void calc_load_migrate(struct rq *rq) { long delta = calc_load_fold_active(rq); if (delta) atomic_long_add(delta, &calc_load_tasks); } /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ static void migrate_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); struct task_struct *next, *stop = rq->stop; int dest_cpu; /* * Fudge the rq selection such that the below task selection loop * doesn't get stuck on the currently eligible stop task. * * We're currently inside stop_machine() and the rq is either stuck * in the stop_machine_cpu_stop() loop, or we're executing this code, * either way we should never end up calling schedule() until we're * done here. */ rq->stop = NULL; for ( ; ; ) { /* * There's this thread running, bail when that's the only * remaining thread. */ if (rq->nr_running == 1) break; next = pick_next_task(rq); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_cpu, next); raw_spin_unlock(&rq->lock); __migrate_task(next, dead_cpu, dest_cpu); raw_spin_lock(&rq->lock); } rq->stop = stop; } #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) static struct ctl_table sd_ctl_dir[] = { { .procname = "sched_domain", .mode = 0555, }, {} }; static struct ctl_table sd_ctl_root[] = { { .procname = "kernel", .mode = 0555, .child = sd_ctl_dir, }, {} }; static struct ctl_table *sd_alloc_ctl_entry(int n) { struct ctl_table *entry = kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); return entry; } static void sd_free_ctl_entry(struct ctl_table **tablep) { struct ctl_table *entry; /* * In the intermediate directories, both the child directory and * procname are dynamically allocated and could fail but the mode * will always be set. In the lowest directory the names are * static strings and all have proc handlers. */ for (entry = *tablep; entry->mode; entry++) { if (entry->child) sd_free_ctl_entry(&entry->child); if (entry->proc_handler == NULL) kfree(entry->procname); } kfree(*tablep); *tablep = NULL; } static int min_load_idx = 0; static int max_load_idx = CPU_LOAD_IDX_MAX; static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, umode_t mode, proc_handler *proc_handler, bool load_idx) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; if (load_idx) { entry->extra1 = &min_load_idx; entry->extra2 = &max_load_idx; } } static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[2], "busy_idx", &sd->busy_idx, sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[3], "idle_idx", &sd->idle_idx, sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[5], "wake_idx", &sd->wake_idx, sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[7], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[11], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[12] is terminator */ return table; } static ctl_table *sd_alloc_ctl_cpu_table(int cpu) { struct ctl_table *entry, *table; struct sched_domain *sd; int domain_num = 0, i; char buf[32]; for_each_domain(cpu, sd) domain_num++; entry = table = sd_alloc_ctl_entry(domain_num + 1); if (table == NULL) return NULL; i = 0; for_each_domain(cpu, sd) { snprintf(buf, 32, "domain%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; entry->child = sd_alloc_ctl_domain_table(sd); entry++; i++; } return table; } static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; WARN_ON(sd_ctl_dir[0].child); sd_ctl_dir[0].child = entry; if (entry == NULL) return; for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; entry->child = sd_alloc_ctl_cpu_table(i); entry++; } WARN_ON(sd_sysctl_header); sd_sysctl_header = register_sysctl_table(sd_ctl_root); } /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { if (sd_sysctl_header) unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); } #else static void register_sched_domain_sysctl(void) { } static void unregister_sched_domain_sysctl(void) { } #endif static void set_rq_online(struct rq *rq) { if (!rq->online) { const struct sched_class *class; cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { if (class->rq_online) class->rq_online(rq); } } } static void set_rq_offline(struct rq *rq) { if (rq->online) { const struct sched_class *class; for_each_class(class) { if (class->rq_offline) class->rq_offline(rq); } cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ static int __cpuinit migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (long)hcpu; unsigned long flags; struct rq *rq = cpu_rq(cpu); switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } raw_spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; case CPU_DEAD: calc_load_migrate(rq); break; #endif } update_max_interval(); return NOTIFY_OK; } /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, .priority = CPU_PRI_MIGRATION, }; static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; default: return NOTIFY_DONE; } } static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: set_cpu_active((long)hcpu, false); return NOTIFY_OK; default: return NOTIFY_DONE; } } static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); /* Register cpu active notifiers */ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); return 0; } early_initcall(migration_init); #endif #ifdef CONFIG_SMP static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG static __read_mostly int sched_debug_enabled; static int __init sched_debug_setup(char *str) { sched_debug_enabled = 1; return 0; } early_param("sched_debug", sched_debug_setup); static inline bool sched_debug(void) { return sched_debug_enabled; } static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" " has parent"); return -1; } printk(KERN_CONT "span %s level %s\n", str, sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { printk("\n"); printk(KERN_ERR "ERROR: group is NULL\n"); break; } /* * Even though we initialize ->power to something semi-sane, * we leave power_orig unset. This allows us to detect if * domain iteration is still funny without causing /0 traps. */ if (!group->sgp->power_orig) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); break; } if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } cpumask_or(groupmask, groupmask, sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", group->sgp->power); } group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; } static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; if (!sched_debug_enabled) return; if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; } printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) break; level++; sd = sd->parent; if (!sd) break; } } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { return false; } #endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; } static int sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) { unsigned long cflags = sd->flags, pflags = parent->flags; if (sd_degenerate(parent)) return 1; if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } if (~cflags & pflags) return 0; return 1; } static void free_rootdomain(struct rcu_head *rcu) { struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); kfree(rd); } static void rq_attach_root(struct rq *rq, struct root_domain *rd) { struct root_domain *old_rd = NULL; unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); cpumask_clear_cpu(rq->cpu, old_rd->span); /* * If we dont want to free the old_rt yet then * set old_rd to NULL to skip the freeing later * in this function: */ if (!atomic_dec_and_test(&old_rd->refcount)) old_rd = NULL; } atomic_inc(&rd->refcount); rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; free_rto_mask: free_cpumask_var(rd->rto_mask); free_online: free_cpumask_var(rd->online); free_span: free_cpumask_var(rd->span); out: return -ENOMEM; } /* * By default the system creates a single root-domain with all cpus as * members (mimicking the global state we have today). */ struct root_domain def_root_domain; static void init_defrootdomain(void) { init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } static struct root_domain *alloc_rootdomain(void) { struct root_domain *rd; rd = kmalloc(sizeof(*rd), GFP_KERNEL); if (!rd) return NULL; if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } return rd; } static void free_sched_groups(struct sched_group *sg, int free_sgp) { struct sched_group *tmp, *first; if (!sg) return; first = sg; do { tmp = sg->next; if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) kfree(sg->sgp); kfree(sg); sg = tmp; } while (sg != first); } static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); /* * If its an overlapping domain it has private groups, iterate and * nuke them all. */ if (sd->flags & SD_OVERLAP) { free_sched_groups(sd->groups, 1); } else if (atomic_dec_and_test(&sd->groups->ref)) { kfree(sd->groups->sgp); kfree(sd->groups); } kfree(sd); } static void destroy_sched_domain(struct sched_domain *sd, int cpu) { call_rcu(&sd->rcu, free_sched_domain); } static void destroy_sched_domains(struct sched_domain *sd, int cpu) { for (; sd; sd = sd->parent) destroy_sched_domain(sd, cpu); } /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_id); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); if (sd) id = cpumask_first(sched_domain_span(sd)); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; } /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ static void cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; if (!parent) break; if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; } sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); destroy_sched_domains(tmp, cpu); update_top_cache_domain(cpu); } /* cpus with isolated domains */ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } __setup("isolcpus=", isolated_cpu_setup); static const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; struct sched_group_power **__percpu sgp; }; struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { sa_rootdomain, sa_sd, sa_sd_storage, sa_none, }; struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); #define SDTL_OVERLAP 0x01 struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; int flags; int numa_level; struct sd_data data; }; /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. * * Asymmetric node setups can result in situations where the domain tree is of * unequal depth, make sure to skip domains that already cover the entire * range. * * In that case build_sched_domains() will have terminated the iteration early * and our sibling sd spans will be empty. Domains should always include the * cpu they're built on, so check that. * */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { const struct cpumask *span = sched_domain_span(sd); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; for_each_cpu(i, span) { sibling = *per_cpu_ptr(sdd->sd, i); if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; cpumask_set_cpu(i, sched_group_mask(sg)); } } /* * Return the canonical balance cpu for this group, this is the first cpu * of this group that's also in the iteration mask. */ int group_balance_cpu(struct sched_group *sg) { return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); } static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; struct sched_domain *child; int i; cpumask_clear(covered); for_each_cpu(i, span) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) continue; child = *per_cpu_ptr(sdd->sd, i); /* See the comment near build_group_mask(). */ if (!cpumask_test_cpu(i, sched_domain_span(child))) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); if (!sg) goto fail; sg_span = sched_group_cpus(sg); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); } else cpumask_set_cpu(i, sg_span); cpumask_or(covered, covered, sg_span); sg->sgp = *per_cpu_ptr(sdd->sgp, i); if (atomic_inc_return(&sg->sgp->ref) == 1) build_group_mask(sd, sg); /* * Initialize sgp->power such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); /* * Make sure the first group of this domain contains the * canonical balance cpu. Otherwise the sched_domain iteration * breaks. See update_sg_lb_stats(). */ if ((!groups && cpumask_test_cpu(cpu, sg_span)) || group_balance_cpu(sg) == cpu) groups = sg; if (!first) first = sg; if (last) last->next = sg; last = sg; last->next = first; } sd->groups = groups; return 0; fail: free_sched_groups(first, 0); return -ENOMEM; } static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; if (child) cpu = cpumask_first(sched_domain_span(child)); if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ } return cpu; } /* * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. * * Assumes the sched_domain tree is fully constructed */ static int build_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered; int i; get_group(cpu, sdd, &sd->groups); atomic_inc(&sd->groups->ref); if (cpu != cpumask_first(sched_domain_span(sd))) return 0; lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); for_each_cpu(i, span) { struct sched_group *sg; int group = get_group(i, sdd, &sg); int j; if (cpumask_test_cpu(i, covered)) continue; cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) continue; cpumask_set_cpu(j, covered); cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; if (last) last->next = sg; last = sg; } last->next = first; return 0; } /* * Initialize sched groups cpu_power. * * cpu_power indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. * Typically cpu_power for all the groups in a sched domain will be same unless * there are asymmetries in the topology. If there are asymmetries, group * having more cpu_power will pickup more load compared to the group having * less cpu_power. */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; WARN_ON(!sd || !sg); do { sg->group_weight = cpumask_weight(sched_group_cpus(sg)); sg = sg->next; } while (sg != sd->groups); if (cpu != group_balance_cpu(sg)) return; update_group_power(sd, cpu); atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); } int __weak arch_sd_sibling_asym_packing(void) {