diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-01-25 16:42:32 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-01-25 16:42:32 -0500 |
| commit | 0008bf54408d4c0637c24d34642f1038c299be95 (patch) | |
| tree | 6a14cdacaa3057795381672339737a45e45effc7 | |
| parent | 2d94dfc8c38edf63e91e48fd55c3a8822b6a9ced (diff) | |
| parent | 6d082592b62689fb91578d0338d04a9f50991990 (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched
* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched: (96 commits)
sched: keep total / count stats in addition to the max for
sched, futex: detach sched.h and futex.h
sched: fix: don't take a mutex from interrupt context
sched: print backtrace of running tasks too
printk: use ktime_get()
softlockup: fix signedness
sched: latencytop support
sched: fix goto retry in pick_next_task_rt()
timers: don't #error on higher HZ values
sched: monitor clock underflows in /proc/sched_debug
sched: fix rq->clock warps on frequency changes
sched: fix, always create kernel threads with normal priority
debug: clean up kernel/profile.c
sched: remove the !PREEMPT_BKL code
sched: make PREEMPT_BKL the default
debug: track and print last unloaded module in the oops trace
debug: show being-loaded/being-unloaded indicator for modules
sched: rt-watchdog: fix .rlim_max = RLIM_INFINITY
sched: rt-group: reduce rescheduling
hrtimer: unlock hrtimer_wakeup
...
81 files changed, 6262 insertions, 1776 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 6221464d1a7e..39ad8f56783a 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt | |||
| @@ -9,8 +9,8 @@ The first thing resembling RCU was published in 1980, when Kung and Lehman | |||
| 9 | [Kung80] recommended use of a garbage collector to defer destruction | 9 | [Kung80] recommended use of a garbage collector to defer destruction |
| 10 | of nodes in a parallel binary search tree in order to simplify its | 10 | of nodes in a parallel binary search tree in order to simplify its |
| 11 | implementation. This works well in environments that have garbage | 11 | implementation. This works well in environments that have garbage |
| 12 | collectors, but current production garbage collectors incur significant | 12 | collectors, but most production garbage collectors incur significant |
| 13 | read-side overhead. | 13 | overhead. |
| 14 | 14 | ||
| 15 | In 1982, Manber and Ladner [Manber82,Manber84] recommended deferring | 15 | In 1982, Manber and Ladner [Manber82,Manber84] recommended deferring |
| 16 | destruction until all threads running at that time have terminated, again | 16 | destruction until all threads running at that time have terminated, again |
| @@ -99,16 +99,25 @@ locking, reduces contention, reduces memory latency for readers, and | |||
| 99 | parallelizes pipeline stalls and memory latency for writers. However, | 99 | parallelizes pipeline stalls and memory latency for writers. However, |
| 100 | these techniques still impose significant read-side overhead in the | 100 | these techniques still impose significant read-side overhead in the |
| 101 | form of memory barriers. Researchers at Sun worked along similar lines | 101 | form of memory barriers. Researchers at Sun worked along similar lines |
| 102 | in the same timeframe [HerlihyLM02,HerlihyLMS03]. These techniques | 102 | in the same timeframe [HerlihyLM02]. These techniques can be thought |
| 103 | can be thought of as inside-out reference counts, where the count is | 103 | of as inside-out reference counts, where the count is represented by the |
| 104 | represented by the number of hazard pointers referencing a given data | 104 | number of hazard pointers referencing a given data structure (rather than |
| 105 | structure (rather than the more conventional counter field within the | 105 | the more conventional counter field within the data structure itself). |
| 106 | data structure itself). | 106 | |
| 107 | By the same token, RCU can be thought of as a "bulk reference count", | ||
| 108 | where some form of reference counter covers all reference by a given CPU | ||
| 109 | or thread during a set timeframe. This timeframe is related to, but | ||
| 110 | not necessarily exactly the same as, an RCU grace period. In classic | ||
| 111 | RCU, the reference counter is the per-CPU bit in the "bitmask" field, | ||
| 112 | and each such bit covers all references that might have been made by | ||
| 113 | the corresponding CPU during the prior grace period. Of course, RCU | ||
| 114 | can be thought of in other terms as well. | ||
| 107 | 115 | ||
| 108 | In 2003, the K42 group described how RCU could be used to create | 116 | In 2003, the K42 group described how RCU could be used to create |
| 109 | hot-pluggable implementations of operating-system functions. Later that | 117 | hot-pluggable implementations of operating-system functions [Appavoo03a]. |
| 110 | year saw a paper describing an RCU implementation of System V IPC | 118 | Later that year saw a paper describing an RCU implementation of System |
| 111 | [Arcangeli03], and an introduction to RCU in Linux Journal [McKenney03a]. | 119 | V IPC [Arcangeli03], and an introduction to RCU in Linux Journal |
| 120 | [McKenney03a]. | ||
| 112 | 121 | ||
| 113 | 2004 has seen a Linux-Journal article on use of RCU in dcache | 122 | 2004 has seen a Linux-Journal article on use of RCU in dcache |
| 114 | [McKenney04a], a performance comparison of locking to RCU on several | 123 | [McKenney04a], a performance comparison of locking to RCU on several |
| @@ -117,10 +126,19 @@ number of operating-system kernels [PaulEdwardMcKenneyPhD], a paper | |||
| 117 | describing how to make RCU safe for soft-realtime applications [Sarma04c], | 126 | describing how to make RCU safe for soft-realtime applications [Sarma04c], |
| 118 | and a paper describing SELinux performance with RCU [JamesMorris04b]. | 127 | and a paper describing SELinux performance with RCU [JamesMorris04b]. |
| 119 | 128 | ||
| 120 | 2005 has seen further adaptation of RCU to realtime use, permitting | 129 | 2005 brought further adaptation of RCU to realtime use, permitting |
| 121 | preemption of RCU realtime critical sections [PaulMcKenney05a, | 130 | preemption of RCU realtime critical sections [PaulMcKenney05a, |
| 122 | PaulMcKenney05b]. | 131 | PaulMcKenney05b]. |
| 123 | 132 | ||
| 133 | 2006 saw the first best-paper award for an RCU paper [ThomasEHart2006a], | ||
| 134 | as well as further work on efficient implementations of preemptible | ||
| 135 | RCU [PaulEMcKenney2006b], but priority-boosting of RCU read-side critical | ||
| 136 | sections proved elusive. An RCU implementation permitting general | ||
| 137 | blocking in read-side critical sections appeared [PaulEMcKenney2006c], | ||
| 138 | Robert Olsson described an RCU-protected trie-hash combination | ||
| 139 | [RobertOlsson2006a]. | ||
| 140 | |||
| 141 | |||
| 124 | Bibtex Entries | 142 | Bibtex Entries |
| 125 | 143 | ||
| 126 | @article{Kung80 | 144 | @article{Kung80 |
| @@ -203,6 +221,41 @@ Bibtex Entries | |||
| 203 | ,Address="New Orleans, LA" | 221 | ,Address="New Orleans, LA" |
| 204 | } | 222 | } |
| 205 | 223 | ||
| 224 | @conference{Pu95a, | ||
| 225 | Author = "Calton Pu and Tito Autrey and Andrew Black and Charles Consel and | ||
| 226 | Crispin Cowan and Jon Inouye and Lakshmi Kethana and Jonathan Walpole and | ||
| 227 | Ke Zhang", | ||
| 228 | Title = "Optimistic Incremental Specialization: Streamlining a Commercial | ||
| 229 | Operating System", | ||
| 230 | Booktitle = "15\textsuperscript{th} ACM Symposium on | ||
| 231 | Operating Systems Principles (SOSP'95)", | ||
| 232 | address = "Copper Mountain, CO", | ||
| 233 | month="December", | ||
| 234 | year="1995", | ||
| 235 | pages="314-321", | ||
| 236 | annotation=" | ||
| 237 | Uses a replugger, but with a flag to signal when people are | ||
| 238 | using the resource at hand. Only one reader at a time. | ||
| 239 | " | ||
| 240 | } | ||
| 241 | |||
| 242 | @conference{Cowan96a, | ||
| 243 | Author = "Crispin Cowan and Tito Autrey and Charles Krasic and | ||
| 244 | Calton Pu and Jonathan Walpole", | ||
| 245 | Title = "Fast Concurrent Dynamic Linking for an Adaptive Operating System", | ||
| 246 | Booktitle = "International Conference on Configurable Distributed Systems | ||
| 247 | (ICCDS'96)", | ||
| 248 | address = "Annapolis, MD", | ||
| 249 | month="May", | ||
| 250 | year="1996", | ||
| 251 | pages="108", | ||
| 252 | isbn="0-8186-7395-8", | ||
| 253 | annotation=" | ||
| 254 | Uses a replugger, but with a counter to signal when people are | ||
| 255 | using the resource at hand. Allows multiple readers. | ||
| 256 | " | ||
| 257 | } | ||
| 258 | |||
| 206 | @techreport{Slingwine95 | 259 | @techreport{Slingwine95 |
| 207 | ,author="John D. Slingwine and Paul E. McKenney" | 260 | ,author="John D. Slingwine and Paul E. McKenney" |
| 208 | ,title="Apparatus and Method for Achieving Reduced Overhead Mutual | 261 | ,title="Apparatus and Method for Achieving Reduced Overhead Mutual |
| @@ -312,6 +365,49 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell" | |||
| 312 | [Viewed June 23, 2004]" | 365 | [Viewed June 23, 2004]" |
| 313 | } | 366 | } |
| 314 | 367 | ||
| 368 | @conference{Michael02a | ||
| 369 | ,author="Maged M. Michael" | ||
| 370 | ,title="Safe Memory Reclamation for Dynamic Lock-Free Objects Using Atomic | ||
| 371 | Reads and Writes" | ||
| 372 | ,Year="2002" | ||
| 373 | ,Month="August" | ||
| 374 | ,booktitle="{Proceedings of the 21\textsuperscript{st} Annual ACM | ||
| 375 | Symposium on Principles of Distributed Computing}" | ||
| 376 | ,pages="21-30" | ||
| 377 | ,annotation=" | ||
| 378 | Each thread keeps an array of pointers to items that it is | ||
| 379 | currently referencing. Sort of an inside-out garbage collection | ||
| 380 | mechanism, but one that requires the accessing code to explicitly | ||
| 381 | state its needs. Also requires read-side memory barriers on | ||
| 382 | most architectures. | ||
| 383 | " | ||
| 384 | } | ||
| 385 | |||
| 386 | @conference{Michael02b | ||
| 387 | ,author="Maged M. Michael" | ||
| 388 | ,title="High Performance Dynamic Lock-Free Hash Tables and List-Based Sets" | ||
| 389 | ,Year="2002" | ||
| 390 | ,Month="August" | ||
| 391 | ,booktitle="{Proceedings of the 14\textsuperscript{th} Annual ACM | ||
| 392 | Symposium on Parallel | ||
| 393 | Algorithms and Architecture}" | ||
| 394 | ,pages="73-82" | ||
| 395 | ,annotation=" | ||
| 396 | Like the title says... | ||
| 397 | " | ||
| 398 | } | ||
| 399 | |||
| 400 | @InProceedings{HerlihyLM02 | ||
| 401 | ,author={Maurice Herlihy and Victor Luchangco and Mark Moir} | ||
| 402 | ,title="The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized, | ||
| 403 | Lock-Free Data Structures" | ||
| 404 | ,booktitle={Proceedings of 16\textsuperscript{th} International | ||
| 405 | Symposium on Distributed Computing} | ||
| 406 | ,year=2002 | ||
| 407 | ,month="October" | ||
| 408 | ,pages="339-353" | ||
| 409 | } | ||
| 410 | |||
| 315 | @article{Appavoo03a | 411 | @article{Appavoo03a |
| 316 | ,author="J. Appavoo and K. Hui and C. A. N. Soules and R. W. Wisniewski and | 412 | ,author="J. Appavoo and K. Hui and C. A. N. Soules and R. W. Wisniewski and |
| 317 | D. M. {Da Silva} and O. Krieger and M. A. Auslander and D. J. Edelsohn and | 413 | D. M. {Da Silva} and O. Krieger and M. A. Auslander and D. J. Edelsohn and |
| @@ -447,3 +543,95 @@ Oregon Health and Sciences University" | |||
| 447 | Realtime turns into making RCU yet more realtime friendly. | 543 | Realtime turns into making RCU yet more realtime friendly. |
| 448 | " | 544 | " |
| 449 | } | 545 | } |
| 546 | |||
| 547 | @conference{ThomasEHart2006a | ||
| 548 | ,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown" | ||
| 549 | ,Title="Making Lockless Synchronization Fast: Performance Implications | ||
| 550 | of Memory Reclamation" | ||
| 551 | ,Booktitle="20\textsuperscript{th} {IEEE} International Parallel and | ||
| 552 | Distributed Processing Symposium" | ||
| 553 | ,month="April" | ||
| 554 | ,year="2006" | ||
| 555 | ,day="25-29" | ||
| 556 | ,address="Rhodes, Greece" | ||
| 557 | ,annotation=" | ||
| 558 | Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free | ||
| 559 | reference counting. | ||
| 560 | " | ||
| 561 | } | ||
| 562 | |||
| 563 | @Conference{PaulEMcKenney2006b | ||
| 564 | ,Author="Paul E. McKenney and Dipankar Sarma and Ingo Molnar and | ||
| 565 | Suparna Bhattacharya" | ||
| 566 | ,Title="Extending RCU for Realtime and Embedded Workloads" | ||
| 567 | ,Booktitle="{Ottawa Linux Symposium}" | ||
| 568 | ,Month="July" | ||
| 569 | ,Year="2006" | ||
| 570 | ,pages="v2 123-138" | ||
| 571 | ,note="Available: | ||
| 572 | \url{http://www.linuxsymposium.org/2006/view_abstract.php?content_key=184} | ||
| 573 | \url{http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf} | ||
| 574 | [Viewed January 1, 2007]" | ||
| 575 | ,annotation=" | ||
| 576 | Described how to improve the -rt implementation of realtime RCU. | ||
| 577 | " | ||
| 578 | } | ||
| 579 | |||
| 580 | @unpublished{PaulEMcKenney2006c | ||
| 581 | ,Author="Paul E. McKenney" | ||
| 582 | ,Title="Sleepable {RCU}" | ||
| 583 | ,month="October" | ||
| 584 | ,day="9" | ||
| 585 | ,year="2006" | ||
| 586 | ,note="Available: | ||
| 587 | \url{http://lwn.net/Articles/202847/} | ||
| 588 | Revised: | ||
| 589 | \url{http://www.rdrop.com/users/paulmck/RCU/srcu.2007.01.14a.pdf} | ||
| 590 | [Viewed August 21, 2006]" | ||
| 591 | ,annotation=" | ||
| 592 | LWN article introducing SRCU. | ||
| 593 | " | ||
| 594 | } | ||
| 595 | |||
| 596 | @unpublished{RobertOlsson2006a | ||
| 597 | ,Author="Robert Olsson and Stefan Nilsson" | ||
| 598 | ,Title="{TRASH}: A dynamic {LC}-trie and hash data structure" | ||
| 599 | ,month="August" | ||
| 600 | ,day="18" | ||
| 601 | ,year="2006" | ||
| 602 | ,note="Available: | ||
| 603 | \url{http://www.nada.kth.se/~snilsson/public/papers/trash/trash.pdf} | ||
| 604 | [Viewed February 24, 2007]" | ||
| 605 | ,annotation=" | ||
| 606 | RCU-protected dynamic trie-hash combination. | ||
| 607 | " | ||
| 608 | } | ||
| 609 | |||
| 610 | @unpublished{ThomasEHart2007a | ||
| 611 | ,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown and Jonathan Walpole" | ||
| 612 | ,Title="Performance of memory reclamation for lockless synchronization" | ||
| 613 | ,journal="J. Parallel Distrib. Comput." | ||
| 614 | ,year="2007" | ||
| 615 | ,note="To appear in J. Parallel Distrib. Comput. | ||
| 616 | \url{doi=10.1016/j.jpdc.2007.04.010}" | ||
| 617 | ,annotation={ | ||
| 618 | Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free | ||
| 619 | reference counting. Journal version of ThomasEHart2006a. | ||
| 620 | } | ||
| 621 | } | ||
| 622 | |||
| 623 | @unpublished{PaulEMcKenney2007QRCUspin | ||
| 624 | ,Author="Paul E. McKenney" | ||
| 625 | ,Title="Using Promela and Spin to verify parallel algorithms" | ||
| 626 | ,month="August" | ||
| 627 | ,day="1" | ||
| 628 | ,year="2007" | ||
| 629 | ,note="Available: | ||
| 630 | \url{http://lwn.net/Articles/243851/} | ||
| 631 | [Viewed September 8, 2007]" | ||
| 632 | ,annotation=" | ||
| 633 | LWN article describing Promela and spin, and also using Oleg | ||
| 634 | Nesterov's QRCU as an example (with Paul McKenney's fastpath). | ||
| 635 | " | ||
| 636 | } | ||
| 637 | |||
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index f84407cba816..95821a29ae41 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt | |||
| @@ -36,6 +36,14 @@ o How can the updater tell when a grace period has completed | |||
| 36 | executed in user mode, or executed in the idle loop, we can | 36 | executed in user mode, or executed in the idle loop, we can |
| 37 | safely free up that item. | 37 | safely free up that item. |
| 38 | 38 | ||
| 39 | Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the | ||
| 40 | same effect, but require that the readers manipulate CPU-local | ||
| 41 | counters. These counters allow limited types of blocking | ||
| 42 | within RCU read-side critical sections. SRCU also uses | ||
| 43 | CPU-local counters, and permits general blocking within | ||
| 44 | RCU read-side critical sections. These two variants of | ||
| 45 | RCU detect grace periods by sampling these counters. | ||
| 46 | |||
| 39 | o If I am running on a uniprocessor kernel, which can only do one | 47 | o If I am running on a uniprocessor kernel, which can only do one |
| 40 | thing at a time, why should I wait for a grace period? | 48 | thing at a time, why should I wait for a grace period? |
| 41 | 49 | ||
| @@ -46,7 +54,10 @@ o How can I see where RCU is currently used in the Linux kernel? | |||
| 46 | Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu", | 54 | Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu", |
| 47 | "rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh", | 55 | "rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh", |
| 48 | "srcu_read_lock", "srcu_read_unlock", "synchronize_rcu", | 56 | "srcu_read_lock", "srcu_read_unlock", "synchronize_rcu", |
| 49 | "synchronize_net", and "synchronize_srcu". | 57 | "synchronize_net", "synchronize_srcu", and the other RCU |
| 58 | primitives. Or grab one of the cscope databases from: | ||
| 59 | |||
| 60 | http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html | ||
| 50 | 61 | ||
| 51 | o What guidelines should I follow when writing code that uses RCU? | 62 | o What guidelines should I follow when writing code that uses RCU? |
| 52 | 63 | ||
| @@ -67,7 +78,11 @@ o I hear that RCU is patented? What is with that? | |||
| 67 | 78 | ||
| 68 | o I hear that RCU needs work in order to support realtime kernels? | 79 | o I hear that RCU needs work in order to support realtime kernels? |
| 69 | 80 | ||
| 70 | Yes, work in progress. | 81 | This work is largely completed. Realtime-friendly RCU can be |
| 82 | enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. | ||
| 83 | However, work is in progress for enabling priority boosting of | ||
| 84 | preempted RCU read-side critical sections.This is needed if you | ||
| 85 | have CPU-bound realtime threads. | ||
| 71 | 86 | ||
| 72 | o Where can I find more information on RCU? | 87 | o Where can I find more information on RCU? |
| 73 | 88 | ||
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 25a3c3f7d378..2967a65269d8 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt | |||
| @@ -46,12 +46,13 @@ stat_interval The number of seconds between output of torture | |||
| 46 | 46 | ||
| 47 | shuffle_interval | 47 | shuffle_interval |
| 48 | The number of seconds to keep the test threads affinitied | 48 | The number of seconds to keep the test threads affinitied |
| 49 | to a particular subset of the CPUs. Used in conjunction | 49 | to a particular subset of the CPUs, defaults to 5 seconds. |
| 50 | with test_no_idle_hz. | 50 | Used in conjunction with test_no_idle_hz. |
| 51 | 51 | ||
| 52 | test_no_idle_hz Whether or not to test the ability of RCU to operate in | 52 | test_no_idle_hz Whether or not to test the ability of RCU to operate in |
| 53 | a kernel that disables the scheduling-clock interrupt to | 53 | a kernel that disables the scheduling-clock interrupt to |
| 54 | idle CPUs. Boolean parameter, "1" to test, "0" otherwise. | 54 | idle CPUs. Boolean parameter, "1" to test, "0" otherwise. |
| 55 | Defaults to omitting this test. | ||
| 55 | 56 | ||
| 56 | torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API, | 57 | torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API, |
| 57 | "rcu_sync" for rcu_read_lock() with synchronous reclamation, | 58 | "rcu_sync" for rcu_read_lock() with synchronous reclamation, |
| @@ -82,8 +83,6 @@ be evident. ;-) | |||
| 82 | 83 | ||
| 83 | The entries are as follows: | 84 | The entries are as follows: |
| 84 | 85 | ||
| 85 | o "ggp": The number of counter flips (or batches) since boot. | ||
| 86 | |||
| 87 | o "rtc": The hexadecimal address of the structure currently visible | 86 | o "rtc": The hexadecimal address of the structure currently visible |
| 88 | to readers. | 87 | to readers. |
| 89 | 88 | ||
| @@ -117,8 +116,8 @@ o "Reader Pipe": Histogram of "ages" of structures seen by readers. | |||
| 117 | o "Reader Batch": Another histogram of "ages" of structures seen | 116 | o "Reader Batch": Another histogram of "ages" of structures seen |
| 118 | by readers, but in terms of counter flips (or batches) rather | 117 | by readers, but in terms of counter flips (or batches) rather |
| 119 | than in terms of grace periods. The legal number of non-zero | 118 | than in terms of grace periods. The legal number of non-zero |
| 120 | entries is again two. The reason for this separate view is | 119 | entries is again two. The reason for this separate view is that |
| 121 | that it is easier to get the third entry to show up in the | 120 | it is sometimes easier to get the third entry to show up in the |
| 122 | "Reader Batch" list than in the "Reader Pipe" list. | 121 | "Reader Batch" list than in the "Reader Pipe" list. |
| 123 | 122 | ||
| 124 | o "Free-Block Circulation": Shows the number of torture structures | 123 | o "Free-Block Circulation": Shows the number of torture structures |
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index a741f658a3c9..fb94f5a71b68 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt | |||
| @@ -109,12 +109,13 @@ Never use anything other than cpumask_t to represent bitmap of CPUs. | |||
| 109 | for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask. | 109 | for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask. |
| 110 | 110 | ||
| 111 | #include <linux/cpu.h> | 111 | #include <linux/cpu.h> |
| 112 | lock_cpu_hotplug() and unlock_cpu_hotplug(): | 112 | get_online_cpus() and put_online_cpus(): |
| 113 | 113 | ||
| 114 | The above calls are used to inhibit cpu hotplug operations. While holding the | 114 | The above calls are used to inhibit cpu hotplug operations. While the |
| 115 | cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid | 115 | cpu_hotplug.refcount is non zero, the cpu_online_map will not change. |
| 116 | cpus going away, you could also use preempt_disable() and preempt_enable() | 116 | If you merely need to avoid cpus going away, you could also use |
| 117 | for those sections. Just remember the critical section cannot call any | 117 | preempt_disable() and preempt_enable() for those sections. |
| 118 | Just remember the critical section cannot call any | ||
| 118 | function that can sleep or schedule this process away. The preempt_disable() | 119 | function that can sleep or schedule this process away. The preempt_disable() |
| 119 | will work as long as stop_machine_run() is used to take a cpu down. | 120 | will work as long as stop_machine_run() is used to take a cpu down. |
| 120 | 121 | ||
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index f6f3689a86ee..e59b5b84168d 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c | |||
| @@ -79,17 +79,6 @@ static unsigned long dummy_gettimeoffset(void) | |||
| 79 | } | 79 | } |
| 80 | #endif | 80 | #endif |
| 81 | 81 | ||
| 82 | /* | ||
| 83 | * An implementation of printk_clock() independent from | ||
| 84 | * sched_clock(). This avoids non-bootable kernels when | ||
| 85 | * printk_clock is enabled. | ||
| 86 | */ | ||
| 87 | unsigned long long printk_clock(void) | ||
| 88 | { | ||
| 89 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) * | ||
| 90 | (1000000000 / HZ); | ||
| 91 | } | ||
| 92 | |||
| 93 | static unsigned long next_rtc_update; | 82 | static unsigned long next_rtc_update; |
| 94 | 83 | ||
| 95 | /* | 84 | /* |
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 4ac2b1f1bd3b..86028c69861e 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c | |||
| @@ -71,8 +71,6 @@ unsigned long __per_cpu_offset[NR_CPUS]; | |||
| 71 | EXPORT_SYMBOL(__per_cpu_offset); | 71 | EXPORT_SYMBOL(__per_cpu_offset); |
| 72 | #endif | 72 | #endif |
| 73 | 73 | ||
| 74 | extern void ia64_setup_printk_clock(void); | ||
| 75 | |||
| 76 | DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info); | 74 | DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info); |
| 77 | DEFINE_PER_CPU(unsigned long, local_per_cpu_offset); | 75 | DEFINE_PER_CPU(unsigned long, local_per_cpu_offset); |
| 78 | unsigned long ia64_cycles_per_usec; | 76 | unsigned long ia64_cycles_per_usec; |
| @@ -507,8 +505,6 @@ setup_arch (char **cmdline_p) | |||
| 507 | /* process SAL system table: */ | 505 | /* process SAL system table: */ |
| 508 | ia64_sal_init(__va(efi.sal_systab)); | 506 | ia64_sal_init(__va(efi.sal_systab)); |
| 509 | 507 | ||
| 510 | ia64_setup_printk_clock(); | ||
| 511 | |||
| 512 | #ifdef CONFIG_SMP | 508 | #ifdef CONFIG_SMP |
| 513 | cpu_physical_id(0) = hard_smp_processor_id(); | 509 | cpu_physical_id(0) = hard_smp_processor_id(); |
| 514 | #endif | 510 | #endif |
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 2bb84214e5f1..3ab042720970 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c | |||
| @@ -344,33 +344,6 @@ udelay (unsigned long usecs) | |||
| 344 | } | 344 | } |
| 345 | EXPORT_SYMBOL(udelay); | 345 | EXPORT_SYMBOL(udelay); |
| 346 | 346 | ||
| 347 | static unsigned long long ia64_itc_printk_clock(void) | ||
| 348 | { | ||
| 349 | if (ia64_get_kr(IA64_KR_PER_CPU_DATA)) | ||
| 350 | return sched_clock(); | ||
| 351 | return 0; | ||
| 352 | } | ||
| 353 | |||
| 354 | static unsigned long long ia64_default_printk_clock(void) | ||
| 355 | { | ||
| 356 | return (unsigned long long)(jiffies_64 - INITIAL_JIFFIES) * | ||
| 357 | (1000000000/HZ); | ||
| 358 | } | ||
| 359 | |||
| 360 | unsigned long long (*ia64_printk_clock)(void) = &ia64_default_printk_clock; | ||
| 361 | |||
| 362 | unsigned long long printk_clock(void) | ||
| 363 | { | ||
| 364 | return ia64_printk_clock(); | ||
| 365 | } | ||
| 366 | |||
| 367 | void __init | ||
| 368 | ia64_setup_printk_clock(void) | ||
| 369 | { | ||
| 370 | if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) | ||
| 371 | ia64_printk_clock = ia64_itc_printk_clock; | ||
| 372 | } | ||
| 373 | |||
| 374 | /* IA64 doesn't cache the timezone */ | 347 | /* IA64 doesn't cache the timezone */ |
| 375 | void update_vsyscall_tz(void) | 348 | void update_vsyscall_tz(void) |
| 376 | { | 349 | { |
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index 1f38a3a68390..bb1d24929640 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c | |||
| @@ -64,7 +64,6 @@ extern void sn_timer_init(void); | |||
| 64 | extern unsigned long last_time_offset; | 64 | extern unsigned long last_time_offset; |
| 65 | extern void (*ia64_mark_idle) (int); | 65 | extern void (*ia64_mark_idle) (int); |
| 66 | extern void snidle(int); | 66 | extern void snidle(int); |
| 67 | extern unsigned long long (*ia64_printk_clock)(void); | ||
| 68 | 67 | ||
| 69 | unsigned long sn_rtc_cycles_per_second; | 68 | unsigned long sn_rtc_cycles_per_second; |
| 70 | EXPORT_SYMBOL(sn_rtc_cycles_per_second); | 69 | EXPORT_SYMBOL(sn_rtc_cycles_per_second); |
| @@ -360,14 +359,6 @@ sn_scan_pcdp(void) | |||
| 360 | 359 | ||
| 361 | static unsigned long sn2_rtc_initial; | 360 | static unsigned long sn2_rtc_initial; |
| 362 | 361 | ||
| 363 | static unsigned long long ia64_sn2_printk_clock(void) | ||
| 364 | { | ||
| 365 | unsigned long rtc_now = rtc_time(); | ||
| 366 | |||
| 367 | return (rtc_now - sn2_rtc_initial) * | ||
| 368 | (1000000000 / sn_rtc_cycles_per_second); | ||
| 369 | } | ||
| 370 | |||
| 371 | /** | 362 | /** |
| 372 | * sn_setup - SN platform setup routine | 363 | * sn_setup - SN platform setup routine |
| 373 | * @cmdline_p: kernel command line | 364 | * @cmdline_p: kernel command line |
| @@ -468,8 +459,6 @@ void __init sn_setup(char **cmdline_p) | |||
| 468 | 459 | ||
| 469 | platform_intr_list[ACPI_INTERRUPT_CPEI] = IA64_CPE_VECTOR; | 460 | platform_intr_list[ACPI_INTERRUPT_CPEI] = IA64_CPE_VECTOR; |
| 470 | 461 | ||
| 471 | ia64_printk_clock = ia64_sn2_printk_clock; | ||
| 472 | |||
| 473 | printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF); | 462 | printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF); |
| 474 | 463 | ||
| 475 | /* | 464 | /* |
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 892665bb12b1..bb4f00c0cbe9 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c | |||
| @@ -58,13 +58,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
| 58 | if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) | 58 | if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) |
| 59 | return -EFAULT; | 59 | return -EFAULT; |
| 60 | 60 | ||
| 61 | lock_cpu_hotplug(); | 61 | get_online_cpus(); |
| 62 | read_lock(&tasklist_lock); | 62 | read_lock(&tasklist_lock); |
| 63 | 63 | ||
| 64 | p = find_process_by_pid(pid); | 64 | p = find_process_by_pid(pid); |
| 65 | if (!p) { | 65 | if (!p) { |
| 66 | read_unlock(&tasklist_lock); | 66 | read_unlock(&tasklist_lock); |
| 67 | unlock_cpu_hotplug(); | 67 | put_online_cpus(); |
| 68 | return -ESRCH; | 68 | return -ESRCH; |
| 69 | } | 69 | } |
| 70 | 70 | ||
| @@ -106,7 +106,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
| 106 | 106 | ||
| 107 | out_unlock: | 107 | out_unlock: |
| 108 | put_task_struct(p); | 108 | put_task_struct(p); |
| 109 | unlock_cpu_hotplug(); | 109 | put_online_cpus(); |
| 110 | return retval; | 110 | return retval; |
| 111 | } | 111 | } |
| 112 | 112 | ||
| @@ -125,7 +125,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
| 125 | if (len < real_len) | 125 | if (len < real_len) |
| 126 | return -EINVAL; | 126 | return -EINVAL; |
| 127 | 127 | ||
| 128 | lock_cpu_hotplug(); | 128 | get_online_cpus(); |
| 129 | read_lock(&tasklist_lock); | 129 | read_lock(&tasklist_lock); |
| 130 | 130 | ||
| 131 | retval = -ESRCH; | 131 | retval = -ESRCH; |
| @@ -140,7 +140,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
| 140 | 140 | ||
| 141 | out_unlock: | 141 | out_unlock: |
| 142 | read_unlock(&tasklist_lock); | 142 | read_unlock(&tasklist_lock); |
| 143 | unlock_cpu_hotplug(); | 143 | put_online_cpus(); |
| 144 | if (retval) | 144 | if (retval) |
| 145 | return retval; | 145 | return retval; |
| 146 | if (copy_to_user(user_mask_ptr, &mask, real_len)) | 146 | if (copy_to_user(user_mask_ptr, &mask, real_len)) |
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 412e6b42986f..c4ad54e0f288 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c | |||
| @@ -153,7 +153,7 @@ static int pseries_add_processor(struct device_node *np) | |||
| 153 | for (i = 0; i < nthreads; i++) | 153 | for (i = 0; i < nthreads; i++) |
| 154 | cpu_set(i, tmp); | 154 | cpu_set(i, tmp); |
| 155 | 155 | ||
| 156 | lock_cpu_hotplug(); | 156 | cpu_maps_update_begin(); |
| 157 | 157 | ||
| 158 | BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map)); | 158 | BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map)); |
| 159 | 159 | ||
| @@ -190,7 +190,7 @@ static int pseries_add_processor(struct device_node *np) | |||
| 190 | } | 190 | } |
| 191 | err = 0; | 191 | err = 0; |
| 192 | out_unlock: | 192 | out_unlock: |
| 193 | unlock_cpu_hotplug(); | 193 | cpu_maps_update_done(); |
| 194 | return err; | 194 | return err; |
| 195 | } | 195 | } |
| 196 | 196 | ||
| @@ -211,7 +211,7 @@ static void pseries_remove_processor(struct device_node *np) | |||
| 211 | 211 | ||
| 212 | nthreads = len / sizeof(u32); | 212 | nthreads = len / sizeof(u32); |
| 213 | 213 | ||
| 214 | lock_cpu_hotplug(); | 214 | cpu_maps_update_begin(); |
| 215 | for (i = 0; i < nthreads; i++) { | 215 | for (i = 0; i < nthreads; i++) { |
| 216 | for_each_present_cpu(cpu) { | 216 | for_each_present_cpu(cpu) { |
| 217 | if (get_hard_smp_processor_id(cpu) != intserv[i]) | 217 | if (get_hard_smp_processor_id(cpu) != intserv[i]) |
| @@ -225,7 +225,7 @@ static void pseries_remove_processor(struct device_node *np) | |||
| 225 | printk(KERN_WARNING "Could not find cpu to remove " | 225 | printk(KERN_WARNING "Could not find cpu to remove " |
| 226 | "with physical id 0x%x\n", intserv[i]); | 226 | "with physical id 0x%x\n", intserv[i]); |
| 227 | } | 227 | } |
| 228 | unlock_cpu_hotplug(); | 228 | cpu_maps_update_done(); |
| 229 | } | 229 | } |
| 230 | 230 | ||
| 231 | static int pseries_smp_notifier(struct notifier_block *nb, | 231 | static int pseries_smp_notifier(struct notifier_block *nb, |
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c index 73401c820110..e3078ce41518 100644 --- a/arch/powerpc/platforms/pseries/rtasd.c +++ b/arch/powerpc/platforms/pseries/rtasd.c | |||
| @@ -382,7 +382,7 @@ static void do_event_scan_all_cpus(long delay) | |||
| 382 | { | 382 | { |
| 383 | int cpu; | 383 | int cpu; |
| 384 | 384 | ||
| 385 | lock_cpu_hotplug(); | 385 | get_online_cpus(); |
| 386 | cpu = first_cpu(cpu_online_map); | 386 | cpu = first_cpu(cpu_online_map); |
| 387 | for (;;) { | 387 | for (;;) { |
| 388 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 388 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); |
| @@ -390,15 +390,15 @@ static void do_event_scan_all_cpus(long delay) | |||
| 390 | set_cpus_allowed(current, CPU_MASK_ALL); | 390 | set_cpus_allowed(current, CPU_MASK_ALL); |
| 391 | 391 | ||
| 392 | /* Drop hotplug lock, and sleep for the specified delay */ | 392 | /* Drop hotplug lock, and sleep for the specified delay */ |
| 393 | unlock_cpu_hotplug(); | 393 | put_online_cpus(); |
| 394 | msleep_interruptible(delay); | 394 | msleep_interruptible(delay); |
| 395 | lock_cpu_hotplug(); | 395 | get_online_cpus(); |
| 396 | 396 | ||
| 397 | cpu = next_cpu(cpu, cpu_online_map); | 397 | cpu = next_cpu(cpu, cpu_online_map); |
| 398 | if (cpu == NR_CPUS) | 398 | if (cpu == NR_CPUS) |
| 399 | break; | 399 | break; |
| 400 | } | 400 | } |
| 401 | unlock_cpu_hotplug(); | 401 | put_online_cpus(); |
| 402 | } | 402 | } |
| 403 | 403 | ||
| 404 | static int rtasd(void *unused) | 404 | static int rtasd(void *unused) |
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 3b20613325dc..beb45c9c0835 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
| @@ -349,7 +349,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, | |||
| 349 | replace = -1; | 349 | replace = -1; |
| 350 | 350 | ||
| 351 | /* No CPU hotplug when we change MTRR entries */ | 351 | /* No CPU hotplug when we change MTRR entries */ |
| 352 | lock_cpu_hotplug(); | 352 | get_online_cpus(); |
| 353 | /* Search for existing MTRR */ | 353 | /* Search for existing MTRR */ |
| 354 | mutex_lock(&mtrr_mutex); | 354 | mutex_lock(&mtrr_mutex); |
| 355 | for (i = 0; i < num_var_ranges; ++i) { | 355 | for (i = 0; i < num_var_ranges; ++i) { |
| @@ -405,7 +405,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, | |||
| 405 | error = i; | 405 | error = i; |
| 406 | out: | 406 | out: |
| 407 | mutex_unlock(&mtrr_mutex); | 407 | mutex_unlock(&mtrr_mutex); |
| 408 | unlock_cpu_hotplug(); | 408 | put_online_cpus(); |
| 409 | return error; | 409 | return error; |
| 410 | } | 410 | } |
| 411 | 411 | ||
| @@ -495,7 +495,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) | |||
| 495 | 495 | ||
| 496 | max = num_var_ranges; | 496 | max = num_var_ranges; |
| 497 | /* No CPU hotplug when we change MTRR entries */ | 497 | /* No CPU hotplug when we change MTRR entries */ |
| 498 | lock_cpu_hotplug(); | 498 | get_online_cpus(); |
| 499 | mutex_lock(&mtrr_mutex); | 499 | mutex_lock(&mtrr_mutex); |
| 500 | if (reg < 0) { | 500 | if (reg < 0) { |
| 501 | /* Search for existing MTRR */ | 501 | /* Search for existing MTRR */ |
| @@ -536,7 +536,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) | |||
| 536 | error = reg; | 536 | error = reg; |
| 537 | out: | 537 | out: |
| 538 | mutex_unlock(&mtrr_mutex); | 538 | mutex_unlock(&mtrr_mutex); |
| 539 | unlock_cpu_hotplug(); | 539 | put_online_cpus(); |
| 540 | return error; | 540 | return error; |
| 541 | } | 541 | } |
| 542 | /** | 542 | /** |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3a058bb16409..e70f3881d7e4 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
| @@ -283,7 +283,7 @@ sysret_careful: | |||
| 283 | sysret_signal: | 283 | sysret_signal: |
| 284 | TRACE_IRQS_ON | 284 | TRACE_IRQS_ON |
| 285 | sti | 285 | sti |
| 286 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | 286 | testl $_TIF_DO_NOTIFY_MASK,%edx |
| 287 | jz 1f | 287 | jz 1f |
| 288 | 288 | ||
| 289 | /* Really a signal */ | 289 | /* Really a signal */ |
| @@ -377,7 +377,7 @@ int_very_careful: | |||
| 377 | jmp int_restore_rest | 377 | jmp int_restore_rest |
| 378 | 378 | ||
| 379 | int_signal: | 379 | int_signal: |
| 380 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | 380 | testl $_TIF_DO_NOTIFY_MASK,%edx |
| 381 | jz 1f | 381 | jz 1f |
| 382 | movq %rsp,%rdi # &ptregs -> arg1 | 382 | movq %rsp,%rdi # &ptregs -> arg1 |
| 383 | xorl %esi,%esi # oldset -> arg2 | 383 | xorl %esi,%esi # oldset -> arg2 |
| @@ -603,7 +603,7 @@ retint_careful: | |||
| 603 | jmp retint_check | 603 | jmp retint_check |
| 604 | 604 | ||
| 605 | retint_signal: | 605 | retint_signal: |
| 606 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | 606 | testl $_TIF_DO_NOTIFY_MASK,%edx |
| 607 | jz retint_swapgs | 607 | jz retint_swapgs |
| 608 | TRACE_IRQS_ON | 608 | TRACE_IRQS_ON |
| 609 | sti | 609 | sti |
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 09c315214a5e..40cfd5488719 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c | |||
| @@ -436,7 +436,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_ | |||
| 436 | return -EINVAL; | 436 | return -EINVAL; |
| 437 | } | 437 | } |
| 438 | 438 | ||
| 439 | lock_cpu_hotplug(); | 439 | get_online_cpus(); |
| 440 | mutex_lock(µcode_mutex); | 440 | mutex_lock(µcode_mutex); |
| 441 | 441 | ||
| 442 | user_buffer = (void __user *) buf; | 442 | user_buffer = (void __user *) buf; |
| @@ -447,7 +447,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_ | |||
| 447 | ret = (ssize_t)len; | 447 | ret = (ssize_t)len; |
| 448 | 448 | ||
| 449 | mutex_unlock(µcode_mutex); | 449 | mutex_unlock(µcode_mutex); |
| 450 | unlock_cpu_hotplug(); | 450 | put_online_cpus(); |
| 451 | 451 | ||
| 452 | return ret; | 452 | return ret; |
| 453 | } | 453 | } |
| @@ -658,14 +658,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) | |||
| 658 | 658 | ||
| 659 | old = current->cpus_allowed; | 659 | old = current->cpus_allowed; |
| 660 | 660 | ||
| 661 | lock_cpu_hotplug(); | 661 | get_online_cpus(); |
| 662 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | 662 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); |
| 663 | 663 | ||
| 664 | mutex_lock(µcode_mutex); | 664 | mutex_lock(µcode_mutex); |
| 665 | if (uci->valid) | 665 | if (uci->valid) |
| 666 | err = cpu_request_microcode(cpu); | 666 | err = cpu_request_microcode(cpu); |
| 667 | mutex_unlock(µcode_mutex); | 667 | mutex_unlock(µcode_mutex); |
| 668 | unlock_cpu_hotplug(); | 668 | put_online_cpus(); |
| 669 | set_cpus_allowed(current, old); | 669 | set_cpus_allowed(current, old); |
| 670 | } | 670 | } |
| 671 | if (err) | 671 | if (err) |
| @@ -817,9 +817,9 @@ static int __init microcode_init (void) | |||
| 817 | return PTR_ERR(microcode_pdev); | 817 | return PTR_ERR(microcode_pdev); |
| 818 | } | 818 | } |
| 819 | 819 | ||
| 820 | lock_cpu_hotplug(); | 820 | get_online_cpus(); |
| 821 | error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); | 821 | error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); |
| 822 | unlock_cpu_hotplug(); | 822 | put_online_cpus(); |
| 823 | if (error) { | 823 | if (error) { |
| 824 | microcode_dev_exit(); | 824 | microcode_dev_exit(); |
| 825 | platform_device_unregister(microcode_pdev); | 825 | platform_device_unregister(microcode_pdev); |
| @@ -839,9 +839,9 @@ static void __exit microcode_exit (void) | |||
| 839 | 839 | ||
| 840 | unregister_hotcpu_notifier(&mc_cpu_notifier); | 840 | unregister_hotcpu_notifier(&mc_cpu_notifier); |
| 841 | 841 | ||
| 842 | lock_cpu_hotplug(); | 842 | get_online_cpus(); |
| 843 | sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); | 843 | sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); |
| 844 | unlock_cpu_hotplug(); | 844 | put_online_cpus(); |
| 845 | 845 | ||
| 846 | platform_device_unregister(microcode_pdev); | 846 | platform_device_unregister(microcode_pdev); |
| 847 | } | 847 | } |
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 9bdd83022f5f..20f29e4c1d33 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c | |||
| @@ -658,6 +658,9 @@ void do_notify_resume(struct pt_regs *regs, void *_unused, | |||
| 658 | /* deal with pending signal delivery */ | 658 | /* deal with pending signal delivery */ |
| 659 | if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) | 659 | if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) |
| 660 | do_signal(regs); | 660 | do_signal(regs); |
| 661 | |||
| 662 | if (thread_info_flags & _TIF_HRTICK_RESCHED) | ||
| 663 | hrtick_resched(); | ||
| 661 | 664 | ||
| 662 | clear_thread_flag(TIF_IRET); | 665 | clear_thread_flag(TIF_IRET); |
| 663 | } | 666 | } |
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ab086b0357fc..38d806467c0f 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c | |||
| @@ -480,6 +480,9 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | |||
| 480 | /* deal with pending signal delivery */ | 480 | /* deal with pending signal delivery */ |
| 481 | if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) | 481 | if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) |
| 482 | do_signal(regs); | 482 | do_signal(regs); |
| 483 | |||
| 484 | if (thread_info_flags & _TIF_HRTICK_RESCHED) | ||
| 485 | hrtick_resched(); | ||
| 483 | } | 486 | } |
| 484 | 487 | ||
| 485 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | 488 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) |
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 6fa6cf036c70..55771fd7e545 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
| @@ -33,6 +33,19 @@ static void save_stack_address(void *data, unsigned long addr) | |||
| 33 | trace->entries[trace->nr_entries++] = addr; | 33 | trace->entries[trace->nr_entries++] = addr; |
| 34 | } | 34 | } |
| 35 | 35 | ||
| 36 | static void save_stack_address_nosched(void *data, unsigned long addr) | ||
| 37 | { | ||
| 38 | struct stack_trace *trace = (struct stack_trace *)data; | ||
| 39 | if (in_sched_functions(addr)) | ||
| 40 | return; | ||
| 41 | if (trace->skip > 0) { | ||
| 42 | trace->skip--; | ||
| 43 | return; | ||
| 44 | } | ||
| 45 | if (trace->nr_entries < trace->max_entries) | ||
| 46 | trace->entries[trace->nr_entries++] = addr; | ||
| 47 | } | ||
| 48 | |||
| 36 | static const struct stacktrace_ops save_stack_ops = { | 49 | static const struct stacktrace_ops save_stack_ops = { |
| 37 | .warning = save_stack_warning, | 50 | .warning = save_stack_warning, |
| 38 | .warning_symbol = save_stack_warning_symbol, | 51 | .warning_symbol = save_stack_warning_symbol, |
| @@ -40,6 +53,13 @@ static const struct stacktrace_ops save_stack_ops = { | |||
| 40 | .address = save_stack_address, | 53 | .address = save_stack_address, |
| 41 | }; | 54 | }; |
| 42 | 55 | ||
| 56 | static const struct stacktrace_ops save_stack_ops_nosched = { | ||
| 57 | .warning = save_stack_warning, | ||
| 58 | .warning_symbol = save_stack_warning_symbol, | ||
| 59 | .stack = save_stack_stack, | ||
| 60 | .address = save_stack_address_nosched, | ||
| 61 | }; | ||
| 62 | |||
| 43 | /* | 63 | /* |
| 44 | * Save stack-backtrace addresses into a stack_trace buffer. | 64 | * Save stack-backtrace addresses into a stack_trace buffer. |
| 45 | */ | 65 | */ |
| @@ -50,3 +70,10 @@ void save_stack_trace(struct stack_trace *trace) | |||
| 50 | trace->entries[trace->nr_entries++] = ULONG_MAX; | 70 | trace->entries[trace->nr_entries++] = ULONG_MAX; |
| 51 | } | 71 | } |
| 52 | EXPORT_SYMBOL(save_stack_trace); | 72 | EXPORT_SYMBOL(save_stack_trace); |
| 73 | |||
| 74 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | ||
| 75 | { | ||
| 76 | dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); | ||
| 77 | if (trace->nr_entries < trace->max_entries) | ||
| 78 | trace->entries[trace->nr_entries++] = ULONG_MAX; | ||
| 79 | } | ||
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 482aec2a9631..96d0fd07c57d 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
| @@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void) | |||
| 459 | 459 | ||
| 460 | /* We don't need the complexity of CPUs coming and going while we're | 460 | /* We don't need the complexity of CPUs coming and going while we're |
| 461 | * doing this. */ | 461 | * doing this. */ |
| 462 | lock_cpu_hotplug(); | 462 | get_online_cpus(); |
| 463 | if (cpu_has_pge) { /* We have a broader idea of "global". */ | 463 | if (cpu_has_pge) { /* We have a broader idea of "global". */ |
| 464 | /* Remember that this was originally set (for cleanup). */ | 464 | /* Remember that this was originally set (for cleanup). */ |
| 465 | cpu_had_pge = 1; | 465 | cpu_had_pge = 1; |
| @@ -469,20 +469,20 @@ void __init lguest_arch_host_init(void) | |||
| 469 | /* Turn off the feature in the global feature set. */ | 469 | /* Turn off the feature in the global feature set. */ |
| 470 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 470 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); |
| 471 | } | 471 | } |
| 472 | unlock_cpu_hotplug(); | 472 | put_online_cpus(); |
| 473 | }; | 473 | }; |
| 474 | /*:*/ | 474 | /*:*/ |
| 475 | 475 | ||
| 476 | void __exit lguest_arch_host_fini(void) | 476 | void __exit lguest_arch_host_fini(void) |
| 477 | { | 477 | { |
| 478 | /* If we had PGE before we started, turn it back on now. */ | 478 | /* If we had PGE before we started, turn it back on now. */ |
| 479 | lock_cpu_hotplug(); | 479 | get_online_cpus(); |
| 480 | if (cpu_had_pge) { | 480 | if (cpu_had_pge) { |
| 481 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 481 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); |
| 482 | /* adjust_pge's argument "1" means set PGE. */ | 482 | /* adjust_pge's argument "1" means set PGE. */ |
| 483 | on_each_cpu(adjust_pge, (void *)1, 0, 1); | 483 | on_each_cpu(adjust_pge, (void *)1, 0, 1); |
| 484 | } | 484 | } |
| 485 | unlock_cpu_hotplug(); | 485 | put_online_cpus(); |
| 486 | } | 486 | } |
| 487 | 487 | ||
| 488 | 488 | ||
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c index 5322e5e54a98..9dc77f14fa52 100644 --- a/drivers/s390/char/sclp_config.c +++ b/drivers/s390/char/sclp_config.c | |||
| @@ -29,12 +29,12 @@ static void sclp_cpu_capability_notify(struct work_struct *work) | |||
| 29 | struct sys_device *sysdev; | 29 | struct sys_device *sysdev; |
| 30 | 30 | ||
| 31 | printk(KERN_WARNING TAG "cpu capability changed.\n"); | 31 | printk(KERN_WARNING TAG "cpu capability changed.\n"); |
| 32 | lock_cpu_hotplug(); | 32 | get_online_cpus(); |
| 33 | for_each_online_cpu(cpu) { | 33 | for_each_online_cpu(cpu) { |
| 34 | sysdev = get_cpu_sysdev(cpu); | 34 | sysdev = get_cpu_sysdev(cpu); |
| 35 | kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); | 35 | kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); |
| 36 | } | 36 | } |
| 37 | unlock_cpu_hotplug(); | 37 | put_online_cpus(); |
| 38 | } | 38 | } |
| 39 | 39 | ||
| 40 | static void sclp_conf_receiver_fn(struct evbuf_header *evbuf) | 40 | static void sclp_conf_receiver_fn(struct evbuf_header *evbuf) |
diff --git a/fs/Kconfig b/fs/Kconfig index 781b47d2f9f2..b4799efaf9e8 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
| @@ -2130,4 +2130,3 @@ source "fs/nls/Kconfig" | |||
| 2130 | source "fs/dlm/Kconfig" | 2130 | source "fs/dlm/Kconfig" |
| 2131 | 2131 | ||
| 2132 | endmenu | 2132 | endmenu |
| 2133 | |||
diff --git a/fs/proc/base.c b/fs/proc/base.c index 7411bfb0b7cc..91fa8e6ce8ad 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
| @@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer) | |||
| 310 | } | 310 | } |
| 311 | #endif | 311 | #endif |
| 312 | 312 | ||
| 313 | #ifdef CONFIG_LATENCYTOP | ||
| 314 | static int lstats_show_proc(struct seq_file *m, void *v) | ||
| 315 | { | ||
| 316 | int i; | ||
| 317 | struct task_struct *task = m->private; | ||
| 318 | seq_puts(m, "Latency Top version : v0.1\n"); | ||
| 319 | |||
| 320 | for (i = 0; i < 32; i++) { | ||
| 321 | if (task->latency_record[i].backtrace[0]) { | ||
| 322 | int q; | ||
| 323 | seq_printf(m, "%i %li %li ", | ||
| 324 | task->latency_record[i].count, | ||
| 325 | task->latency_record[i].time, | ||
| 326 | task->latency_record[i].max); | ||
| 327 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | ||
| 328 | char sym[KSYM_NAME_LEN]; | ||
| 329 | char *c; | ||
| 330 | if (!task->latency_record[i].backtrace[q]) | ||
| 331 | break; | ||
| 332 | if (task->latency_record[i].backtrace[q] == ULONG_MAX) | ||
| 333 | break; | ||
| 334 | sprint_symbol(sym, task->latency_record[i].backtrace[q]); | ||
| 335 | c = strchr(sym, '+'); | ||
| 336 | if (c) | ||
| 337 | *c = 0; | ||
| 338 | seq_printf(m, "%s ", sym); | ||
| 339 | } | ||
| 340 | seq_printf(m, "\n"); | ||
| 341 | } | ||
| 342 | |||
| 343 | } | ||
| 344 | return 0; | ||
| 345 | } | ||
| 346 | |||
| 347 | static int lstats_open(struct inode *inode, struct file *file) | ||
| 348 | { | ||
| 349 | int ret; | ||
| 350 | struct seq_file *m; | ||
| 351 | struct task_struct *task = get_proc_task(inode); | ||
| 352 | |||
| 353 | ret = single_open(file, lstats_show_proc, NULL); | ||
| 354 | if (!ret) { | ||
| 355 | m = file->private_data; | ||
| 356 | m->private = task; | ||
| 357 | } | ||
| 358 | return ret; | ||
| 359 | } | ||
| 360 | |||
| 361 | static ssize_t lstats_write(struct file *file, const char __user *buf, | ||
| 362 | size_t count, loff_t *offs) | ||
| 363 | { | ||
| 364 | struct seq_file *m; | ||
| 365 | struct task_struct *task; | ||
| 366 | |||
| 367 | m = file->private_data; | ||
| 368 | task = m->private; | ||
| 369 | clear_all_latency_tracing(task); | ||
| 370 | |||
| 371 | return count; | ||
| 372 | } | ||
| 373 | |||
| 374 | static const struct file_operations proc_lstats_operations = { | ||
| 375 | .open = lstats_open, | ||
| 376 | .read = seq_read, | ||
| 377 | .write = lstats_write, | ||
| 378 | .llseek = seq_lseek, | ||
| 379 | .release = single_release, | ||
| 380 | }; | ||
| 381 | |||
| 382 | #endif | ||
| 383 | |||
| 313 | /* The badness from the OOM killer */ | 384 | /* The badness from the OOM killer */ |
| 314 | unsigned long badness(struct task_struct *p, unsigned long uptime); | 385 | unsigned long badness(struct task_struct *p, unsigned long uptime); |
| 315 | static int proc_oom_score(struct task_struct *task, char *buffer) | 386 | static int proc_oom_score(struct task_struct *task, char *buffer) |
| @@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = { | |||
| 1020 | }; | 1091 | }; |
| 1021 | #endif | 1092 | #endif |
| 1022 | 1093 | ||
| 1094 | |||
| 1023 | #ifdef CONFIG_SCHED_DEBUG | 1095 | #ifdef CONFIG_SCHED_DEBUG |
| 1024 | /* | 1096 | /* |
| 1025 | * Print out various scheduling related per-task fields: | 1097 | * Print out various scheduling related per-task fields: |
| @@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = { | |||
| 2230 | #ifdef CONFIG_SCHEDSTATS | 2302 | #ifdef CONFIG_SCHEDSTATS |
| 2231 | INF("schedstat", S_IRUGO, pid_schedstat), | 2303 | INF("schedstat", S_IRUGO, pid_schedstat), |
| 2232 | #endif | 2304 | #endif |
| 2305 | #ifdef CONFIG_LATENCYTOP | ||
| 2306 | REG("latency", S_IRUGO, lstats), | ||
| 2307 | #endif | ||
| 2233 | #ifdef CONFIG_PROC_PID_CPUSET | 2308 | #ifdef CONFIG_PROC_PID_CPUSET |
| 2234 | REG("cpuset", S_IRUGO, cpuset), | 2309 | REG("cpuset", S_IRUGO, cpuset), |
| 2235 | #endif | 2310 | #endif |
| @@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = { | |||
| 2555 | #ifdef CONFIG_SCHEDSTATS | 2630 | #ifdef CONFIG_SCHEDSTATS |
| 2556 | INF("schedstat", S_IRUGO, pid_schedstat), | 2631 | INF("schedstat", S_IRUGO, pid_schedstat), |
| 2557 | #endif | 2632 | #endif |
| 2633 | #ifdef CONFIG_LATENCYTOP | ||
| 2634 | REG("latency", S_IRUGO, lstats), | ||
| 2635 | #endif | ||
| 2558 | #ifdef CONFIG_PROC_PID_CPUSET | 2636 | #ifdef CONFIG_PROC_PID_CPUSET |
| 2559 | REG("cpuset", S_IRUGO, cpuset), | 2637 | REG("cpuset", S_IRUGO, cpuset), |
| 2560 | #endif | 2638 | #endif |
diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h index a4a22cc35898..587566f95f6c 100644 --- a/include/asm-generic/resource.h +++ b/include/asm-generic/resource.h | |||
| @@ -44,8 +44,8 @@ | |||
| 44 | #define RLIMIT_NICE 13 /* max nice prio allowed to raise to | 44 | #define RLIMIT_NICE 13 /* max nice prio allowed to raise to |
| 45 | 0-39 for nice level 19 .. -20 */ | 45 | 0-39 for nice level 19 .. -20 */ |
| 46 | #define RLIMIT_RTPRIO 14 /* maximum realtime priority */ | 46 | #define RLIMIT_RTPRIO 14 /* maximum realtime priority */ |
| 47 | 47 | #define RLIMIT_RTTIME 15 /* timeout for RT tasks in us */ | |
| 48 | #define RLIM_NLIMITS 15 | 48 | #define RLIM_NLIMITS 16 |
| 49 | 49 | ||
| 50 | /* | 50 | /* |
| 51 | * SuS says limits have to be unsigned. | 51 | * SuS says limits have to be unsigned. |
| @@ -86,6 +86,7 @@ | |||
| 86 | [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ | 86 | [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ |
| 87 | [RLIMIT_NICE] = { 0, 0 }, \ | 87 | [RLIMIT_NICE] = { 0, 0 }, \ |
| 88 | [RLIMIT_RTPRIO] = { 0, 0 }, \ | 88 | [RLIMIT_RTPRIO] = { 0, 0 }, \ |
| 89 | [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ | ||
| 89 | } | 90 | } |
| 90 | 91 | ||
| 91 | #endif /* __KERNEL__ */ | 92 | #endif /* __KERNEL__ */ |
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h index 22a8cbcd35e2..ef58fd2a6eb0 100644 --- a/include/asm-x86/thread_info_32.h +++ b/include/asm-x86/thread_info_32.h | |||
| @@ -132,6 +132,7 @@ static inline struct thread_info *current_thread_info(void) | |||
| 132 | #define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ | 132 | #define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ |
| 133 | #define TIF_SECCOMP 7 /* secure computing */ | 133 | #define TIF_SECCOMP 7 /* secure computing */ |
| 134 | #define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ | 134 | #define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ |
| 135 | #define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */ | ||
| 135 | #define TIF_MEMDIE 16 | 136 | #define TIF_MEMDIE 16 |
| 136 | #define TIF_DEBUG 17 /* uses debug registers */ | 137 | #define TIF_DEBUG 17 /* uses debug registers */ |
| 137 | #define TIF_IO_BITMAP 18 /* uses I/O bitmap */ | 138 | #define TIF_IO_BITMAP 18 /* uses I/O bitmap */ |
| @@ -147,6 +148,7 @@ static inline struct thread_info *current_thread_info(void) | |||
| 147 | #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) | 148 | #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) |
| 148 | #define _TIF_SECCOMP (1<<TIF_SECCOMP) | 149 | #define _TIF_SECCOMP (1<<TIF_SECCOMP) |
| 149 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) | 150 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) |
| 151 | #define _TIF_HRTICK_RESCHED (1<<TIF_HRTICK_RESCHED) | ||
| 150 | #define _TIF_DEBUG (1<<TIF_DEBUG) | 152 | #define _TIF_DEBUG (1<<TIF_DEBUG) |
| 151 | #define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP) | 153 | #define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP) |
| 152 | #define _TIF_FREEZE (1<<TIF_FREEZE) | 154 | #define _TIF_FREEZE (1<<TIF_FREEZE) |
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h index beae2bfb62ca..7f6ee68f0002 100644 --- a/include/asm-x86/thread_info_64.h +++ b/include/asm-x86/thread_info_64.h | |||
| @@ -115,6 +115,7 @@ static inline struct thread_info *stack_thread_info(void) | |||
| 115 | #define TIF_SECCOMP 8 /* secure computing */ | 115 | #define TIF_SECCOMP 8 /* secure computing */ |
| 116 | #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ | 116 | #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ |
| 117 | #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ | 117 | #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ |
| 118 | #define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */ | ||
| 118 | /* 16 free */ | 119 | /* 16 free */ |
| 119 | #define TIF_IA32 17 /* 32bit process */ | 120 | #define TIF_IA32 17 /* 32bit process */ |
| 120 | #define TIF_FORK 18 /* ret_from_fork */ | 121 | #define TIF_FORK 18 /* ret_from_fork */ |
| @@ -133,6 +134,7 @@ static inline struct thread_info *stack_thread_info(void) | |||
| 133 | #define _TIF_SECCOMP (1<<TIF_SECCOMP) | 134 | #define _TIF_SECCOMP (1<<TIF_SECCOMP) |
| 134 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) | 135 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) |
| 135 | #define _TIF_MCE_NOTIFY (1<<TIF_MCE_NOTIFY) | 136 | #define _TIF_MCE_NOTIFY (1<<TIF_MCE_NOTIFY) |
| 137 | #define _TIF_HRTICK_RESCHED (1<<TIF_HRTICK_RESCHED) | ||
| 136 | #define _TIF_IA32 (1<<TIF_IA32) | 138 | #define _TIF_IA32 (1<<TIF_IA32) |
| 137 | #define _TIF_FORK (1<<TIF_FORK) | 139 | #define _TIF_FORK (1<<TIF_FORK) |
| 138 | #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING) | 140 | #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING) |
| @@ -146,6 +148,9 @@ static inline struct thread_info *stack_thread_info(void) | |||
| 146 | /* work to do on any return to user space */ | 148 | /* work to do on any return to user space */ |
| 147 | #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) | 149 | #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) |
| 148 | 150 | ||
| 151 | #define _TIF_DO_NOTIFY_MASK \ | ||
| 152 | (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED) | ||
| 153 | |||
| 149 | /* flags to check in __switch_to() */ | 154 | /* flags to check in __switch_to() */ |
| 150 | #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP) | 155 | #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP) |
| 151 | 156 | ||
diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 92f2029a34f3..0be8d65bc3c8 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h | |||
| @@ -71,18 +71,27 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb) | |||
| 71 | 71 | ||
| 72 | int cpu_up(unsigned int cpu); | 72 | int cpu_up(unsigned int cpu); |
| 73 | 73 | ||
| 74 | extern void cpu_hotplug_init(void); | ||
| 75 | |||
| 74 | #else | 76 | #else |
| 75 | 77 | ||
| 76 | static inline int register_cpu_notifier(struct notifier_block *nb) | 78 | static inline int register_cpu_notifier(struct notifier_block *nb) |
| 77 | { | 79 | { |
| 78 | return 0; | 80 | return 0; |
| 79 | } | 81 | } |
| 82 | |||
| 80 | static inline void unregister_cpu_notifier(struct notifier_block *nb) | 83 | static inline void unregister_cpu_notifier(struct notifier_block *nb) |
| 81 | { | 84 | { |
| 82 | } | 85 | } |
| 83 | 86 | ||
| 87 | static inline void cpu_hotplug_init(void) | ||
| 88 | { | ||
| 89 | } | ||
| 90 | |||
| 84 | #endif /* CONFIG_SMP */ | 91 | #endif /* CONFIG_SMP */ |
| 85 | extern struct sysdev_class cpu_sysdev_class; | 92 | extern struct sysdev_class cpu_sysdev_class; |
| 93 | extern void cpu_maps_update_begin(void); | ||
| 94 | extern void cpu_maps_update_done(void); | ||
| 86 | 95 | ||
| 87 | #ifdef CONFIG_HOTPLUG_CPU | 96 | #ifdef CONFIG_HOTPLUG_CPU |
| 88 | /* Stop CPUs going up and down. */ | 97 | /* Stop CPUs going up and down. */ |
| @@ -97,8 +106,8 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex) | |||
| 97 | mutex_unlock(cpu_hp_mutex); | 106 | mutex_unlock(cpu_hp_mutex); |
| 98 | } | 107 | } |
| 99 | 108 | ||
| 100 | extern void lock_cpu_hotplug(void); | 109 | extern void get_online_cpus(void); |
| 101 | extern void unlock_cpu_hotplug(void); | 110 | extern void put_online_cpus(void); |
| 102 | #define hotcpu_notifier(fn, pri) { \ | 111 | #define hotcpu_notifier(fn, pri) { \ |
| 103 | static struct notifier_block fn##_nb = \ | 112 | static struct notifier_block fn##_nb = \ |
| 104 | { .notifier_call = fn, .priority = pri }; \ | 113 | { .notifier_call = fn, .priority = pri }; \ |
| @@ -115,8 +124,8 @@ static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex) | |||
| 115 | static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex) | 124 | static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex) |
| 116 | { } | 125 | { } |
| 117 | 126 | ||
| 118 | #define lock_cpu_hotplug() do { } while (0) | 127 | #define get_online_cpus() do { } while (0) |
| 119 | #define unlock_cpu_hotplug() do { } while (0) | 128 | #define put_online_cpus() do { } while (0) |
| 120 | #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) | 129 | #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) |
| 121 | /* These aren't inline functions due to a GCC bug. */ | 130 | /* These aren't inline functions due to a GCC bug. */ |
| 122 | #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) | 131 | #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) |
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 1678a5de7013..f4a5871767f5 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h | |||
| @@ -47,6 +47,7 @@ struct task_struct; | |||
| 47 | 47 | ||
| 48 | #ifdef CONFIG_LOCKDEP | 48 | #ifdef CONFIG_LOCKDEP |
| 49 | extern void debug_show_all_locks(void); | 49 | extern void debug_show_all_locks(void); |
| 50 | extern void __debug_show_held_locks(struct task_struct *task); | ||
| 50 | extern void debug_show_held_locks(struct task_struct *task); | 51 | extern void debug_show_held_locks(struct task_struct *task); |
| 51 | extern void debug_check_no_locks_freed(const void *from, unsigned long len); | 52 | extern void debug_check_no_locks_freed(const void *from, unsigned long len); |
| 52 | extern void debug_check_no_locks_held(struct task_struct *task); | 53 | extern void debug_check_no_locks_held(struct task_struct *task); |
| @@ -55,6 +56,10 @@ static inline void debug_show_all_locks(void) | |||
| 55 | { | 56 | { |
| 56 | } | 57 | } |
| 57 | 58 | ||
| 59 | static inline void __debug_show_held_locks(struct task_struct *task) | ||
| 60 | { | ||
| 61 | } | ||
| 62 | |||
| 58 | static inline void debug_show_held_locks(struct task_struct *task) | 63 | static inline void debug_show_held_locks(struct task_struct *task) |
| 59 | { | 64 | { |
| 60 | } | 65 | } |
diff --git a/include/linux/futex.h b/include/linux/futex.h index 92d420fe03f8..1a15f8e237a7 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h | |||
| @@ -1,8 +1,12 @@ | |||
| 1 | #ifndef _LINUX_FUTEX_H | 1 | #ifndef _LINUX_FUTEX_H |
| 2 | #define _LINUX_FUTEX_H | 2 | #define _LINUX_FUTEX_H |
| 3 | 3 | ||
| 4 | #include <linux/sched.h> | 4 | #include <linux/compiler.h> |
| 5 | #include <linux/types.h> | ||
| 5 | 6 | ||
| 7 | struct inode; | ||
| 8 | struct mm_struct; | ||
| 9 | struct task_struct; | ||
| 6 | union ktime; | 10 | union ktime; |
| 7 | 11 | ||
| 8 | /* Second argument to futex syscall */ | 12 | /* Second argument to futex syscall */ |
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 8d302298a161..2961ec788046 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h | |||
| @@ -72,11 +72,7 @@ | |||
| 72 | #define in_softirq() (softirq_count()) | 72 | #define in_softirq() (softirq_count()) |
| 73 | #define in_interrupt() (irq_count()) | 73 | #define in_interrupt() (irq_count()) |
| 74 | 74 | ||
| 75 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | 75 | #define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) |
| 76 | # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) | ||
| 77 | #else | ||
| 78 | # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) | ||
| 79 | #endif | ||
| 80 | 76 | ||
| 81 | #ifdef CONFIG_PREEMPT | 77 | #ifdef CONFIG_PREEMPT |
| 82 | # define PREEMPT_CHECK_OFFSET 1 | 78 | # define PREEMPT_CHECK_OFFSET 1 |
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7a9398e19704..49067f14fac1 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h | |||
| @@ -115,10 +115,8 @@ struct hrtimer { | |||
| 115 | enum hrtimer_restart (*function)(struct hrtimer *); | 115 | enum hrtimer_restart (*function)(struct hrtimer *); |
| 116 | struct hrtimer_clock_base *base; | 116 | struct hrtimer_clock_base *base; |
| 117 | unsigned long state; | 117 | unsigned long state; |
| 118 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
| 119 | enum hrtimer_cb_mode cb_mode; | 118 | enum hrtimer_cb_mode cb_mode; |
| 120 | struct list_head cb_entry; | 119 | struct list_head cb_entry; |
| 121 | #endif | ||
| 122 | #ifdef CONFIG_TIMER_STATS | 120 | #ifdef CONFIG_TIMER_STATS |
| 123 | void *start_site; | 121 | void *start_site; |
| 124 | char start_comm[16]; | 122 | char start_comm[16]; |
| @@ -194,10 +192,10 @@ struct hrtimer_cpu_base { | |||
| 194 | spinlock_t lock; | 192 | spinlock_t lock; |
| 195 | struct lock_class_key lock_key; | 193 | struct lock_class_key lock_key; |
| 196 | struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; | 194 | struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; |
| 195 | struct list_head cb_pending; | ||
| 197 | #ifdef CONFIG_HIGH_RES_TIMERS | 196 | #ifdef CONFIG_HIGH_RES_TIMERS |
| 198 | ktime_t expires_next; | 197 | ktime_t expires_next; |
| 199 | int hres_active; | 198 | int hres_active; |
| 200 | struct list_head cb_pending; | ||
| 201 | unsigned long nr_events; | 199 | unsigned long nr_events; |
| 202 | #endif | 200 | #endif |
| 203 | }; | 201 | }; |
| @@ -217,6 +215,11 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) | |||
| 217 | return timer->base->get_time(); | 215 | return timer->base->get_time(); |
| 218 | } | 216 | } |
| 219 | 217 | ||
| 218 | static inline int hrtimer_is_hres_active(struct hrtimer *timer) | ||
| 219 | { | ||
| 220 | return timer->base->cpu_base->hres_active; | ||
| 221 | } | ||
| 222 | |||
| 220 | /* | 223 | /* |
| 221 | * The resolution of the clocks. The resolution value is returned in | 224 | * The resolution of the clocks. The resolution value is returned in |
| 222 | * the clock_getres() system call to give application programmers an | 225 | * the clock_getres() system call to give application programmers an |
| @@ -248,6 +251,10 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) | |||
| 248 | return timer->base->softirq_time; | 251 | return timer->base->softirq_time; |
| 249 | } | 252 | } |
| 250 | 253 | ||
| 254 | static inline int hrtimer_is_hres_active(struct hrtimer *timer) | ||
| 255 | { | ||
| 256 | return 0; | ||
| 257 | } | ||
| 251 | #endif | 258 | #endif |
| 252 | 259 | ||
| 253 | extern ktime_t ktime_get(void); | 260 | extern ktime_t ktime_get(void); |
| @@ -310,6 +317,7 @@ extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, | |||
| 310 | 317 | ||
| 311 | /* Soft interrupt function to run the hrtimer queues: */ | 318 | /* Soft interrupt function to run the hrtimer queues: */ |
| 312 | extern void hrtimer_run_queues(void); | 319 | extern void hrtimer_run_queues(void); |
| 320 | extern void hrtimer_run_pending(void); | ||
| 313 | 321 | ||
| 314 | /* Bootup initialization: */ | 322 | /* Bootup initialization: */ |
| 315 | extern void __init hrtimers_init(void); | 323 | extern void __init hrtimers_init(void); |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index cae35b6b9aec..796019b22b6f 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
| @@ -132,9 +132,12 @@ extern struct group_info init_groups; | |||
| 132 | .cpus_allowed = CPU_MASK_ALL, \ | 132 | .cpus_allowed = CPU_MASK_ALL, \ |
| 133 | .mm = NULL, \ | 133 | .mm = NULL, \ |
| 134 | .active_mm = &init_mm, \ | 134 | .active_mm = &init_mm, \ |
| 135 | .run_list = LIST_HEAD_INIT(tsk.run_list), \ | 135 | .rt = { \ |
| 136 | .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ | ||
| 137 | .time_slice = HZ, \ | ||
| 138 | .nr_cpus_allowed = NR_CPUS, \ | ||
| 139 | }, \ | ||
| 136 | .ioprio = 0, \ | 140 | .ioprio = 0, \ |
| 137 | .time_slice = HZ, \ | ||
| 138 | .tasks = LIST_HEAD_INIT(tsk.tasks), \ | 141 | .tasks = LIST_HEAD_INIT(tsk.tasks), \ |
| 139 | .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ | 142 | .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ |
| 140 | .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ | 143 | .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ |
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 2306920fa388..c3db4a00f1fa 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h | |||
| @@ -256,6 +256,7 @@ enum | |||
| 256 | #ifdef CONFIG_HIGH_RES_TIMERS | 256 | #ifdef CONFIG_HIGH_RES_TIMERS |
| 257 | HRTIMER_SOFTIRQ, | 257 | HRTIMER_SOFTIRQ, |
| 258 | #endif | 258 | #endif |
| 259 | RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ | ||
| 259 | }; | 260 | }; |
| 260 | 261 | ||
| 261 | /* softirq mask and active fields moved to irq_cpustat_t in | 262 | /* softirq mask and active fields moved to irq_cpustat_t in |
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 8b080024bbc1..7ba9e47bf061 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h | |||
| @@ -29,6 +29,12 @@ | |||
| 29 | # define SHIFT_HZ 9 | 29 | # define SHIFT_HZ 9 |
| 30 | #elif HZ >= 768 && HZ < 1536 | 30 | #elif HZ >= 768 && HZ < 1536 |
| 31 | # define SHIFT_HZ 10 | 31 | # define SHIFT_HZ 10 |
| 32 | #elif HZ >= 1536 && HZ < 3072 | ||
| 33 | # define SHIFT_HZ 11 | ||
| 34 | #elif HZ >= 3072 && HZ < 6144 | ||
| 35 | # define SHIFT_HZ 12 | ||
| 36 | #elif HZ >= 6144 && HZ < 12288 | ||
| 37 | # define SHIFT_HZ 13 | ||
| 32 | #else | 38 | #else |
| 33 | # error You lose. | 39 | # error You lose. |
| 34 | #endif | 40 | #endif |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94bc99656963..a7283c9beadf 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
| @@ -105,8 +105,8 @@ struct user; | |||
| 105 | * supposed to. | 105 | * supposed to. |
| 106 | */ | 106 | */ |
| 107 | #ifdef CONFIG_PREEMPT_VOLUNTARY | 107 | #ifdef CONFIG_PREEMPT_VOLUNTARY |
| 108 | extern int cond_resched(void); | 108 | extern int _cond_resched(void); |
| 109 | # define might_resched() cond_resched() | 109 | # define might_resched() _cond_resched() |
| 110 | #else | 110 | #else |
| 111 | # define might_resched() do { } while (0) | 111 | # define might_resched() do { } while (0) |
| 112 | #endif | 112 | #endif |
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h new file mode 100644 index 000000000000..901c2d6377a8 --- /dev/null +++ b/include/linux/latencytop.h | |||
| @@ -0,0 +1,44 @@ | |||
| 1 | /* | ||
| 2 | * latencytop.h: Infrastructure for displaying latency | ||
| 3 | * | ||
| 4 | * (C) Copyright 2008 Intel Corporation | ||
| 5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
| 6 | * | ||
| 7 | */ | ||
| 8 | |||
| 9 | #ifndef _INCLUDE_GUARD_LATENCYTOP_H_ | ||
| 10 | #define _INCLUDE_GUARD_LATENCYTOP_H_ | ||
| 11 | |||
| 12 | #ifdef CONFIG_LATENCYTOP | ||
| 13 | |||
| 14 | #define LT_SAVECOUNT 32 | ||
| 15 | #define LT_BACKTRACEDEPTH 12 | ||
| 16 | |||
| 17 | struct latency_record { | ||
| 18 | unsigned long backtrace[LT_BACKTRACEDEPTH]; | ||
| 19 | unsigned int count; | ||
| 20 | unsigned long time; | ||
| 21 | unsigned long max; | ||
| 22 | }; | ||
| 23 | |||
| 24 | |||
| 25 | struct task_struct; | ||
| 26 | |||
| 27 | void account_scheduler_latency(struct task_struct *task, int usecs, int inter); | ||
| 28 | |||
| 29 | void clear_all_latency_tracing(struct task_struct *p); | ||
| 30 | |||
| 31 | #else | ||
| 32 | |||
| 33 | static inline void | ||
| 34 | account_scheduler_latency(struct task_struct *task, int usecs, int inter) | ||
| 35 | { | ||
| 36 | } | ||
| 37 | |||
| 38 | static inline void clear_all_latency_tracing(struct task_struct *p) | ||
| 39 | { | ||
| 40 | } | ||
| 41 | |||
| 42 | #endif | ||
| 43 | |||
| 44 | #endif | ||
diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 0c40cc0b4a36..5dfbc684ce7d 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h | |||
| @@ -207,9 +207,7 @@ static inline int notifier_to_errno(int ret) | |||
| 207 | #define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */ | 207 | #define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */ |
| 208 | #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ | 208 | #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ |
| 209 | #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ | 209 | #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ |
| 210 | #define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */ | 210 | #define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, |
| 211 | #define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */ | ||
| 212 | #define CPU_DYING 0x000A /* CPU (unsigned)v not running any task, | ||
| 213 | * not handling interrupts, soon dead */ | 211 | * not handling interrupts, soon dead */ |
| 214 | 212 | ||
| 215 | /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend | 213 | /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend |
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h new file mode 100644 index 000000000000..4d6624260b4c --- /dev/null +++ b/include/linux/rcuclassic.h | |||
| @@ -0,0 +1,164 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion (classic version) | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2001 | ||
| 19 | * | ||
| 20 | * Author: Dipankar Sarma <dipankar@in.ibm.com> | ||
| 21 | * | ||
| 22 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
| 23 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
| 24 | * Papers: | ||
| 25 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
| 26 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
| 27 | * | ||
| 28 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 29 | * Documentation/RCU | ||
| 30 | * | ||
| 31 | */ | ||
| 32 | |||
| 33 | #ifndef __LINUX_RCUCLASSIC_H | ||
| 34 | #define __LINUX_RCUCLASSIC_H | ||
| 35 | |||
| 36 | #ifdef __KERNEL__ | ||
| 37 | |||
| 38 | #include <linux/cache.h> | ||
| 39 | #include <linux/spinlock.h> | ||
| 40 | #include <linux/threads.h> | ||
| 41 | #include <linux/percpu.h> | ||
| 42 | #include <linux/cpumask.h> | ||
| 43 | #include <linux/seqlock.h> | ||
| 44 | |||
| 45 | |||
| 46 | /* Global control variables for rcupdate callback mechanism. */ | ||
| 47 | struct rcu_ctrlblk { | ||
| 48 | long cur; /* Current batch number. */ | ||
| 49 | long completed; /* Number of the last completed batch */ | ||
| 50 | int next_pending; /* Is the next batch already waiting? */ | ||
| 51 | |||
| 52 | int signaled; | ||
| 53 | |||
| 54 | spinlock_t lock ____cacheline_internodealigned_in_smp; | ||
| 55 | cpumask_t cpumask; /* CPUs that need to switch in order */ | ||
| 56 | /* for current batch to proceed. */ | ||
| 57 | } ____cacheline_internodealigned_in_smp; | ||
| 58 | |||
| 59 | /* Is batch a before batch b ? */ | ||
| 60 | static inline int rcu_batch_before(long a, long b) | ||
| 61 | { | ||
| 62 | return (a - b) < 0; | ||
| 63 | } | ||
| 64 | |||
| 65 | /* Is batch a after batch b ? */ | ||
| 66 | static inline int rcu_batch_after(long a, long b) | ||
| 67 | { | ||
| 68 | return (a - b) > 0; | ||
| 69 | } | ||
| 70 | |||
| 71 | /* | ||
| 72 | * Per-CPU data for Read-Copy UPdate. | ||
| 73 | * nxtlist - new callbacks are added here | ||
| 74 | * curlist - current batch for which quiescent cycle started if any | ||
| 75 | */ | ||
| 76 | struct rcu_data { | ||
| 77 | /* 1) quiescent state handling : */ | ||
| 78 | long quiescbatch; /* Batch # for grace period */ | ||
| 79 | int passed_quiesc; /* User-mode/idle loop etc. */ | ||
| 80 | int qs_pending; /* core waits for quiesc state */ | ||
| 81 | |||
| 82 | /* 2) batch handling */ | ||
| 83 | long batch; /* Batch # for current RCU batch */ | ||
| 84 | struct rcu_head *nxtlist; | ||
| 85 | struct rcu_head **nxttail; | ||
| 86 | long qlen; /* # of queued callbacks */ | ||
| 87 | struct rcu_head *curlist; | ||
| 88 | struct rcu_head **curtail; | ||
| 89 | struct rcu_head *donelist; | ||
| 90 | struct rcu_head **donetail; | ||
| 91 | long blimit; /* Upper limit on a processed batch */ | ||
| 92 | int cpu; | ||
| 93 | struct rcu_head barrier; | ||
| 94 | }; | ||
| 95 | |||
| 96 | DECLARE_PER_CPU(struct rcu_data, rcu_data); | ||
| 97 | DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); | ||
| 98 | |||
| 99 | /* | ||
| 100 | * Increment the quiescent state counter. | ||
| 101 | * The counter is a bit degenerated: We do not need to know | ||
| 102 | * how many quiescent states passed, just if there was at least | ||
| 103 | * one since the start of the grace period. Thus just a flag. | ||
| 104 | */ | ||
| 105 | static inline void rcu_qsctr_inc(int cpu) | ||
| 106 | { | ||
| 107 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
| 108 | rdp->passed_quiesc = 1; | ||
| 109 | } | ||
| 110 | static inline void rcu_bh_qsctr_inc(int cpu) | ||
| 111 | { | ||
| 112 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | ||
| 113 | rdp->passed_quiesc = 1; | ||
| 114 | } | ||
| 115 | |||
| 116 | extern int rcu_pending(int cpu); | ||
| 117 | extern int rcu_needs_cpu(int cpu); | ||
| 118 | |||
| 119 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 120 | extern struct lockdep_map rcu_lock_map; | ||
| 121 | # define rcu_read_acquire() \ | ||
| 122 | lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_) | ||
| 123 | # define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) | ||
| 124 | #else | ||
| 125 | # define rcu_read_acquire() do { } while (0) | ||
| 126 | # define rcu_read_release() do { } while (0) | ||
| 127 | #endif | ||
| 128 | |||
| 129 | #define __rcu_read_lock() \ | ||
| 130 | do { \ | ||
| 131 | preempt_disable(); \ | ||
| 132 | __acquire(RCU); \ | ||
| 133 | rcu_read_acquire(); \ | ||
| 134 | } while (0) | ||
| 135 | #define __rcu_read_unlock() \ | ||
| 136 | do { \ | ||
| 137 | rcu_read_release(); \ | ||
| 138 | __release(RCU); \ | ||
| 139 | preempt_enable(); \ | ||
| 140 | } while (0) | ||
| 141 | #define __rcu_read_lock_bh() \ | ||
| 142 | do { \ | ||
| 143 | local_bh_disable(); \ | ||
| 144 | __acquire(RCU_BH); \ | ||
| 145 | rcu_read_acquire(); \ | ||
| 146 | } while (0) | ||
| 147 | #define __rcu_read_unlock_bh() \ | ||
| 148 | do { \ | ||
| 149 | rcu_read_release(); \ | ||
| 150 | __release(RCU_BH); \ | ||
| 151 | local_bh_enable(); \ | ||
| 152 | } while (0) | ||
| 153 | |||
| 154 | #define __synchronize_sched() synchronize_rcu() | ||
| 155 | |||
| 156 | extern void __rcu_init(void); | ||
| 157 | extern void rcu_check_callbacks(int cpu, int user); | ||
| 158 | extern void rcu_restart_cpu(int cpu); | ||
| 159 | |||
| 160 | extern long rcu_batches_completed(void); | ||
| 161 | extern long rcu_batches_completed_bh(void); | ||
| 162 | |||
| 163 | #endif /* __KERNEL__ */ | ||
| 164 | #endif /* __LINUX_RCUCLASSIC_H */ | ||
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index cc24a01df940..d32c14de270e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
| @@ -15,7 +15,7 @@ | |||
| 15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 17 | * | 17 | * |
| 18 | * Copyright (C) IBM Corporation, 2001 | 18 | * Copyright IBM Corporation, 2001 |
| 19 | * | 19 | * |
| 20 | * Author: Dipankar Sarma <dipankar@in.ibm.com> | 20 | * Author: Dipankar Sarma <dipankar@in.ibm.com> |
| 21 | * | 21 | * |
| @@ -53,96 +53,18 @@ struct rcu_head { | |||
| 53 | void (*func)(struct rcu_head *head); | 53 | void (*func)(struct rcu_head *head); |
| 54 | }; | 54 | }; |
| 55 | 55 | ||
| 56 | #ifdef CONFIG_CLASSIC_RCU | ||
| 57 | #include <linux/rcuclassic.h> | ||
| 58 | #else /* #ifdef CONFIG_CLASSIC_RCU */ | ||
| 59 | #include <linux/rcupreempt.h> | ||
| 60 | #endif /* #else #ifdef CONFIG_CLASSIC_RCU */ | ||
| 61 | |||
| 56 | #define RCU_HEAD_INIT { .next = NULL, .func = NULL } | 62 | #define RCU_HEAD_INIT { .next = NULL, .func = NULL } |
| 57 | #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT | 63 | #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT |
| 58 | #define INIT_RCU_HEAD(ptr) do { \ | 64 | #define INIT_RCU_HEAD(ptr) do { \ |
| 59 | (ptr)->next = NULL; (ptr)->func = NULL; \ | 65 | (ptr)->next = NULL; (ptr)->func = NULL; \ |
| 60 | } while (0) | 66 | } while (0) |
| 61 | 67 | ||
| 62 | |||
| 63 | |||
| 64 | /* Global control variables for rcupdate callback mechanism. */ | ||
| 65 | struct rcu_ctrlblk { | ||
| 66 | long cur; /* Current batch number. */ | ||
| 67 | long completed; /* Number of the last completed batch */ | ||
| 68 | int next_pending; /* Is the next batch already waiting? */ | ||
| 69 | |||
| 70 | int signaled; | ||
| 71 | |||
| 72 | spinlock_t lock ____cacheline_internodealigned_in_smp; | ||
| 73 | cpumask_t cpumask; /* CPUs that need to switch in order */ | ||
| 74 | /* for current batch to proceed. */ | ||
| 75 | } ____cacheline_internodealigned_in_smp; | ||
| 76 | |||
| 77 | /* Is batch a before batch b ? */ | ||
| 78 | static inline int rcu_batch_before(long a, long b) | ||
| 79 | { | ||
| 80 | return (a - b) < 0; | ||
| 81 | } | ||
| 82 | |||
| 83 | /* Is batch a after batch b ? */ | ||
| 84 | static inline int rcu_batch_after(long a, long b) | ||
| 85 | { | ||
| 86 | return (a - b) > 0; | ||
| 87 | } | ||
| 88 | |||
| 89 | /* | ||
| 90 | * Per-CPU data for Read-Copy UPdate. | ||
| 91 | * nxtlist - new callbacks are added here | ||
| 92 | * curlist - current batch for which quiescent cycle started if any | ||
| 93 | */ | ||
| 94 | struct rcu_data { | ||
| 95 | /* 1) quiescent state handling : */ | ||
| 96 | long quiescbatch; /* Batch # for grace period */ | ||
| 97 | int passed_quiesc; /* User-mode/idle loop etc. */ | ||
| 98 | int qs_pending; /* core waits for quiesc state */ | ||
| 99 | |||
| 100 | /* 2) batch handling */ | ||
| 101 | long batch; /* Batch # for current RCU batch */ | ||
| 102 | struct rcu_head *nxtlist; | ||
| 103 | struct rcu_head **nxttail; | ||
| 104 | long qlen; /* # of queued callbacks */ | ||
| 105 | struct rcu_head *curlist; | ||
| 106 | struct rcu_head **curtail; | ||
| 107 | struct rcu_head *donelist; | ||
| 108 | struct rcu_head **donetail; | ||
| 109 | long blimit; /* Upper limit on a processed batch */ | ||
| 110 | int cpu; | ||
| 111 | struct rcu_head barrier; | ||
| 112 | }; | ||
| 113 | |||
| 114 | DECLARE_PER_CPU(struct rcu_data, rcu_data); | ||
| 115 | DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); | ||
| 116 | |||
| 117 | /* | ||
| 118 | * Increment the quiescent state counter. | ||
| 119 | * The counter is a bit degenerated: We do not need to know | ||
| 120 | * how many quiescent states passed, just if there was at least | ||
| 121 | * one since the start of the grace period. Thus just a flag. | ||
| 122 | */ | ||
| 123 | static inline void rcu_qsctr_inc(int cpu) | ||
| 124 | { | ||
| 125 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
| 126 | rdp->passed_quiesc = 1; | ||
| 127 | } | ||
| 128 | static inline void rcu_bh_qsctr_inc(int cpu) | ||
| 129 | { | ||
| 130 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | ||
| 131 | rdp->passed_quiesc = 1; | ||
| 132 | } | ||
| 133 | |||
| 134 | extern int rcu_pending(int cpu); | ||
| 135 | extern int rcu_needs_cpu(int cpu); | ||
| 136 | |||
| 137 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 138 | extern struct lockdep_map rcu_lock_map; | ||
| 139 | # define rcu_read_acquire() lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_) | ||
| 140 | # define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) | ||
| 141 | #else | ||
| 142 | # define rcu_read_acquire() do { } while (0) | ||
| 143 | # define rcu_read_release() do { } while (0) | ||
| 144 | #endif | ||
| 145 | |||
| 146 | /** | 68 | /** |
| 147 | * rcu_read_lock - mark the beginning of an RCU read-side critical section. | 69 | * rcu_read_lock - mark the beginning of an RCU read-side critical section. |
| 148 | * | 70 | * |
| @@ -172,24 +94,13 @@ extern struct lockdep_map rcu_lock_map; | |||
| 172 | * | 94 | * |
| 173 | * It is illegal to block while in an RCU read-side critical section. | 95 | * It is illegal to block while in an RCU read-side critical section. |
| 174 | */ | 96 | */ |
| 175 | #define rcu_read_lock() \ | 97 | #define rcu_read_lock() __rcu_read_lock() |
| 176 | do { \ | ||
| 177 | preempt_disable(); \ | ||
| 178 | __acquire(RCU); \ | ||
| 179 | rcu_read_acquire(); \ | ||
| 180 | } while(0) | ||
| 181 | 98 | ||
| 182 | /** | 99 | /** |
| 183 | * rcu_read_unlock - marks the end of an RCU read-side critical section. | 100 | * rcu_read_unlock - marks the end of an RCU read-side critical section. |
| 184 | * | 101 | * |
| 185 | * See rcu_read_lock() for more information. | 102 | * See rcu_read_lock() for more information. |
| 186 | */ | 103 | */ |
| 187 | #define rcu_read_unlock() \ | ||
| 188 | do { \ | ||
| 189 | rcu_read_release(); \ | ||
| 190 | __release(RCU); \ | ||
| 191 | preempt_enable(); \ | ||
| 192 | } while(0) | ||
| 193 | 104 | ||
| 194 | /* | 105 | /* |
| 195 | * So where is rcu_write_lock()? It does not exist, as there is no | 106 | * So where is rcu_write_lock()? It does not exist, as there is no |
| @@ -200,6 +111,7 @@ extern struct lockdep_map rcu_lock_map; | |||
| 200 | * used as well. RCU does not care how the writers keep out of each | 111 | * used as well. RCU does not care how the writers keep out of each |
| 201 | * others' way, as long as they do so. | 112 | * others' way, as long as they do so. |
| 202 | */ | 113 | */ |
| 114 | #define rcu_read_unlock() __rcu_read_unlock() | ||
| 203 | 115 | ||
| 204 | /** | 116 | /** |
| 205 | * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section | 117 | * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section |
| @@ -212,24 +124,14 @@ extern struct lockdep_map rcu_lock_map; | |||
| 212 | * can use just rcu_read_lock(). | 124 | * can use just rcu_read_lock(). |
| 213 | * | 125 | * |
| 214 | */ | 126 | */ |
| 215 | #define rcu_read_lock_bh() \ | 127 | #define rcu_read_lock_bh() __rcu_read_lock_bh() |
| 216 | do { \ | ||
| 217 | local_bh_disable(); \ | ||
| 218 | __acquire(RCU_BH); \ | ||
| 219 | rcu_read_acquire(); \ | ||
| 220 | } while(0) | ||
| 221 | 128 | ||
| 222 | /* | 129 | /* |
| 223 | * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section | 130 | * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section |
| 224 | * | 131 | * |
| 225 | * See rcu_read_lock_bh() for more information. | 132 | * See rcu_read_lock_bh() for more information. |
| 226 | */ | 133 | */ |
| 227 | #define rcu_read_unlock_bh() \ | 134 | #define rcu_read_unlock_bh() __rcu_read_unlock_bh() |
| 228 | do { \ | ||
| 229 | rcu_read_release(); \ | ||
| 230 | __release(RCU_BH); \ | ||
| 231 | local_bh_enable(); \ | ||
| 232 | } while(0) | ||
| 233 | 135 | ||
| 234 | /* | 136 | /* |
| 235 | * Prevent the compiler from merging or refetching accesses. The compiler | 137 | * Prevent the compiler from merging or refetching accesses. The compiler |
| @@ -293,21 +195,52 @@ extern struct lockdep_map rcu_lock_map; | |||
| 293 | * In "classic RCU", these two guarantees happen to be one and | 195 | * In "classic RCU", these two guarantees happen to be one and |
| 294 | * the same, but can differ in realtime RCU implementations. | 196 | * the same, but can differ in realtime RCU implementations. |
| 295 | */ | 197 | */ |
| 296 | #define synchronize_sched() synchronize_rcu() | 198 | #define synchronize_sched() __synchronize_sched() |
| 297 | 199 | ||
| 298 | extern void rcu_init(void); | 200 | /** |
| 299 | extern void rcu_check_callbacks(int cpu, int user); | 201 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
| 300 | extern void rcu_restart_cpu(int cpu); | 202 | * @head: structure to be used for queueing the RCU updates. |
| 301 | extern long rcu_batches_completed(void); | 203 | * @func: actual update function to be invoked after the grace period |
| 302 | extern long rcu_batches_completed_bh(void); | 204 | * |
| 205 | * The update function will be invoked some time after a full grace | ||
| 206 | * period elapses, in other words after all currently executing RCU | ||
| 207 | * read-side critical sections have completed. RCU read-side critical | ||
| 208 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
| 209 | * and may be nested. | ||
| 210 | */ | ||
| 211 | extern void call_rcu(struct rcu_head *head, | ||
| 212 | void (*func)(struct rcu_head *head)); | ||
| 303 | 213 | ||
| 304 | /* Exported interfaces */ | 214 | /** |
| 305 | extern void FASTCALL(call_rcu(struct rcu_head *head, | 215 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. |
| 306 | void (*func)(struct rcu_head *head))); | 216 | * @head: structure to be used for queueing the RCU updates. |
| 307 | extern void FASTCALL(call_rcu_bh(struct rcu_head *head, | 217 | * @func: actual update function to be invoked after the grace period |
| 308 | void (*func)(struct rcu_head *head))); | 218 | * |
| 219 | * The update function will be invoked some time after a full grace | ||
| 220 | * period elapses, in other words after all currently executing RCU | ||
| 221 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
| 222 | * that the read-side critical sections end on completion of a softirq | ||
| 223 | * handler. This means that read-side critical sections in process | ||
| 224 | * context must not be interrupted by softirqs. This interface is to be | ||
| 225 | * used when most of the read-side critical sections are in softirq context. | ||
| 226 | * RCU read-side critical sections are delimited by : | ||
| 227 | * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. | ||
| 228 | * OR | ||
| 229 | * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. | ||
| 230 | * These may be nested. | ||
| 231 | */ | ||
| 232 | extern void call_rcu_bh(struct rcu_head *head, | ||
| 233 | void (*func)(struct rcu_head *head)); | ||
| 234 | |||
| 235 | /* Exported common interfaces */ | ||
| 309 | extern void synchronize_rcu(void); | 236 | extern void synchronize_rcu(void); |
| 310 | extern void rcu_barrier(void); | 237 | extern void rcu_barrier(void); |
| 238 | extern long rcu_batches_completed(void); | ||
| 239 | extern long rcu_batches_completed_bh(void); | ||
| 240 | |||
| 241 | /* Internal to kernel */ | ||
| 242 | extern void rcu_init(void); | ||
| 243 | extern int rcu_needs_cpu(int cpu); | ||
| 311 | 244 | ||
| 312 | #endif /* __KERNEL__ */ | 245 | #endif /* __KERNEL__ */ |
| 313 | #endif /* __LINUX_RCUPDATE_H */ | 246 | #endif /* __LINUX_RCUPDATE_H */ |
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h new file mode 100644 index 000000000000..ece8eb3e4151 --- /dev/null +++ b/include/linux/rcupreempt.h | |||
| @@ -0,0 +1,86 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion (RT implementation) | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright (C) IBM Corporation, 2006 | ||
| 19 | * | ||
| 20 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
| 21 | * | ||
| 22 | * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com> | ||
| 23 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
| 24 | * Papers: | ||
| 25 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
| 26 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
| 27 | * | ||
| 28 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 29 | * Documentation/RCU | ||
| 30 | * | ||
| 31 | */ | ||
| 32 | |||
| 33 | #ifndef __LINUX_RCUPREEMPT_H | ||
| 34 | #define __LINUX_RCUPREEMPT_H | ||
| 35 | |||
| 36 | #ifdef __KERNEL__ | ||
| 37 | |||
| 38 | #include <linux/cache.h> | ||
| 39 | #include <linux/spinlock.h> | ||
| 40 | #include <linux/threads.h> | ||
| 41 | #include <linux/percpu.h> | ||
| 42 | #include <linux/cpumask.h> | ||
| 43 | #include <linux/seqlock.h> | ||
| 44 | |||
| 45 | #define rcu_qsctr_inc(cpu) | ||
| 46 | #define rcu_bh_qsctr_inc(cpu) | ||
| 47 | #define call_rcu_bh(head, rcu) call_rcu(head, rcu) | ||
| 48 | |||
| 49 | extern void __rcu_read_lock(void); | ||
| 50 | extern void __rcu_read_unlock(void); | ||
| 51 | extern int rcu_pending(int cpu); | ||
| 52 | extern int rcu_needs_cpu(int cpu); | ||
| 53 | |||
| 54 | #define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); } | ||
| 55 | #define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); } | ||
| 56 | |||
| 57 | extern void __synchronize_sched(void); | ||
| 58 | |||
| 59 | extern void __rcu_init(void); | ||
| 60 | extern void rcu_check_callbacks(int cpu, int user); | ||
| 61 | extern void rcu_restart_cpu(int cpu); | ||
| 62 | extern long rcu_batches_completed(void); | ||
| 63 | |||
| 64 | /* | ||
| 65 | * Return the number of RCU batches processed thus far. Useful for debug | ||
| 66 | * and statistic. The _bh variant is identifcal to straight RCU | ||
| 67 | */ | ||
| 68 | static inline long rcu_batches_completed_bh(void) | ||
| 69 | { | ||
| 70 | return rcu_batches_completed(); | ||
| 71 | } | ||
| 72 | |||
| 73 | #ifdef CONFIG_RCU_TRACE | ||
| 74 | struct rcupreempt_trace; | ||
| 75 | extern long *rcupreempt_flipctr(int cpu); | ||
| 76 | extern long rcupreempt_data_completed(void); | ||
| 77 | extern int rcupreempt_flip_flag(int cpu); | ||
| 78 | extern int rcupreempt_mb_flag(int cpu); | ||
| 79 | extern char *rcupreempt_try_flip_state_name(void); | ||
| 80 | extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu); | ||
| 81 | #endif | ||
| 82 | |||
| 83 | struct softirq_action; | ||
| 84 | |||
| 85 | #endif /* __KERNEL__ */ | ||
| 86 | #endif /* __LINUX_RCUPREEMPT_H */ | ||
diff --git a/include/linux/rcupreempt_trace.h b/include/linux/rcupreempt_trace.h new file mode 100644 index 000000000000..21cd6b2a5c42 --- /dev/null +++ b/include/linux/rcupreempt_trace.h | |||
| @@ -0,0 +1,99 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion (RT implementation) | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright (C) IBM Corporation, 2006 | ||
| 19 | * | ||
| 20 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
| 21 | * | ||
| 22 | * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com> | ||
| 23 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
| 24 | * Papers: | ||
| 25 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
| 26 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
| 27 | * | ||
| 28 | * For detailed explanation of the Preemptible Read-Copy Update mechanism see - | ||
| 29 | * http://lwn.net/Articles/253651/ | ||
| 30 | */ | ||
| 31 | |||
| 32 | #ifndef __LINUX_RCUPREEMPT_TRACE_H | ||
| 33 | #define __LINUX_RCUPREEMPT_TRACE_H | ||
| 34 | |||
| 35 | #ifdef __KERNEL__ | ||
| 36 | #include <linux/types.h> | ||
| 37 | #include <linux/kernel.h> | ||
| 38 | |||
| 39 | #include <asm/atomic.h> | ||
| 40 | |||
| 41 | /* | ||
| 42 | * PREEMPT_RCU data structures. | ||
| 43 | */ | ||
| 44 | |||
| 45 | struct rcupreempt_trace { | ||
| 46 | long next_length; | ||
| 47 | long next_add; | ||
| 48 | long wait_length; | ||
| 49 | long wait_add; | ||
| 50 | long done_length; | ||
| 51 | long done_add; | ||
| 52 | long done_remove; | ||
| 53 | atomic_t done_invoked; | ||
| 54 | long rcu_check_callbacks; | ||
| 55 | atomic_t rcu_try_flip_1; | ||
| 56 | atomic_t rcu_try_flip_e1; | ||
| 57 | long rcu_try_flip_i1; | ||
| 58 | long rcu_try_flip_ie1; | ||
| 59 | long rcu_try_flip_g1; | ||
| 60 | long rcu_try_flip_a1; | ||
| 61 | long rcu_try_flip_ae1; | ||
| 62 | long rcu_try_flip_a2; | ||
| 63 | long rcu_try_flip_z1; | ||
| 64 | long rcu_try_flip_ze1; | ||
| 65 | long rcu_try_flip_z2; | ||
| 66 | long rcu_try_flip_m1; | ||
| 67 | long rcu_try_flip_me1; | ||
| 68 | long rcu_try_flip_m2; | ||
| 69 | }; | ||
| 70 | |||
| 71 | #ifdef CONFIG_RCU_TRACE | ||
| 72 | #define RCU_TRACE(fn, arg) fn(arg); | ||
| 73 | #else | ||
| 74 | #define RCU_TRACE(fn, arg) | ||
| 75 | #endif | ||
| 76 | |||
| 77 | extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace); | ||
| 78 | extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace); | ||
| 79 | extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace); | ||
| 80 | extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace); | ||
| 81 | extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace); | ||
| 82 | extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace); | ||
| 83 | extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace); | ||
| 84 | extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace); | ||
| 85 | extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace); | ||
| 86 | extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace); | ||
| 87 | extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace); | ||
| 88 | extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace); | ||
| 89 | extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace); | ||
| 90 | extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace); | ||
| 91 | extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace); | ||
| 92 | extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace); | ||
| 93 | extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace); | ||
| 94 | extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace); | ||
| 95 | extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace); | ||
| 96 | extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace); | ||
| 97 | |||
| 98 | #endif /* __KERNEL__ */ | ||
| 99 | #endif /* __LINUX_RCUPREEMPT_TRACE_H */ | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index d6eacda765ca..df5b24ee80b3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -78,7 +78,6 @@ struct sched_param { | |||
| 78 | #include <linux/proportions.h> | 78 | #include <linux/proportions.h> |
| 79 | #include <linux/seccomp.h> | 79 | #include <linux/seccomp.h> |
| 80 | #include <linux/rcupdate.h> | 80 | #include <linux/rcupdate.h> |
| 81 | #include <linux/futex.h> | ||
| 82 | #include <linux/rtmutex.h> | 81 | #include <linux/rtmutex.h> |
| 83 | 82 | ||
| 84 | #include <linux/time.h> | 83 | #include <linux/time.h> |
| @@ -88,11 +87,13 @@ struct sched_param { | |||
| 88 | #include <linux/hrtimer.h> | 87 | #include <linux/hrtimer.h> |
| 89 | #include <linux/task_io_accounting.h> | 88 | #include <linux/task_io_accounting.h> |
| 90 | #include <linux/kobject.h> | 89 | #include <linux/kobject.h> |
| 90 | #include <linux/latencytop.h> | ||
| 91 | 91 | ||
| 92 | #include <asm/processor.h> | 92 | #include <asm/processor.h> |
| 93 | 93 | ||
| 94 | struct exec_domain; | 94 | struct exec_domain; |
| 95 | struct futex_pi_state; | 95 | struct futex_pi_state; |
| 96 | struct robust_list_head; | ||
| 96 | struct bio; | 97 | struct bio; |
| 97 | 98 | ||
| 98 | /* | 99 | /* |
| @@ -230,6 +231,8 @@ static inline int select_nohz_load_balancer(int cpu) | |||
| 230 | } | 231 | } |
| 231 | #endif | 232 | #endif |
| 232 | 233 | ||
| 234 | extern unsigned long rt_needs_cpu(int cpu); | ||
| 235 | |||
| 233 | /* | 236 | /* |
| 234 | * Only dump TASK_* tasks. (0 for all tasks) | 237 | * Only dump TASK_* tasks. (0 for all tasks) |
| 235 | */ | 238 | */ |
| @@ -257,13 +260,19 @@ extern void trap_init(void); | |||
| 257 | extern void account_process_tick(struct task_struct *task, int user); | 260 | extern void account_process_tick(struct task_struct *task, int user); |
| 258 | extern void update_process_times(int user); | 261 | extern void update_process_times(int user); |
| 259 | extern void scheduler_tick(void); | 262 | extern void scheduler_tick(void); |
| 263 | extern void hrtick_resched(void); | ||
| 264 | |||
| 265 | extern void sched_show_task(struct task_struct *p); | ||
| 260 | 266 | ||
| 261 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 267 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
| 262 | extern void softlockup_tick(void); | 268 | extern void softlockup_tick(void); |
| 263 | extern void spawn_softlockup_task(void); | 269 | extern void spawn_softlockup_task(void); |
| 264 | extern void touch_softlockup_watchdog(void); | 270 | extern void touch_softlockup_watchdog(void); |
| 265 | extern void touch_all_softlockup_watchdogs(void); | 271 | extern void touch_all_softlockup_watchdogs(void); |
| 266 | extern int softlockup_thresh; | 272 | extern unsigned long softlockup_thresh; |
| 273 | extern unsigned long sysctl_hung_task_check_count; | ||
| 274 | extern unsigned long sysctl_hung_task_timeout_secs; | ||
| 275 | extern unsigned long sysctl_hung_task_warnings; | ||
| 267 | #else | 276 | #else |
| 268 | static inline void softlockup_tick(void) | 277 | static inline void softlockup_tick(void) |
| 269 | { | 278 | { |
| @@ -822,6 +831,7 @@ struct sched_class { | |||
| 822 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); | 831 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); |
| 823 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); | 832 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); |
| 824 | void (*yield_task) (struct rq *rq); | 833 | void (*yield_task) (struct rq *rq); |
| 834 | int (*select_task_rq)(struct task_struct *p, int sync); | ||
| 825 | 835 | ||
| 826 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); | 836 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); |
| 827 | 837 | ||
| @@ -837,11 +847,25 @@ struct sched_class { | |||
| 837 | int (*move_one_task) (struct rq *this_rq, int this_cpu, | 847 | int (*move_one_task) (struct rq *this_rq, int this_cpu, |
| 838 | struct rq *busiest, struct sched_domain *sd, | 848 | struct rq *busiest, struct sched_domain *sd, |
| 839 | enum cpu_idle_type idle); | 849 | enum cpu_idle_type idle); |
| 850 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | ||
| 851 | void (*post_schedule) (struct rq *this_rq); | ||
| 852 | void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); | ||
| 840 | #endif | 853 | #endif |
| 841 | 854 | ||
| 842 | void (*set_curr_task) (struct rq *rq); | 855 | void (*set_curr_task) (struct rq *rq); |
| 843 | void (*task_tick) (struct rq *rq, struct task_struct *p); | 856 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
| 844 | void (*task_new) (struct rq *rq, struct task_struct *p); | 857 | void (*task_new) (struct rq *rq, struct task_struct *p); |
| 858 | void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); | ||
| 859 | |||
| 860 | void (*join_domain)(struct rq *rq); | ||
| 861 | void (*leave_domain)(struct rq *rq); | ||
| 862 | |||
| 863 | void (*switched_from) (struct rq *this_rq, struct task_struct *task, | ||
| 864 | int running); | ||
| 865 | void (*switched_to) (struct rq *this_rq, struct task_struct *task, | ||
| 866 | int running); | ||
| 867 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | ||
| 868 | int oldprio, int running); | ||
| 845 | }; | 869 | }; |
| 846 | 870 | ||
| 847 | struct load_weight { | 871 | struct load_weight { |
| @@ -871,6 +895,8 @@ struct sched_entity { | |||
| 871 | #ifdef CONFIG_SCHEDSTATS | 895 | #ifdef CONFIG_SCHEDSTATS |
| 872 | u64 wait_start; | 896 | u64 wait_start; |
| 873 | u64 wait_max; | 897 | u64 wait_max; |
| 898 | u64 wait_count; | ||
| 899 | u64 wait_sum; | ||
| 874 | 900 | ||
| 875 | u64 sleep_start; | 901 | u64 sleep_start; |
| 876 | u64 sleep_max; | 902 | u64 sleep_max; |
| @@ -909,6 +935,21 @@ struct sched_entity { | |||
| 909 | #endif | 935 | #endif |
| 910 | }; | 936 | }; |
| 911 | 937 | ||
| 938 | struct sched_rt_entity { | ||
| 939 | struct list_head run_list; | ||
| 940 | unsigned int time_slice; | ||
| 941 | unsigned long timeout; | ||
| 942 | int nr_cpus_allowed; | ||
| 943 | |||
| 944 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 945 | struct sched_rt_entity *parent; | ||
| 946 | /* rq on which this entity is (to be) queued: */ | ||
| 947 | struct rt_rq *rt_rq; | ||
| 948 | /* rq "owned" by this entity/group: */ | ||
| 949 | struct rt_rq *my_q; | ||
| 950 | #endif | ||
| 951 | }; | ||
| 952 | |||
| 912 | struct task_struct { | 953 | struct task_struct { |
| 913 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | 954 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
| 914 | void *stack; | 955 | void *stack; |
| @@ -925,9 +966,9 @@ struct task_struct { | |||
| 925 | #endif | 966 | #endif |
| 926 | 967 | ||
| 927 | int prio, static_prio, normal_prio; | 968 | int prio, static_prio, normal_prio; |
| 928 | struct list_head run_list; | ||
| 929 | const struct sched_class *sched_class; | 969 | const struct sched_class *sched_class; |
| 930 | struct sched_entity se; | 970 | struct sched_entity se; |
| 971 | struct sched_rt_entity rt; | ||
| 931 | 972 | ||
| 932 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 973 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| 933 | /* list of struct preempt_notifier: */ | 974 | /* list of struct preempt_notifier: */ |
| @@ -951,7 +992,11 @@ struct task_struct { | |||
| 951 | 992 | ||
| 952 | unsigned int policy; | 993 | unsigned int policy; |
| 953 | cpumask_t cpus_allowed; | 994 | cpumask_t cpus_allowed; |
| 954 | unsigned int time_slice; | 995 | |
| 996 | #ifdef CONFIG_PREEMPT_RCU | ||
| 997 | int rcu_read_lock_nesting; | ||
| 998 | int rcu_flipctr_idx; | ||
| 999 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 955 | 1000 | ||
| 956 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1001 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 957 | struct sched_info sched_info; | 1002 | struct sched_info sched_info; |
| @@ -1041,6 +1086,11 @@ struct task_struct { | |||
| 1041 | /* ipc stuff */ | 1086 | /* ipc stuff */ |
| 1042 | struct sysv_sem sysvsem; | 1087 | struct sysv_sem sysvsem; |
| 1043 | #endif | 1088 | #endif |
| 1089 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
| 1090 | /* hung task detection */ | ||
| 1091 | unsigned long last_switch_timestamp; | ||
| 1092 | unsigned long last_switch_count; | ||
| 1093 | #endif | ||
| 1044 | /* CPU-specific state of this task */ | 1094 | /* CPU-specific state of this task */ |
| 1045 | struct thread_struct thread; | 1095 | struct thread_struct thread; |
| 1046 | /* filesystem information */ | 1096 | /* filesystem information */ |
| @@ -1173,6 +1223,10 @@ struct task_struct { | |||
| 1173 | int make_it_fail; | 1223 | int make_it_fail; |
| 1174 | #endif | 1224 | #endif |
| 1175 | struct prop_local_single dirties; | 1225 | struct prop_local_single dirties; |
| 1226 | #ifdef CONFIG_LATENCYTOP | ||
| 1227 | int latency_record_count; | ||
| 1228 | struct latency_record latency_record[LT_SAVECOUNT]; | ||
| 1229 | #endif | ||
| 1176 | }; | 1230 | }; |
| 1177 | 1231 | ||
| 1178 | /* | 1232 | /* |
| @@ -1453,6 +1507,12 @@ extern unsigned int sysctl_sched_child_runs_first; | |||
| 1453 | extern unsigned int sysctl_sched_features; | 1507 | extern unsigned int sysctl_sched_features; |
| 1454 | extern unsigned int sysctl_sched_migration_cost; | 1508 | extern unsigned int sysctl_sched_migration_cost; |
| 1455 | extern unsigned int sysctl_sched_nr_migrate; | 1509 | extern unsigned int sysctl_sched_nr_migrate; |
| 1510 | extern unsigned int sysctl_sched_rt_period; | ||
| 1511 | extern unsigned int sysctl_sched_rt_ratio; | ||
| 1512 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
| 1513 | extern unsigned int sysctl_sched_min_bal_int_shares; | ||
| 1514 | extern unsigned int sysctl_sched_max_bal_int_shares; | ||
| 1515 | #endif | ||
| 1456 | 1516 | ||
| 1457 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 1517 | int sched_nr_latency_handler(struct ctl_table *table, int write, |
| 1458 | struct file *file, void __user *buffer, size_t *length, | 1518 | struct file *file, void __user *buffer, size_t *length, |
| @@ -1845,7 +1905,18 @@ static inline int need_resched(void) | |||
| 1845 | * cond_resched_lock() will drop the spinlock before scheduling, | 1905 | * cond_resched_lock() will drop the spinlock before scheduling, |
| 1846 | * cond_resched_softirq() will enable bhs before scheduling. | 1906 | * cond_resched_softirq() will enable bhs before scheduling. |
| 1847 | */ | 1907 | */ |
| 1848 | extern int cond_resched(void); | 1908 | #ifdef CONFIG_PREEMPT |
| 1909 | static inline int cond_resched(void) | ||
| 1910 | { | ||
| 1911 | return 0; | ||
| 1912 | } | ||
| 1913 | #else | ||
| 1914 | extern int _cond_resched(void); | ||
| 1915 | static inline int cond_resched(void) | ||
| 1916 | { | ||
| 1917 | return _cond_resched(); | ||
| 1918 | } | ||
| 1919 | #endif | ||
| 1849 | extern int cond_resched_lock(spinlock_t * lock); | 1920 | extern int cond_resched_lock(spinlock_t * lock); |
| 1850 | extern int cond_resched_softirq(void); | 1921 | extern int cond_resched_softirq(void); |
| 1851 | 1922 | ||
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h index 58962c51dee1..aab3a4cff4e1 100644 --- a/include/linux/smp_lock.h +++ b/include/linux/smp_lock.h | |||
| @@ -17,22 +17,10 @@ extern void __lockfunc __release_kernel_lock(void); | |||
| 17 | __release_kernel_lock(); \ | 17 | __release_kernel_lock(); \ |
| 18 | } while (0) | 18 | } while (0) |
| 19 | 19 | ||
| 20 | /* | ||
| 21 | * Non-SMP kernels will never block on the kernel lock, | ||
| 22 | * so we are better off returning a constant zero from | ||
| 23 | * reacquire_kernel_lock() so that the compiler can see | ||
| 24 | * it at compile-time. | ||
| 25 | */ | ||
| 26 | #if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_BKL) | ||
| 27 | # define return_value_on_smp return | ||
| 28 | #else | ||
| 29 | # define return_value_on_smp | ||
| 30 | #endif | ||
| 31 | |||
| 32 | static inline int reacquire_kernel_lock(struct task_struct *task) | 20 | static inline int reacquire_kernel_lock(struct task_struct *task) |
| 33 | { | 21 | { |
| 34 | if (unlikely(task->lock_depth >= 0)) | 22 | if (unlikely(task->lock_depth >= 0)) |
| 35 | return_value_on_smp __reacquire_kernel_lock(); | 23 | return __reacquire_kernel_lock(); |
| 36 | return 0; | 24 | return 0; |
| 37 | } | 25 | } |
| 38 | 26 | ||
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index e7fa657d0c49..5da9794b2d78 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h | |||
| @@ -9,10 +9,13 @@ struct stack_trace { | |||
| 9 | }; | 9 | }; |
| 10 | 10 | ||
| 11 | extern void save_stack_trace(struct stack_trace *trace); | 11 | extern void save_stack_trace(struct stack_trace *trace); |
| 12 | extern void save_stack_trace_tsk(struct task_struct *tsk, | ||
| 13 | struct stack_trace *trace); | ||
| 12 | 14 | ||
| 13 | extern void print_stack_trace(struct stack_trace *trace, int spaces); | 15 | extern void print_stack_trace(struct stack_trace *trace, int spaces); |
| 14 | #else | 16 | #else |
| 15 | # define save_stack_trace(trace) do { } while (0) | 17 | # define save_stack_trace(trace) do { } while (0) |
| 18 | # define save_stack_trace_tsk(tsk, trace) do { } while (0) | ||
| 16 | # define print_stack_trace(trace, spaces) do { } while (0) | 19 | # define print_stack_trace(trace, spaces) do { } while (0) |
| 17 | #endif | 20 | #endif |
| 18 | 21 | ||
diff --git a/include/linux/topology.h b/include/linux/topology.h index 47729f18bfdf..2352f46160d3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
| @@ -5,7 +5,7 @@ | |||
| 5 | * | 5 | * |
| 6 | * Copyright (C) 2002, IBM Corp. | 6 | * Copyright (C) 2002, IBM Corp. |
| 7 | * | 7 | * |
| 8 | * All rights reserved. | 8 | * All rights reserved. |
| 9 | * | 9 | * |
| 10 | * This program is free software; you can redistribute it and/or modify | 10 | * This program is free software; you can redistribute it and/or modify |
| 11 | * it under the terms of the GNU General Public License as published by | 11 | * it under the terms of the GNU General Public License as published by |
| @@ -103,6 +103,7 @@ | |||
| 103 | .forkexec_idx = 0, \ | 103 | .forkexec_idx = 0, \ |
| 104 | .flags = SD_LOAD_BALANCE \ | 104 | .flags = SD_LOAD_BALANCE \ |
| 105 | | SD_BALANCE_NEWIDLE \ | 105 | | SD_BALANCE_NEWIDLE \ |
| 106 | | SD_BALANCE_FORK \ | ||
| 106 | | SD_BALANCE_EXEC \ | 107 | | SD_BALANCE_EXEC \ |
| 107 | | SD_WAKE_AFFINE \ | 108 | | SD_WAKE_AFFINE \ |
| 108 | | SD_WAKE_IDLE \ | 109 | | SD_WAKE_IDLE \ |
| @@ -134,6 +135,7 @@ | |||
| 134 | .forkexec_idx = 1, \ | 135 | .forkexec_idx = 1, \ |
| 135 | .flags = SD_LOAD_BALANCE \ | 136 | .flags = SD_LOAD_BALANCE \ |
| 136 | | SD_BALANCE_NEWIDLE \ | 137 | | SD_BALANCE_NEWIDLE \ |
| 138 | | SD_BALANCE_FORK \ | ||
| 137 | | SD_BALANCE_EXEC \ | 139 | | SD_BALANCE_EXEC \ |
| 138 | | SD_WAKE_AFFINE \ | 140 | | SD_WAKE_AFFINE \ |
| 139 | | SD_WAKE_IDLE \ | 141 | | SD_WAKE_IDLE \ |
| @@ -165,6 +167,7 @@ | |||
| 165 | .forkexec_idx = 1, \ | 167 | .forkexec_idx = 1, \ |
| 166 | .flags = SD_LOAD_BALANCE \ | 168 | .flags = SD_LOAD_BALANCE \ |
| 167 | | SD_BALANCE_NEWIDLE \ | 169 | | SD_BALANCE_NEWIDLE \ |
| 170 | | SD_BALANCE_FORK \ | ||
| 168 | | SD_BALANCE_EXEC \ | 171 | | SD_BALANCE_EXEC \ |
| 169 | | SD_WAKE_AFFINE \ | 172 | | SD_WAKE_AFFINE \ |
| 170 | | BALANCE_FOR_PKG_POWER,\ | 173 | | BALANCE_FOR_PKG_POWER,\ |
diff --git a/init/Kconfig b/init/Kconfig index f5becd2a12f6..0eda68f0ad54 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -763,3 +763,31 @@ source "block/Kconfig" | |||
| 763 | 763 | ||
| 764 | config PREEMPT_NOTIFIERS | 764 | config PREEMPT_NOTIFIERS |
| 765 | bool | 765 | bool |
| 766 | |||
| 767 | choice | ||
| 768 | prompt "RCU implementation type:" | ||
| 769 | default CLASSIC_RCU | ||
| 770 | |||
| 771 | config CLASSIC_RCU | ||
| 772 | bool "Classic RCU" | ||
| 773 | help | ||
| 774 | This option selects the classic RCU implementation that is | ||
| 775 | designed for best read-side performance on non-realtime | ||
| 776 | systems. | ||
| 777 | |||
| 778 | Say Y if you are unsure. | ||
| 779 | |||
| 780 | config PREEMPT_RCU | ||
| 781 | bool "Preemptible RCU" | ||
| 782 | depends on PREEMPT | ||
| 783 | help | ||
| 784 | This option reduces the latency of the kernel by making certain | ||
| 785 | RCU sections preemptible. Normally RCU code is non-preemptible, if | ||
| 786 | this option is selected then read-only RCU sections become | ||
| 787 | preemptible. This helps latency, but may expose bugs due to | ||
| 788 | now-naive assumptions about each RCU read-side critical section | ||
| 789 | remaining on a given CPU through its execution. | ||
| 790 | |||
| 791 | Say N if you are unsure. | ||
| 792 | |||
| 793 | endchoice | ||
diff --git a/init/main.c b/init/main.c index 80b04b6c5157..f287ca5862b9 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -607,6 +607,7 @@ asmlinkage void __init start_kernel(void) | |||
| 607 | vfs_caches_init_early(); | 607 | vfs_caches_init_early(); |
| 608 | cpuset_init_early(); | 608 | cpuset_init_early(); |
| 609 | mem_init(); | 609 | mem_init(); |
| 610 | cpu_hotplug_init(); | ||
| 610 | kmem_cache_init(); | 611 | kmem_cache_init(); |
| 611 | setup_per_cpu_pageset(); | 612 | setup_per_cpu_pageset(); |
| 612 | numa_policy_init(); | 613 | numa_policy_init(); |
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 4af15802ccd4..526128a2e622 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
| @@ -54,3 +54,5 @@ config HZ | |||
| 54 | default 300 if HZ_300 | 54 | default 300 if HZ_300 |
| 55 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
| 56 | 56 | ||
| 57 | config SCHED_HRTICK | ||
| 58 | def_bool HIGH_RES_TIMERS && X86 | ||
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c14207..0669b70fa6a3 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
| @@ -52,14 +52,13 @@ config PREEMPT | |||
| 52 | 52 | ||
| 53 | endchoice | 53 | endchoice |
| 54 | 54 | ||
| 55 | config PREEMPT_BKL | 55 | config RCU_TRACE |
| 56 | bool "Preempt The Big Kernel Lock" | 56 | bool "Enable tracing for RCU - currently stats in debugfs" |
| 57 | depends on SMP || PREEMPT | 57 | select DEBUG_FS |
| 58 | default y | 58 | default y |
| 59 | help | 59 | help |
| 60 | This option reduces the latency of the kernel by making the | 60 | This option provides tracing in RCU which presents stats |
| 61 | big kernel lock preemptible. | 61 | in debugfs for debugging RCU implementation. |
| 62 | 62 | ||
| 63 | Say Y here if you are building a kernel for a desktop system. | 63 | Say Y here if you want to enable RCU tracing |
| 64 | Say N if you are unsure. | 64 | Say N if you are unsure. |
| 65 | |||
diff --git a/kernel/Makefile b/kernel/Makefile index dfa96956dae0..390d42146267 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -52,11 +52,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
| 52 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 52 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
| 53 | obj-$(CONFIG_SECCOMP) += seccomp.o | 53 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 54 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 54 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| 55 | obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o | ||
| 56 | obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o | ||
| 57 | ifeq ($(CONFIG_PREEMPT_RCU),y) | ||
| 58 | obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o | ||
| 59 | endif | ||
| 55 | obj-$(CONFIG_RELAY) += relay.o | 60 | obj-$(CONFIG_RELAY) += relay.o |
| 56 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 61 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
| 57 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 62 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
| 58 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 63 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
| 59 | obj-$(CONFIG_MARKERS) += marker.o | 64 | obj-$(CONFIG_MARKERS) += marker.o |
| 65 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | ||
| 60 | 66 | ||
| 61 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 67 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
| 62 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 68 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 6b3a0c15144f..e0d3a4f56ecb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -15,9 +15,8 @@ | |||
| 15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
| 16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
| 17 | 17 | ||
| 18 | /* This protects CPUs going up and down... */ | 18 | /* Serializes the updates to cpu_online_map, cpu_present_map */ |
| 19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 19 | static DEFINE_MUTEX(cpu_add_remove_lock); |
| 20 | static DEFINE_MUTEX(cpu_bitmask_lock); | ||
| 21 | 20 | ||
| 22 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); |
| 23 | 22 | ||
| @@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | |||
| 26 | */ | 25 | */ |
| 27 | static int cpu_hotplug_disabled; | 26 | static int cpu_hotplug_disabled; |
| 28 | 27 | ||
| 29 | #ifdef CONFIG_HOTPLUG_CPU | 28 | static struct { |
| 29 | struct task_struct *active_writer; | ||
| 30 | struct mutex lock; /* Synchronizes accesses to refcount, */ | ||
| 31 | /* | ||
| 32 | * Also blocks the new readers during | ||
| 33 | * an ongoing cpu hotplug operation. | ||
| 34 | */ | ||
| 35 | int refcount; | ||
| 36 | wait_queue_head_t writer_queue; | ||
| 37 | } cpu_hotplug; | ||
| 30 | 38 | ||
| 31 | /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ | 39 | #define writer_exists() (cpu_hotplug.active_writer != NULL) |
| 32 | static struct task_struct *recursive; | ||
| 33 | static int recursive_depth; | ||
| 34 | 40 | ||
| 35 | void lock_cpu_hotplug(void) | 41 | void __init cpu_hotplug_init(void) |
| 36 | { | 42 | { |
| 37 | struct task_struct *tsk = current; | 43 | cpu_hotplug.active_writer = NULL; |
| 38 | 44 | mutex_init(&cpu_hotplug.lock); | |
| 39 | if (tsk == recursive) { | 45 | cpu_hotplug.refcount = 0; |
| 40 | static int warnings = 10; | 46 | init_waitqueue_head(&cpu_hotplug.writer_queue); |
| 41 | if (warnings) { | 47 | } |
| 42 | printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); | 48 | |
| 43 | WARN_ON(1); | 49 | #ifdef CONFIG_HOTPLUG_CPU |
| 44 | warnings--; | 50 | |
| 45 | } | 51 | void get_online_cpus(void) |
| 46 | recursive_depth++; | 52 | { |
| 53 | might_sleep(); | ||
| 54 | if (cpu_hotplug.active_writer == current) | ||
| 47 | return; | 55 | return; |
| 48 | } | 56 | mutex_lock(&cpu_hotplug.lock); |
| 49 | mutex_lock(&cpu_bitmask_lock); | 57 | cpu_hotplug.refcount++; |
| 50 | recursive = tsk; | 58 | mutex_unlock(&cpu_hotplug.lock); |
| 59 | |||
| 51 | } | 60 | } |
| 52 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); | 61 | EXPORT_SYMBOL_GPL(get_online_cpus); |
| 53 | 62 | ||
| 54 | void unlock_cpu_hotplug(void) | 63 | void put_online_cpus(void) |
| 55 | { | 64 | { |
| 56 | WARN_ON(recursive != current); | 65 | if (cpu_hotplug.active_writer == current) |
| 57 | if (recursive_depth) { | ||
| 58 | recursive_depth--; | ||
| 59 | return; | 66 | return; |
| 60 | } | 67 | mutex_lock(&cpu_hotplug.lock); |
| 61 | recursive = NULL; | 68 | cpu_hotplug.refcount--; |
| 62 | mutex_unlock(&cpu_bitmask_lock); | 69 | |
| 70 | if (unlikely(writer_exists()) && !cpu_hotplug.refcount) | ||
| 71 | wake_up(&cpu_hotplug.writer_queue); | ||
| 72 | |||
| 73 | mutex_unlock(&cpu_hotplug.lock); | ||
| 74 | |||
| 63 | } | 75 | } |
| 64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 76 | EXPORT_SYMBOL_GPL(put_online_cpus); |
| 65 | 77 | ||
| 66 | #endif /* CONFIG_HOTPLUG_CPU */ | 78 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 67 | 79 | ||
| 80 | /* | ||
| 81 | * The following two API's must be used when attempting | ||
| 82 | * to serialize the updates to cpu_online_map, cpu_present_map. | ||
| 83 | */ | ||
| 84 | void cpu_maps_update_begin(void) | ||
| 85 | { | ||
| 86 | mutex_lock(&cpu_add_remove_lock); | ||
| 87 | } | ||
| 88 | |||
| 89 | void cpu_maps_update_done(void) | ||
| 90 | { | ||
| 91 | mutex_unlock(&cpu_add_remove_lock); | ||
| 92 | } | ||
| 93 | |||
| 94 | /* | ||
| 95 | * This ensures that the hotplug operation can begin only when the | ||
| 96 | * refcount goes to zero. | ||
| 97 | * | ||
| 98 | * Note that during a cpu-hotplug operation, the new readers, if any, | ||
| 99 | * will be blocked by the cpu_hotplug.lock | ||
| 100 | * | ||
| 101 | * Since cpu_maps_update_begin is always called after invoking | ||
| 102 | * cpu_maps_update_begin, we can be sure that only one writer is active. | ||
| 103 | * | ||
| 104 | * Note that theoretically, there is a possibility of a livelock: | ||
| 105 | * - Refcount goes to zero, last reader wakes up the sleeping | ||
| 106 | * writer. | ||
| 107 | * - Last reader unlocks the cpu_hotplug.lock. | ||
| 108 | * - A new reader arrives at this moment, bumps up the refcount. | ||
| 109 | * - The writer acquires the cpu_hotplug.lock finds the refcount | ||
| 110 | * non zero and goes to sleep again. | ||
| 111 | * | ||
| 112 | * However, this is very difficult to achieve in practice since | ||
| 113 | * get_online_cpus() not an api which is called all that often. | ||
| 114 | * | ||
| 115 | */ | ||
| 116 | static void cpu_hotplug_begin(void) | ||
| 117 | { | ||
| 118 | DECLARE_WAITQUEUE(wait, current); | ||
| 119 | |||
| 120 | mutex_lock(&cpu_hotplug.lock); | ||
| 121 | |||
| 122 | cpu_hotplug.active_writer = current; | ||
| 123 | add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); | ||
| 124 | while (cpu_hotplug.refcount) { | ||
| 125 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 126 | mutex_unlock(&cpu_hotplug.lock); | ||
| 127 | schedule(); | ||
| 128 | mutex_lock(&cpu_hotplug.lock); | ||
| 129 | } | ||
| 130 | remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); | ||
| 131 | } | ||
| 132 | |||
| 133 | static void cpu_hotplug_done(void) | ||
| 134 | { | ||
| 135 | cpu_hotplug.active_writer = NULL; | ||
| 136 | mutex_unlock(&cpu_hotplug.lock); | ||
| 137 | } | ||
| 68 | /* Need to know about CPUs going up/down? */ | 138 | /* Need to know about CPUs going up/down? */ |
| 69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) | 139 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
| 70 | { | 140 | { |
| 71 | int ret; | 141 | int ret; |
| 72 | mutex_lock(&cpu_add_remove_lock); | 142 | cpu_maps_update_begin(); |
| 73 | ret = raw_notifier_chain_register(&cpu_chain, nb); | 143 | ret = raw_notifier_chain_register(&cpu_chain, nb); |
| 74 | mutex_unlock(&cpu_add_remove_lock); | 144 | cpu_maps_update_done(); |
| 75 | return ret; | 145 | return ret; |
| 76 | } | 146 | } |
| 77 | 147 | ||
| @@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier); | |||
| 81 | 151 | ||
| 82 | void unregister_cpu_notifier(struct notifier_block *nb) | 152 | void unregister_cpu_notifier(struct notifier_block *nb) |
| 83 | { | 153 | { |
| 84 | mutex_lock(&cpu_add_remove_lock); | 154 | cpu_maps_update_begin(); |
| 85 | raw_notifier_chain_unregister(&cpu_chain, nb); | 155 | raw_notifier_chain_unregister(&cpu_chain, nb); |
| 86 | mutex_unlock(&cpu_add_remove_lock); | 156 | cpu_maps_update_done(); |
| 87 | } | 157 | } |
| 88 | EXPORT_SYMBOL(unregister_cpu_notifier); | 158 | EXPORT_SYMBOL(unregister_cpu_notifier); |
| 89 | 159 | ||
| @@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 147 | if (!cpu_online(cpu)) | 217 | if (!cpu_online(cpu)) |
| 148 | return -EINVAL; | 218 | return -EINVAL; |
| 149 | 219 | ||
| 150 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 220 | cpu_hotplug_begin(); |
| 151 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, | 221 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, |
| 152 | hcpu, -1, &nr_calls); | 222 | hcpu, -1, &nr_calls); |
| 153 | if (err == NOTIFY_BAD) { | 223 | if (err == NOTIFY_BAD) { |
| @@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 166 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
| 167 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed(current, tmp); |
| 168 | 238 | ||
| 169 | mutex_lock(&cpu_bitmask_lock); | ||
| 170 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
| 171 | mutex_unlock(&cpu_bitmask_lock); | ||
| 172 | 240 | ||
| 173 | if (IS_ERR(p) || cpu_online(cpu)) { | 241 | if (IS_ERR(p) || cpu_online(cpu)) { |
| 174 | /* CPU didn't die: tell everyone. Can't complain. */ | 242 | /* CPU didn't die: tell everyone. Can't complain. */ |
| @@ -202,7 +270,7 @@ out_thread: | |||
| 202 | out_allowed: | 270 | out_allowed: |
| 203 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed(current, old_allowed); |
| 204 | out_release: | 272 | out_release: |
| 205 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 273 | cpu_hotplug_done(); |
| 206 | return err; | 274 | return err; |
| 207 | } | 275 | } |
| 208 | 276 | ||
| @@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu) | |||
| 210 | { | 278 | { |
| 211 | int err = 0; | 279 | int err = 0; |
| 212 | 280 | ||
| 213 | mutex_lock(&cpu_add_remove_lock); | 281 | cpu_maps_update_begin(); |
| 214 | if (cpu_hotplug_disabled) | 282 | if (cpu_hotplug_disabled) |
| 215 | err = -EBUSY; | 283 | err = -EBUSY; |
| 216 | else | 284 | else |
| 217 | err = _cpu_down(cpu, 0); | 285 | err = _cpu_down(cpu, 0); |
| 218 | 286 | ||
| 219 | mutex_unlock(&cpu_add_remove_lock); | 287 | cpu_maps_update_done(); |
| 220 | return err; | 288 | return err; |
| 221 | } | 289 | } |
| 222 | #endif /*CONFIG_HOTPLUG_CPU*/ | 290 | #endif /*CONFIG_HOTPLUG_CPU*/ |
| @@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 231 | if (cpu_online(cpu) || !cpu_present(cpu)) | 299 | if (cpu_online(cpu) || !cpu_present(cpu)) |
| 232 | return -EINVAL; | 300 | return -EINVAL; |
| 233 | 301 | ||
| 234 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 302 | cpu_hotplug_begin(); |
| 235 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, | 303 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, |
| 236 | -1, &nr_calls); | 304 | -1, &nr_calls); |
| 237 | if (ret == NOTIFY_BAD) { | 305 | if (ret == NOTIFY_BAD) { |
| @@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 243 | } | 311 | } |
| 244 | 312 | ||
| 245 | /* Arch-specific enabling code. */ | 313 | /* Arch-specific enabling code. */ |
| 246 | mutex_lock(&cpu_bitmask_lock); | ||
| 247 | ret = __cpu_up(cpu); | 314 | ret = __cpu_up(cpu); |
| 248 | mutex_unlock(&cpu_bitmask_lock); | ||
| 249 | if (ret != 0) | 315 | if (ret != 0) |
| 250 | goto out_notify; | 316 | goto out_notify; |
| 251 | BUG_ON(!cpu_online(cpu)); | 317 | BUG_ON(!cpu_online(cpu)); |
| @@ -257,7 +323,7 @@ out_notify: | |||
| 257 | if (ret != 0) | 323 | if (ret != 0) |
| 258 | __raw_notifier_call_chain(&cpu_chain, | 324 | __raw_notifier_call_chain(&cpu_chain, |
| 259 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 325 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
| 260 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 326 | cpu_hotplug_done(); |
| 261 | 327 | ||
| 262 | return ret; | 328 | return ret; |
| 263 | } | 329 | } |
| @@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
| 275 | return -EINVAL; | 341 | return -EINVAL; |
| 276 | } | 342 | } |
| 277 | 343 | ||
| 278 | mutex_lock(&cpu_add_remove_lock); | 344 | cpu_maps_update_begin(); |
| 279 | if (cpu_hotplug_disabled) | 345 | if (cpu_hotplug_disabled) |
| 280 | err = -EBUSY; | 346 | err = -EBUSY; |
| 281 | else | 347 | else |
| 282 | err = _cpu_up(cpu, 0); | 348 | err = _cpu_up(cpu, 0); |
| 283 | 349 | ||
| 284 | mutex_unlock(&cpu_add_remove_lock); | 350 | cpu_maps_update_done(); |
| 285 | return err; | 351 | return err; |
| 286 | } | 352 | } |
| 287 | 353 | ||
| @@ -292,7 +358,7 @@ int disable_nonboot_cpus(void) | |||
| 292 | { | 358 | { |
| 293 | int cpu, first_cpu, error = 0; | 359 | int cpu, first_cpu, error = 0; |
| 294 | 360 | ||
| 295 | mutex_lock(&cpu_add_remove_lock); | 361 | cpu_maps_update_begin(); |
| 296 | first_cpu = first_cpu(cpu_online_map); | 362 | first_cpu = first_cpu(cpu_online_map); |
| 297 | /* We take down all of the non-boot CPUs in one shot to avoid races | 363 | /* We take down all of the non-boot CPUs in one shot to avoid races |
| 298 | * with the userspace trying to use the CPU hotplug at the same time | 364 | * with the userspace trying to use the CPU hotplug at the same time |
| @@ -319,7 +385,7 @@ int disable_nonboot_cpus(void) | |||
| 319 | } else { | 385 | } else { |
| 320 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 386 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); |
| 321 | } | 387 | } |
| 322 | mutex_unlock(&cpu_add_remove_lock); | 388 | cpu_maps_update_done(); |
| 323 | return error; | 389 | return error; |
| 324 | } | 390 | } |
| 325 | 391 | ||
| @@ -328,7 +394,7 @@ void enable_nonboot_cpus(void) | |||
| 328 | int cpu, error; | 394 | int cpu, error; |
| 329 | 395 | ||
| 330 | /* Allow everyone to use the CPU hotplug again */ | 396 | /* Allow everyone to use the CPU hotplug again */ |
| 331 | mutex_lock(&cpu_add_remove_lock); | 397 | cpu_maps_update_begin(); |
| 332 | cpu_hotplug_disabled = 0; | 398 | cpu_hotplug_disabled = 0; |
| 333 | if (cpus_empty(frozen_cpus)) | 399 | if (cpus_empty(frozen_cpus)) |
| 334 | goto out; | 400 | goto out; |
| @@ -344,6 +410,6 @@ void enable_nonboot_cpus(void) | |||
| 344 | } | 410 | } |
| 345 | cpus_clear(frozen_cpus); | 411 | cpus_clear(frozen_cpus); |
| 346 | out: | 412 | out: |
| 347 | mutex_unlock(&cpu_add_remove_lock); | 413 | cpu_maps_update_done(); |
| 348 | } | 414 | } |
| 349 | #endif /* CONFIG_PM_SLEEP_SMP */ | 415 | #endif /* CONFIG_PM_SLEEP_SMP */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 50f5dc463688..cfaf6419d817 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
| 537 | * | 537 | * |
| 538 | * Call with cgroup_mutex held. May take callback_mutex during | 538 | * Call with cgroup_mutex held. May take callback_mutex during |
| 539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | 539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest |
| 540 | * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 540 | * a call to the get_online_cpus()/put_online_cpus() pair. |
| 541 | * Must not be called holding callback_mutex, because we must not | 541 | * Must not be called holding callback_mutex, because we must not |
| 542 | * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere | 542 | * call get_online_cpus() while holding callback_mutex. Elsewhere |
| 543 | * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. | 543 | * the kernel nests callback_mutex inside get_online_cpus() calls. |
| 544 | * So the reverse nesting would risk an ABBA deadlock. | 544 | * So the reverse nesting would risk an ABBA deadlock. |
| 545 | * | 545 | * |
| 546 | * The three key local variables below are: | 546 | * The three key local variables below are: |
| @@ -691,9 +691,9 @@ restart: | |||
| 691 | 691 | ||
| 692 | rebuild: | 692 | rebuild: |
| 693 | /* Have scheduler rebuild sched domains */ | 693 | /* Have scheduler rebuild sched domains */ |
| 694 | lock_cpu_hotplug(); | 694 | get_online_cpus(); |
| 695 | partition_sched_domains(ndoms, doms); | 695 | partition_sched_domains(ndoms, doms); |
| 696 | unlock_cpu_hotplug(); | 696 | put_online_cpus(); |
| 697 | 697 | ||
| 698 | done: | 698 | done: |
| 699 | if (q && !IS_ERR(q)) | 699 | if (q && !IS_ERR(q)) |
| @@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create( | |||
| 1617 | * | 1617 | * |
| 1618 | * If the cpuset being removed has its flag 'sched_load_balance' | 1618 | * If the cpuset being removed has its flag 'sched_load_balance' |
| 1619 | * enabled, then simulate turning sched_load_balance off, which | 1619 | * enabled, then simulate turning sched_load_balance off, which |
| 1620 | * will call rebuild_sched_domains(). The lock_cpu_hotplug() | 1620 | * will call rebuild_sched_domains(). The get_online_cpus() |
| 1621 | * call in rebuild_sched_domains() must not be made while holding | 1621 | * call in rebuild_sched_domains() must not be made while holding |
| 1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | 1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside |
| 1623 | * lock_cpu_hotplug() calls. So the reverse nesting would risk an | 1623 | * get_online_cpus() calls. So the reverse nesting would risk an |
| 1624 | * ABBA deadlock. | 1624 | * ABBA deadlock. |
| 1625 | */ | 1625 | */ |
| 1626 | 1626 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 8dd8ff281009..39d22b3357de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1045,6 +1045,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1045 | copy_flags(clone_flags, p); | 1045 | copy_flags(clone_flags, p); |
| 1046 | INIT_LIST_HEAD(&p->children); | 1046 | INIT_LIST_HEAD(&p->children); |
| 1047 | INIT_LIST_HEAD(&p->sibling); | 1047 | INIT_LIST_HEAD(&p->sibling); |
| 1048 | #ifdef CONFIG_PREEMPT_RCU | ||
| 1049 | p->rcu_read_lock_nesting = 0; | ||
| 1050 | p->rcu_flipctr_idx = 0; | ||
| 1051 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 1048 | p->vfork_done = NULL; | 1052 | p->vfork_done = NULL; |
| 1049 | spin_lock_init(&p->alloc_lock); | 1053 | spin_lock_init(&p->alloc_lock); |
| 1050 | 1054 | ||
| @@ -1059,6 +1063,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1059 | p->prev_utime = cputime_zero; | 1063 | p->prev_utime = cputime_zero; |
| 1060 | p->prev_stime = cputime_zero; | 1064 | p->prev_stime = cputime_zero; |
| 1061 | 1065 | ||
| 1066 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
| 1067 | p->last_switch_count = 0; | ||
| 1068 | p->last_switch_timestamp = 0; | ||
| 1069 | #endif | ||
| 1070 | |||
| 1062 | #ifdef CONFIG_TASK_XACCT | 1071 | #ifdef CONFIG_TASK_XACCT |
| 1063 | p->rchar = 0; /* I/O counter: bytes read */ | 1072 | p->rchar = 0; /* I/O counter: bytes read */ |
| 1064 | p->wchar = 0; /* I/O counter: bytes written */ | 1073 | p->wchar = 0; /* I/O counter: bytes written */ |
| @@ -1196,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1196 | #ifdef TIF_SYSCALL_EMU | 1205 | #ifdef TIF_SYSCALL_EMU |
| 1197 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 1206 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
| 1198 | #endif | 1207 | #endif |
| 1208 | clear_all_latency_tracing(p); | ||
| 1199 | 1209 | ||
| 1200 | /* Our parent execution domain becomes current domain | 1210 | /* Our parent execution domain becomes current domain |
| 1201 | These must match for thread signalling to apply */ | 1211 | These must match for thread signalling to apply */ |
| @@ -1237,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1237 | * parent's CPU). This avoids alot of nasty races. | 1247 | * parent's CPU). This avoids alot of nasty races. |
| 1238 | */ | 1248 | */ |
| 1239 | p->cpus_allowed = current->cpus_allowed; | 1249 | p->cpus_allowed = current->cpus_allowed; |
| 1250 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | ||
| 1240 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | 1251 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || |
| 1241 | !cpu_online(task_cpu(p)))) | 1252 | !cpu_online(task_cpu(p)))) |
| 1242 | set_task_cpu(p, smp_processor_id()); | 1253 | set_task_cpu(p, smp_processor_id()); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f994bb8065e6..bd5d6b5060bc 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div) | |||
| 325 | } | 325 | } |
| 326 | #endif /* BITS_PER_LONG >= 64 */ | 326 | #endif /* BITS_PER_LONG >= 64 */ |
| 327 | 327 | ||
| 328 | /* | ||
| 329 | * Check, whether the timer is on the callback pending list | ||
| 330 | */ | ||
| 331 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
| 332 | { | ||
| 333 | return timer->state & HRTIMER_STATE_PENDING; | ||
| 334 | } | ||
| 335 | |||
| 336 | /* | ||
| 337 | * Remove a timer from the callback pending list | ||
| 338 | */ | ||
| 339 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
| 340 | { | ||
| 341 | list_del_init(&timer->cb_entry); | ||
| 342 | } | ||
| 343 | |||
| 328 | /* High resolution timer related functions */ | 344 | /* High resolution timer related functions */ |
| 329 | #ifdef CONFIG_HIGH_RES_TIMERS | 345 | #ifdef CONFIG_HIGH_RES_TIMERS |
| 330 | 346 | ||
| @@ -494,29 +510,12 @@ void hres_timers_resume(void) | |||
| 494 | } | 510 | } |
| 495 | 511 | ||
| 496 | /* | 512 | /* |
| 497 | * Check, whether the timer is on the callback pending list | ||
| 498 | */ | ||
| 499 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
| 500 | { | ||
| 501 | return timer->state & HRTIMER_STATE_PENDING; | ||
| 502 | } | ||
| 503 | |||
| 504 | /* | ||
| 505 | * Remove a timer from the callback pending list | ||
| 506 | */ | ||
| 507 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
| 508 | { | ||
| 509 | list_del_init(&timer->cb_entry); | ||
| 510 | } | ||
| 511 | |||
| 512 | /* | ||
| 513 | * Initialize the high resolution related parts of cpu_base | 513 | * Initialize the high resolution related parts of cpu_base |
| 514 | */ | 514 | */ |
| 515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | 515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) |
| 516 | { | 516 | { |
| 517 | base->expires_next.tv64 = KTIME_MAX; | 517 | base->expires_next.tv64 = KTIME_MAX; |
| 518 | base->hres_active = 0; | 518 | base->hres_active = 0; |
| 519 | INIT_LIST_HEAD(&base->cb_pending); | ||
| 520 | } | 519 | } |
| 521 | 520 | ||
| 522 | /* | 521 | /* |
| @@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
| 524 | */ | 523 | */ |
| 525 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | 524 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) |
| 526 | { | 525 | { |
| 527 | INIT_LIST_HEAD(&timer->cb_entry); | ||
| 528 | } | 526 | } |
| 529 | 527 | ||
| 530 | /* | 528 | /* |
| @@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 618 | { | 616 | { |
| 619 | return 0; | 617 | return 0; |
| 620 | } | 618 | } |
| 621 | static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } | ||
| 622 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } | ||
| 623 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 619 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
| 624 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | 620 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } |
| 621 | static inline int hrtimer_reprogram(struct hrtimer *timer, | ||
| 622 | struct hrtimer_clock_base *base) | ||
| 623 | { | ||
| 624 | return 0; | ||
| 625 | } | ||
| 625 | 626 | ||
| 626 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 627 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
| 627 | 628 | ||
| @@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 1001 | clock_id = CLOCK_MONOTONIC; | 1002 | clock_id = CLOCK_MONOTONIC; |
| 1002 | 1003 | ||
| 1003 | timer->base = &cpu_base->clock_base[clock_id]; | 1004 | timer->base = &cpu_base->clock_base[clock_id]; |
| 1005 | INIT_LIST_HEAD(&timer->cb_entry); | ||
| 1004 | hrtimer_init_timer_hres(timer); | 1006 | hrtimer_init_timer_hres(timer); |
| 1005 | 1007 | ||
| 1006 | #ifdef CONFIG_TIMER_STATS | 1008 | #ifdef CONFIG_TIMER_STATS |
| @@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
| 1030 | } | 1032 | } |
| 1031 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | 1033 | EXPORT_SYMBOL_GPL(hrtimer_get_res); |
| 1032 | 1034 | ||
| 1035 | static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | ||
| 1036 | { | ||
| 1037 | spin_lock_irq(&cpu_base->lock); | ||
| 1038 | |||
| 1039 | while (!list_empty(&cpu_base->cb_pending)) { | ||
| 1040 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
| 1041 | struct hrtimer *timer; | ||
| 1042 | int restart; | ||
| 1043 | |||
| 1044 | timer = list_entry(cpu_base->cb_pending.next, | ||
| 1045 | struct hrtimer, cb_entry); | ||
| 1046 | |||
| 1047 | timer_stats_account_hrtimer(timer); | ||
| 1048 | |||
| 1049 | fn = timer->function; | ||
| 1050 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | ||
| 1051 | spin_unlock_irq(&cpu_base->lock); | ||
| 1052 | |||
| 1053 | restart = fn(timer); | ||
| 1054 | |||
| 1055 | spin_lock_irq(&cpu_base->lock); | ||
| 1056 | |||
| 1057 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
| 1058 | if (restart == HRTIMER_RESTART) { | ||
| 1059 | BUG_ON(hrtimer_active(timer)); | ||
| 1060 | /* | ||
| 1061 | * Enqueue the timer, allow reprogramming of the event | ||
| 1062 | * device | ||
| 1063 | */ | ||
| 1064 | enqueue_hrtimer(timer, timer->base, 1); | ||
| 1065 | } else if (hrtimer_active(timer)) { | ||
| 1066 | /* | ||
| 1067 | * If the timer was rearmed on another CPU, reprogram | ||
| 1068 | * the event device. | ||
| 1069 | */ | ||
| 1070 | if (timer->base->first == &timer->node) | ||
| 1071 | hrtimer_reprogram(timer, timer->base); | ||
| 1072 | } | ||
| 1073 | } | ||
| 1074 | spin_unlock_irq(&cpu_base->lock); | ||
| 1075 | } | ||
| 1076 | |||
| 1077 | static void __run_hrtimer(struct hrtimer *timer) | ||
| 1078 | { | ||
| 1079 | struct hrtimer_clock_base *base = timer->base; | ||
| 1080 | struct hrtimer_cpu_base *cpu_base = base->cpu_base; | ||
| 1081 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
| 1082 | int restart; | ||
| 1083 | |||
| 1084 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
| 1085 | timer_stats_account_hrtimer(timer); | ||
| 1086 | |||
| 1087 | fn = timer->function; | ||
| 1088 | if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { | ||
| 1089 | /* | ||
| 1090 | * Used for scheduler timers, avoid lock inversion with | ||
| 1091 | * rq->lock and tasklist_lock. | ||
| 1092 | * | ||
| 1093 | * These timers are required to deal with enqueue expiry | ||
| 1094 | * themselves and are not allowed to migrate. | ||
| 1095 | */ | ||
| 1096 | spin_unlock(&cpu_base->lock); | ||
| 1097 | restart = fn(timer); | ||
| 1098 | spin_lock(&cpu_base->lock); | ||
| 1099 | } else | ||
| 1100 | restart = fn(timer); | ||
| 1101 | |||
| 1102 | /* | ||
| 1103 | * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid | ||
| 1104 | * reprogramming of the event hardware. This happens at the end of this | ||
| 1105 | * function anyway. | ||
| 1106 | */ | ||
| 1107 | if (restart != HRTIMER_NORESTART) { | ||
| 1108 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
| 1109 | enqueue_hrtimer(timer, base, 0); | ||
| 1110 | } | ||
| 1111 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
| 1112 | } | ||
| 1113 | |||
| 1033 | #ifdef CONFIG_HIGH_RES_TIMERS | 1114 | #ifdef CONFIG_HIGH_RES_TIMERS |
| 1034 | 1115 | ||
| 1035 | /* | 1116 | /* |
| @@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
| 1087 | continue; | 1168 | continue; |
| 1088 | } | 1169 | } |
| 1089 | 1170 | ||
| 1090 | __remove_hrtimer(timer, base, | 1171 | __run_hrtimer(timer); |
| 1091 | HRTIMER_STATE_CALLBACK, 0); | ||
| 1092 | timer_stats_account_hrtimer(timer); | ||
| 1093 | |||
| 1094 | /* | ||
| 1095 | * Note: We clear the CALLBACK bit after | ||
| 1096 | * enqueue_hrtimer to avoid reprogramming of | ||
| 1097 | * the event hardware. This happens at the end | ||
| 1098 | * of this function anyway. | ||
| 1099 | */ | ||
| 1100 | if (timer->function(timer) != HRTIMER_NORESTART) { | ||
| 1101 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
| 1102 | enqueue_hrtimer(timer, base, 0); | ||
| 1103 | } | ||
| 1104 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
| 1105 | } | 1172 | } |
| 1106 | spin_unlock(&cpu_base->lock); | 1173 | spin_unlock(&cpu_base->lock); |
| 1107 | base++; | 1174 | base++; |
| @@ -1122,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
| 1122 | 1189 | ||
| 1123 | static void run_hrtimer_softirq(struct softirq_action *h) | 1190 | static void run_hrtimer_softirq(struct softirq_action *h) |
| 1124 | { | 1191 | { |
| 1125 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1192 | run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); |
| 1126 | 1193 | } | |
| 1127 | spin_lock_irq(&cpu_base->lock); | ||
| 1128 | |||
| 1129 | while (!list_empty(&cpu_base->cb_pending)) { | ||
| 1130 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
| 1131 | struct hrtimer *timer; | ||
| 1132 | int restart; | ||
| 1133 | |||
| 1134 | timer = list_entry(cpu_base->cb_pending.next, | ||
| 1135 | struct hrtimer, cb_entry); | ||
| 1136 | 1194 | ||
| 1137 | timer_stats_account_hrtimer(timer); | 1195 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
| 1138 | 1196 | ||
| 1139 | fn = timer->function; | 1197 | /* |
| 1140 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | 1198 | * Called from timer softirq every jiffy, expire hrtimers: |
| 1141 | spin_unlock_irq(&cpu_base->lock); | 1199 | * |
| 1200 | * For HRT its the fall back code to run the softirq in the timer | ||
| 1201 | * softirq context in case the hrtimer initialization failed or has | ||
| 1202 | * not been done yet. | ||
| 1203 | */ | ||
| 1204 | void hrtimer_run_pending(void) | ||
| 1205 | { | ||
| 1206 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
| 1142 | 1207 | ||
| 1143 | restart = fn(timer); | 1208 | if (hrtimer_hres_active()) |
| 1209 | return; | ||
| 1144 | 1210 | ||
| 1145 | spin_lock_irq(&cpu_base->lock); | 1211 | /* |
| 1212 | * This _is_ ugly: We have to check in the softirq context, | ||
| 1213 | * whether we can switch to highres and / or nohz mode. The | ||
| 1214 | * clocksource switch happens in the timer interrupt with | ||
| 1215 | * xtime_lock held. Notification from there only sets the | ||
| 1216 | * check bit in the tick_oneshot code, otherwise we might | ||
| 1217 | * deadlock vs. xtime_lock. | ||
| 1218 | */ | ||
| 1219 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
| 1220 | hrtimer_switch_to_hres(); | ||
| 1146 | 1221 | ||
| 1147 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1222 | run_hrtimer_pending(cpu_base); |
| 1148 | if (restart == HRTIMER_RESTART) { | ||
| 1149 | BUG_ON(hrtimer_active(timer)); | ||
| 1150 | /* | ||
| 1151 | * Enqueue the timer, allow reprogramming of the event | ||
| 1152 | * device | ||
| 1153 | */ | ||
| 1154 | enqueue_hrtimer(timer, timer->base, 1); | ||
| 1155 | } else if (hrtimer_active(timer)) { | ||
| 1156 | /* | ||
| 1157 | * If the timer was rearmed on another CPU, reprogram | ||
| 1158 | * the event device. | ||
| 1159 | */ | ||
| 1160 | if (timer->base->first == &timer->node) | ||
| 1161 | hrtimer_reprogram(timer, timer->base); | ||
| 1162 | } | ||
| 1163 | } | ||
| 1164 | spin_unlock_irq(&cpu_base->lock); | ||
| 1165 | } | 1223 | } |
| 1166 | 1224 | ||
| 1167 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
| 1168 | |||
| 1169 | /* | 1225 | /* |
| 1170 | * Expire the per base hrtimer-queue: | 1226 | * Called from hardirq context every jiffy |
| 1171 | */ | 1227 | */ |
| 1172 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1228 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, |
| 1173 | int index) | 1229 | int index) |
| @@ -1181,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | |||
| 1181 | if (base->get_softirq_time) | 1237 | if (base->get_softirq_time) |
| 1182 | base->softirq_time = base->get_softirq_time(); | 1238 | base->softirq_time = base->get_softirq_time(); |
| 1183 | 1239 | ||
| 1184 | spin_lock_irq(&cpu_base->lock); | 1240 | spin_lock(&cpu_base->lock); |
| 1185 | 1241 | ||
| 1186 | while ((node = base->first)) { | 1242 | while ((node = base->first)) { |
| 1187 | struct hrtimer *timer; | 1243 | struct hrtimer *timer; |
| 1188 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
| 1189 | int restart; | ||
| 1190 | 1244 | ||
| 1191 | timer = rb_entry(node, struct hrtimer, node); | 1245 | timer = rb_entry(node, struct hrtimer, node); |
| 1192 | if (base->softirq_time.tv64 <= timer->expires.tv64) | 1246 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
| 1193 | break; | 1247 | break; |
| 1194 | 1248 | ||
| 1195 | #ifdef CONFIG_HIGH_RES_TIMERS | 1249 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
| 1196 | WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); | 1250 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); |
| 1197 | #endif | 1251 | list_add_tail(&timer->cb_entry, |
| 1198 | timer_stats_account_hrtimer(timer); | 1252 | &base->cpu_base->cb_pending); |
| 1199 | 1253 | continue; | |
| 1200 | fn = timer->function; | ||
| 1201 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
| 1202 | spin_unlock_irq(&cpu_base->lock); | ||
| 1203 | |||
| 1204 | restart = fn(timer); | ||
| 1205 | |||
| 1206 | spin_lock_irq(&cpu_base->lock); | ||
| 1207 | |||
| 1208 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
| 1209 | if (restart != HRTIMER_NORESTART) { | ||
| 1210 | BUG_ON(hrtimer_active(timer)); | ||
| 1211 | enqueue_hrtimer(timer, base, 0); | ||
| 1212 | } | 1254 | } |
| 1255 | |||
| 1256 | __run_hrtimer(timer); | ||
| 1213 | } | 1257 | } |
| 1214 | spin_unlock_irq(&cpu_base->lock); | 1258 | spin_unlock(&cpu_base->lock); |
| 1215 | } | 1259 | } |
| 1216 | 1260 | ||
| 1217 | /* | ||
| 1218 | * Called from timer softirq every jiffy, expire hrtimers: | ||
| 1219 | * | ||
| 1220 | * For HRT its the fall back code to run the softirq in the timer | ||
| 1221 | * softirq context in case the hrtimer initialization failed or has | ||
| 1222 | * not been done yet. | ||
| 1223 | */ | ||
| 1224 | void hrtimer_run_queues(void) | 1261 | void hrtimer_run_queues(void) |
| 1225 | { | 1262 | { |
| 1226 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1263 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
| @@ -1229,18 +1266,6 @@ void hrtimer_run_queues(void) | |||
| 1229 | if (hrtimer_hres_active()) | 1266 | if (hrtimer_hres_active()) |
| 1230 | return; | 1267 | return; |
| 1231 | 1268 | ||
| 1232 | /* | ||
| 1233 | * This _is_ ugly: We have to check in the softirq context, | ||
| 1234 | * whether we can switch to highres and / or nohz mode. The | ||
| 1235 | * clocksource switch happens in the timer interrupt with | ||
| 1236 | * xtime_lock held. Notification from there only sets the | ||
| 1237 | * check bit in the tick_oneshot code, otherwise we might | ||
| 1238 | * deadlock vs. xtime_lock. | ||
| 1239 | */ | ||
| 1240 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
| 1241 | if (hrtimer_switch_to_hres()) | ||
| 1242 | return; | ||
| 1243 | |||
| 1244 | hrtimer_get_softirq_time(cpu_base); | 1269 | hrtimer_get_softirq_time(cpu_base); |
| 1245 | 1270 | ||
| 1246 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1271 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
| @@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
| 1268 | sl->timer.function = hrtimer_wakeup; | 1293 | sl->timer.function = hrtimer_wakeup; |
| 1269 | sl->task = task; | 1294 | sl->task = task; |
| 1270 | #ifdef CONFIG_HIGH_RES_TIMERS | 1295 | #ifdef CONFIG_HIGH_RES_TIMERS |
| 1271 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; | 1296 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
| 1272 | #endif | 1297 | #endif |
| 1273 | } | 1298 | } |
| 1274 | 1299 | ||
| @@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
| 1279 | do { | 1304 | do { |
| 1280 | set_current_state(TASK_INTERRUPTIBLE); | 1305 | set_current_state(TASK_INTERRUPTIBLE); |
| 1281 | hrtimer_start(&t->timer, t->timer.expires, mode); | 1306 | hrtimer_start(&t->timer, t->timer.expires, mode); |
| 1307 | if (!hrtimer_active(&t->timer)) | ||
| 1308 | t->task = NULL; | ||
| 1282 | 1309 | ||
| 1283 | if (likely(t->task)) | 1310 | if (likely(t->task)) |
| 1284 | schedule(); | 1311 | schedule(); |
| @@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
| 1389 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1416 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
| 1390 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1417 | cpu_base->clock_base[i].cpu_base = cpu_base; |
| 1391 | 1418 | ||
| 1419 | INIT_LIST_HEAD(&cpu_base->cb_pending); | ||
| 1392 | hrtimer_init_hres(cpu_base); | 1420 | hrtimer_init_hres(cpu_base); |
| 1393 | } | 1421 | } |
| 1394 | 1422 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index dcfe724300eb..0ac887882f90 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -15,6 +15,8 @@ | |||
| 15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
| 16 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
| 17 | 17 | ||
| 18 | #define KTHREAD_NICE_LEVEL (-5) | ||
| 19 | |||
| 18 | static DEFINE_SPINLOCK(kthread_create_lock); | 20 | static DEFINE_SPINLOCK(kthread_create_lock); |
| 19 | static LIST_HEAD(kthread_create_list); | 21 | static LIST_HEAD(kthread_create_list); |
| 20 | struct task_struct *kthreadd_task; | 22 | struct task_struct *kthreadd_task; |
| @@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create) | |||
| 94 | if (pid < 0) { | 96 | if (pid < 0) { |
| 95 | create->result = ERR_PTR(pid); | 97 | create->result = ERR_PTR(pid); |
| 96 | } else { | 98 | } else { |
| 99 | struct sched_param param = { .sched_priority = 0 }; | ||
| 97 | wait_for_completion(&create->started); | 100 | wait_for_completion(&create->started); |
| 98 | read_lock(&tasklist_lock); | 101 | read_lock(&tasklist_lock); |
| 99 | create->result = find_task_by_pid(pid); | 102 | create->result = find_task_by_pid(pid); |
| 100 | read_unlock(&tasklist_lock); | 103 | read_unlock(&tasklist_lock); |
| 104 | /* | ||
| 105 | * root may have changed our (kthreadd's) priority or CPU mask. | ||
| 106 | * The kernel thread should not inherit these properties. | ||
| 107 | */ | ||
| 108 | sched_setscheduler(create->result, SCHED_NORMAL, ¶m); | ||
| 109 | set_user_nice(create->result, KTHREAD_NICE_LEVEL); | ||
| 110 | set_cpus_allowed(create->result, CPU_MASK_ALL); | ||
| 101 | } | 111 | } |
| 102 | complete(&create->done); | 112 | complete(&create->done); |
| 103 | } | 113 | } |
| @@ -221,7 +231,7 @@ int kthreadd(void *unused) | |||
| 221 | /* Setup a clean context for our children to inherit. */ | 231 | /* Setup a clean context for our children to inherit. */ |
| 222 | set_task_comm(tsk, "kthreadd"); | 232 | set_task_comm(tsk, "kthreadd"); |
| 223 | ignore_signals(tsk); | 233 | ignore_signals(tsk); |
| 224 | set_user_nice(tsk, -5); | 234 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); |
| 225 | set_cpus_allowed(tsk, CPU_MASK_ALL); | 235 | set_cpus_allowed(tsk, CPU_MASK_ALL); |
| 226 | 236 | ||
| 227 | current->flags |= PF_NOFREEZE; | 237 | current->flags |= PF_NOFREEZE; |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c new file mode 100644 index 000000000000..b4e3c85abe74 --- /dev/null +++ b/kernel/latencytop.c | |||
| @@ -0,0 +1,239 @@ | |||
| 1 | /* | ||
| 2 | * latencytop.c: Latency display infrastructure | ||
| 3 | * | ||
| 4 | * (C) Copyright 2008 Intel Corporation | ||
| 5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or | ||
| 8 | * modify it under the terms of the GNU General Public License | ||
| 9 | * as published by the Free Software Foundation; version 2 | ||
| 10 | * of the License. | ||
| 11 | */ | ||
| 12 | #include <linux/latencytop.h> | ||
| 13 | #include <linux/kallsyms.h> | ||
| 14 | #include <linux/seq_file.h> | ||
| 15 | #include <linux/notifier.h> | ||
| 16 | #include <linux/spinlock.h> | ||
| 17 | #include <linux/proc_fs.h> | ||
| 18 | #include <linux/module.h> | ||
| 19 | #include <linux/sched.h> | ||
| 20 | #include <linux/list.h> | ||
| 21 | #include <linux/slab.h> | ||
| 22 | #include <linux/stacktrace.h> | ||
| 23 | |||
| 24 | static DEFINE_SPINLOCK(latency_lock); | ||
| 25 | |||
| 26 | #define MAXLR 128 | ||
| 27 | static struct latency_record latency_record[MAXLR]; | ||
| 28 | |||
| 29 | int latencytop_enabled; | ||
| 30 | |||
| 31 | void clear_all_latency_tracing(struct task_struct *p) | ||
| 32 | { | ||
| 33 | unsigned long flags; | ||
| 34 | |||
| 35 | if (!latencytop_enabled) | ||
| 36 | return; | ||
| 37 | |||
| 38 | spin_lock_irqsave(&latency_lock, flags); | ||
| 39 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | ||
| 40 | p->latency_record_count = 0; | ||
| 41 | spin_unlock_irqrestore(&latency_lock, flags); | ||
| 42 | } | ||
| 43 | |||
| 44 | static void clear_global_latency_tracing(void) | ||
| 45 | { | ||
| 46 | unsigned long flags; | ||
| 47 | |||
| 48 | spin_lock_irqsave(&latency_lock, flags); | ||
| 49 | memset(&latency_record, 0, sizeof(latency_record)); | ||
| 50 | spin_unlock_irqrestore(&latency_lock, flags); | ||
| 51 | } | ||
| 52 | |||
| 53 | static void __sched | ||
| 54 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | ||
| 55 | { | ||
| 56 | int firstnonnull = MAXLR + 1; | ||
| 57 | int i; | ||
| 58 | |||
| 59 | if (!latencytop_enabled) | ||
| 60 | return; | ||
| 61 | |||
| 62 | /* skip kernel threads for now */ | ||
| 63 | if (!tsk->mm) | ||
| 64 | return; | ||
| 65 | |||
| 66 | for (i = 0; i < MAXLR; i++) { | ||
| 67 | int q; | ||
| 68 | int same = 1; | ||
| 69 | /* Nothing stored: */ | ||
| 70 | if (!latency_record[i].backtrace[0]) { | ||
| 71 | if (firstnonnull > i) | ||
| 72 | firstnonnull = i; | ||
| 73 | continue; | ||
| 74 | } | ||
| 75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
| 76 | if (latency_record[i].backtrace[q] != | ||
| 77 | lat->backtrace[q]) | ||
| 78 | same = 0; | ||
| 79 | if (same && lat->backtrace[q] == 0) | ||
| 80 | break; | ||
| 81 | if (same && lat->backtrace[q] == ULONG_MAX) | ||
| 82 | break; | ||
| 83 | } | ||
| 84 | if (same) { | ||
| 85 | latency_record[i].count++; | ||
| 86 | latency_record[i].time += lat->time; | ||
| 87 | if (lat->time > latency_record[i].max) | ||
| 88 | latency_record[i].max = lat->time; | ||
| 89 | return; | ||
| 90 | } | ||
| 91 | } | ||
| 92 | |||
| 93 | i = firstnonnull; | ||
| 94 | if (i >= MAXLR - 1) | ||
| 95 | return; | ||
| 96 | |||
| 97 | /* Allocted a new one: */ | ||
| 98 | memcpy(&latency_record[i], lat, sizeof(struct latency_record)); | ||
| 99 | } | ||
| 100 | |||
| 101 | static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) | ||
| 102 | { | ||
| 103 | struct stack_trace trace; | ||
| 104 | |||
| 105 | memset(&trace, 0, sizeof(trace)); | ||
| 106 | trace.max_entries = LT_BACKTRACEDEPTH; | ||
| 107 | trace.entries = &lat->backtrace[0]; | ||
| 108 | trace.skip = 0; | ||
| 109 | save_stack_trace_tsk(tsk, &trace); | ||
| 110 | } | ||
| 111 | |||
| 112 | void __sched | ||
| 113 | account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | ||
| 114 | { | ||
| 115 | unsigned long flags; | ||
| 116 | int i, q; | ||
| 117 | struct latency_record lat; | ||
| 118 | |||
| 119 | if (!latencytop_enabled) | ||
| 120 | return; | ||
| 121 | |||
| 122 | /* Long interruptible waits are generally user requested... */ | ||
| 123 | if (inter && usecs > 5000) | ||
| 124 | return; | ||
| 125 | |||
| 126 | memset(&lat, 0, sizeof(lat)); | ||
| 127 | lat.count = 1; | ||
| 128 | lat.time = usecs; | ||
| 129 | lat.max = usecs; | ||
| 130 | store_stacktrace(tsk, &lat); | ||
| 131 | |||
| 132 | spin_lock_irqsave(&latency_lock, flags); | ||
| 133 | |||
| 134 | account_global_scheduler_latency(tsk, &lat); | ||
| 135 | |||
| 136 | /* | ||
| 137 | * short term hack; if we're > 32 we stop; future we recycle: | ||
| 138 | */ | ||
| 139 | tsk->latency_record_count++; | ||
| 140 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
| 141 | goto out_unlock; | ||
| 142 | |||
| 143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | ||
| 144 | struct latency_record *mylat; | ||
| 145 | int same = 1; | ||
| 146 | mylat = &tsk->latency_record[i]; | ||
| 147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
| 148 | if (mylat->backtrace[q] != | ||
| 149 | lat.backtrace[q]) | ||
| 150 | same = 0; | ||
| 151 | if (same && lat.backtrace[q] == 0) | ||
| 152 | break; | ||
| 153 | if (same && lat.backtrace[q] == ULONG_MAX) | ||
| 154 | break; | ||
| 155 | } | ||
| 156 | if (same) { | ||
| 157 | mylat->count++; | ||
| 158 | mylat->time += lat.time; | ||
| 159 | if (lat.time > mylat->max) | ||
| 160 | mylat->max = lat.time; | ||
| 161 | goto out_unlock; | ||
| 162 | } | ||
| 163 | } | ||
| 164 | |||
| 165 | /* Allocated a new one: */ | ||
| 166 | i = tsk->latency_record_count; | ||
| 167 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | ||
| 168 | |||
| 169 | out_unlock: | ||
| 170 | spin_unlock_irqrestore(&latency_lock, flags); | ||
| 171 | } | ||
| 172 | |||
| 173 | static int lstats_show(struct seq_file *m, void *v) | ||
| 174 | { | ||
| 175 | int i; | ||
| 176 | |||
| 177 | seq_puts(m, "Latency Top version : v0.1\n"); | ||
| 178 | |||
| 179 | for (i = 0; i < MAXLR; i++) { | ||
| 180 | if (latency_record[i].backtrace[0]) { | ||
| 181 | int q; | ||
| 182 | seq_printf(m, "%i %li %li ", | ||
| 183 | latency_record[i].count, | ||
| 184 | latency_record[i].time, | ||
| 185 | latency_record[i].max); | ||
| 186 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | ||
| 187 | char sym[KSYM_NAME_LEN]; | ||
| 188 | char *c; | ||
| 189 | if (!latency_record[i].backtrace[q]) | ||
| 190 | break; | ||
| 191 | if (latency_record[i].backtrace[q] == ULONG_MAX) | ||
| 192 | break; | ||
| 193 | sprint_symbol(sym, latency_record[i].backtrace[q]); | ||
| 194 | c = strchr(sym, '+'); | ||
| 195 | if (c) | ||
| 196 | *c = 0; | ||
| 197 | seq_printf(m, "%s ", sym); | ||
| 198 | } | ||
| 199 | seq_printf(m, "\n"); | ||
| 200 | } | ||
| 201 | } | ||
| 202 | return 0; | ||
| 203 | } | ||
| 204 | |||
| 205 | static ssize_t | ||
| 206 | lstats_write(struct file *file, const char __user *buf, size_t count, | ||
| 207 | loff_t *offs) | ||
| 208 | { | ||
| 209 | clear_global_latency_tracing(); | ||
| 210 | |||
| 211 | return count; | ||
| 212 | } | ||
| 213 | |||
| 214 | static int lstats_open(struct inode *inode, struct file *filp) | ||
| 215 | { | ||
| 216 | return single_open(filp, lstats_show, NULL); | ||
| 217 | } | ||
| 218 | |||
| 219 | static struct file_operations lstats_fops = { | ||
| 220 | .open = lstats_open, | ||
| 221 | .read = seq_read, | ||
| 222 | .write = lstats_write, | ||
| 223 | .llseek = seq_lseek, | ||
| 224 | .release = single_release, | ||
| 225 | }; | ||
| 226 | |||
| 227 | static int __init init_lstats_procfs(void) | ||
| 228 | { | ||
| 229 | struct proc_dir_entry *pe; | ||
| 230 | |||
| 231 | pe = create_proc_entry("latency_stats", 0644, NULL); | ||
| 232 | if (!pe) | ||
| 233 | return -ENOMEM; | ||
| 234 | |||
| 235 | pe->proc_fops = &lstats_fops; | ||
| 236 | |||
| 237 | return 0; | ||
| 238 | } | ||
| 239 | __initcall(init_lstats_procfs); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e2c07ece367d..3574379f4d62 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -3206,7 +3206,11 @@ retry: | |||
| 3206 | 3206 | ||
| 3207 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | 3207 | EXPORT_SYMBOL_GPL(debug_show_all_locks); |
| 3208 | 3208 | ||
| 3209 | void debug_show_held_locks(struct task_struct *task) | 3209 | /* |
| 3210 | * Careful: only use this function if you are sure that | ||
| 3211 | * the task cannot run in parallel! | ||
| 3212 | */ | ||
| 3213 | void __debug_show_held_locks(struct task_struct *task) | ||
| 3210 | { | 3214 | { |
| 3211 | if (unlikely(!debug_locks)) { | 3215 | if (unlikely(!debug_locks)) { |
| 3212 | printk("INFO: lockdep is turned off.\n"); | 3216 | printk("INFO: lockdep is turned off.\n"); |
| @@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task) | |||
| 3214 | } | 3218 | } |
| 3215 | lockdep_print_held_locks(task); | 3219 | lockdep_print_held_locks(task); |
| 3216 | } | 3220 | } |
| 3221 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
| 3222 | |||
| 3223 | void debug_show_held_locks(struct task_struct *task) | ||
| 3224 | { | ||
| 3225 | __debug_show_held_locks(task); | ||
| 3226 | } | ||
| 3217 | 3227 | ||
| 3218 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3228 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
| 3219 | 3229 | ||
diff --git a/kernel/module.c b/kernel/module.c index dcb8a2cbf75e..1bb4c5e0d56e 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -496,6 +496,8 @@ static struct module_attribute modinfo_##field = { \ | |||
| 496 | MODINFO_ATTR(version); | 496 | MODINFO_ATTR(version); |
| 497 | MODINFO_ATTR(srcversion); | 497 | MODINFO_ATTR(srcversion); |
| 498 | 498 | ||
| 499 | static char last_unloaded_module[MODULE_NAME_LEN+1]; | ||
| 500 | |||
| 499 | #ifdef CONFIG_MODULE_UNLOAD | 501 | #ifdef CONFIG_MODULE_UNLOAD |
| 500 | /* Init the unload section of the module. */ | 502 | /* Init the unload section of the module. */ |
| 501 | static void module_unload_init(struct module *mod) | 503 | static void module_unload_init(struct module *mod) |
| @@ -719,6 +721,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
| 719 | mod->exit(); | 721 | mod->exit(); |
| 720 | mutex_lock(&module_mutex); | 722 | mutex_lock(&module_mutex); |
| 721 | } | 723 | } |
| 724 | /* Store the name of the last unloaded module for diagnostic purposes */ | ||
| 725 | sprintf(last_unloaded_module, mod->name); | ||
| 722 | free_module(mod); | 726 | free_module(mod); |
| 723 | 727 | ||
| 724 | out: | 728 | out: |
| @@ -2357,21 +2361,30 @@ static void m_stop(struct seq_file *m, void *p) | |||
| 2357 | mutex_unlock(&module_mutex); | 2361 | mutex_unlock(&module_mutex); |
| 2358 | } | 2362 | } |
| 2359 | 2363 | ||
| 2360 | static char *taint_flags(unsigned int taints, char *buf) | 2364 | static char *module_flags(struct module *mod, char *buf) |
| 2361 | { | 2365 | { |
| 2362 | int bx = 0; | 2366 | int bx = 0; |
| 2363 | 2367 | ||
| 2364 | if (taints) { | 2368 | if (mod->taints || |
| 2369 | mod->state == MODULE_STATE_GOING || | ||
| 2370 | mod->state == MODULE_STATE_COMING) { | ||
| 2365 | buf[bx++] = '('; | 2371 | buf[bx++] = '('; |
| 2366 | if (taints & TAINT_PROPRIETARY_MODULE) | 2372 | if (mod->taints & TAINT_PROPRIETARY_MODULE) |
| 2367 | buf[bx++] = 'P'; | 2373 | buf[bx++] = 'P'; |
| 2368 | if (taints & TAINT_FORCED_MODULE) | 2374 | if (mod->taints & TAINT_FORCED_MODULE) |
| 2369 | buf[bx++] = 'F'; | 2375 | buf[bx++] = 'F'; |
| 2370 | /* | 2376 | /* |
| 2371 | * TAINT_FORCED_RMMOD: could be added. | 2377 | * TAINT_FORCED_RMMOD: could be added. |
| 2372 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | 2378 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't |
| 2373 | * apply to modules. | 2379 | * apply to modules. |
| 2374 | */ | 2380 | */ |
| 2381 | |||
| 2382 | /* Show a - for module-is-being-unloaded */ | ||
| 2383 | if (mod->state == MODULE_STATE_GOING) | ||
| 2384 | buf[bx++] = '-'; | ||
| 2385 | /* Show a + for module-is-being-loaded */ | ||
| 2386 | if (mod->state == MODULE_STATE_COMING) | ||
| 2387 | buf[bx++] = '+'; | ||
| 2375 | buf[bx++] = ')'; | 2388 | buf[bx++] = ')'; |
| 2376 | } | 2389 | } |
| 2377 | buf[bx] = '\0'; | 2390 | buf[bx] = '\0'; |
| @@ -2398,7 +2411,7 @@ static int m_show(struct seq_file *m, void *p) | |||
| 2398 | 2411 | ||
| 2399 | /* Taints info */ | 2412 | /* Taints info */ |
| 2400 | if (mod->taints) | 2413 | if (mod->taints) |
| 2401 | seq_printf(m, " %s", taint_flags(mod->taints, buf)); | 2414 | seq_printf(m, " %s", module_flags(mod, buf)); |
| 2402 | 2415 | ||
| 2403 | seq_printf(m, "\n"); | 2416 | seq_printf(m, "\n"); |
| 2404 | return 0; | 2417 | return 0; |
| @@ -2493,7 +2506,9 @@ void print_modules(void) | |||
| 2493 | 2506 | ||
| 2494 | printk("Modules linked in:"); | 2507 | printk("Modules linked in:"); |
| 2495 | list_for_each_entry(mod, &modules, list) | 2508 | list_for_each_entry(mod, &modules, list) |
| 2496 | printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); | 2509 | printk(" %s%s", mod->name, module_flags(mod, buf)); |
| 2510 | if (last_unloaded_module[0]) | ||
| 2511 | printk(" [last unloaded: %s]", last_unloaded_module); | ||
| 2497 | printk("\n"); | 2512 | printk("\n"); |
| 2498 | } | 2513 | } |
| 2499 | 2514 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 68c96376e84a..0b7c82ac467e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
| 967 | { | 967 | { |
| 968 | int maxfire; | 968 | int maxfire; |
| 969 | struct list_head *timers = tsk->cpu_timers; | 969 | struct list_head *timers = tsk->cpu_timers; |
| 970 | struct signal_struct *const sig = tsk->signal; | ||
| 970 | 971 | ||
| 971 | maxfire = 20; | 972 | maxfire = 20; |
| 972 | tsk->it_prof_expires = cputime_zero; | 973 | tsk->it_prof_expires = cputime_zero; |
| @@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk, | |||
| 1011 | t->firing = 1; | 1012 | t->firing = 1; |
| 1012 | list_move_tail(&t->entry, firing); | 1013 | list_move_tail(&t->entry, firing); |
| 1013 | } | 1014 | } |
| 1015 | |||
| 1016 | /* | ||
| 1017 | * Check for the special case thread timers. | ||
| 1018 | */ | ||
| 1019 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { | ||
| 1020 | unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; | ||
| 1021 | unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; | ||
| 1022 | |||
| 1023 | if (hard != RLIM_INFINITY && | ||
| 1024 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | ||
| 1025 | /* | ||
| 1026 | * At the hard limit, we just die. | ||
| 1027 | * No need to calculate anything else now. | ||
| 1028 | */ | ||
| 1029 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
| 1030 | return; | ||
| 1031 | } | ||
| 1032 | if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { | ||
| 1033 | /* | ||
| 1034 | * At the soft limit, send a SIGXCPU every second. | ||
| 1035 | */ | ||
| 1036 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur | ||
| 1037 | < sig->rlim[RLIMIT_RTTIME].rlim_max) { | ||
| 1038 | sig->rlim[RLIMIT_RTTIME].rlim_cur += | ||
| 1039 | USEC_PER_SEC; | ||
| 1040 | } | ||
| 1041 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
| 1042 | } | ||
| 1043 | } | ||
| 1014 | } | 1044 | } |
| 1015 | 1045 | ||
| 1016 | /* | 1046 | /* |
diff --git a/kernel/printk.c b/kernel/printk.c index 89011bf8c106..423a8c765a57 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str) | |||
| 573 | 573 | ||
| 574 | __setup("time", printk_time_setup); | 574 | __setup("time", printk_time_setup); |
| 575 | 575 | ||
| 576 | __attribute__((weak)) unsigned long long printk_clock(void) | ||
| 577 | { | ||
| 578 | return sched_clock(); | ||
| 579 | } | ||
| 580 | |||
| 581 | /* Check if we have any console registered that can be called early in boot. */ | 576 | /* Check if we have any console registered that can be called early in boot. */ |
| 582 | static int have_callable_console(void) | 577 | static int have_callable_console(void) |
| 583 | { | 578 | { |
| @@ -628,30 +623,57 @@ asmlinkage int printk(const char *fmt, ...) | |||
| 628 | /* cpu currently holding logbuf_lock */ | 623 | /* cpu currently holding logbuf_lock */ |
| 629 | static volatile unsigned int printk_cpu = UINT_MAX; | 624 | static volatile unsigned int printk_cpu = UINT_MAX; |
| 630 | 625 | ||
| 626 | const char printk_recursion_bug_msg [] = | ||
| 627 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
| 628 | static int printk_recursion_bug; | ||
| 629 | |||
| 631 | asmlinkage int vprintk(const char *fmt, va_list args) | 630 | asmlinkage int vprintk(const char *fmt, va_list args) |
| 632 | { | 631 | { |
| 632 | static int log_level_unknown = 1; | ||
| 633 | static char printk_buf[1024]; | ||
| 634 | |||
| 633 | unsigned long flags; | 635 | unsigned long flags; |
| 634 | int printed_len; | 636 | int printed_len = 0; |
| 637 | int this_cpu; | ||
| 635 | char *p; | 638 | char *p; |
| 636 | static char printk_buf[1024]; | ||
| 637 | static int log_level_unknown = 1; | ||
| 638 | 639 | ||
| 639 | boot_delay_msec(); | 640 | boot_delay_msec(); |
| 640 | 641 | ||
| 641 | preempt_disable(); | 642 | preempt_disable(); |
| 642 | if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) | ||
| 643 | /* If a crash is occurring during printk() on this CPU, | ||
| 644 | * make sure we can't deadlock */ | ||
| 645 | zap_locks(); | ||
| 646 | |||
| 647 | /* This stops the holder of console_sem just where we want him */ | 643 | /* This stops the holder of console_sem just where we want him */ |
| 648 | raw_local_irq_save(flags); | 644 | raw_local_irq_save(flags); |
| 645 | this_cpu = smp_processor_id(); | ||
| 646 | |||
| 647 | /* | ||
| 648 | * Ouch, printk recursed into itself! | ||
| 649 | */ | ||
| 650 | if (unlikely(printk_cpu == this_cpu)) { | ||
| 651 | /* | ||
| 652 | * If a crash is occurring during printk() on this CPU, | ||
| 653 | * then try to get the crash message out but make sure | ||
| 654 | * we can't deadlock. Otherwise just return to avoid the | ||
| 655 | * recursion and return - but flag the recursion so that | ||
| 656 | * it can be printed at the next appropriate moment: | ||
| 657 | */ | ||
| 658 | if (!oops_in_progress) { | ||
| 659 | printk_recursion_bug = 1; | ||
| 660 | goto out_restore_irqs; | ||
| 661 | } | ||
| 662 | zap_locks(); | ||
| 663 | } | ||
| 664 | |||
| 649 | lockdep_off(); | 665 | lockdep_off(); |
| 650 | spin_lock(&logbuf_lock); | 666 | spin_lock(&logbuf_lock); |
| 651 | printk_cpu = smp_processor_id(); | 667 | printk_cpu = this_cpu; |
| 652 | 668 | ||
| 669 | if (printk_recursion_bug) { | ||
| 670 | printk_recursion_bug = 0; | ||
| 671 | strcpy(printk_buf, printk_recursion_bug_msg); | ||
| 672 | printed_len = sizeof(printk_recursion_bug_msg); | ||
| 673 | } | ||
| 653 | /* Emit the output into the temporary buffer */ | 674 | /* Emit the output into the temporary buffer */ |
| 654 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | 675 | printed_len += vscnprintf(printk_buf + printed_len, |
| 676 | sizeof(printk_buf), fmt, args); | ||
| 655 | 677 | ||
| 656 | /* | 678 | /* |
| 657 | * Copy the output into log_buf. If the caller didn't provide | 679 | * Copy the output into log_buf. If the caller didn't provide |
| @@ -680,7 +702,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 680 | loglev_char = default_message_loglevel | 702 | loglev_char = default_message_loglevel |
| 681 | + '0'; | 703 | + '0'; |
| 682 | } | 704 | } |
| 683 | t = printk_clock(); | 705 | t = 0; |
| 706 | if (system_state != SYSTEM_BOOTING) | ||
| 707 | t = ktime_to_ns(ktime_get()); | ||
| 684 | nanosec_rem = do_div(t, 1000000000); | 708 | nanosec_rem = do_div(t, 1000000000); |
| 685 | tlen = sprintf(tbuf, | 709 | tlen = sprintf(tbuf, |
| 686 | "<%c>[%5lu.%06lu] ", | 710 | "<%c>[%5lu.%06lu] ", |
| @@ -744,6 +768,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 744 | printk_cpu = UINT_MAX; | 768 | printk_cpu = UINT_MAX; |
| 745 | spin_unlock(&logbuf_lock); | 769 | spin_unlock(&logbuf_lock); |
| 746 | lockdep_on(); | 770 | lockdep_on(); |
| 771 | out_restore_irqs: | ||
| 747 | raw_local_irq_restore(flags); | 772 | raw_local_irq_restore(flags); |
| 748 | } | 773 | } |
| 749 | 774 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 5e95330e5120..e64c2da11c0f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); | |||
| 52 | static DEFINE_MUTEX(profile_flip_mutex); | 52 | static DEFINE_MUTEX(profile_flip_mutex); |
| 53 | #endif /* CONFIG_SMP */ | 53 | #endif /* CONFIG_SMP */ |
| 54 | 54 | ||
| 55 | static int __init profile_setup(char * str) | 55 | static int __init profile_setup(char *str) |
| 56 | { | 56 | { |
| 57 | static char __initdata schedstr[] = "schedule"; | 57 | static char __initdata schedstr[] = "schedule"; |
| 58 | static char __initdata sleepstr[] = "sleep"; | 58 | static char __initdata sleepstr[] = "sleep"; |
| @@ -104,28 +104,28 @@ __setup("profile=", profile_setup); | |||
| 104 | 104 | ||
| 105 | void __init profile_init(void) | 105 | void __init profile_init(void) |
| 106 | { | 106 | { |
| 107 | if (!prof_on) | 107 | if (!prof_on) |
| 108 | return; | 108 | return; |
| 109 | 109 | ||
| 110 | /* only text is profiled */ | 110 | /* only text is profiled */ |
| 111 | prof_len = (_etext - _stext) >> prof_shift; | 111 | prof_len = (_etext - _stext) >> prof_shift; |
| 112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); | 112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); |
| 113 | } | 113 | } |
| 114 | 114 | ||
| 115 | /* Profile event notifications */ | 115 | /* Profile event notifications */ |
| 116 | 116 | ||
| 117 | #ifdef CONFIG_PROFILING | 117 | #ifdef CONFIG_PROFILING |
| 118 | 118 | ||
| 119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); | 119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); |
| 120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); | 120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); |
| 121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); | 121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); |
| 122 | 122 | ||
| 123 | void profile_task_exit(struct task_struct * task) | 123 | void profile_task_exit(struct task_struct *task) |
| 124 | { | 124 | { |
| 125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); | 125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); |
| 126 | } | 126 | } |
| 127 | 127 | ||
| 128 | int profile_handoff_task(struct task_struct * task) | 128 | int profile_handoff_task(struct task_struct *task) |
| 129 | { | 129 | { |
| 130 | int ret; | 130 | int ret; |
| 131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); | 131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); |
| @@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr) | |||
| 137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); | 137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); |
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | int task_handoff_register(struct notifier_block * n) | 140 | int task_handoff_register(struct notifier_block *n) |
| 141 | { | 141 | { |
| 142 | return atomic_notifier_chain_register(&task_free_notifier, n); | 142 | return atomic_notifier_chain_register(&task_free_notifier, n); |
| 143 | } | 143 | } |
| 144 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
| 144 | 145 | ||
| 145 | int task_handoff_unregister(struct notifier_block * n) | 146 | int task_handoff_unregister(struct notifier_block *n) |
| 146 | { | 147 | { |
| 147 | return atomic_notifier_chain_unregister(&task_free_notifier, n); | 148 | return atomic_notifier_chain_unregister(&task_free_notifier, n); |
| 148 | } | 149 | } |
| 150 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
| 149 | 151 | ||
| 150 | int profile_event_register(enum profile_type type, struct notifier_block * n) | 152 | int profile_event_register(enum profile_type type, struct notifier_block *n) |
| 151 | { | 153 | { |
| 152 | int err = -EINVAL; | 154 | int err = -EINVAL; |
| 153 | 155 | ||
| 154 | switch (type) { | 156 | switch (type) { |
| 155 | case PROFILE_TASK_EXIT: | 157 | case PROFILE_TASK_EXIT: |
| 156 | err = blocking_notifier_chain_register( | 158 | err = blocking_notifier_chain_register( |
| 157 | &task_exit_notifier, n); | 159 | &task_exit_notifier, n); |
| 158 | break; | 160 | break; |
| 159 | case PROFILE_MUNMAP: | 161 | case PROFILE_MUNMAP: |
| 160 | err = blocking_notifier_chain_register( | 162 | err = blocking_notifier_chain_register( |
| 161 | &munmap_notifier, n); | 163 | &munmap_notifier, n); |
| 162 | break; | 164 | break; |
| 163 | } | 165 | } |
| 164 | 166 | ||
| 165 | return err; | 167 | return err; |
| 166 | } | 168 | } |
| 169 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
| 167 | 170 | ||
| 168 | 171 | int profile_event_unregister(enum profile_type type, struct notifier_block *n) | |
| 169 | int profile_event_unregister(enum profile_type type, struct notifier_block * n) | ||
| 170 | { | 172 | { |
| 171 | int err = -EINVAL; | 173 | int err = -EINVAL; |
| 172 | 174 | ||
| 173 | switch (type) { | 175 | switch (type) { |
| 174 | case PROFILE_TASK_EXIT: | 176 | case PROFILE_TASK_EXIT: |
| 175 | err = blocking_notifier_chain_unregister( | 177 | err = blocking_notifier_chain_unregister( |
| 176 | &task_exit_notifier, n); | 178 | &task_exit_notifier, n); |
| 177 | break; | 179 | break; |
| 178 | case PROFILE_MUNMAP: | 180 | case PROFILE_MUNMAP: |
| 179 | err = blocking_notifier_chain_unregister( | 181 | err = blocking_notifier_chain_unregister( |
| 180 | &munmap_notifier, n); | 182 | &munmap_notifier, n); |
| 181 | break; | 183 | break; |
| 182 | } | 184 | } |
| 183 | 185 | ||
| 184 | return err; | 186 | return err; |
| 185 | } | 187 | } |
| 188 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
| 186 | 189 | ||
| 187 | int register_timer_hook(int (*hook)(struct pt_regs *)) | 190 | int register_timer_hook(int (*hook)(struct pt_regs *)) |
| 188 | { | 191 | { |
| @@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *)) | |||
| 191 | timer_hook = hook; | 194 | timer_hook = hook; |
| 192 | return 0; | 195 | return 0; |
| 193 | } | 196 | } |
| 197 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
| 194 | 198 | ||
| 195 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) | 199 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) |
| 196 | { | 200 | { |
| @@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *)) | |||
| 199 | /* make sure all CPUs see the NULL hook */ | 203 | /* make sure all CPUs see the NULL hook */ |
| 200 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ | 204 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ |
| 201 | } | 205 | } |
| 202 | |||
| 203 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
| 204 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | 206 | EXPORT_SYMBOL_GPL(unregister_timer_hook); |
| 205 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
| 206 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
| 207 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
| 208 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
| 209 | 207 | ||
| 210 | #endif /* CONFIG_PROFILING */ | 208 | #endif /* CONFIG_PROFILING */ |
| 211 | 209 | ||
| @@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
| 366 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | 364 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); |
| 367 | } | 365 | } |
| 368 | break; | 366 | break; |
| 369 | out_free: | 367 | out_free: |
| 370 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | 368 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); |
| 371 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | 369 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; |
| 372 | __free_page(page); | 370 | __free_page(page); |
| @@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits) | |||
| 409 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 407 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
| 410 | } | 408 | } |
| 411 | #endif /* !CONFIG_SMP */ | 409 | #endif /* !CONFIG_SMP */ |
| 412 | |||
| 413 | EXPORT_SYMBOL_GPL(profile_hits); | 410 | EXPORT_SYMBOL_GPL(profile_hits); |
| 414 | 411 | ||
| 415 | void profile_tick(int type) | 412 | void profile_tick(int type) |
| @@ -427,7 +424,7 @@ void profile_tick(int type) | |||
| 427 | #include <asm/uaccess.h> | 424 | #include <asm/uaccess.h> |
| 428 | #include <asm/ptrace.h> | 425 | #include <asm/ptrace.h> |
| 429 | 426 | ||
| 430 | static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | 427 | static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, |
| 431 | int count, int *eof, void *data) | 428 | int count, int *eof, void *data) |
| 432 | { | 429 | { |
| 433 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); | 430 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); |
| @@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | |||
| 437 | return len; | 434 | return len; |
| 438 | } | 435 | } |
| 439 | 436 | ||
| 440 | static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, | 437 | static int prof_cpu_mask_write_proc(struct file *file, |
| 441 | unsigned long count, void *data) | 438 | const char __user *buffer, unsigned long count, void *data) |
| 442 | { | 439 | { |
| 443 | cpumask_t *mask = (cpumask_t *)data; | 440 | cpumask_t *mask = (cpumask_t *)data; |
| 444 | unsigned long full_count = count, err; | 441 | unsigned long full_count = count, err; |
| @@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | |||
| 457 | struct proc_dir_entry *entry; | 454 | struct proc_dir_entry *entry; |
| 458 | 455 | ||
| 459 | /* create /proc/irq/prof_cpu_mask */ | 456 | /* create /proc/irq/prof_cpu_mask */ |
| 460 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) | 457 | entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); |
| 458 | if (!entry) | ||
| 461 | return; | 459 | return; |
| 462 | entry->data = (void *)&prof_cpu_mask; | 460 | entry->data = (void *)&prof_cpu_mask; |
| 463 | entry->read_proc = prof_cpu_mask_read_proc; | 461 | entry->read_proc = prof_cpu_mask_read_proc; |
| @@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
| 475 | { | 473 | { |
| 476 | unsigned long p = *ppos; | 474 | unsigned long p = *ppos; |
| 477 | ssize_t read; | 475 | ssize_t read; |
| 478 | char * pnt; | 476 | char *pnt; |
| 479 | unsigned int sample_step = 1 << prof_shift; | 477 | unsigned int sample_step = 1 << prof_shift; |
| 480 | 478 | ||
| 481 | profile_flip_buffers(); | 479 | profile_flip_buffers(); |
| @@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
| 486 | read = 0; | 484 | read = 0; |
| 487 | 485 | ||
| 488 | while (p < sizeof(unsigned int) && count > 0) { | 486 | while (p < sizeof(unsigned int) && count > 0) { |
| 489 | if (put_user(*((char *)(&sample_step)+p),buf)) | 487 | if (put_user(*((char *)(&sample_step)+p), buf)) |
| 490 | return -EFAULT; | 488 | return -EFAULT; |
| 491 | buf++; p++; count--; read++; | 489 | buf++; p++; count--; read++; |
| 492 | } | 490 | } |
| 493 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | 491 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); |
| 494 | if (copy_to_user(buf,(void *)pnt,count)) | 492 | if (copy_to_user(buf, (void *)pnt, count)) |
| 495 | return -EFAULT; | 493 | return -EFAULT; |
| 496 | read += count; | 494 | read += count; |
| 497 | *ppos += read; | 495 | *ppos += read; |
| @@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
| 508 | size_t count, loff_t *ppos) | 506 | size_t count, loff_t *ppos) |
| 509 | { | 507 | { |
| 510 | #ifdef CONFIG_SMP | 508 | #ifdef CONFIG_SMP |
| 511 | extern int setup_profiling_timer (unsigned int multiplier); | 509 | extern int setup_profiling_timer(unsigned int multiplier); |
| 512 | 510 | ||
| 513 | if (count == sizeof(int)) { | 511 | if (count == sizeof(int)) { |
| 514 | unsigned int multiplier; | 512 | unsigned int multiplier; |
| @@ -591,7 +589,8 @@ static int __init create_proc_profile(void) | |||
| 591 | return 0; | 589 | return 0; |
| 592 | if (create_hash_tables()) | 590 | if (create_hash_tables()) |
| 593 | return -1; | 591 | return -1; |
| 594 | if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) | 592 | entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); |
| 593 | if (!entry) | ||
| 595 | return 0; | 594 | return 0; |
| 596 | entry->proc_fops = &proc_profile_operations; | 595 | entry->proc_fops = &proc_profile_operations; |
| 597 | entry->size = (1+prof_len) * sizeof(atomic_t); | 596 | entry->size = (1+prof_len) * sizeof(atomic_t); |
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c new file mode 100644 index 000000000000..f4ffbd0f306f --- /dev/null +++ b/kernel/rcuclassic.c | |||
| @@ -0,0 +1,575 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2001 | ||
| 19 | * | ||
| 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
| 21 | * Manfred Spraul <manfred@colorfullife.com> | ||
| 22 | * | ||
| 23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
| 24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
| 25 | * Papers: | ||
| 26 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
| 27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
| 28 | * | ||
| 29 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 30 | * Documentation/RCU | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/types.h> | ||
| 34 | #include <linux/kernel.h> | ||
| 35 | #include <linux/init.h> | ||
| 36 | #include <linux/spinlock.h> | ||
| 37 | #include <linux/smp.h> | ||
| 38 | #include <linux/rcupdate.h> | ||
| 39 | #include <linux/interrupt.h> | ||
| 40 | #include <linux/sched.h> | ||
| 41 | #include <asm/atomic.h> | ||
| 42 | #include <linux/bitops.h> | ||
| 43 | #include <linux/module.h> | ||
| 44 | #include <linux/completion.h> | ||
| 45 | #include <linux/moduleparam.h> | ||
| 46 | #include <linux/percpu.h> | ||
| 47 | #include <linux/notifier.h> | ||
| 48 | #include <linux/cpu.h> | ||
| 49 | #include <linux/mutex.h> | ||
| 50 | |||
| 51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 52 | static struct lock_class_key rcu_lock_key; | ||
| 53 | struct lockdep_map rcu_lock_map = | ||
| 54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
| 55 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
| 56 | #endif | ||
| 57 | |||
| 58 | |||
| 59 | /* Definition for rcupdate control block. */ | ||
| 60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
| 61 | .cur = -300, | ||
| 62 | .completed = -300, | ||
| 63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
| 64 | .cpumask = CPU_MASK_NONE, | ||
| 65 | }; | ||
| 66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
| 67 | .cur = -300, | ||
| 68 | .completed = -300, | ||
| 69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
| 70 | .cpumask = CPU_MASK_NONE, | ||
| 71 | }; | ||
| 72 | |||
| 73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | ||
| 74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
| 75 | |||
| 76 | static int blimit = 10; | ||
| 77 | static int qhimark = 10000; | ||
| 78 | static int qlowmark = 100; | ||
| 79 | |||
| 80 | #ifdef CONFIG_SMP | ||
| 81 | static void force_quiescent_state(struct rcu_data *rdp, | ||
| 82 | struct rcu_ctrlblk *rcp) | ||
| 83 | { | ||
| 84 | int cpu; | ||
| 85 | cpumask_t cpumask; | ||
| 86 | set_need_resched(); | ||
| 87 | if (unlikely(!rcp->signaled)) { | ||
| 88 | rcp->signaled = 1; | ||
| 89 | /* | ||
| 90 | * Don't send IPI to itself. With irqs disabled, | ||
| 91 | * rdp->cpu is the current cpu. | ||
| 92 | */ | ||
| 93 | cpumask = rcp->cpumask; | ||
| 94 | cpu_clear(rdp->cpu, cpumask); | ||
| 95 | for_each_cpu_mask(cpu, cpumask) | ||
| 96 | smp_send_reschedule(cpu); | ||
| 97 | } | ||
| 98 | } | ||
| 99 | #else | ||
| 100 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
| 101 | struct rcu_ctrlblk *rcp) | ||
| 102 | { | ||
| 103 | set_need_resched(); | ||
| 104 | } | ||
| 105 | #endif | ||
| 106 | |||
| 107 | /** | ||
| 108 | * call_rcu - Queue an RCU callback for invocation after a grace period. | ||
| 109 | * @head: structure to be used for queueing the RCU updates. | ||
| 110 | * @func: actual update function to be invoked after the grace period | ||
| 111 | * | ||
| 112 | * The update function will be invoked some time after a full grace | ||
| 113 | * period elapses, in other words after all currently executing RCU | ||
| 114 | * read-side critical sections have completed. RCU read-side critical | ||
| 115 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
| 116 | * and may be nested. | ||
| 117 | */ | ||
| 118 | void call_rcu(struct rcu_head *head, | ||
| 119 | void (*func)(struct rcu_head *rcu)) | ||
| 120 | { | ||
| 121 | unsigned long flags; | ||
| 122 | struct rcu_data *rdp; | ||
| 123 | |||
| 124 | head->func = func; | ||
| 125 | head->next = NULL; | ||
| 126 | local_irq_save(flags); | ||
| 127 | rdp = &__get_cpu_var(rcu_data); | ||
| 128 | *rdp->nxttail = head; | ||
| 129 | rdp->nxttail = &head->next; | ||
| 130 | if (unlikely(++rdp->qlen > qhimark)) { | ||
| 131 | rdp->blimit = INT_MAX; | ||
| 132 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
| 133 | } | ||
| 134 | local_irq_restore(flags); | ||
| 135 | } | ||
| 136 | EXPORT_SYMBOL_GPL(call_rcu); | ||
| 137 | |||
| 138 | /** | ||
| 139 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
| 140 | * @head: structure to be used for queueing the RCU updates. | ||
| 141 | * @func: actual update function to be invoked after the grace period | ||
| 142 | * | ||
| 143 | * The update function will be invoked some time after a full grace | ||
| 144 | * period elapses, in other words after all currently executing RCU | ||
| 145 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
| 146 | * that the read-side critical sections end on completion of a softirq | ||
| 147 | * handler. This means that read-side critical sections in process | ||
| 148 | * context must not be interrupted by softirqs. This interface is to be | ||
| 149 | * used when most of the read-side critical sections are in softirq context. | ||
| 150 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
| 151 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
| 152 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
| 153 | */ | ||
| 154 | void call_rcu_bh(struct rcu_head *head, | ||
| 155 | void (*func)(struct rcu_head *rcu)) | ||
| 156 | { | ||
| 157 | unsigned long flags; | ||
| 158 | struct rcu_data *rdp; | ||
| 159 | |||
| 160 | head->func = func; | ||
| 161 | head->next = NULL; | ||
| 162 | local_irq_save(flags); | ||
| 163 | rdp = &__get_cpu_var(rcu_bh_data); | ||
| 164 | *rdp->nxttail = head; | ||
| 165 | rdp->nxttail = &head->next; | ||
| 166 | |||
| 167 | if (unlikely(++rdp->qlen > qhimark)) { | ||
| 168 | rdp->blimit = INT_MAX; | ||
| 169 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
| 170 | } | ||
| 171 | |||
| 172 | local_irq_restore(flags); | ||
| 173 | } | ||
| 174 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
| 175 | |||
| 176 | /* | ||
| 177 | * Return the number of RCU batches processed thus far. Useful | ||
| 178 | * for debug and statistics. | ||
| 179 | */ | ||
| 180 | long rcu_batches_completed(void) | ||
| 181 | { | ||
| 182 | return rcu_ctrlblk.completed; | ||
| 183 | } | ||
| 184 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
| 185 | |||
| 186 | /* | ||
| 187 | * Return the number of RCU batches processed thus far. Useful | ||
| 188 | * for debug and statistics. | ||
| 189 | */ | ||
| 190 | long rcu_batches_completed_bh(void) | ||
| 191 | { | ||
| 192 | return rcu_bh_ctrlblk.completed; | ||
| 193 | } | ||
| 194 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
| 195 | |||
| 196 | /* Raises the softirq for processing rcu_callbacks. */ | ||
| 197 | static inline void raise_rcu_softirq(void) | ||
| 198 | { | ||
| 199 | raise_softirq(RCU_SOFTIRQ); | ||
| 200 | /* | ||
| 201 | * The smp_mb() here is required to ensure that this cpu's | ||
| 202 | * __rcu_process_callbacks() reads the most recently updated | ||
| 203 | * value of rcu->cur. | ||
| 204 | */ | ||
| 205 | smp_mb(); | ||
| 206 | } | ||
| 207 | |||
| 208 | /* | ||
| 209 | * Invoke the completed RCU callbacks. They are expected to be in | ||
| 210 | * a per-cpu list. | ||
| 211 | */ | ||
| 212 | static void rcu_do_batch(struct rcu_data *rdp) | ||
| 213 | { | ||
| 214 | struct rcu_head *next, *list; | ||
| 215 | int count = 0; | ||
| 216 | |||
| 217 | list = rdp->donelist; | ||
| 218 | while (list) { | ||
| 219 | next = list->next; | ||
| 220 | prefetch(next); | ||
| 221 | list->func(list); | ||
| 222 | list = next; | ||
| 223 | if (++count >= rdp->blimit) | ||
| 224 | break; | ||
| 225 | } | ||
| 226 | rdp->donelist = list; | ||
| 227 | |||
| 228 | local_irq_disable(); | ||
| 229 | rdp->qlen -= count; | ||
| 230 | local_irq_enable(); | ||
| 231 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
| 232 | rdp->blimit = blimit; | ||
| 233 | |||
| 234 | if (!rdp->donelist) | ||
| 235 | rdp->donetail = &rdp->donelist; | ||
| 236 | else | ||
| 237 | raise_rcu_softirq(); | ||
| 238 | } | ||
| 239 | |||
| 240 | /* | ||
| 241 | * Grace period handling: | ||
| 242 | * The grace period handling consists out of two steps: | ||
| 243 | * - A new grace period is started. | ||
| 244 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
| 245 | * all cpus, they must pick this up by comparing rcp->cur with | ||
| 246 | * rdp->quiescbatch. All cpus are recorded in the | ||
| 247 | * rcu_ctrlblk.cpumask bitmap. | ||
| 248 | * - All cpus must go through a quiescent state. | ||
| 249 | * Since the start of the grace period is not broadcasted, at least two | ||
| 250 | * calls to rcu_check_quiescent_state are required: | ||
| 251 | * The first call just notices that a new grace period is running. The | ||
| 252 | * following calls check if there was a quiescent state since the beginning | ||
| 253 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
| 254 | * the bitmap is empty, then the grace period is completed. | ||
| 255 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
| 256 | * period (if necessary). | ||
| 257 | */ | ||
| 258 | /* | ||
| 259 | * Register a new batch of callbacks, and start it up if there is currently no | ||
| 260 | * active batch and the batch to be registered has not already occurred. | ||
| 261 | * Caller must hold rcu_ctrlblk.lock. | ||
| 262 | */ | ||
| 263 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
| 264 | { | ||
| 265 | if (rcp->next_pending && | ||
| 266 | rcp->completed == rcp->cur) { | ||
| 267 | rcp->next_pending = 0; | ||
| 268 | /* | ||
| 269 | * next_pending == 0 must be visible in | ||
| 270 | * __rcu_process_callbacks() before it can see new value of cur. | ||
| 271 | */ | ||
| 272 | smp_wmb(); | ||
| 273 | rcp->cur++; | ||
| 274 | |||
| 275 | /* | ||
| 276 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
| 277 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
| 278 | * included in rcp->cpumask, which will extend graceperiods | ||
| 279 | * unnecessarily. | ||
| 280 | */ | ||
| 281 | smp_mb(); | ||
| 282 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
| 283 | |||
| 284 | rcp->signaled = 0; | ||
| 285 | } | ||
| 286 | } | ||
| 287 | |||
| 288 | /* | ||
| 289 | * cpu went through a quiescent state since the beginning of the grace period. | ||
| 290 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
| 291 | * cpu. Start another grace period if someone has further entries pending | ||
| 292 | */ | ||
| 293 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
| 294 | { | ||
| 295 | cpu_clear(cpu, rcp->cpumask); | ||
| 296 | if (cpus_empty(rcp->cpumask)) { | ||
| 297 | /* batch completed ! */ | ||
| 298 | rcp->completed = rcp->cur; | ||
| 299 | rcu_start_batch(rcp); | ||
| 300 | } | ||
| 301 | } | ||
| 302 | |||
| 303 | /* | ||
| 304 | * Check if the cpu has gone through a quiescent state (say context | ||
| 305 | * switch). If so and if it already hasn't done so in this RCU | ||
| 306 | * quiescent cycle, then indicate that it has done so. | ||
| 307 | */ | ||
| 308 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
| 309 | struct rcu_data *rdp) | ||
| 310 | { | ||
| 311 | if (rdp->quiescbatch != rcp->cur) { | ||
| 312 | /* start new grace period: */ | ||
| 313 | rdp->qs_pending = 1; | ||
| 314 | rdp->passed_quiesc = 0; | ||
| 315 | rdp->quiescbatch = rcp->cur; | ||
| 316 | return; | ||
| 317 | } | ||
| 318 | |||
| 319 | /* Grace period already completed for this cpu? | ||
| 320 | * qs_pending is checked instead of the actual bitmap to avoid | ||
| 321 | * cacheline trashing. | ||
| 322 | */ | ||
| 323 | if (!rdp->qs_pending) | ||
| 324 | return; | ||
| 325 | |||
| 326 | /* | ||
| 327 | * Was there a quiescent state since the beginning of the grace | ||
| 328 | * period? If no, then exit and wait for the next call. | ||
| 329 | */ | ||
| 330 | if (!rdp->passed_quiesc) | ||
| 331 | return; | ||
| 332 | rdp->qs_pending = 0; | ||
| 333 | |||
| 334 | spin_lock(&rcp->lock); | ||
| 335 | /* | ||
| 336 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
| 337 | * during cpu startup. Ignore the quiescent state. | ||
| 338 | */ | ||
| 339 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
| 340 | cpu_quiet(rdp->cpu, rcp); | ||
| 341 | |||
| 342 | spin_unlock(&rcp->lock); | ||
| 343 | } | ||
| 344 | |||
| 345 | |||
| 346 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 347 | |||
| 348 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
| 349 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
| 350 | * which is dead and hence not processing interrupts. | ||
| 351 | */ | ||
| 352 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
| 353 | struct rcu_head **tail) | ||
| 354 | { | ||
| 355 | local_irq_disable(); | ||
| 356 | *this_rdp->nxttail = list; | ||
| 357 | if (list) | ||
| 358 | this_rdp->nxttail = tail; | ||
| 359 | local_irq_enable(); | ||
| 360 | } | ||
| 361 | |||
| 362 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
| 363 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 364 | { | ||
| 365 | /* if the cpu going offline owns the grace period | ||
| 366 | * we can block indefinitely waiting for it, so flush | ||
| 367 | * it here | ||
| 368 | */ | ||
| 369 | spin_lock_bh(&rcp->lock); | ||
| 370 | if (rcp->cur != rcp->completed) | ||
| 371 | cpu_quiet(rdp->cpu, rcp); | ||
| 372 | spin_unlock_bh(&rcp->lock); | ||
| 373 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
| 374 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
| 375 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
| 376 | } | ||
| 377 | |||
| 378 | static void rcu_offline_cpu(int cpu) | ||
| 379 | { | ||
| 380 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
| 381 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
| 382 | |||
| 383 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
| 384 | &per_cpu(rcu_data, cpu)); | ||
| 385 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
| 386 | &per_cpu(rcu_bh_data, cpu)); | ||
| 387 | put_cpu_var(rcu_data); | ||
| 388 | put_cpu_var(rcu_bh_data); | ||
| 389 | } | ||
| 390 | |||
| 391 | #else | ||
| 392 | |||
| 393 | static void rcu_offline_cpu(int cpu) | ||
| 394 | { | ||
| 395 | } | ||
| 396 | |||
| 397 | #endif | ||
| 398 | |||
| 399 | /* | ||
| 400 | * This does the RCU processing work from softirq context. | ||
| 401 | */ | ||
| 402 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
| 403 | struct rcu_data *rdp) | ||
| 404 | { | ||
| 405 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
| 406 | *rdp->donetail = rdp->curlist; | ||
| 407 | rdp->donetail = rdp->curtail; | ||
| 408 | rdp->curlist = NULL; | ||
| 409 | rdp->curtail = &rdp->curlist; | ||
| 410 | } | ||
| 411 | |||
| 412 | if (rdp->nxtlist && !rdp->curlist) { | ||
| 413 | local_irq_disable(); | ||
| 414 | rdp->curlist = rdp->nxtlist; | ||
| 415 | rdp->curtail = rdp->nxttail; | ||
| 416 | rdp->nxtlist = NULL; | ||
| 417 | rdp->nxttail = &rdp->nxtlist; | ||
| 418 | local_irq_enable(); | ||
| 419 | |||
| 420 | /* | ||
| 421 | * start the next batch of callbacks | ||
| 422 | */ | ||
| 423 | |||
| 424 | /* determine batch number */ | ||
| 425 | rdp->batch = rcp->cur + 1; | ||
| 426 | /* see the comment and corresponding wmb() in | ||
| 427 | * the rcu_start_batch() | ||
| 428 | */ | ||
| 429 | smp_rmb(); | ||
| 430 | |||
| 431 | if (!rcp->next_pending) { | ||
| 432 | /* and start it/schedule start if it's a new batch */ | ||
| 433 | spin_lock(&rcp->lock); | ||
| 434 | rcp->next_pending = 1; | ||
| 435 | rcu_start_batch(rcp); | ||
| 436 | spin_unlock(&rcp->lock); | ||
| 437 | } | ||
| 438 | } | ||
| 439 | |||
| 440 | rcu_check_quiescent_state(rcp, rdp); | ||
| 441 | if (rdp->donelist) | ||
| 442 | rcu_do_batch(rdp); | ||
| 443 | } | ||
| 444 | |||
| 445 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
| 446 | { | ||
| 447 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
| 448 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
| 449 | } | ||
| 450 | |||
| 451 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 452 | { | ||
| 453 | /* This cpu has pending rcu entries and the grace period | ||
| 454 | * for them has completed. | ||
| 455 | */ | ||
| 456 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
| 457 | return 1; | ||
| 458 | |||
| 459 | /* This cpu has no pending entries, but there are new entries */ | ||
| 460 | if (!rdp->curlist && rdp->nxtlist) | ||
| 461 | return 1; | ||
| 462 | |||
| 463 | /* This cpu has finished callbacks to invoke */ | ||
| 464 | if (rdp->donelist) | ||
| 465 | return 1; | ||
| 466 | |||
| 467 | /* The rcu core waits for a quiescent state from the cpu */ | ||
| 468 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
| 469 | return 1; | ||
| 470 | |||
| 471 | /* nothing to do */ | ||
| 472 | return 0; | ||
| 473 | } | ||
| 474 | |||
| 475 | /* | ||
| 476 | * Check to see if there is any immediate RCU-related work to be done | ||
| 477 | * by the current CPU, returning 1 if so. This function is part of the | ||
| 478 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
| 479 | */ | ||
| 480 | int rcu_pending(int cpu) | ||
| 481 | { | ||
| 482 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
| 483 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
| 484 | } | ||
| 485 | |||
| 486 | /* | ||
| 487 | * Check to see if any future RCU-related work will need to be done | ||
| 488 | * by the current CPU, even if none need be done immediately, returning | ||
| 489 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
| 490 | * an exported member of the RCU API. | ||
| 491 | */ | ||
| 492 | int rcu_needs_cpu(int cpu) | ||
| 493 | { | ||
| 494 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
| 495 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
| 496 | |||
| 497 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
| 498 | } | ||
| 499 | |||
| 500 | void rcu_check_callbacks(int cpu, int user) | ||
| 501 | { | ||
| 502 | if (user || | ||
| 503 | (idle_cpu(cpu) && !in_softirq() && | ||
| 504 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
| 505 | rcu_qsctr_inc(cpu); | ||
| 506 | rcu_bh_qsctr_inc(cpu); | ||
| 507 | } else if (!in_softirq()) | ||
| 508 | rcu_bh_qsctr_inc(cpu); | ||
| 509 | raise_rcu_softirq(); | ||
| 510 | } | ||
| 511 | |||
| 512 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
| 513 | struct rcu_data *rdp) | ||
| 514 | { | ||
| 515 | memset(rdp, 0, sizeof(*rdp)); | ||
| 516 | rdp->curtail = &rdp->curlist; | ||
| 517 | rdp->nxttail = &rdp->nxtlist; | ||
| 518 | rdp->donetail = &rdp->donelist; | ||
| 519 | rdp->quiescbatch = rcp->completed; | ||
| 520 | rdp->qs_pending = 0; | ||
| 521 | rdp->cpu = cpu; | ||
| 522 | rdp->blimit = blimit; | ||
| 523 | } | ||
| 524 | |||
| 525 | static void __cpuinit rcu_online_cpu(int cpu) | ||
| 526 | { | ||
| 527 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
| 528 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
| 529 | |||
| 530 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
| 531 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
| 532 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
| 533 | } | ||
| 534 | |||
| 535 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
| 536 | unsigned long action, void *hcpu) | ||
| 537 | { | ||
| 538 | long cpu = (long)hcpu; | ||
| 539 | |||
| 540 | switch (action) { | ||
| 541 | case CPU_UP_PREPARE: | ||
| 542 | case CPU_UP_PREPARE_FROZEN: | ||
| 543 | rcu_online_cpu(cpu); | ||
| 544 | break; | ||
| 545 | case CPU_DEAD: | ||
| 546 | case CPU_DEAD_FROZEN: | ||
| 547 | rcu_offline_cpu(cpu); | ||
| 548 | break; | ||
| 549 | default: | ||
| 550 | break; | ||
| 551 | } | ||
| 552 | return NOTIFY_OK; | ||
| 553 | } | ||
| 554 | |||
| 555 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
| 556 | .notifier_call = rcu_cpu_notify, | ||
| 557 | }; | ||
| 558 | |||
| 559 | /* | ||
| 560 | * Initializes rcu mechanism. Assumed to be called early. | ||
| 561 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
| 562 | * Note that rcu_qsctr and friends are implicitly | ||
| 563 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
| 564 | */ | ||
| 565 | void __init __rcu_init(void) | ||
| 566 | { | ||
| 567 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | ||
| 568 | (void *)(long)smp_processor_id()); | ||
| 569 | /* Register notifier for non-boot CPUs */ | ||
| 570 | register_cpu_notifier(&rcu_nb); | ||
| 571 | } | ||
| 572 | |||
| 573 | module_param(blimit, int, 0); | ||
| 574 | module_param(qhimark, int, 0); | ||
| 575 | module_param(qlowmark, int, 0); | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f2c1a04e9b18..760dfc233a00 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -15,7 +15,7 @@ | |||
| 15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 17 | * | 17 | * |
| 18 | * Copyright (C) IBM Corporation, 2001 | 18 | * Copyright IBM Corporation, 2001 |
| 19 | * | 19 | * |
| 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> |
| 21 | * Manfred Spraul <manfred@colorfullife.com> | 21 | * Manfred Spraul <manfred@colorfullife.com> |
| @@ -35,165 +35,57 @@ | |||
| 35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
| 36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
| 37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
| 38 | #include <linux/rcupdate.h> | ||
| 39 | #include <linux/interrupt.h> | 38 | #include <linux/interrupt.h> |
| 40 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
| 41 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
| 42 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
| 43 | #include <linux/module.h> | ||
| 44 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
| 45 | #include <linux/moduleparam.h> | ||
| 46 | #include <linux/percpu.h> | 43 | #include <linux/percpu.h> |
| 47 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
| 48 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
| 49 | #include <linux/mutex.h> | 46 | #include <linux/mutex.h> |
| 47 | #include <linux/module.h> | ||
| 50 | 48 | ||
| 51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 49 | struct rcu_synchronize { |
| 52 | static struct lock_class_key rcu_lock_key; | 50 | struct rcu_head head; |
| 53 | struct lockdep_map rcu_lock_map = | 51 | struct completion completion; |
| 54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
| 55 | |||
| 56 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
| 57 | #endif | ||
| 58 | |||
| 59 | /* Definition for rcupdate control block. */ | ||
| 60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
| 61 | .cur = -300, | ||
| 62 | .completed = -300, | ||
| 63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
| 64 | .cpumask = CPU_MASK_NONE, | ||
| 65 | }; | ||
| 66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
| 67 | .cur = -300, | ||
| 68 | .completed = -300, | ||
| 69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
| 70 | .cpumask = CPU_MASK_NONE, | ||
| 71 | }; | 52 | }; |
| 72 | 53 | ||
| 73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 54 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
| 74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
| 75 | |||
| 76 | /* Fake initialization required by compiler */ | ||
| 77 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | ||
| 78 | static int blimit = 10; | ||
| 79 | static int qhimark = 10000; | ||
| 80 | static int qlowmark = 100; | ||
| 81 | |||
| 82 | static atomic_t rcu_barrier_cpu_count; | 55 | static atomic_t rcu_barrier_cpu_count; |
| 83 | static DEFINE_MUTEX(rcu_barrier_mutex); | 56 | static DEFINE_MUTEX(rcu_barrier_mutex); |
| 84 | static struct completion rcu_barrier_completion; | 57 | static struct completion rcu_barrier_completion; |
| 85 | 58 | ||
| 86 | #ifdef CONFIG_SMP | 59 | /* Because of FASTCALL declaration of complete, we use this wrapper */ |
| 87 | static void force_quiescent_state(struct rcu_data *rdp, | 60 | static void wakeme_after_rcu(struct rcu_head *head) |
| 88 | struct rcu_ctrlblk *rcp) | ||
| 89 | { | ||
| 90 | int cpu; | ||
| 91 | cpumask_t cpumask; | ||
| 92 | set_need_resched(); | ||
| 93 | if (unlikely(!rcp->signaled)) { | ||
| 94 | rcp->signaled = 1; | ||
| 95 | /* | ||
| 96 | * Don't send IPI to itself. With irqs disabled, | ||
| 97 | * rdp->cpu is the current cpu. | ||
| 98 | */ | ||
| 99 | cpumask = rcp->cpumask; | ||
| 100 | cpu_clear(rdp->cpu, cpumask); | ||
| 101 | for_each_cpu_mask(cpu, cpumask) | ||
| 102 | smp_send_reschedule(cpu); | ||
| 103 | } | ||
| 104 | } | ||
| 105 | #else | ||
| 106 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
| 107 | struct rcu_ctrlblk *rcp) | ||
| 108 | { | 61 | { |
| 109 | set_need_resched(); | 62 | struct rcu_synchronize *rcu; |
| 63 | |||
| 64 | rcu = container_of(head, struct rcu_synchronize, head); | ||
| 65 | complete(&rcu->completion); | ||
| 110 | } | 66 | } |
| 111 | #endif | ||
| 112 | 67 | ||
| 113 | /** | 68 | /** |
| 114 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 69 | * synchronize_rcu - wait until a grace period has elapsed. |
| 115 | * @head: structure to be used for queueing the RCU updates. | ||
| 116 | * @func: actual update function to be invoked after the grace period | ||
| 117 | * | 70 | * |
| 118 | * The update function will be invoked some time after a full grace | 71 | * Control will return to the caller some time after a full grace |
| 119 | * period elapses, in other words after all currently executing RCU | 72 | * period has elapsed, in other words after all currently executing RCU |
| 120 | * read-side critical sections have completed. RCU read-side critical | 73 | * read-side critical sections have completed. RCU read-side critical |
| 121 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 74 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), |
| 122 | * and may be nested. | 75 | * and may be nested. |
| 123 | */ | 76 | */ |
| 124 | void fastcall call_rcu(struct rcu_head *head, | 77 | void synchronize_rcu(void) |
| 125 | void (*func)(struct rcu_head *rcu)) | ||
| 126 | { | ||
| 127 | unsigned long flags; | ||
| 128 | struct rcu_data *rdp; | ||
| 129 | |||
| 130 | head->func = func; | ||
| 131 | head->next = NULL; | ||
| 132 | local_irq_save(flags); | ||
| 133 | rdp = &__get_cpu_var(rcu_data); | ||
| 134 | *rdp->nxttail = head; | ||
| 135 | rdp->nxttail = &head->next; | ||
| 136 | if (unlikely(++rdp->qlen > qhimark)) { | ||
| 137 | rdp->blimit = INT_MAX; | ||
| 138 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
| 139 | } | ||
| 140 | local_irq_restore(flags); | ||
| 141 | } | ||
| 142 | |||
| 143 | /** | ||
| 144 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
| 145 | * @head: structure to be used for queueing the RCU updates. | ||
| 146 | * @func: actual update function to be invoked after the grace period | ||
| 147 | * | ||
| 148 | * The update function will be invoked some time after a full grace | ||
| 149 | * period elapses, in other words after all currently executing RCU | ||
| 150 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
| 151 | * that the read-side critical sections end on completion of a softirq | ||
| 152 | * handler. This means that read-side critical sections in process | ||
| 153 | * context must not be interrupted by softirqs. This interface is to be | ||
| 154 | * used when most of the read-side critical sections are in softirq context. | ||
| 155 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
| 156 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
| 157 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
| 158 | */ | ||
| 159 | void fastcall call_rcu_bh(struct rcu_head *head, | ||
| 160 | void (*func)(struct rcu_head *rcu)) | ||
| 161 | { | 78 | { |
| 162 | unsigned long flags; | 79 | struct rcu_synchronize rcu; |
| 163 | struct rcu_data *rdp; | ||
| 164 | |||
| 165 | head->func = func; | ||
| 166 | head->next = NULL; | ||
| 167 | local_irq_save(flags); | ||
| 168 | rdp = &__get_cpu_var(rcu_bh_data); | ||
| 169 | *rdp->nxttail = head; | ||
| 170 | rdp->nxttail = &head->next; | ||
| 171 | |||
| 172 | if (unlikely(++rdp->qlen > qhimark)) { | ||
| 173 | rdp->blimit = INT_MAX; | ||
| 174 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
| 175 | } | ||
| 176 | |||
| 177 | local_irq_restore(flags); | ||
| 178 | } | ||
| 179 | 80 | ||
| 180 | /* | 81 | init_completion(&rcu.completion); |
| 181 | * Return the number of RCU batches processed thus far. Useful | 82 | /* Will wake me after RCU finished */ |
| 182 | * for debug and statistics. | 83 | call_rcu(&rcu.head, wakeme_after_rcu); |
| 183 | */ | ||
| 184 | long rcu_batches_completed(void) | ||
| 185 | { | ||
| 186 | return rcu_ctrlblk.completed; | ||
| 187 | } | ||
| 188 | 84 | ||
| 189 | /* | 85 | /* Wait for it */ |
| 190 | * Return the number of RCU batches processed thus far. Useful | 86 | wait_for_completion(&rcu.completion); |
| 191 | * for debug and statistics. | ||
| 192 | */ | ||
| 193 | long rcu_batches_completed_bh(void) | ||
| 194 | { | ||
| 195 | return rcu_bh_ctrlblk.completed; | ||
| 196 | } | 87 | } |
| 88 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
| 197 | 89 | ||
| 198 | static void rcu_barrier_callback(struct rcu_head *notused) | 90 | static void rcu_barrier_callback(struct rcu_head *notused) |
| 199 | { | 91 | { |
| @@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused) | |||
| 207 | static void rcu_barrier_func(void *notused) | 99 | static void rcu_barrier_func(void *notused) |
| 208 | { | 100 | { |
| 209 | int cpu = smp_processor_id(); | 101 | int cpu = smp_processor_id(); |
| 210 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | 102 | struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); |
| 211 | struct rcu_head *head; | ||
| 212 | 103 | ||
| 213 | head = &rdp->barrier; | ||
| 214 | atomic_inc(&rcu_barrier_cpu_count); | 104 | atomic_inc(&rcu_barrier_cpu_count); |
| 215 | call_rcu(head, rcu_barrier_callback); | 105 | call_rcu(head, rcu_barrier_callback); |
| 216 | } | 106 | } |
| @@ -225,420 +115,24 @@ void rcu_barrier(void) | |||
| 225 | mutex_lock(&rcu_barrier_mutex); | 115 | mutex_lock(&rcu_barrier_mutex); |
| 226 | init_completion(&rcu_barrier_completion); | 116 | init_completion(&rcu_barrier_completion); |
| 227 | atomic_set(&rcu_barrier_cpu_count, 0); | 117 | atomic_set(&rcu_barrier_cpu_count, 0); |
| 118 | /* | ||
| 119 | * The queueing of callbacks in all CPUs must be atomic with | ||
| 120 | * respect to RCU, otherwise one CPU may queue a callback, | ||
| 121 | * wait for a grace period, decrement barrier count and call | ||
| 122 | * complete(), while other CPUs have not yet queued anything. | ||
| 123 | * So, we need to make sure that grace periods cannot complete | ||
| 124 | * until all the callbacks are queued. | ||
| 125 | */ | ||
| 126 | rcu_read_lock(); | ||
| 228 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); | 127 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); |
| 128 | rcu_read_unlock(); | ||
| 229 | wait_for_completion(&rcu_barrier_completion); | 129 | wait_for_completion(&rcu_barrier_completion); |
| 230 | mutex_unlock(&rcu_barrier_mutex); | 130 | mutex_unlock(&rcu_barrier_mutex); |
| 231 | } | 131 | } |
| 232 | EXPORT_SYMBOL_GPL(rcu_barrier); | 132 | EXPORT_SYMBOL_GPL(rcu_barrier); |
| 233 | 133 | ||
| 234 | /* | ||
| 235 | * Invoke the completed RCU callbacks. They are expected to be in | ||
| 236 | * a per-cpu list. | ||
| 237 | */ | ||
| 238 | static void rcu_do_batch(struct rcu_data *rdp) | ||
| 239 | { | ||
| 240 | struct rcu_head *next, *list; | ||
| 241 | int count = 0; | ||
| 242 | |||
| 243 | list = rdp->donelist; | ||
| 244 | while (list) { | ||
| 245 | next = list->next; | ||
| 246 | prefetch(next); | ||
| 247 | list->func(list); | ||
| 248 | list = next; | ||
| 249 | if (++count >= rdp->blimit) | ||
| 250 | break; | ||
| 251 | } | ||
| 252 | rdp->donelist = list; | ||
| 253 | |||
| 254 | local_irq_disable(); | ||
| 255 | rdp->qlen -= count; | ||
| 256 | local_irq_enable(); | ||
| 257 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
| 258 | rdp->blimit = blimit; | ||
| 259 | |||
| 260 | if (!rdp->donelist) | ||
| 261 | rdp->donetail = &rdp->donelist; | ||
| 262 | else | ||
| 263 | tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); | ||
| 264 | } | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Grace period handling: | ||
| 268 | * The grace period handling consists out of two steps: | ||
| 269 | * - A new grace period is started. | ||
| 270 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
| 271 | * all cpus, they must pick this up by comparing rcp->cur with | ||
| 272 | * rdp->quiescbatch. All cpus are recorded in the | ||
| 273 | * rcu_ctrlblk.cpumask bitmap. | ||
| 274 | * - All cpus must go through a quiescent state. | ||
| 275 | * Since the start of the grace period is not broadcasted, at least two | ||
| 276 | * calls to rcu_check_quiescent_state are required: | ||
| 277 | * The first call just notices that a new grace period is running. The | ||
| 278 | * following calls check if there was a quiescent state since the beginning | ||
| 279 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
| 280 | * the bitmap is empty, then the grace period is completed. | ||
| 281 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
| 282 | * period (if necessary). | ||
| 283 | */ | ||
| 284 | /* | ||
| 285 | * Register a new batch of callbacks, and start it up if there is currently no | ||
| 286 | * active batch and the batch to be registered has not already occurred. | ||
| 287 | * Caller must hold rcu_ctrlblk.lock. | ||
| 288 | */ | ||
| 289 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
| 290 | { | ||
| 291 | if (rcp->next_pending && | ||
| 292 | rcp->completed == rcp->cur) { | ||
| 293 | rcp->next_pending = 0; | ||
| 294 | /* | ||
| 295 | * next_pending == 0 must be visible in | ||
| 296 | * __rcu_process_callbacks() before it can see new value of cur. | ||
| 297 | */ | ||
| 298 | smp_wmb(); | ||
| 299 | rcp->cur++; | ||
| 300 | |||
| 301 | /* | ||
| 302 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
| 303 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
| 304 | * included in rcp->cpumask, which will extend graceperiods | ||
| 305 | * unnecessarily. | ||
| 306 | */ | ||
| 307 | smp_mb(); | ||
| 308 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
| 309 | |||
| 310 | rcp->signaled = 0; | ||
| 311 | } | ||
| 312 | } | ||
| 313 | |||
| 314 | /* | ||
| 315 | * cpu went through a quiescent state since the beginning of the grace period. | ||
| 316 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
| 317 | * cpu. Start another grace period if someone has further entries pending | ||
| 318 | */ | ||
| 319 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
| 320 | { | ||
| 321 | cpu_clear(cpu, rcp->cpumask); | ||
| 322 | if (cpus_empty(rcp->cpumask)) { | ||
| 323 | /* batch completed ! */ | ||
| 324 | rcp->completed = rcp->cur; | ||
| 325 | rcu_start_batch(rcp); | ||
| 326 | } | ||
| 327 | } | ||
| 328 | |||
| 329 | /* | ||
| 330 | * Check if the cpu has gone through a quiescent state (say context | ||
| 331 | * switch). If so and if it already hasn't done so in this RCU | ||
| 332 | * quiescent cycle, then indicate that it has done so. | ||
| 333 | */ | ||
| 334 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
| 335 | struct rcu_data *rdp) | ||
| 336 | { | ||
| 337 | if (rdp->quiescbatch != rcp->cur) { | ||
| 338 | /* start new grace period: */ | ||
| 339 | rdp->qs_pending = 1; | ||
| 340 | rdp->passed_quiesc = 0; | ||
| 341 | rdp->quiescbatch = rcp->cur; | ||
| 342 | return; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* Grace period already completed for this cpu? | ||
| 346 | * qs_pending is checked instead of the actual bitmap to avoid | ||
| 347 | * cacheline trashing. | ||
| 348 | */ | ||
| 349 | if (!rdp->qs_pending) | ||
| 350 | return; | ||
| 351 | |||
| 352 | /* | ||
| 353 | * Was there a quiescent state since the beginning of the grace | ||
| 354 | * period? If no, then exit and wait for the next call. | ||
| 355 | */ | ||
| 356 | if (!rdp->passed_quiesc) | ||
| 357 | return; | ||
| 358 | rdp->qs_pending = 0; | ||
| 359 | |||
| 360 | spin_lock(&rcp->lock); | ||
| 361 | /* | ||
| 362 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
| 363 | * during cpu startup. Ignore the quiescent state. | ||
| 364 | */ | ||
| 365 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
| 366 | cpu_quiet(rdp->cpu, rcp); | ||
| 367 | |||
| 368 | spin_unlock(&rcp->lock); | ||
| 369 | } | ||
| 370 | |||
| 371 | |||
| 372 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 373 | |||
| 374 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
| 375 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
| 376 | * which is dead and hence not processing interrupts. | ||
| 377 | */ | ||
| 378 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
| 379 | struct rcu_head **tail) | ||
| 380 | { | ||
| 381 | local_irq_disable(); | ||
| 382 | *this_rdp->nxttail = list; | ||
| 383 | if (list) | ||
| 384 | this_rdp->nxttail = tail; | ||
| 385 | local_irq_enable(); | ||
| 386 | } | ||
| 387 | |||
| 388 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
| 389 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 390 | { | ||
| 391 | /* if the cpu going offline owns the grace period | ||
| 392 | * we can block indefinitely waiting for it, so flush | ||
| 393 | * it here | ||
| 394 | */ | ||
| 395 | spin_lock_bh(&rcp->lock); | ||
| 396 | if (rcp->cur != rcp->completed) | ||
| 397 | cpu_quiet(rdp->cpu, rcp); | ||
| 398 | spin_unlock_bh(&rcp->lock); | ||
| 399 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
| 400 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
| 401 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
| 402 | } | ||
| 403 | |||
| 404 | static void rcu_offline_cpu(int cpu) | ||
| 405 | { | ||
| 406 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
| 407 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
| 408 | |||
| 409 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
| 410 | &per_cpu(rcu_data, cpu)); | ||
| 411 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
| 412 | &per_cpu(rcu_bh_data, cpu)); | ||
| 413 | put_cpu_var(rcu_data); | ||
| 414 | put_cpu_var(rcu_bh_data); | ||
| 415 | tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); | ||
| 416 | } | ||
| 417 | |||
| 418 | #else | ||
| 419 | |||
| 420 | static void rcu_offline_cpu(int cpu) | ||
| 421 | { | ||
| 422 | } | ||
| 423 | |||
| 424 | #endif | ||
| 425 | |||
| 426 | /* | ||
| 427 | * This does the RCU processing work from tasklet context. | ||
| 428 | */ | ||
| 429 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
| 430 | struct rcu_data *rdp) | ||
| 431 | { | ||
| 432 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
| 433 | *rdp->donetail = rdp->curlist; | ||
| 434 | rdp->donetail = rdp->curtail; | ||
| 435 | rdp->curlist = NULL; | ||
| 436 | rdp->curtail = &rdp->curlist; | ||
| 437 | } | ||
| 438 | |||
| 439 | if (rdp->nxtlist && !rdp->curlist) { | ||
| 440 | local_irq_disable(); | ||
| 441 | rdp->curlist = rdp->nxtlist; | ||
| 442 | rdp->curtail = rdp->nxttail; | ||
| 443 | rdp->nxtlist = NULL; | ||
| 444 | rdp->nxttail = &rdp->nxtlist; | ||
| 445 | local_irq_enable(); | ||
| 446 | |||
| 447 | /* | ||
| 448 | * start the next batch of callbacks | ||
| 449 | */ | ||
| 450 | |||
| 451 | /* determine batch number */ | ||
| 452 | rdp->batch = rcp->cur + 1; | ||
| 453 | /* see the comment and corresponding wmb() in | ||
| 454 | * the rcu_start_batch() | ||
| 455 | */ | ||
| 456 | smp_rmb(); | ||
| 457 | |||
| 458 | if (!rcp->next_pending) { | ||
| 459 | /* and start it/schedule start if it's a new batch */ | ||
| 460 | spin_lock(&rcp->lock); | ||
| 461 | rcp->next_pending = 1; | ||
| 462 | rcu_start_batch(rcp); | ||
| 463 | spin_unlock(&rcp->lock); | ||
| 464 | } | ||
| 465 | } | ||
| 466 | |||
| 467 | rcu_check_quiescent_state(rcp, rdp); | ||
| 468 | if (rdp->donelist) | ||
| 469 | rcu_do_batch(rdp); | ||
| 470 | } | ||
| 471 | |||
| 472 | static void rcu_process_callbacks(unsigned long unused) | ||
| 473 | { | ||
| 474 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
| 475 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
| 476 | } | ||
| 477 | |||
| 478 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 479 | { | ||
| 480 | /* This cpu has pending rcu entries and the grace period | ||
| 481 | * for them has completed. | ||
| 482 | */ | ||
| 483 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
| 484 | return 1; | ||
| 485 | |||
| 486 | /* This cpu has no pending entries, but there are new entries */ | ||
| 487 | if (!rdp->curlist && rdp->nxtlist) | ||
| 488 | return 1; | ||
| 489 | |||
| 490 | /* This cpu has finished callbacks to invoke */ | ||
| 491 | if (rdp->donelist) | ||
| 492 | return 1; | ||
| 493 | |||
| 494 | /* The rcu core waits for a quiescent state from the cpu */ | ||
| 495 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
| 496 | return 1; | ||
| 497 | |||
| 498 | /* nothing to do */ | ||
| 499 | return 0; | ||
| 500 | } | ||
| 501 | |||
| 502 | /* | ||
| 503 | * Check to see if there is any immediate RCU-related work to be done | ||
| 504 | * by the current CPU, returning 1 if so. This function is part of the | ||
| 505 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
| 506 | */ | ||
| 507 | int rcu_pending(int cpu) | ||
| 508 | { | ||
| 509 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
| 510 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
| 511 | } | ||
| 512 | |||
| 513 | /* | ||
| 514 | * Check to see if any future RCU-related work will need to be done | ||
| 515 | * by the current CPU, even if none need be done immediately, returning | ||
| 516 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
| 517 | * an exported member of the RCU API. | ||
| 518 | */ | ||
| 519 | int rcu_needs_cpu(int cpu) | ||
| 520 | { | ||
| 521 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
| 522 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
| 523 | |||
| 524 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
| 525 | } | ||
| 526 | |||
| 527 | void rcu_check_callbacks(int cpu, int user) | ||
| 528 | { | ||
| 529 | if (user || | ||
| 530 | (idle_cpu(cpu) && !in_softirq() && | ||
| 531 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
| 532 | rcu_qsctr_inc(cpu); | ||
| 533 | rcu_bh_qsctr_inc(cpu); | ||
| 534 | } else if (!in_softirq()) | ||
| 535 | rcu_bh_qsctr_inc(cpu); | ||
| 536 | tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); | ||
| 537 | } | ||
| 538 | |||
| 539 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
| 540 | struct rcu_data *rdp) | ||
| 541 | { | ||
| 542 | memset(rdp, 0, sizeof(*rdp)); | ||
| 543 | rdp->curtail = &rdp->curlist; | ||
| 544 | rdp->nxttail = &rdp->nxtlist; | ||
| 545 | rdp->donetail = &rdp->donelist; | ||
| 546 | rdp->quiescbatch = rcp->completed; | ||
| 547 | rdp->qs_pending = 0; | ||
| 548 | rdp->cpu = cpu; | ||
| 549 | rdp->blimit = blimit; | ||
| 550 | } | ||
| 551 | |||
| 552 | static void __cpuinit rcu_online_cpu(int cpu) | ||
| 553 | { | ||
| 554 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
| 555 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
| 556 | |||
| 557 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
| 558 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
| 559 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | ||
| 560 | } | ||
| 561 | |||
| 562 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
| 563 | unsigned long action, void *hcpu) | ||
| 564 | { | ||
| 565 | long cpu = (long)hcpu; | ||
| 566 | switch (action) { | ||
| 567 | case CPU_UP_PREPARE: | ||
| 568 | case CPU_UP_PREPARE_FROZEN: | ||
| 569 | rcu_online_cpu(cpu); | ||
| 570 | break; | ||
| 571 | case CPU_DEAD: | ||
| 572 | case CPU_DEAD_FROZEN: | ||
| 573 | rcu_offline_cpu(cpu); | ||
| 574 | break; | ||
| 575 | default: | ||
| 576 | break; | ||
| 577 | } | ||
| 578 | return NOTIFY_OK; | ||
| 579 | } | ||
| 580 | |||
| 581 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
| 582 | .notifier_call = rcu_cpu_notify, | ||
| 583 | }; | ||
| 584 | |||
| 585 | /* | ||
| 586 | * Initializes rcu mechanism. Assumed to be called early. | ||
| 587 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
| 588 | * Note that rcu_qsctr and friends are implicitly | ||
| 589 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
| 590 | */ | ||
| 591 | void __init rcu_init(void) | 134 | void __init rcu_init(void) |
| 592 | { | 135 | { |
| 593 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 136 | __rcu_init(); |
| 594 | (void *)(long)smp_processor_id()); | ||
| 595 | /* Register notifier for non-boot CPUs */ | ||
| 596 | register_cpu_notifier(&rcu_nb); | ||
| 597 | } | ||
| 598 | |||
| 599 | struct rcu_synchronize { | ||
| 600 | struct rcu_head head; | ||
| 601 | struct completion completion; | ||
| 602 | }; | ||
| 603 | |||
| 604 | /* Because of FASTCALL declaration of complete, we use this wrapper */ | ||
| 605 | static void wakeme_after_rcu(struct rcu_head *head) | ||
| 606 | { | ||
| 607 | struct rcu_synchronize *rcu; | ||
| 608 | |||
| 609 | rcu = container_of(head, struct rcu_synchronize, head); | ||
| 610 | complete(&rcu->completion); | ||
| 611 | } | 137 | } |
| 612 | 138 | ||
| 613 | /** | ||
| 614 | * synchronize_rcu - wait until a grace period has elapsed. | ||
| 615 | * | ||
| 616 | * Control will return to the caller some time after a full grace | ||
| 617 | * period has elapsed, in other words after all currently executing RCU | ||
| 618 | * read-side critical sections have completed. RCU read-side critical | ||
| 619 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
| 620 | * and may be nested. | ||
| 621 | * | ||
| 622 | * If your read-side code is not protected by rcu_read_lock(), do -not- | ||
| 623 | * use synchronize_rcu(). | ||
| 624 | */ | ||
| 625 | void synchronize_rcu(void) | ||
| 626 | { | ||
| 627 | struct rcu_synchronize rcu; | ||
| 628 | |||
| 629 | init_completion(&rcu.completion); | ||
| 630 | /* Will wake me after RCU finished */ | ||
| 631 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
| 632 | |||
| 633 | /* Wait for it */ | ||
| 634 | wait_for_completion(&rcu.completion); | ||
| 635 | } | ||
| 636 | |||
| 637 | module_param(blimit, int, 0); | ||
| 638 | module_param(qhimark, int, 0); | ||
| 639 | module_param(qlowmark, int, 0); | ||
| 640 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
| 641 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
| 642 | EXPORT_SYMBOL_GPL(call_rcu); | ||
| 643 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
| 644 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c new file mode 100644 index 000000000000..987cfb7ade89 --- /dev/null +++ b/kernel/rcupreempt.c | |||
| @@ -0,0 +1,953 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2006 | ||
| 19 | * | ||
| 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
| 21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
| 22 | * for pushing me away from locks and towards counters, and | ||
| 23 | * to Suparna Bhattacharya for pushing me completely away | ||
| 24 | * from atomic instructions on the read side. | ||
| 25 | * | ||
| 26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
| 27 | * | ||
| 28 | * Design Document: http://lwn.net/Articles/253651/ | ||
| 29 | * | ||
| 30 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 31 | * Documentation/RCU/ *.txt | ||
| 32 | * | ||
| 33 | */ | ||
| 34 | #include <linux/types.h> | ||
| 35 | #include <linux/kernel.h> | ||
| 36 | #include <linux/init.h> | ||
| 37 | #include <linux/spinlock.h> | ||
| 38 | #include <linux/smp.h> | ||
| 39 | #include <linux/rcupdate.h> | ||
| 40 | #include <linux/interrupt.h> | ||
| 41 | #include <linux/sched.h> | ||
| 42 | #include <asm/atomic.h> | ||
| 43 | #include <linux/bitops.h> | ||
| 44 | #include <linux/module.h> | ||
| 45 | #include <linux/completion.h> | ||
| 46 | #include <linux/moduleparam.h> | ||
| 47 | #include <linux/percpu.h> | ||
| 48 | #include <linux/notifier.h> | ||
| 49 | #include <linux/rcupdate.h> | ||
| 50 | #include <linux/cpu.h> | ||
| 51 | #include <linux/random.h> | ||
| 52 | #include <linux/delay.h> | ||
| 53 | #include <linux/byteorder/swabb.h> | ||
| 54 | #include <linux/cpumask.h> | ||
| 55 | #include <linux/rcupreempt_trace.h> | ||
| 56 | |||
| 57 | /* | ||
| 58 | * Macro that prevents the compiler from reordering accesses, but does | ||
| 59 | * absolutely -nothing- to prevent CPUs from reordering. This is used | ||
| 60 | * only to mediate communication between mainline code and hardware | ||
| 61 | * interrupt and NMI handlers. | ||
| 62 | */ | ||
| 63 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
| 64 | |||
| 65 | /* | ||
| 66 | * PREEMPT_RCU data structures. | ||
| 67 | */ | ||
| 68 | |||
| 69 | /* | ||
| 70 | * GP_STAGES specifies the number of times the state machine has | ||
| 71 | * to go through the all the rcu_try_flip_states (see below) | ||
| 72 | * in a single Grace Period. | ||
| 73 | * | ||
| 74 | * GP in GP_STAGES stands for Grace Period ;) | ||
| 75 | */ | ||
| 76 | #define GP_STAGES 2 | ||
| 77 | struct rcu_data { | ||
| 78 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
| 79 | long completed; /* Number of last completed batch. */ | ||
| 80 | int waitlistcount; | ||
| 81 | struct tasklet_struct rcu_tasklet; | ||
| 82 | struct rcu_head *nextlist; | ||
| 83 | struct rcu_head **nexttail; | ||
| 84 | struct rcu_head *waitlist[GP_STAGES]; | ||
| 85 | struct rcu_head **waittail[GP_STAGES]; | ||
| 86 | struct rcu_head *donelist; | ||
| 87 | struct rcu_head **donetail; | ||
| 88 | long rcu_flipctr[2]; | ||
| 89 | #ifdef CONFIG_RCU_TRACE | ||
| 90 | struct rcupreempt_trace trace; | ||
| 91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 92 | }; | ||
| 93 | |||
| 94 | /* | ||
| 95 | * States for rcu_try_flip() and friends. | ||
| 96 | */ | ||
| 97 | |||
| 98 | enum rcu_try_flip_states { | ||
| 99 | |||
| 100 | /* | ||
| 101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
| 102 | * starts happening. Denoted by "I" | ||
| 103 | */ | ||
| 104 | rcu_try_flip_idle_state, | ||
| 105 | |||
| 106 | /* | ||
| 107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
| 108 | * prevents the old set of counters from ever being incremented once | ||
| 109 | * we leave this state, which in turn is necessary because we cannot | ||
| 110 | * test any individual counter for zero -- we can only check the sum. | ||
| 111 | * Denoted by "A". | ||
| 112 | */ | ||
| 113 | rcu_try_flip_waitack_state, | ||
| 114 | |||
| 115 | /* | ||
| 116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
| 117 | * Denoted by "Z". | ||
| 118 | */ | ||
| 119 | rcu_try_flip_waitzero_state, | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
| 123 | * This is necessary to ensure that these other CPUs really have | ||
| 124 | * completed executing their RCU read-side critical sections, despite | ||
| 125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
| 126 | */ | ||
| 127 | rcu_try_flip_waitmb_state, | ||
| 128 | }; | ||
| 129 | |||
| 130 | struct rcu_ctrlblk { | ||
| 131 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
| 132 | long completed; /* Number of last completed batch. */ | ||
| 133 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
| 134 | the rcu state machine */ | ||
| 135 | }; | ||
| 136 | |||
| 137 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
| 138 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
| 139 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
| 140 | .completed = 0, | ||
| 141 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
| 142 | }; | ||
| 143 | |||
| 144 | |||
| 145 | #ifdef CONFIG_RCU_TRACE | ||
| 146 | static char *rcu_try_flip_state_names[] = | ||
| 147 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
| 148 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 149 | |||
| 150 | static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; | ||
| 151 | |||
| 152 | /* | ||
| 153 | * Enum and per-CPU flag to determine when each CPU has seen | ||
| 154 | * the most recent counter flip. | ||
| 155 | */ | ||
| 156 | |||
| 157 | enum rcu_flip_flag_values { | ||
| 158 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
| 159 | /* Only GP detector can update. */ | ||
| 160 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
| 161 | /* Only corresponding CPU can update. */ | ||
| 162 | }; | ||
| 163 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
| 164 | = rcu_flip_seen; | ||
| 165 | |||
| 166 | /* | ||
| 167 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
| 168 | * needed memory barrier to fence in memory references from its last RCU | ||
| 169 | * read-side critical section in the just-completed grace period. | ||
| 170 | */ | ||
| 171 | |||
| 172 | enum rcu_mb_flag_values { | ||
| 173 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
| 174 | /* Only GP detector can update. */ | ||
| 175 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
| 176 | /* Only corresponding CPU can update. */ | ||
| 177 | }; | ||
| 178 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
| 179 | = rcu_mb_done; | ||
| 180 | |||
| 181 | /* | ||
| 182 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
| 183 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
| 184 | */ | ||
| 185 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
| 186 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
| 187 | |||
| 188 | /* | ||
| 189 | * Helper macro for tracing when the appropriate rcu_data is not | ||
| 190 | * cached in a local variable, but where the CPU number is so cached. | ||
| 191 | */ | ||
| 192 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
| 193 | |||
| 194 | /* | ||
| 195 | * Helper macro for tracing when the appropriate rcu_data is not | ||
| 196 | * cached in a local variable. | ||
| 197 | */ | ||
| 198 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
| 199 | |||
| 200 | /* | ||
| 201 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
| 202 | * to by a local variable. | ||
| 203 | */ | ||
| 204 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
| 205 | |||
| 206 | /* | ||
| 207 | * Return the number of RCU batches processed thus far. Useful | ||
| 208 | * for debug and statistics. | ||
| 209 | */ | ||
| 210 | long rcu_batches_completed(void) | ||
| 211 | { | ||
| 212 | return rcu_ctrlblk.completed; | ||
| 213 | } | ||
| 214 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
| 215 | |||
| 216 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
| 217 | |||
| 218 | void __rcu_read_lock(void) | ||
| 219 | { | ||
| 220 | int idx; | ||
| 221 | struct task_struct *t = current; | ||
| 222 | int nesting; | ||
| 223 | |||
| 224 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
| 225 | if (nesting != 0) { | ||
| 226 | |||
| 227 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
| 228 | |||
| 229 | t->rcu_read_lock_nesting = nesting + 1; | ||
| 230 | |||
| 231 | } else { | ||
| 232 | unsigned long flags; | ||
| 233 | |||
| 234 | /* | ||
| 235 | * We disable interrupts for the following reasons: | ||
| 236 | * - If we get scheduling clock interrupt here, and we | ||
| 237 | * end up acking the counter flip, it's like a promise | ||
| 238 | * that we will never increment the old counter again. | ||
| 239 | * Thus we will break that promise if that | ||
| 240 | * scheduling clock interrupt happens between the time | ||
| 241 | * we pick the .completed field and the time that we | ||
| 242 | * increment our counter. | ||
| 243 | * | ||
| 244 | * - We don't want to be preempted out here. | ||
| 245 | * | ||
| 246 | * NMIs can still occur, of course, and might themselves | ||
| 247 | * contain rcu_read_lock(). | ||
| 248 | */ | ||
| 249 | |||
| 250 | local_irq_save(flags); | ||
| 251 | |||
| 252 | /* | ||
| 253 | * Outermost nesting of rcu_read_lock(), so increment | ||
| 254 | * the current counter for the current CPU. Use volatile | ||
| 255 | * casts to prevent the compiler from reordering. | ||
| 256 | */ | ||
| 257 | |||
| 258 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
| 259 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
| 260 | |||
| 261 | /* | ||
| 262 | * Now that the per-CPU counter has been incremented, we | ||
| 263 | * are protected from races with rcu_read_lock() invoked | ||
| 264 | * from NMI handlers on this CPU. We can therefore safely | ||
| 265 | * increment the nesting counter, relieving further NMIs | ||
| 266 | * of the need to increment the per-CPU counter. | ||
| 267 | */ | ||
| 268 | |||
| 269 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
| 270 | |||
| 271 | /* | ||
| 272 | * Now that we have preventing any NMIs from storing | ||
| 273 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
| 274 | * remember which counter to decrement in the matching | ||
| 275 | * rcu_read_unlock(). | ||
| 276 | */ | ||
| 277 | |||
| 278 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
| 279 | local_irq_restore(flags); | ||
| 280 | } | ||
| 281 | } | ||
| 282 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
| 283 | |||
| 284 | void __rcu_read_unlock(void) | ||
| 285 | { | ||
| 286 | int idx; | ||
| 287 | struct task_struct *t = current; | ||
| 288 | int nesting; | ||
| 289 | |||
| 290 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
| 291 | if (nesting > 1) { | ||
| 292 | |||
| 293 | /* | ||
| 294 | * We are still protected by the enclosing rcu_read_lock(), | ||
| 295 | * so simply decrement the counter. | ||
| 296 | */ | ||
| 297 | |||
| 298 | t->rcu_read_lock_nesting = nesting - 1; | ||
| 299 | |||
| 300 | } else { | ||
| 301 | unsigned long flags; | ||
| 302 | |||
| 303 | /* | ||
| 304 | * Disable local interrupts to prevent the grace-period | ||
| 305 | * detection state machine from seeing us half-done. | ||
| 306 | * NMIs can still occur, of course, and might themselves | ||
| 307 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
| 308 | */ | ||
| 309 | |||
| 310 | local_irq_save(flags); | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Outermost nesting of rcu_read_unlock(), so we must | ||
| 314 | * decrement the current counter for the current CPU. | ||
| 315 | * This must be done carefully, because NMIs can | ||
| 316 | * occur at any point in this code, and any rcu_read_lock() | ||
| 317 | * and rcu_read_unlock() pairs in the NMI handlers | ||
| 318 | * must interact non-destructively with this code. | ||
| 319 | * Lots of volatile casts, and -very- careful ordering. | ||
| 320 | * | ||
| 321 | * Changes to this code, including this one, must be | ||
| 322 | * inspected, validated, and tested extremely carefully!!! | ||
| 323 | */ | ||
| 324 | |||
| 325 | /* | ||
| 326 | * First, pick up the index. | ||
| 327 | */ | ||
| 328 | |||
| 329 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
| 330 | |||
| 331 | /* | ||
| 332 | * Now that we have fetched the counter index, it is | ||
| 333 | * safe to decrement the per-task RCU nesting counter. | ||
| 334 | * After this, any interrupts or NMIs will increment and | ||
| 335 | * decrement the per-CPU counters. | ||
| 336 | */ | ||
| 337 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
| 338 | |||
| 339 | /* | ||
| 340 | * It is now safe to decrement this task's nesting count. | ||
| 341 | * NMIs that occur after this statement will route their | ||
| 342 | * rcu_read_lock() calls through this "else" clause, and | ||
| 343 | * will thus start incrementing the per-CPU counter on | ||
| 344 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
| 345 | * but that is OK, since we have already fetched it. | ||
| 346 | */ | ||
| 347 | |||
| 348 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
| 349 | local_irq_restore(flags); | ||
| 350 | } | ||
| 351 | } | ||
| 352 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
| 353 | |||
| 354 | /* | ||
| 355 | * If a global counter flip has occurred since the last time that we | ||
| 356 | * advanced callbacks, advance them. Hardware interrupts must be | ||
| 357 | * disabled when calling this function. | ||
| 358 | */ | ||
| 359 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
| 360 | { | ||
| 361 | int cpu; | ||
| 362 | int i; | ||
| 363 | int wlc = 0; | ||
| 364 | |||
| 365 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
| 366 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
| 367 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
| 368 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
| 369 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
| 370 | } | ||
| 371 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
| 372 | if (rdp->waitlist[i] != NULL) { | ||
| 373 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
| 374 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
| 375 | wlc++; | ||
| 376 | } else { | ||
| 377 | rdp->waitlist[i + 1] = NULL; | ||
| 378 | rdp->waittail[i + 1] = | ||
| 379 | &rdp->waitlist[i + 1]; | ||
| 380 | } | ||
| 381 | } | ||
| 382 | if (rdp->nextlist != NULL) { | ||
| 383 | rdp->waitlist[0] = rdp->nextlist; | ||
| 384 | rdp->waittail[0] = rdp->nexttail; | ||
| 385 | wlc++; | ||
| 386 | rdp->nextlist = NULL; | ||
| 387 | rdp->nexttail = &rdp->nextlist; | ||
| 388 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
| 389 | } else { | ||
| 390 | rdp->waitlist[0] = NULL; | ||
| 391 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
| 392 | } | ||
| 393 | rdp->waitlistcount = wlc; | ||
| 394 | rdp->completed = rcu_ctrlblk.completed; | ||
| 395 | } | ||
| 396 | |||
| 397 | /* | ||
| 398 | * Check to see if this CPU needs to report that it has seen | ||
| 399 | * the most recent counter flip, thereby declaring that all | ||
| 400 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
| 401 | */ | ||
| 402 | |||
| 403 | cpu = raw_smp_processor_id(); | ||
| 404 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
| 405 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
| 406 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
| 407 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
| 408 | /* seen -after- acknowledgement. */ | ||
| 409 | } | ||
| 410 | } | ||
| 411 | |||
| 412 | /* | ||
| 413 | * Get here when RCU is idle. Decide whether we need to | ||
| 414 | * move out of idle state, and return non-zero if so. | ||
| 415 | * "Straightforward" approach for the moment, might later | ||
| 416 | * use callback-list lengths, grace-period duration, or | ||
| 417 | * some such to determine when to exit idle state. | ||
| 418 | * Might also need a pre-idle test that does not acquire | ||
| 419 | * the lock, but let's get the simple case working first... | ||
| 420 | */ | ||
| 421 | |||
| 422 | static int | ||
| 423 | rcu_try_flip_idle(void) | ||
| 424 | { | ||
| 425 | int cpu; | ||
| 426 | |||
| 427 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
| 428 | if (!rcu_pending(smp_processor_id())) { | ||
| 429 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
| 430 | return 0; | ||
| 431 | } | ||
| 432 | |||
| 433 | /* | ||
| 434 | * Do the flip. | ||
| 435 | */ | ||
| 436 | |||
| 437 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
| 438 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
| 439 | |||
| 440 | /* | ||
| 441 | * Need a memory barrier so that other CPUs see the new | ||
| 442 | * counter value before they see the subsequent change of all | ||
| 443 | * the rcu_flip_flag instances to rcu_flipped. | ||
| 444 | */ | ||
| 445 | |||
| 446 | smp_mb(); /* see above block comment. */ | ||
| 447 | |||
| 448 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
| 449 | |||
| 450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
| 451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
| 452 | |||
| 453 | return 1; | ||
| 454 | } | ||
| 455 | |||
| 456 | /* | ||
| 457 | * Wait for CPUs to acknowledge the flip. | ||
| 458 | */ | ||
| 459 | |||
| 460 | static int | ||
| 461 | rcu_try_flip_waitack(void) | ||
| 462 | { | ||
| 463 | int cpu; | ||
| 464 | |||
| 465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
| 466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
| 467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
| 468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
| 469 | return 0; | ||
| 470 | } | ||
| 471 | |||
| 472 | /* | ||
| 473 | * Make sure our checks above don't bleed into subsequent | ||
| 474 | * waiting for the sum of the counters to reach zero. | ||
| 475 | */ | ||
| 476 | |||
| 477 | smp_mb(); /* see above block comment. */ | ||
| 478 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
| 479 | return 1; | ||
| 480 | } | ||
| 481 | |||
| 482 | /* | ||
| 483 | * Wait for collective ``last'' counter to reach zero, | ||
| 484 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
| 485 | */ | ||
| 486 | |||
| 487 | static int | ||
| 488 | rcu_try_flip_waitzero(void) | ||
| 489 | { | ||
| 490 | int cpu; | ||
| 491 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
| 492 | int sum = 0; | ||
| 493 | |||
| 494 | /* Check to see if the sum of the "last" counters is zero. */ | ||
| 495 | |||
| 496 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
| 497 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
| 498 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
| 499 | if (sum != 0) { | ||
| 500 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
| 501 | return 0; | ||
| 502 | } | ||
| 503 | |||
| 504 | /* | ||
| 505 | * This ensures that the other CPUs see the call for | ||
| 506 | * memory barriers -after- the sum to zero has been | ||
| 507 | * detected here | ||
| 508 | */ | ||
| 509 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
| 510 | |||
| 511 | /* Call for a memory barrier from each CPU. */ | ||
| 512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
| 513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
| 514 | |||
| 515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
| 516 | return 1; | ||
| 517 | } | ||
| 518 | |||
| 519 | /* | ||
| 520 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
| 521 | * Return 0 once all CPUs have done so. | ||
| 522 | */ | ||
| 523 | |||
| 524 | static int | ||
| 525 | rcu_try_flip_waitmb(void) | ||
| 526 | { | ||
| 527 | int cpu; | ||
| 528 | |||
| 529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
| 530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
| 531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
| 532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
| 533 | return 0; | ||
| 534 | } | ||
| 535 | |||
| 536 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
| 537 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
| 538 | return 1; | ||
| 539 | } | ||
| 540 | |||
| 541 | /* | ||
| 542 | * Attempt a single flip of the counters. Remember, a single flip does | ||
| 543 | * -not- constitute a grace period. Instead, the interval between | ||
| 544 | * at least GP_STAGES consecutive flips is a grace period. | ||
| 545 | * | ||
| 546 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
| 547 | * on a large SMP, they might want to use a hierarchical organization of | ||
| 548 | * the per-CPU-counter pairs. | ||
| 549 | */ | ||
| 550 | static void rcu_try_flip(void) | ||
| 551 | { | ||
| 552 | unsigned long flags; | ||
| 553 | |||
| 554 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
| 555 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
| 556 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
| 557 | return; | ||
| 558 | } | ||
| 559 | |||
| 560 | /* | ||
| 561 | * Take the next transition(s) through the RCU grace-period | ||
| 562 | * flip-counter state machine. | ||
| 563 | */ | ||
| 564 | |||
| 565 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
| 566 | case rcu_try_flip_idle_state: | ||
| 567 | if (rcu_try_flip_idle()) | ||
| 568 | rcu_ctrlblk.rcu_try_flip_state = | ||
| 569 | rcu_try_flip_waitack_state; | ||
| 570 | break; | ||
| 571 | case rcu_try_flip_waitack_state: | ||
| 572 | if (rcu_try_flip_waitack()) | ||
| 573 | rcu_ctrlblk.rcu_try_flip_state = | ||
| 574 | rcu_try_flip_waitzero_state; | ||
| 575 | break; | ||
| 576 | case rcu_try_flip_waitzero_state: | ||
| 577 | if (rcu_try_flip_waitzero()) | ||
| 578 | rcu_ctrlblk.rcu_try_flip_state = | ||
| 579 | rcu_try_flip_waitmb_state; | ||
| 580 | break; | ||
| 581 | case rcu_try_flip_waitmb_state: | ||
| 582 | if (rcu_try_flip_waitmb()) | ||
| 583 | rcu_ctrlblk.rcu_try_flip_state = | ||
| 584 | rcu_try_flip_idle_state; | ||
| 585 | } | ||
| 586 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
| 587 | } | ||
| 588 | |||
| 589 | /* | ||
| 590 | * Check to see if this CPU needs to do a memory barrier in order to | ||
| 591 | * ensure that any prior RCU read-side critical sections have committed | ||
| 592 | * their counter manipulations and critical-section memory references | ||
| 593 | * before declaring the grace period to be completed. | ||
| 594 | */ | ||
| 595 | static void rcu_check_mb(int cpu) | ||
| 596 | { | ||
| 597 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
| 598 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
| 599 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
| 600 | } | ||
| 601 | } | ||
| 602 | |||
| 603 | void rcu_check_callbacks(int cpu, int user) | ||
| 604 | { | ||
| 605 | unsigned long flags; | ||
| 606 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
| 607 | |||
| 608 | rcu_check_mb(cpu); | ||
| 609 | if (rcu_ctrlblk.completed == rdp->completed) | ||
| 610 | rcu_try_flip(); | ||
| 611 | spin_lock_irqsave(&rdp->lock, flags); | ||
| 612 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
| 613 | __rcu_advance_callbacks(rdp); | ||
| 614 | if (rdp->donelist == NULL) { | ||
| 615 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
| 616 | } else { | ||
| 617 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
| 618 | raise_softirq(RCU_SOFTIRQ); | ||
| 619 | } | ||
| 620 | } | ||
| 621 | |||
| 622 | /* | ||
| 623 | * Needed by dynticks, to make sure all RCU processing has finished | ||
| 624 | * when we go idle: | ||
| 625 | */ | ||
| 626 | void rcu_advance_callbacks(int cpu, int user) | ||
| 627 | { | ||
| 628 | unsigned long flags; | ||
| 629 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
| 630 | |||
| 631 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
| 632 | rcu_try_flip(); | ||
| 633 | if (rcu_ctrlblk.completed == rdp->completed) | ||
| 634 | return; | ||
| 635 | } | ||
| 636 | spin_lock_irqsave(&rdp->lock, flags); | ||
| 637 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
| 638 | __rcu_advance_callbacks(rdp); | ||
| 639 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
| 640 | } | ||
| 641 | |||
| 642 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 643 | #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ | ||
| 644 | *dsttail = srclist; \ | ||
| 645 | if (srclist != NULL) { \ | ||
| 646 | dsttail = srctail; \ | ||
| 647 | srclist = NULL; \ | ||
| 648 | srctail = &srclist;\ | ||
| 649 | } \ | ||
| 650 | } while (0) | ||
| 651 | |||
| 652 | void rcu_offline_cpu(int cpu) | ||
| 653 | { | ||
| 654 | int i; | ||
| 655 | struct rcu_head *list = NULL; | ||
| 656 | unsigned long flags; | ||
| 657 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
| 658 | struct rcu_head **tail = &list; | ||
| 659 | |||
| 660 | /* | ||
| 661 | * Remove all callbacks from the newly dead CPU, retaining order. | ||
| 662 | * Otherwise rcu_barrier() will fail | ||
| 663 | */ | ||
| 664 | |||
| 665 | spin_lock_irqsave(&rdp->lock, flags); | ||
| 666 | rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); | ||
| 667 | for (i = GP_STAGES - 1; i >= 0; i--) | ||
| 668 | rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], | ||
| 669 | list, tail); | ||
| 670 | rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); | ||
| 671 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
| 672 | rdp->waitlistcount = 0; | ||
| 673 | |||
| 674 | /* Disengage the newly dead CPU from the grace-period computation. */ | ||
| 675 | |||
| 676 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
| 677 | rcu_check_mb(cpu); | ||
| 678 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
| 679 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
| 680 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
| 681 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
| 682 | /* seen -after- acknowledgement. */ | ||
| 683 | } | ||
| 684 | |||
| 685 | RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
| 686 | RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; | ||
| 687 | |||
| 688 | RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; | ||
| 689 | RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; | ||
| 690 | |||
| 691 | cpu_clear(cpu, rcu_cpu_online_map); | ||
| 692 | |||
| 693 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
| 694 | |||
| 695 | /* | ||
| 696 | * Place the removed callbacks on the current CPU's queue. | ||
| 697 | * Make them all start a new grace period: simple approach, | ||
| 698 | * in theory could starve a given set of callbacks, but | ||
| 699 | * you would need to be doing some serious CPU hotplugging | ||
| 700 | * to make this happen. If this becomes a problem, adding | ||
| 701 | * a synchronize_rcu() to the hotplug path would be a simple | ||
| 702 | * fix. | ||
| 703 | */ | ||
| 704 | |||
| 705 | rdp = RCU_DATA_ME(); | ||
| 706 | spin_lock_irqsave(&rdp->lock, flags); | ||
| 707 | *rdp->nexttail = list; | ||
| 708 | if (list) | ||
| 709 | rdp->nexttail = tail; | ||
| 710 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
| 711 | } | ||
| 712 | |||
| 713 | void __devinit rcu_online_cpu(int cpu) | ||
| 714 | { | ||
| 715 | unsigned long flags; | ||
| 716 | |||
| 717 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
| 718 | cpu_set(cpu, rcu_cpu_online_map); | ||
| 719 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
| 720 | } | ||
| 721 | |||
| 722 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 723 | |||
| 724 | void rcu_offline_cpu(int cpu) | ||
| 725 | { | ||
| 726 | } | ||
| 727 | |||
| 728 | void __devinit rcu_online_cpu(int cpu) | ||
| 729 | { | ||
| 730 | } | ||
| 731 | |||
| 732 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 733 | |||
| 734 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
| 735 | { | ||
| 736 | unsigned long flags; | ||
| 737 | struct rcu_head *next, *list; | ||
| 738 | struct rcu_data *rdp = RCU_DATA_ME(); | ||
| 739 | |||
| 740 | spin_lock_irqsave(&rdp->lock, flags); | ||
| 741 | list = rdp->donelist; | ||
| 742 | if (list == NULL) { | ||
| 743 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
| 744 | return; | ||
| 745 | } | ||
| 746 | rdp->donelist = NULL; | ||
| 747 | rdp->donetail = &rdp->donelist; | ||
| 748 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
| 749 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
| 750 | while (list) { | ||
| 751 | next = list->next; | ||
| 752 | list->func(list); | ||
| 753 | list = next; | ||
| 754 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
| 755 | } | ||
| 756 | } | ||
| 757 | |||
| 758 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 759 | { | ||
| 760 | unsigned long flags; | ||
| 761 | struct rcu_data *rdp; | ||
| 762 | |||
| 763 | head->func = func; | ||
| 764 | head->next = NULL; | ||
| 765 | local_irq_save(flags); | ||
| 766 | rdp = RCU_DATA_ME(); | ||
| 767 | spin_lock(&rdp->lock); | ||
| 768 | __rcu_advance_callbacks(rdp); | ||
| 769 | *rdp->nexttail = head; | ||
| 770 | rdp->nexttail = &head->next; | ||
| 771 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
| 772 | spin_unlock(&rdp->lock); | ||
| 773 | local_irq_restore(flags); | ||
| 774 | } | ||
| 775 | EXPORT_SYMBOL_GPL(call_rcu); | ||
| 776 | |||
| 777 | /* | ||
| 778 | * Wait until all currently running preempt_disable() code segments | ||
| 779 | * (including hardware-irq-disable segments) complete. Note that | ||
| 780 | * in -rt this does -not- necessarily result in all currently executing | ||
| 781 | * interrupt -handlers- having completed. | ||
| 782 | */ | ||
| 783 | void __synchronize_sched(void) | ||
| 784 | { | ||
| 785 | cpumask_t oldmask; | ||
| 786 | int cpu; | ||
| 787 | |||
| 788 | if (sched_getaffinity(0, &oldmask) < 0) | ||
| 789 | oldmask = cpu_possible_map; | ||
| 790 | for_each_online_cpu(cpu) { | ||
| 791 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | ||
| 792 | schedule(); | ||
| 793 | } | ||
| 794 | sched_setaffinity(0, oldmask); | ||
| 795 | } | ||
| 796 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
| 797 | |||
| 798 | /* | ||
| 799 | * Check to see if any future RCU-related work will need to be done | ||
| 800 | * by the current CPU, even if none need be done immediately, returning | ||
| 801 | * 1 if so. Assumes that notifiers would take care of handling any | ||
| 802 | * outstanding requests from the RCU core. | ||
| 803 | * | ||
| 804 | * This function is part of the RCU implementation; it is -not- | ||
| 805 | * an exported member of the RCU API. | ||
| 806 | */ | ||
| 807 | int rcu_needs_cpu(int cpu) | ||
| 808 | { | ||
| 809 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
| 810 | |||
| 811 | return (rdp->donelist != NULL || | ||
| 812 | !!rdp->waitlistcount || | ||
| 813 | rdp->nextlist != NULL); | ||
| 814 | } | ||
| 815 | |||
| 816 | int rcu_pending(int cpu) | ||
| 817 | { | ||
| 818 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
| 819 | |||
| 820 | /* The CPU has at least one callback queued somewhere. */ | ||
| 821 | |||
| 822 | if (rdp->donelist != NULL || | ||
| 823 | !!rdp->waitlistcount || | ||
| 824 | rdp->nextlist != NULL) | ||
| 825 | return 1; | ||
| 826 | |||
| 827 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
| 828 | |||
| 829 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
| 830 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
| 831 | return 1; | ||
| 832 | |||
| 833 | /* This CPU has fallen behind the global grace-period number. */ | ||
| 834 | |||
| 835 | if (rdp->completed != rcu_ctrlblk.completed) | ||
| 836 | return 1; | ||
| 837 | |||
| 838 | /* Nothing needed from this CPU. */ | ||
| 839 | |||
| 840 | return 0; | ||
| 841 | } | ||
| 842 | |||
| 843 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
| 844 | unsigned long action, void *hcpu) | ||
| 845 | { | ||
| 846 | long cpu = (long)hcpu; | ||
| 847 | |||
| 848 | switch (action) { | ||
| 849 | case CPU_UP_PREPARE: | ||
| 850 | case CPU_UP_PREPARE_FROZEN: | ||
| 851 | rcu_online_cpu(cpu); | ||
| 852 | break; | ||
| 853 | case CPU_UP_CANCELED: | ||
| 854 | case CPU_UP_CANCELED_FROZEN: | ||
| 855 | case CPU_DEAD: | ||
| 856 | case CPU_DEAD_FROZEN: | ||
| 857 | rcu_offline_cpu(cpu); | ||
| 858 | break; | ||
| 859 | default: | ||
| 860 | break; | ||
| 861 | } | ||
| 862 | return NOTIFY_OK; | ||
| 863 | } | ||
| 864 | |||
| 865 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
| 866 | .notifier_call = rcu_cpu_notify, | ||
| 867 | }; | ||
| 868 | |||
| 869 | void __init __rcu_init(void) | ||
| 870 | { | ||
| 871 | int cpu; | ||
| 872 | int i; | ||
| 873 | struct rcu_data *rdp; | ||
| 874 | |||
| 875 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
| 876 | for_each_possible_cpu(cpu) { | ||
| 877 | rdp = RCU_DATA_CPU(cpu); | ||
| 878 | spin_lock_init(&rdp->lock); | ||
| 879 | rdp->completed = 0; | ||
| 880 | rdp->waitlistcount = 0; | ||
| 881 | rdp->nextlist = NULL; | ||
| 882 | rdp->nexttail = &rdp->nextlist; | ||
| 883 | for (i = 0; i < GP_STAGES; i++) { | ||
| 884 | rdp->waitlist[i] = NULL; | ||
| 885 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
| 886 | } | ||
| 887 | rdp->donelist = NULL; | ||
| 888 | rdp->donetail = &rdp->donelist; | ||
| 889 | rdp->rcu_flipctr[0] = 0; | ||
| 890 | rdp->rcu_flipctr[1] = 0; | ||
| 891 | } | ||
| 892 | register_cpu_notifier(&rcu_nb); | ||
| 893 | |||
| 894 | /* | ||
| 895 | * We don't need protection against CPU-Hotplug here | ||
| 896 | * since | ||
| 897 | * a) If a CPU comes online while we are iterating over the | ||
| 898 | * cpu_online_map below, we would only end up making a | ||
| 899 | * duplicate call to rcu_online_cpu() which sets the corresponding | ||
| 900 | * CPU's mask in the rcu_cpu_online_map. | ||
| 901 | * | ||
| 902 | * b) A CPU cannot go offline at this point in time since the user | ||
| 903 | * does not have access to the sysfs interface, nor do we | ||
| 904 | * suspend the system. | ||
| 905 | */ | ||
| 906 | for_each_online_cpu(cpu) | ||
| 907 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); | ||
| 908 | |||
| 909 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
| 910 | } | ||
| 911 | |||
| 912 | /* | ||
| 913 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
| 914 | */ | ||
| 915 | void synchronize_kernel(void) | ||
| 916 | { | ||
| 917 | synchronize_rcu(); | ||
| 918 | } | ||
| 919 | |||
| 920 | #ifdef CONFIG_RCU_TRACE | ||
| 921 | long *rcupreempt_flipctr(int cpu) | ||
| 922 | { | ||
| 923 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
| 924 | } | ||
| 925 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
| 926 | |||
| 927 | int rcupreempt_flip_flag(int cpu) | ||
| 928 | { | ||
| 929 | return per_cpu(rcu_flip_flag, cpu); | ||
| 930 | } | ||
| 931 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
| 932 | |||
| 933 | int rcupreempt_mb_flag(int cpu) | ||
| 934 | { | ||
| 935 | return per_cpu(rcu_mb_flag, cpu); | ||
| 936 | } | ||
| 937 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
| 938 | |||
| 939 | char *rcupreempt_try_flip_state_name(void) | ||
| 940 | { | ||
| 941 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
| 942 | } | ||
| 943 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
| 944 | |||
| 945 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
| 946 | { | ||
| 947 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
| 948 | |||
| 949 | return &rdp->trace; | ||
| 950 | } | ||
| 951 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
| 952 | |||
| 953 | #endif /* #ifdef RCU_TRACE */ | ||
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c new file mode 100644 index 000000000000..49ac4947af24 --- /dev/null +++ b/kernel/rcupreempt_trace.c | |||
| @@ -0,0 +1,330 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update tracing for realtime implementation | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2006 | ||
| 19 | * | ||
| 20 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
| 21 | * | ||
| 22 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 23 | * Documentation/RCU/ *.txt | ||
| 24 | * | ||
| 25 | */ | ||
| 26 | #include <linux/types.h> | ||
| 27 | #include <linux/kernel.h> | ||
| 28 | #include <linux/init.h> | ||
| 29 | #include <linux/spinlock.h> | ||
| 30 | #include <linux/smp.h> | ||
| 31 | #include <linux/rcupdate.h> | ||
| 32 | #include <linux/interrupt.h> | ||
| 33 | #include <linux/sched.h> | ||
| 34 | #include <asm/atomic.h> | ||
| 35 | #include <linux/bitops.h> | ||
| 36 | #include <linux/module.h> | ||
| 37 | #include <linux/completion.h> | ||
| 38 | #include <linux/moduleparam.h> | ||
| 39 | #include <linux/percpu.h> | ||
| 40 | #include <linux/notifier.h> | ||
| 41 | #include <linux/rcupdate.h> | ||
| 42 | #include <linux/cpu.h> | ||
| 43 | #include <linux/mutex.h> | ||
| 44 | #include <linux/rcupreempt_trace.h> | ||
| 45 | #include <linux/debugfs.h> | ||
| 46 | |||
| 47 | static struct mutex rcupreempt_trace_mutex; | ||
| 48 | static char *rcupreempt_trace_buf; | ||
| 49 | #define RCUPREEMPT_TRACE_BUF_SIZE 4096 | ||
| 50 | |||
| 51 | void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) | ||
| 52 | { | ||
| 53 | trace->done_length += trace->wait_length; | ||
| 54 | trace->done_add += trace->wait_length; | ||
| 55 | trace->wait_length = 0; | ||
| 56 | } | ||
| 57 | void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) | ||
| 58 | { | ||
| 59 | trace->wait_length += trace->next_length; | ||
| 60 | trace->wait_add += trace->next_length; | ||
| 61 | trace->next_length = 0; | ||
| 62 | } | ||
| 63 | void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) | ||
| 64 | { | ||
| 65 | atomic_inc(&trace->rcu_try_flip_1); | ||
| 66 | } | ||
| 67 | void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) | ||
| 68 | { | ||
| 69 | atomic_inc(&trace->rcu_try_flip_e1); | ||
| 70 | } | ||
| 71 | void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) | ||
| 72 | { | ||
| 73 | trace->rcu_try_flip_i1++; | ||
| 74 | } | ||
| 75 | void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) | ||
| 76 | { | ||
| 77 | trace->rcu_try_flip_ie1++; | ||
| 78 | } | ||
| 79 | void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) | ||
| 80 | { | ||
| 81 | trace->rcu_try_flip_g1++; | ||
| 82 | } | ||
| 83 | void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) | ||
| 84 | { | ||
| 85 | trace->rcu_try_flip_a1++; | ||
| 86 | } | ||
| 87 | void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) | ||
| 88 | { | ||
| 89 | trace->rcu_try_flip_ae1++; | ||
| 90 | } | ||
| 91 | void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) | ||
| 92 | { | ||
| 93 | trace->rcu_try_flip_a2++; | ||
| 94 | } | ||
| 95 | void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) | ||
| 96 | { | ||
| 97 | trace->rcu_try_flip_z1++; | ||
| 98 | } | ||
| 99 | void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) | ||
| 100 | { | ||
| 101 | trace->rcu_try_flip_ze1++; | ||
| 102 | } | ||
| 103 | void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) | ||
| 104 | { | ||
| 105 | trace->rcu_try_flip_z2++; | ||
| 106 | } | ||
| 107 | void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) | ||
| 108 | { | ||
| 109 | trace->rcu_try_flip_m1++; | ||
| 110 | } | ||
| 111 | void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) | ||
| 112 | { | ||
| 113 | trace->rcu_try_flip_me1++; | ||
| 114 | } | ||
| 115 | void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) | ||
| 116 | { | ||
| 117 | trace->rcu_try_flip_m2++; | ||
| 118 | } | ||
| 119 | void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) | ||
| 120 | { | ||
| 121 | trace->rcu_check_callbacks++; | ||
| 122 | } | ||
| 123 | void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) | ||
| 124 | { | ||
| 125 | trace->done_remove += trace->done_length; | ||
| 126 | trace->done_length = 0; | ||
| 127 | } | ||
| 128 | void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) | ||
| 129 | { | ||
| 130 | atomic_inc(&trace->done_invoked); | ||
| 131 | } | ||
| 132 | void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) | ||
| 133 | { | ||
| 134 | trace->next_add++; | ||
| 135 | trace->next_length++; | ||
| 136 | } | ||
| 137 | |||
| 138 | static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) | ||
| 139 | { | ||
| 140 | struct rcupreempt_trace *cp; | ||
| 141 | int cpu; | ||
| 142 | |||
| 143 | memset(sp, 0, sizeof(*sp)); | ||
| 144 | for_each_possible_cpu(cpu) { | ||
| 145 | cp = rcupreempt_trace_cpu(cpu); | ||
| 146 | sp->next_length += cp->next_length; | ||
| 147 | sp->next_add += cp->next_add; | ||
| 148 | sp->wait_length += cp->wait_length; | ||
| 149 | sp->wait_add += cp->wait_add; | ||
| 150 | sp->done_length += cp->done_length; | ||
| 151 | sp->done_add += cp->done_add; | ||
| 152 | sp->done_remove += cp->done_remove; | ||
| 153 | atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); | ||
| 154 | sp->rcu_check_callbacks += cp->rcu_check_callbacks; | ||
| 155 | atomic_set(&sp->rcu_try_flip_1, | ||
| 156 | atomic_read(&cp->rcu_try_flip_1)); | ||
| 157 | atomic_set(&sp->rcu_try_flip_e1, | ||
| 158 | atomic_read(&cp->rcu_try_flip_e1)); | ||
| 159 | sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; | ||
| 160 | sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; | ||
| 161 | sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; | ||
| 162 | sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; | ||
| 163 | sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; | ||
| 164 | sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; | ||
| 165 | sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; | ||
| 166 | sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; | ||
| 167 | sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; | ||
| 168 | sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; | ||
| 169 | sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; | ||
| 170 | sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; | ||
| 171 | } | ||
| 172 | } | ||
| 173 | |||
| 174 | static ssize_t rcustats_read(struct file *filp, char __user *buffer, | ||
| 175 | size_t count, loff_t *ppos) | ||
| 176 | { | ||
| 177 | struct rcupreempt_trace trace; | ||
| 178 | ssize_t bcount; | ||
| 179 | int cnt = 0; | ||
| 180 | |||
| 181 | rcupreempt_trace_sum(&trace); | ||
| 182 | mutex_lock(&rcupreempt_trace_mutex); | ||
| 183 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
| 184 | "ggp=%ld rcc=%ld\n", | ||
| 185 | rcu_batches_completed(), | ||
| 186 | trace.rcu_check_callbacks); | ||
| 187 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
| 188 | "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" | ||
| 189 | "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" | ||
| 190 | "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", | ||
| 191 | |||
| 192 | trace.next_add, trace.next_length, | ||
| 193 | trace.wait_add, trace.wait_length, | ||
| 194 | trace.done_add, trace.done_length, | ||
| 195 | trace.done_remove, atomic_read(&trace.done_invoked), | ||
| 196 | atomic_read(&trace.rcu_try_flip_1), | ||
| 197 | atomic_read(&trace.rcu_try_flip_e1), | ||
| 198 | trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, | ||
| 199 | trace.rcu_try_flip_g1, | ||
| 200 | trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, | ||
| 201 | trace.rcu_try_flip_a2, | ||
| 202 | trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, | ||
| 203 | trace.rcu_try_flip_z2, | ||
| 204 | trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, | ||
| 205 | trace.rcu_try_flip_m2); | ||
| 206 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
| 207 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
| 208 | mutex_unlock(&rcupreempt_trace_mutex); | ||
| 209 | return bcount; | ||
| 210 | } | ||
| 211 | |||
| 212 | static ssize_t rcugp_read(struct file *filp, char __user *buffer, | ||
| 213 | size_t count, loff_t *ppos) | ||
| 214 | { | ||
| 215 | long oldgp = rcu_batches_completed(); | ||
| 216 | ssize_t bcount; | ||
| 217 | |||
| 218 | mutex_lock(&rcupreempt_trace_mutex); | ||
| 219 | synchronize_rcu(); | ||
| 220 | snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, | ||
| 221 | "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); | ||
| 222 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
| 223 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
| 224 | mutex_unlock(&rcupreempt_trace_mutex); | ||
| 225 | return bcount; | ||
| 226 | } | ||
| 227 | |||
| 228 | static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, | ||
| 229 | size_t count, loff_t *ppos) | ||
| 230 | { | ||
| 231 | int cnt = 0; | ||
| 232 | int cpu; | ||
| 233 | int f = rcu_batches_completed() & 0x1; | ||
| 234 | ssize_t bcount; | ||
| 235 | |||
| 236 | mutex_lock(&rcupreempt_trace_mutex); | ||
| 237 | |||
| 238 | cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, | ||
| 239 | "CPU last cur F M\n"); | ||
| 240 | for_each_online_cpu(cpu) { | ||
| 241 | long *flipctr = rcupreempt_flipctr(cpu); | ||
| 242 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
| 243 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
| 244 | "%3d %4ld %3ld %d %d\n", | ||
| 245 | cpu, | ||
| 246 | flipctr[!f], | ||
| 247 | flipctr[f], | ||
| 248 | rcupreempt_flip_flag(cpu), | ||
| 249 | rcupreempt_mb_flag(cpu)); | ||
| 250 | } | ||
| 251 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
| 252 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
| 253 | "ggp = %ld, state = %s\n", | ||
| 254 | rcu_batches_completed(), | ||
| 255 | rcupreempt_try_flip_state_name()); | ||
| 256 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
| 257 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
| 258 | "\n"); | ||
| 259 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
| 260 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
| 261 | mutex_unlock(&rcupreempt_trace_mutex); | ||
| 262 | return bcount; | ||
| 263 | } | ||
| 264 | |||
| 265 | static struct file_operations rcustats_fops = { | ||
| 266 | .owner = THIS_MODULE, | ||
| 267 | .read = rcustats_read, | ||
| 268 | }; | ||
| 269 | |||
| 270 | static struct file_operations rcugp_fops = { | ||
| 271 | .owner = THIS_MODULE, | ||
| 272 | .read = rcugp_read, | ||
| 273 | }; | ||
| 274 | |||
| 275 | static struct file_operations rcuctrs_fops = { | ||
| 276 | .owner = THIS_MODULE, | ||
| 277 | .read = rcuctrs_read, | ||
| 278 | }; | ||
| 279 | |||
| 280 | static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; | ||
| 281 | static int rcupreempt_debugfs_init(void) | ||
| 282 | { | ||
| 283 | rcudir = debugfs_create_dir("rcu", NULL); | ||
| 284 | if (!rcudir) | ||
| 285 | goto out; | ||
| 286 | statdir = debugfs_create_file("rcustats", 0444, rcudir, | ||
| 287 | NULL, &rcustats_fops); | ||
| 288 | if (!statdir) | ||
| 289 | goto free_out; | ||
| 290 | |||
| 291 | gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | ||
| 292 | if (!gpdir) | ||
| 293 | goto free_out; | ||
| 294 | |||
| 295 | ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, | ||
| 296 | NULL, &rcuctrs_fops); | ||
| 297 | if (!ctrsdir) | ||
| 298 | goto free_out; | ||
| 299 | return 0; | ||
| 300 | free_out: | ||
| 301 | if (statdir) | ||
| 302 | debugfs_remove(statdir); | ||
| 303 | if (gpdir) | ||
| 304 | debugfs_remove(gpdir); | ||
| 305 | debugfs_remove(rcudir); | ||
| 306 | out: | ||
| 307 | return 1; | ||
| 308 | } | ||
| 309 | |||
| 310 | static int __init rcupreempt_trace_init(void) | ||
| 311 | { | ||
| 312 | mutex_init(&rcupreempt_trace_mutex); | ||
| 313 | rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); | ||
| 314 | if (!rcupreempt_trace_buf) | ||
| 315 | return 1; | ||
| 316 | return rcupreempt_debugfs_init(); | ||
| 317 | } | ||
| 318 | |||
| 319 | static void __exit rcupreempt_trace_cleanup(void) | ||
| 320 | { | ||
| 321 | debugfs_remove(statdir); | ||
| 322 | debugfs_remove(gpdir); | ||
| 323 | debugfs_remove(ctrsdir); | ||
| 324 | debugfs_remove(rcudir); | ||
| 325 | kfree(rcupreempt_trace_buf); | ||
| 326 | } | ||
| 327 | |||
| 328 | |||
| 329 | module_init(rcupreempt_trace_init); | ||
| 330 | module_exit(rcupreempt_trace_cleanup); | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c3e165c2318f..fd599829e72a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void) | |||
| 726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask = CPU_MASK_ALL; |
| 727 | int i; | 727 | int i; |
| 728 | 728 | ||
| 729 | lock_cpu_hotplug(); | 729 | get_online_cpus(); |
| 730 | 730 | ||
| 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
| 732 | if (num_online_cpus() == 1) { | 732 | if (num_online_cpus() == 1) { |
| 733 | unlock_cpu_hotplug(); | 733 | put_online_cpus(); |
| 734 | return; | 734 | return; |
| 735 | } | 735 | } |
| 736 | 736 | ||
| @@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void) | |||
| 762 | else | 762 | else |
| 763 | rcu_idle_cpu--; | 763 | rcu_idle_cpu--; |
| 764 | 764 | ||
| 765 | unlock_cpu_hotplug(); | 765 | put_online_cpus(); |
| 766 | } | 766 | } |
| 767 | 767 | ||
| 768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | 768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the |
diff --git a/kernel/sched.c b/kernel/sched.c index e76b11ca6df3..524285e46fa7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -22,6 +22,8 @@ | |||
| 22 | * by Peter Williams | 22 | * by Peter Williams |
| 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
| 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
| 25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
| 26 | * Thomas Gleixner, Mike Kravetz | ||
| 25 | */ | 27 | */ |
| 26 | 28 | ||
| 27 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
| @@ -63,6 +65,7 @@ | |||
| 63 | #include <linux/reciprocal_div.h> | 65 | #include <linux/reciprocal_div.h> |
| 64 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
| 65 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
| 68 | #include <linux/hrtimer.h> | ||
| 66 | 69 | ||
| 67 | #include <asm/tlb.h> | 70 | #include <asm/tlb.h> |
| 68 | #include <asm/irq_regs.h> | 71 | #include <asm/irq_regs.h> |
| @@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 96 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
| 97 | 100 | ||
| 98 | /* | 101 | /* |
| 99 | * Some helpers for converting nanosecond timing to jiffy resolution | 102 | * Helpers for converting nanosecond timing to jiffy resolution |
| 100 | */ | 103 | */ |
| 101 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
| 102 | #define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) | ||
| 103 | 105 | ||
| 104 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 106 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
| 105 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
| @@ -159,6 +161,8 @@ struct rt_prio_array { | |||
| 159 | 161 | ||
| 160 | struct cfs_rq; | 162 | struct cfs_rq; |
| 161 | 163 | ||
| 164 | static LIST_HEAD(task_groups); | ||
| 165 | |||
| 162 | /* task group related information */ | 166 | /* task group related information */ |
| 163 | struct task_group { | 167 | struct task_group { |
| 164 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 168 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
| @@ -168,10 +172,50 @@ struct task_group { | |||
| 168 | struct sched_entity **se; | 172 | struct sched_entity **se; |
| 169 | /* runqueue "owned" by this group on each cpu */ | 173 | /* runqueue "owned" by this group on each cpu */ |
| 170 | struct cfs_rq **cfs_rq; | 174 | struct cfs_rq **cfs_rq; |
| 175 | |||
| 176 | struct sched_rt_entity **rt_se; | ||
| 177 | struct rt_rq **rt_rq; | ||
| 178 | |||
| 179 | unsigned int rt_ratio; | ||
| 180 | |||
| 181 | /* | ||
| 182 | * shares assigned to a task group governs how much of cpu bandwidth | ||
| 183 | * is allocated to the group. The more shares a group has, the more is | ||
| 184 | * the cpu bandwidth allocated to it. | ||
| 185 | * | ||
| 186 | * For ex, lets say that there are three task groups, A, B and C which | ||
| 187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
| 188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
| 189 | * should be: | ||
| 190 | * | ||
| 191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
| 192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
| 193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
| 194 | * | ||
| 195 | * The weight assigned to a task group's schedulable entities on every | ||
| 196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
| 197 | * group's shares. For ex: lets say that task group A has been | ||
| 198 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
| 199 | * | ||
| 200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
| 201 | * | ||
| 202 | * Note: It's not necessary that each of a task's group schedulable | ||
| 203 | * entity have the same weight on all CPUs. If the group | ||
| 204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
| 205 | * better distribution of weight could be: | ||
| 206 | * | ||
| 207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
| 208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
| 209 | * | ||
| 210 | * rebalance_shares() is responsible for distributing the shares of a | ||
| 211 | * task groups like this among the group's schedulable entities across | ||
| 212 | * cpus. | ||
| 213 | * | ||
| 214 | */ | ||
| 171 | unsigned long shares; | 215 | unsigned long shares; |
| 172 | /* spinlock to serialize modification to shares */ | 216 | |
| 173 | spinlock_t lock; | ||
| 174 | struct rcu_head rcu; | 217 | struct rcu_head rcu; |
| 218 | struct list_head list; | ||
| 175 | }; | 219 | }; |
| 176 | 220 | ||
| 177 | /* Default task group's sched entity on each cpu */ | 221 | /* Default task group's sched entity on each cpu */ |
| @@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | |||
| 179 | /* Default task group's cfs_rq on each cpu */ | 223 | /* Default task group's cfs_rq on each cpu */ |
| 180 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
| 181 | 225 | ||
| 226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
| 227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | ||
| 228 | |||
| 182 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | 229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; |
| 183 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | 230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; |
| 184 | 231 | ||
| 232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | ||
| 233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | ||
| 234 | |||
| 235 | /* task_group_mutex serializes add/remove of task groups and also changes to | ||
| 236 | * a task group's cpu shares. | ||
| 237 | */ | ||
| 238 | static DEFINE_MUTEX(task_group_mutex); | ||
| 239 | |||
| 240 | /* doms_cur_mutex serializes access to doms_cur[] array */ | ||
| 241 | static DEFINE_MUTEX(doms_cur_mutex); | ||
| 242 | |||
| 243 | #ifdef CONFIG_SMP | ||
| 244 | /* kernel thread that runs rebalance_shares() periodically */ | ||
| 245 | static struct task_struct *lb_monitor_task; | ||
| 246 | static int load_balance_monitor(void *unused); | ||
| 247 | #endif | ||
| 248 | |||
| 249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
| 250 | |||
| 185 | /* Default task group. | 251 | /* Default task group. |
| 186 | * Every task in system belong to this group at bootup. | 252 | * Every task in system belong to this group at bootup. |
| 187 | */ | 253 | */ |
| 188 | struct task_group init_task_group = { | 254 | struct task_group init_task_group = { |
| 189 | .se = init_sched_entity_p, | 255 | .se = init_sched_entity_p, |
| 190 | .cfs_rq = init_cfs_rq_p, | 256 | .cfs_rq = init_cfs_rq_p, |
| 257 | |||
| 258 | .rt_se = init_sched_rt_entity_p, | ||
| 259 | .rt_rq = init_rt_rq_p, | ||
| 191 | }; | 260 | }; |
| 192 | 261 | ||
| 193 | #ifdef CONFIG_FAIR_USER_SCHED | 262 | #ifdef CONFIG_FAIR_USER_SCHED |
| 194 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | 263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
| 195 | #else | 264 | #else |
| 196 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | 265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
| 197 | #endif | 266 | #endif |
| 198 | 267 | ||
| 199 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | 268 | #define MIN_GROUP_SHARES 2 |
| 269 | |||
| 270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | ||
| 200 | 271 | ||
| 201 | /* return group to which a task belongs */ | 272 | /* return group to which a task belongs */ |
| 202 | static inline struct task_group *task_group(struct task_struct *p) | 273 | static inline struct task_group *task_group(struct task_struct *p) |
| @@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 215 | } | 286 | } |
| 216 | 287 | ||
| 217 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
| 218 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) | 289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
| 219 | { | 290 | { |
| 220 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
| 221 | p->se.parent = task_group(p)->se[cpu]; | 292 | p->se.parent = task_group(p)->se[cpu]; |
| 293 | |||
| 294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
| 295 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
| 296 | } | ||
| 297 | |||
| 298 | static inline void lock_task_group_list(void) | ||
| 299 | { | ||
| 300 | mutex_lock(&task_group_mutex); | ||
| 301 | } | ||
| 302 | |||
| 303 | static inline void unlock_task_group_list(void) | ||
| 304 | { | ||
| 305 | mutex_unlock(&task_group_mutex); | ||
| 306 | } | ||
| 307 | |||
| 308 | static inline void lock_doms_cur(void) | ||
| 309 | { | ||
| 310 | mutex_lock(&doms_cur_mutex); | ||
| 311 | } | ||
| 312 | |||
| 313 | static inline void unlock_doms_cur(void) | ||
| 314 | { | ||
| 315 | mutex_unlock(&doms_cur_mutex); | ||
| 222 | } | 316 | } |
| 223 | 317 | ||
| 224 | #else | 318 | #else |
| 225 | 319 | ||
| 226 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } | 320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
| 321 | static inline void lock_task_group_list(void) { } | ||
| 322 | static inline void unlock_task_group_list(void) { } | ||
| 323 | static inline void lock_doms_cur(void) { } | ||
| 324 | static inline void unlock_doms_cur(void) { } | ||
| 227 | 325 | ||
| 228 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 229 | 327 | ||
| @@ -264,10 +362,56 @@ struct cfs_rq { | |||
| 264 | /* Real-Time classes' related field in a runqueue: */ | 362 | /* Real-Time classes' related field in a runqueue: */ |
| 265 | struct rt_rq { | 363 | struct rt_rq { |
| 266 | struct rt_prio_array active; | 364 | struct rt_prio_array active; |
| 267 | int rt_load_balance_idx; | 365 | unsigned long rt_nr_running; |
| 268 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | 366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
| 367 | int highest_prio; /* highest queued rt task prio */ | ||
| 368 | #endif | ||
| 369 | #ifdef CONFIG_SMP | ||
| 370 | unsigned long rt_nr_migratory; | ||
| 371 | int overloaded; | ||
| 372 | #endif | ||
| 373 | int rt_throttled; | ||
| 374 | u64 rt_time; | ||
| 375 | |||
| 376 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 377 | struct rq *rq; | ||
| 378 | struct list_head leaf_rt_rq_list; | ||
| 379 | struct task_group *tg; | ||
| 380 | struct sched_rt_entity *rt_se; | ||
| 381 | #endif | ||
| 269 | }; | 382 | }; |
| 270 | 383 | ||
| 384 | #ifdef CONFIG_SMP | ||
| 385 | |||
| 386 | /* | ||
| 387 | * We add the notion of a root-domain which will be used to define per-domain | ||
| 388 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
| 389 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
| 390 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
| 391 | * object. | ||
| 392 | * | ||
| 393 | */ | ||
| 394 | struct root_domain { | ||
| 395 | atomic_t refcount; | ||
| 396 | cpumask_t span; | ||
| 397 | cpumask_t online; | ||
| 398 | |||
| 399 | /* | ||
| 400 | * The "RT overload" flag: it gets set if a CPU has more than | ||
| 401 | * one runnable RT task. | ||
| 402 | */ | ||
| 403 | cpumask_t rto_mask; | ||
| 404 | atomic_t rto_count; | ||
| 405 | }; | ||
| 406 | |||
| 407 | /* | ||
| 408 | * By default the system creates a single root-domain with all cpus as | ||
| 409 | * members (mimicking the global state we have today). | ||
| 410 | */ | ||
| 411 | static struct root_domain def_root_domain; | ||
| 412 | |||
| 413 | #endif | ||
| 414 | |||
| 271 | /* | 415 | /* |
| 272 | * This is the main, per-CPU runqueue data structure. | 416 | * This is the main, per-CPU runqueue data structure. |
| 273 | * | 417 | * |
| @@ -296,11 +440,15 @@ struct rq { | |||
| 296 | u64 nr_switches; | 440 | u64 nr_switches; |
| 297 | 441 | ||
| 298 | struct cfs_rq cfs; | 442 | struct cfs_rq cfs; |
| 443 | struct rt_rq rt; | ||
| 444 | u64 rt_period_expire; | ||
| 445 | int rt_throttled; | ||
| 446 | |||
| 299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 447 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 300 | /* list of leaf cfs_rq on this cpu: */ | 448 | /* list of leaf cfs_rq on this cpu: */ |
| 301 | struct list_head leaf_cfs_rq_list; | 449 | struct list_head leaf_cfs_rq_list; |
| 450 | struct list_head leaf_rt_rq_list; | ||
| 302 | #endif | 451 | #endif |
| 303 | struct rt_rq rt; | ||
| 304 | 452 | ||
| 305 | /* | 453 | /* |
| 306 | * This is part of a global counter where only the total sum | 454 | * This is part of a global counter where only the total sum |
| @@ -317,7 +465,7 @@ struct rq { | |||
| 317 | u64 clock, prev_clock_raw; | 465 | u64 clock, prev_clock_raw; |
| 318 | s64 clock_max_delta; | 466 | s64 clock_max_delta; |
| 319 | 467 | ||
| 320 | unsigned int clock_warps, clock_overflows; | 468 | unsigned int clock_warps, clock_overflows, clock_underflows; |
| 321 | u64 idle_clock; | 469 | u64 idle_clock; |
| 322 | unsigned int clock_deep_idle_events; | 470 | unsigned int clock_deep_idle_events; |
| 323 | u64 tick_timestamp; | 471 | u64 tick_timestamp; |
| @@ -325,6 +473,7 @@ struct rq { | |||
| 325 | atomic_t nr_iowait; | 473 | atomic_t nr_iowait; |
| 326 | 474 | ||
| 327 | #ifdef CONFIG_SMP | 475 | #ifdef CONFIG_SMP |
| 476 | struct root_domain *rd; | ||
| 328 | struct sched_domain *sd; | 477 | struct sched_domain *sd; |
| 329 | 478 | ||
| 330 | /* For active balancing */ | 479 | /* For active balancing */ |
| @@ -337,6 +486,12 @@ struct rq { | |||
| 337 | struct list_head migration_queue; | 486 | struct list_head migration_queue; |
| 338 | #endif | 487 | #endif |
| 339 | 488 | ||
| 489 | #ifdef CONFIG_SCHED_HRTICK | ||
| 490 | unsigned long hrtick_flags; | ||
| 491 | ktime_t hrtick_expire; | ||
| 492 | struct hrtimer hrtick_timer; | ||
| 493 | #endif | ||
| 494 | |||
| 340 | #ifdef CONFIG_SCHEDSTATS | 495 | #ifdef CONFIG_SCHEDSTATS |
| 341 | /* latency stats */ | 496 | /* latency stats */ |
| 342 | struct sched_info rq_sched_info; | 497 | struct sched_info rq_sched_info; |
| @@ -363,7 +518,6 @@ struct rq { | |||
| 363 | }; | 518 | }; |
| 364 | 519 | ||
| 365 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 366 | static DEFINE_MUTEX(sched_hotcpu_mutex); | ||
| 367 | 521 | ||
| 368 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
| 369 | { | 523 | { |
| @@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq) | |||
| 441 | #define task_rq(p) cpu_rq(task_cpu(p)) | 595 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 442 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 443 | 597 | ||
| 598 | unsigned long rt_needs_cpu(int cpu) | ||
| 599 | { | ||
| 600 | struct rq *rq = cpu_rq(cpu); | ||
| 601 | u64 delta; | ||
| 602 | |||
| 603 | if (!rq->rt_throttled) | ||
| 604 | return 0; | ||
| 605 | |||
| 606 | if (rq->clock > rq->rt_period_expire) | ||
| 607 | return 1; | ||
| 608 | |||
| 609 | delta = rq->rt_period_expire - rq->clock; | ||
| 610 | do_div(delta, NSEC_PER_SEC / HZ); | ||
| 611 | |||
| 612 | return (unsigned long)delta; | ||
| 613 | } | ||
| 614 | |||
| 444 | /* | 615 | /* |
| 445 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
| 446 | */ | 617 | */ |
| @@ -459,6 +630,8 @@ enum { | |||
| 459 | SCHED_FEAT_START_DEBIT = 4, | 630 | SCHED_FEAT_START_DEBIT = 4, |
| 460 | SCHED_FEAT_TREE_AVG = 8, | 631 | SCHED_FEAT_TREE_AVG = 8, |
| 461 | SCHED_FEAT_APPROX_AVG = 16, | 632 | SCHED_FEAT_APPROX_AVG = 16, |
| 633 | SCHED_FEAT_HRTICK = 32, | ||
| 634 | SCHED_FEAT_DOUBLE_TICK = 64, | ||
| 462 | }; | 635 | }; |
| 463 | 636 | ||
| 464 | const_debug unsigned int sysctl_sched_features = | 637 | const_debug unsigned int sysctl_sched_features = |
| @@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features = | |||
| 466 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | |
| 467 | SCHED_FEAT_START_DEBIT * 1 | | 640 | SCHED_FEAT_START_DEBIT * 1 | |
| 468 | SCHED_FEAT_TREE_AVG * 0 | | 641 | SCHED_FEAT_TREE_AVG * 0 | |
| 469 | SCHED_FEAT_APPROX_AVG * 0; | 642 | SCHED_FEAT_APPROX_AVG * 0 | |
| 643 | SCHED_FEAT_HRTICK * 1 | | ||
| 644 | SCHED_FEAT_DOUBLE_TICK * 0; | ||
| 470 | 645 | ||
| 471 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) |
| 472 | 647 | ||
| @@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features = | |||
| 477 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
| 478 | 653 | ||
| 479 | /* | 654 | /* |
| 655 | * period over which we measure -rt task cpu usage in ms. | ||
| 656 | * default: 1s | ||
| 657 | */ | ||
| 658 | const_debug unsigned int sysctl_sched_rt_period = 1000; | ||
| 659 | |||
| 660 | #define SCHED_RT_FRAC_SHIFT 16 | ||
| 661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) | ||
| 662 | |||
| 663 | /* | ||
| 664 | * ratio of time -rt tasks may consume. | ||
| 665 | * default: 95% | ||
| 666 | */ | ||
| 667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; | ||
| 668 | |||
| 669 | /* | ||
| 480 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
| 481 | * clock constructed from sched_clock(): | 671 | * clock constructed from sched_clock(): |
| 482 | */ | 672 | */ |
| @@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
| 668 | struct rq *rq = cpu_rq(smp_processor_id()); | 858 | struct rq *rq = cpu_rq(smp_processor_id()); |
| 669 | u64 now = sched_clock(); | 859 | u64 now = sched_clock(); |
| 670 | 860 | ||
| 671 | touch_softlockup_watchdog(); | ||
| 672 | rq->idle_clock += delta_ns; | 861 | rq->idle_clock += delta_ns; |
| 673 | /* | 862 | /* |
| 674 | * Override the previous timestamp and ignore all | 863 | * Override the previous timestamp and ignore all |
| @@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
| 680 | rq->prev_clock_raw = now; | 869 | rq->prev_clock_raw = now; |
| 681 | rq->clock += delta_ns; | 870 | rq->clock += delta_ns; |
| 682 | spin_unlock(&rq->lock); | 871 | spin_unlock(&rq->lock); |
| 872 | touch_softlockup_watchdog(); | ||
| 683 | } | 873 | } |
| 684 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
| 685 | 875 | ||
| 876 | static void __resched_task(struct task_struct *p, int tif_bit); | ||
| 877 | |||
| 878 | static inline void resched_task(struct task_struct *p) | ||
| 879 | { | ||
| 880 | __resched_task(p, TIF_NEED_RESCHED); | ||
| 881 | } | ||
| 882 | |||
| 883 | #ifdef CONFIG_SCHED_HRTICK | ||
| 884 | /* | ||
| 885 | * Use HR-timers to deliver accurate preemption points. | ||
| 886 | * | ||
| 887 | * Its all a bit involved since we cannot program an hrt while holding the | ||
| 888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | ||
| 889 | * reschedule event. | ||
| 890 | * | ||
| 891 | * When we get rescheduled we reprogram the hrtick_timer outside of the | ||
| 892 | * rq->lock. | ||
| 893 | */ | ||
| 894 | static inline void resched_hrt(struct task_struct *p) | ||
| 895 | { | ||
| 896 | __resched_task(p, TIF_HRTICK_RESCHED); | ||
| 897 | } | ||
| 898 | |||
| 899 | static inline void resched_rq(struct rq *rq) | ||
| 900 | { | ||
| 901 | unsigned long flags; | ||
| 902 | |||
| 903 | spin_lock_irqsave(&rq->lock, flags); | ||
| 904 | resched_task(rq->curr); | ||
| 905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 906 | } | ||
| 907 | |||
| 908 | enum { | ||
| 909 | HRTICK_SET, /* re-programm hrtick_timer */ | ||
| 910 | HRTICK_RESET, /* not a new slice */ | ||
| 911 | }; | ||
| 912 | |||
| 913 | /* | ||
| 914 | * Use hrtick when: | ||
| 915 | * - enabled by features | ||
| 916 | * - hrtimer is actually high res | ||
| 917 | */ | ||
| 918 | static inline int hrtick_enabled(struct rq *rq) | ||
| 919 | { | ||
| 920 | if (!sched_feat(HRTICK)) | ||
| 921 | return 0; | ||
| 922 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
| 923 | } | ||
| 924 | |||
| 925 | /* | ||
| 926 | * Called to set the hrtick timer state. | ||
| 927 | * | ||
| 928 | * called with rq->lock held and irqs disabled | ||
| 929 | */ | ||
| 930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | ||
| 931 | { | ||
| 932 | assert_spin_locked(&rq->lock); | ||
| 933 | |||
| 934 | /* | ||
| 935 | * preempt at: now + delay | ||
| 936 | */ | ||
| 937 | rq->hrtick_expire = | ||
| 938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | ||
| 939 | /* | ||
| 940 | * indicate we need to program the timer | ||
| 941 | */ | ||
| 942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | ||
| 943 | if (reset) | ||
| 944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
| 945 | |||
| 946 | /* | ||
| 947 | * New slices are called from the schedule path and don't need a | ||
| 948 | * forced reschedule. | ||
| 949 | */ | ||
| 950 | if (reset) | ||
| 951 | resched_hrt(rq->curr); | ||
| 952 | } | ||
| 953 | |||
| 954 | static void hrtick_clear(struct rq *rq) | ||
| 955 | { | ||
| 956 | if (hrtimer_active(&rq->hrtick_timer)) | ||
| 957 | hrtimer_cancel(&rq->hrtick_timer); | ||
| 958 | } | ||
| 959 | |||
| 960 | /* | ||
| 961 | * Update the timer from the possible pending state. | ||
| 962 | */ | ||
| 963 | static void hrtick_set(struct rq *rq) | ||
| 964 | { | ||
| 965 | ktime_t time; | ||
| 966 | int set, reset; | ||
| 967 | unsigned long flags; | ||
| 968 | |||
| 969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
| 970 | |||
| 971 | spin_lock_irqsave(&rq->lock, flags); | ||
| 972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | ||
| 973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
| 974 | time = rq->hrtick_expire; | ||
| 975 | clear_thread_flag(TIF_HRTICK_RESCHED); | ||
| 976 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 977 | |||
| 978 | if (set) { | ||
| 979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | ||
| 980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | ||
| 981 | resched_rq(rq); | ||
| 982 | } else | ||
| 983 | hrtick_clear(rq); | ||
| 984 | } | ||
| 985 | |||
| 986 | /* | ||
| 987 | * High-resolution timer tick. | ||
| 988 | * Runs from hardirq context with interrupts disabled. | ||
| 989 | */ | ||
| 990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | ||
| 991 | { | ||
| 992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | ||
| 993 | |||
| 994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
| 995 | |||
| 996 | spin_lock(&rq->lock); | ||
| 997 | __update_rq_clock(rq); | ||
| 998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | ||
| 999 | spin_unlock(&rq->lock); | ||
| 1000 | |||
| 1001 | return HRTIMER_NORESTART; | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static inline void init_rq_hrtick(struct rq *rq) | ||
| 1005 | { | ||
| 1006 | rq->hrtick_flags = 0; | ||
| 1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 1008 | rq->hrtick_timer.function = hrtick; | ||
| 1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | void hrtick_resched(void) | ||
| 1013 | { | ||
| 1014 | struct rq *rq; | ||
| 1015 | unsigned long flags; | ||
| 1016 | |||
| 1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | ||
| 1018 | return; | ||
| 1019 | |||
| 1020 | local_irq_save(flags); | ||
| 1021 | rq = cpu_rq(smp_processor_id()); | ||
| 1022 | hrtick_set(rq); | ||
| 1023 | local_irq_restore(flags); | ||
| 1024 | } | ||
| 1025 | #else | ||
| 1026 | static inline void hrtick_clear(struct rq *rq) | ||
| 1027 | { | ||
| 1028 | } | ||
| 1029 | |||
| 1030 | static inline void hrtick_set(struct rq *rq) | ||
| 1031 | { | ||
| 1032 | } | ||
| 1033 | |||
| 1034 | static inline void init_rq_hrtick(struct rq *rq) | ||
| 1035 | { | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | void hrtick_resched(void) | ||
| 1039 | { | ||
| 1040 | } | ||
| 1041 | #endif | ||
| 1042 | |||
| 686 | /* | 1043 | /* |
| 687 | * resched_task - mark a task 'to be rescheduled now'. | 1044 | * resched_task - mark a task 'to be rescheduled now'. |
| 688 | * | 1045 | * |
| @@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
| 696 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
| 697 | #endif | 1054 | #endif |
| 698 | 1055 | ||
| 699 | static void resched_task(struct task_struct *p) | 1056 | static void __resched_task(struct task_struct *p, int tif_bit) |
| 700 | { | 1057 | { |
| 701 | int cpu; | 1058 | int cpu; |
| 702 | 1059 | ||
| 703 | assert_spin_locked(&task_rq(p)->lock); | 1060 | assert_spin_locked(&task_rq(p)->lock); |
| 704 | 1061 | ||
| 705 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) |
| 706 | return; | 1063 | return; |
| 707 | 1064 | ||
| 708 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 1065 | set_tsk_thread_flag(p, tif_bit); |
| 709 | 1066 | ||
| 710 | cpu = task_cpu(p); | 1067 | cpu = task_cpu(p); |
| 711 | if (cpu == smp_processor_id()) | 1068 | if (cpu == smp_processor_id()) |
| @@ -728,10 +1085,10 @@ static void resched_cpu(int cpu) | |||
| 728 | spin_unlock_irqrestore(&rq->lock, flags); | 1085 | spin_unlock_irqrestore(&rq->lock, flags); |
| 729 | } | 1086 | } |
| 730 | #else | 1087 | #else |
| 731 | static inline void resched_task(struct task_struct *p) | 1088 | static void __resched_task(struct task_struct *p, int tif_bit) |
| 732 | { | 1089 | { |
| 733 | assert_spin_locked(&task_rq(p)->lock); | 1090 | assert_spin_locked(&task_rq(p)->lock); |
| 734 | set_tsk_need_resched(p); | 1091 | set_tsk_thread_flag(p, tif_bit); |
| 735 | } | 1092 | } |
| 736 | #endif | 1093 | #endif |
| 737 | 1094 | ||
| @@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
| 871 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
| 872 | #endif | 1229 | #endif |
| 873 | 1230 | ||
| 1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
| 1232 | { | ||
| 1233 | update_load_add(&rq->load, load); | ||
| 1234 | } | ||
| 1235 | |||
| 1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
| 1237 | { | ||
| 1238 | update_load_sub(&rq->load, load); | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | #ifdef CONFIG_SMP | ||
| 1242 | static unsigned long source_load(int cpu, int type); | ||
| 1243 | static unsigned long target_load(int cpu, int type); | ||
| 1244 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
| 1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
| 1246 | #endif /* CONFIG_SMP */ | ||
| 1247 | |||
| 874 | #include "sched_stats.h" | 1248 | #include "sched_stats.h" |
| 875 | #include "sched_idletask.c" | 1249 | #include "sched_idletask.c" |
| 876 | #include "sched_fair.c" | 1250 | #include "sched_fair.c" |
| @@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | |||
| 881 | 1255 | ||
| 882 | #define sched_class_highest (&rt_sched_class) | 1256 | #define sched_class_highest (&rt_sched_class) |
| 883 | 1257 | ||
| 884 | /* | ||
| 885 | * Update delta_exec, delta_fair fields for rq. | ||
| 886 | * | ||
| 887 | * delta_fair clock advances at a rate inversely proportional to | ||
| 888 | * total load (rq->load.weight) on the runqueue, while | ||
| 889 | * delta_exec advances at the same rate as wall-clock (provided | ||
| 890 | * cpu is not idle). | ||
| 891 | * | ||
| 892 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
| 893 | * runqueue over any given interval. This (smoothened) load is used | ||
| 894 | * during load balance. | ||
| 895 | * | ||
| 896 | * This function is called /before/ updating rq->load | ||
| 897 | * and when switching tasks. | ||
| 898 | */ | ||
| 899 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | ||
| 900 | { | ||
| 901 | update_load_add(&rq->load, p->se.load.weight); | ||
| 902 | } | ||
| 903 | |||
| 904 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
| 905 | { | ||
| 906 | update_load_sub(&rq->load, p->se.load.weight); | ||
| 907 | } | ||
| 908 | |||
| 909 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 1258 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
| 910 | { | 1259 | { |
| 911 | rq->nr_running++; | 1260 | rq->nr_running++; |
| 912 | inc_load(rq, p); | ||
| 913 | } | 1261 | } |
| 914 | 1262 | ||
| 915 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1263 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
| 916 | { | 1264 | { |
| 917 | rq->nr_running--; | 1265 | rq->nr_running--; |
| 918 | dec_load(rq, p); | ||
| 919 | } | 1266 | } |
| 920 | 1267 | ||
| 921 | static void set_load_weight(struct task_struct *p) | 1268 | static void set_load_weight(struct task_struct *p) |
| @@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu) | |||
| 1039 | 1386 | ||
| 1040 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
| 1041 | { | 1388 | { |
| 1042 | set_task_cfs_rq(p, cpu); | 1389 | set_task_rq(p, cpu); |
| 1043 | #ifdef CONFIG_SMP | 1390 | #ifdef CONFIG_SMP |
| 1044 | /* | 1391 | /* |
| 1045 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
| @@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 1051 | #endif | 1398 | #endif |
| 1052 | } | 1399 | } |
| 1053 | 1400 | ||
| 1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | ||
| 1402 | const struct sched_class *prev_class, | ||
| 1403 | int oldprio, int running) | ||
| 1404 | { | ||
| 1405 | if (prev_class != p->sched_class) { | ||
| 1406 | if (prev_class->switched_from) | ||
| 1407 | prev_class->switched_from(rq, p, running); | ||
| 1408 | p->sched_class->switched_to(rq, p, running); | ||
| 1409 | } else | ||
| 1410 | p->sched_class->prio_changed(rq, p, oldprio, running); | ||
| 1411 | } | ||
| 1412 | |||
| 1054 | #ifdef CONFIG_SMP | 1413 | #ifdef CONFIG_SMP |
| 1055 | 1414 | ||
| 1056 | /* | 1415 | /* |
| 1057 | * Is this task likely cache-hot: | 1416 | * Is this task likely cache-hot: |
| 1058 | */ | 1417 | */ |
| 1059 | static inline int | 1418 | static int |
| 1060 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
| 1061 | { | 1420 | { |
| 1062 | s64 delta; | 1421 | s64 delta; |
| @@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type) | |||
| 1281 | /* | 1640 | /* |
| 1282 | * Return the average load per task on the cpu's run queue | 1641 | * Return the average load per task on the cpu's run queue |
| 1283 | */ | 1642 | */ |
| 1284 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1643 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 1285 | { | 1644 | { |
| 1286 | struct rq *rq = cpu_rq(cpu); | 1645 | struct rq *rq = cpu_rq(cpu); |
| 1287 | unsigned long total = weighted_cpuload(cpu); | 1646 | unsigned long total = weighted_cpuload(cpu); |
| @@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1438 | 1797 | ||
| 1439 | #endif /* CONFIG_SMP */ | 1798 | #endif /* CONFIG_SMP */ |
| 1440 | 1799 | ||
| 1441 | /* | ||
| 1442 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
| 1443 | * not idle and an idle cpu is available. The span of cpus to | ||
| 1444 | * search starts with cpus closest then further out as needed, | ||
| 1445 | * so we always favor a closer, idle cpu. | ||
| 1446 | * | ||
| 1447 | * Returns the CPU we should wake onto. | ||
| 1448 | */ | ||
| 1449 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
| 1450 | static int wake_idle(int cpu, struct task_struct *p) | ||
| 1451 | { | ||
| 1452 | cpumask_t tmp; | ||
| 1453 | struct sched_domain *sd; | ||
| 1454 | int i; | ||
| 1455 | |||
| 1456 | /* | ||
| 1457 | * If it is idle, then it is the best cpu to run this task. | ||
| 1458 | * | ||
| 1459 | * This cpu is also the best, if it has more than one task already. | ||
| 1460 | * Siblings must be also busy(in most cases) as they didn't already | ||
| 1461 | * pickup the extra load from this cpu and hence we need not check | ||
| 1462 | * sibling runqueue info. This will avoid the checks and cache miss | ||
| 1463 | * penalities associated with that. | ||
| 1464 | */ | ||
| 1465 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
| 1466 | return cpu; | ||
| 1467 | |||
| 1468 | for_each_domain(cpu, sd) { | ||
| 1469 | if (sd->flags & SD_WAKE_IDLE) { | ||
| 1470 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
| 1471 | for_each_cpu_mask(i, tmp) { | ||
| 1472 | if (idle_cpu(i)) { | ||
| 1473 | if (i != task_cpu(p)) { | ||
| 1474 | schedstat_inc(p, | ||
| 1475 | se.nr_wakeups_idle); | ||
| 1476 | } | ||
| 1477 | return i; | ||
| 1478 | } | ||
| 1479 | } | ||
| 1480 | } else { | ||
| 1481 | break; | ||
| 1482 | } | ||
| 1483 | } | ||
| 1484 | return cpu; | ||
| 1485 | } | ||
| 1486 | #else | ||
| 1487 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
| 1488 | { | ||
| 1489 | return cpu; | ||
| 1490 | } | ||
| 1491 | #endif | ||
| 1492 | |||
| 1493 | /*** | 1800 | /*** |
| 1494 | * try_to_wake_up - wake up a thread | 1801 | * try_to_wake_up - wake up a thread |
| 1495 | * @p: the to-be-woken-up thread | 1802 | * @p: the to-be-woken-up thread |
| @@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1510 | unsigned long flags; | 1817 | unsigned long flags; |
| 1511 | long old_state; | 1818 | long old_state; |
| 1512 | struct rq *rq; | 1819 | struct rq *rq; |
| 1513 | #ifdef CONFIG_SMP | ||
| 1514 | struct sched_domain *sd, *this_sd = NULL; | ||
| 1515 | unsigned long load, this_load; | ||
| 1516 | int new_cpu; | ||
| 1517 | #endif | ||
| 1518 | 1820 | ||
| 1519 | rq = task_rq_lock(p, &flags); | 1821 | rq = task_rq_lock(p, &flags); |
| 1520 | old_state = p->state; | 1822 | old_state = p->state; |
| @@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1532 | if (unlikely(task_running(rq, p))) | 1834 | if (unlikely(task_running(rq, p))) |
| 1533 | goto out_activate; | 1835 | goto out_activate; |
| 1534 | 1836 | ||
| 1535 | new_cpu = cpu; | 1837 | cpu = p->sched_class->select_task_rq(p, sync); |
| 1536 | 1838 | if (cpu != orig_cpu) { | |
| 1537 | schedstat_inc(rq, ttwu_count); | 1839 | set_task_cpu(p, cpu); |
| 1538 | if (cpu == this_cpu) { | ||
| 1539 | schedstat_inc(rq, ttwu_local); | ||
| 1540 | goto out_set_cpu; | ||
| 1541 | } | ||
| 1542 | |||
| 1543 | for_each_domain(this_cpu, sd) { | ||
| 1544 | if (cpu_isset(cpu, sd->span)) { | ||
| 1545 | schedstat_inc(sd, ttwu_wake_remote); | ||
| 1546 | this_sd = sd; | ||
| 1547 | break; | ||
| 1548 | } | ||
| 1549 | } | ||
| 1550 | |||
| 1551 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
| 1552 | goto out_set_cpu; | ||
| 1553 | |||
| 1554 | /* | ||
| 1555 | * Check for affine wakeup and passive balancing possibilities. | ||
| 1556 | */ | ||
| 1557 | if (this_sd) { | ||
| 1558 | int idx = this_sd->wake_idx; | ||
| 1559 | unsigned int imbalance; | ||
| 1560 | |||
| 1561 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
| 1562 | |||
| 1563 | load = source_load(cpu, idx); | ||
| 1564 | this_load = target_load(this_cpu, idx); | ||
| 1565 | |||
| 1566 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
| 1567 | |||
| 1568 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
| 1569 | unsigned long tl = this_load; | ||
| 1570 | unsigned long tl_per_task; | ||
| 1571 | |||
| 1572 | /* | ||
| 1573 | * Attract cache-cold tasks on sync wakeups: | ||
| 1574 | */ | ||
| 1575 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
| 1576 | goto out_set_cpu; | ||
| 1577 | |||
| 1578 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
| 1579 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 1580 | |||
| 1581 | /* | ||
| 1582 | * If sync wakeup then subtract the (maximum possible) | ||
| 1583 | * effect of the currently running task from the load | ||
| 1584 | * of the current CPU: | ||
| 1585 | */ | ||
| 1586 | if (sync) | ||
| 1587 | tl -= current->se.load.weight; | ||
| 1588 | |||
| 1589 | if ((tl <= load && | ||
| 1590 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
| 1591 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
| 1592 | /* | ||
| 1593 | * This domain has SD_WAKE_AFFINE and | ||
| 1594 | * p is cache cold in this domain, and | ||
| 1595 | * there is no bad imbalance. | ||
| 1596 | */ | ||
| 1597 | schedstat_inc(this_sd, ttwu_move_affine); | ||
| 1598 | schedstat_inc(p, se.nr_wakeups_affine); | ||
| 1599 | goto out_set_cpu; | ||
| 1600 | } | ||
| 1601 | } | ||
| 1602 | |||
| 1603 | /* | ||
| 1604 | * Start passive balancing when half the imbalance_pct | ||
| 1605 | * limit is reached. | ||
| 1606 | */ | ||
| 1607 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
| 1608 | if (imbalance*this_load <= 100*load) { | ||
| 1609 | schedstat_inc(this_sd, ttwu_move_balance); | ||
| 1610 | schedstat_inc(p, se.nr_wakeups_passive); | ||
| 1611 | goto out_set_cpu; | ||
| 1612 | } | ||
| 1613 | } | ||
| 1614 | } | ||
| 1615 | |||
| 1616 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
| 1617 | out_set_cpu: | ||
| 1618 | new_cpu = wake_idle(new_cpu, p); | ||
| 1619 | if (new_cpu != cpu) { | ||
| 1620 | set_task_cpu(p, new_cpu); | ||
| 1621 | task_rq_unlock(rq, &flags); | 1840 | task_rq_unlock(rq, &flags); |
| 1622 | /* might preempt at this point */ | 1841 | /* might preempt at this point */ |
| 1623 | rq = task_rq_lock(p, &flags); | 1842 | rq = task_rq_lock(p, &flags); |
| @@ -1631,6 +1850,21 @@ out_set_cpu: | |||
| 1631 | cpu = task_cpu(p); | 1850 | cpu = task_cpu(p); |
| 1632 | } | 1851 | } |
| 1633 | 1852 | ||
| 1853 | #ifdef CONFIG_SCHEDSTATS | ||
| 1854 | schedstat_inc(rq, ttwu_count); | ||
| 1855 | if (cpu == this_cpu) | ||
| 1856 | schedstat_inc(rq, ttwu_local); | ||
| 1857 | else { | ||
| 1858 | struct sched_domain *sd; | ||
| 1859 | for_each_domain(this_cpu, sd) { | ||
| 1860 | if (cpu_isset(cpu, sd->span)) { | ||
| 1861 | schedstat_inc(sd, ttwu_wake_remote); | ||
| 1862 | break; | ||
| 1863 | } | ||
| 1864 | } | ||
| 1865 | } | ||
| 1866 | #endif | ||
| 1867 | |||
| 1634 | out_activate: | 1868 | out_activate: |
| 1635 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
| 1636 | schedstat_inc(p, se.nr_wakeups); | 1870 | schedstat_inc(p, se.nr_wakeups); |
| @@ -1649,6 +1883,10 @@ out_activate: | |||
| 1649 | 1883 | ||
| 1650 | out_running: | 1884 | out_running: |
| 1651 | p->state = TASK_RUNNING; | 1885 | p->state = TASK_RUNNING; |
| 1886 | #ifdef CONFIG_SMP | ||
| 1887 | if (p->sched_class->task_wake_up) | ||
| 1888 | p->sched_class->task_wake_up(rq, p); | ||
| 1889 | #endif | ||
| 1652 | out: | 1890 | out: |
| 1653 | task_rq_unlock(rq, &flags); | 1891 | task_rq_unlock(rq, &flags); |
| 1654 | 1892 | ||
| @@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p) | |||
| 1691 | p->se.wait_max = 0; | 1929 | p->se.wait_max = 0; |
| 1692 | #endif | 1930 | #endif |
| 1693 | 1931 | ||
| 1694 | INIT_LIST_HEAD(&p->run_list); | 1932 | INIT_LIST_HEAD(&p->rt.run_list); |
| 1695 | p->se.on_rq = 0; | 1933 | p->se.on_rq = 0; |
| 1696 | 1934 | ||
| 1697 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1935 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| @@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 1771 | inc_nr_running(p, rq); | 2009 | inc_nr_running(p, rq); |
| 1772 | } | 2010 | } |
| 1773 | check_preempt_curr(rq, p); | 2011 | check_preempt_curr(rq, p); |
| 2012 | #ifdef CONFIG_SMP | ||
| 2013 | if (p->sched_class->task_wake_up) | ||
| 2014 | p->sched_class->task_wake_up(rq, p); | ||
| 2015 | #endif | ||
| 1774 | task_rq_unlock(rq, &flags); | 2016 | task_rq_unlock(rq, &flags); |
| 1775 | } | 2017 | } |
| 1776 | 2018 | ||
| @@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 1891 | prev_state = prev->state; | 2133 | prev_state = prev->state; |
| 1892 | finish_arch_switch(prev); | 2134 | finish_arch_switch(prev); |
| 1893 | finish_lock_switch(rq, prev); | 2135 | finish_lock_switch(rq, prev); |
| 2136 | #ifdef CONFIG_SMP | ||
| 2137 | if (current->sched_class->post_schedule) | ||
| 2138 | current->sched_class->post_schedule(rq); | ||
| 2139 | #endif | ||
| 2140 | |||
| 1894 | fire_sched_in_preempt_notifiers(current); | 2141 | fire_sched_in_preempt_notifiers(current); |
| 1895 | if (mm) | 2142 | if (mm) |
| 1896 | mmdrop(mm); | 2143 | mmdrop(mm); |
| @@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
| 2124 | /* | 2371 | /* |
| 2125 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2372 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
| 2126 | */ | 2373 | */ |
| 2127 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2374 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
| 2128 | __releases(this_rq->lock) | 2375 | __releases(this_rq->lock) |
| 2129 | __acquires(busiest->lock) | 2376 | __acquires(busiest->lock) |
| 2130 | __acquires(this_rq->lock) | 2377 | __acquires(this_rq->lock) |
| 2131 | { | 2378 | { |
| 2379 | int ret = 0; | ||
| 2380 | |||
| 2132 | if (unlikely(!irqs_disabled())) { | 2381 | if (unlikely(!irqs_disabled())) { |
| 2133 | /* printk() doesn't work good under rq->lock */ | 2382 | /* printk() doesn't work good under rq->lock */ |
| 2134 | spin_unlock(&this_rq->lock); | 2383 | spin_unlock(&this_rq->lock); |
| @@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
| 2139 | spin_unlock(&this_rq->lock); | 2388 | spin_unlock(&this_rq->lock); |
| 2140 | spin_lock(&busiest->lock); | 2389 | spin_lock(&busiest->lock); |
| 2141 | spin_lock(&this_rq->lock); | 2390 | spin_lock(&this_rq->lock); |
| 2391 | ret = 1; | ||
| 2142 | } else | 2392 | } else |
| 2143 | spin_lock(&busiest->lock); | 2393 | spin_lock(&busiest->lock); |
| 2144 | } | 2394 | } |
| 2395 | return ret; | ||
| 2145 | } | 2396 | } |
| 2146 | 2397 | ||
| 2147 | /* | 2398 | /* |
| @@ -3485,12 +3736,14 @@ void scheduler_tick(void) | |||
| 3485 | /* | 3736 | /* |
| 3486 | * Let rq->clock advance by at least TICK_NSEC: | 3737 | * Let rq->clock advance by at least TICK_NSEC: |
| 3487 | */ | 3738 | */ |
| 3488 | if (unlikely(rq->clock < next_tick)) | 3739 | if (unlikely(rq->clock < next_tick)) { |
| 3489 | rq->clock = next_tick; | 3740 | rq->clock = next_tick; |
| 3741 | rq->clock_underflows++; | ||
| 3742 | } | ||
| 3490 | rq->tick_timestamp = rq->clock; | 3743 | rq->tick_timestamp = rq->clock; |
| 3491 | update_cpu_load(rq); | 3744 | update_cpu_load(rq); |
| 3492 | if (curr != rq->idle) /* FIXME: needed? */ | 3745 | curr->sched_class->task_tick(rq, curr, 0); |
| 3493 | curr->sched_class->task_tick(rq, curr); | 3746 | update_sched_rt_period(rq); |
| 3494 | spin_unlock(&rq->lock); | 3747 | spin_unlock(&rq->lock); |
| 3495 | 3748 | ||
| 3496 | #ifdef CONFIG_SMP | 3749 | #ifdef CONFIG_SMP |
| @@ -3636,6 +3889,8 @@ need_resched_nonpreemptible: | |||
| 3636 | 3889 | ||
| 3637 | schedule_debug(prev); | 3890 | schedule_debug(prev); |
| 3638 | 3891 | ||
| 3892 | hrtick_clear(rq); | ||
| 3893 | |||
| 3639 | /* | 3894 | /* |
| 3640 | * Do the rq-clock update outside the rq lock: | 3895 | * Do the rq-clock update outside the rq lock: |
| 3641 | */ | 3896 | */ |
| @@ -3654,6 +3909,11 @@ need_resched_nonpreemptible: | |||
| 3654 | switch_count = &prev->nvcsw; | 3909 | switch_count = &prev->nvcsw; |
| 3655 | } | 3910 | } |
| 3656 | 3911 | ||
| 3912 | #ifdef CONFIG_SMP | ||
| 3913 | if (prev->sched_class->pre_schedule) | ||
| 3914 | prev->sched_class->pre_schedule(rq, prev); | ||
| 3915 | #endif | ||
| 3916 | |||
| 3657 | if (unlikely(!rq->nr_running)) | 3917 | if (unlikely(!rq->nr_running)) |
| 3658 | idle_balance(cpu, rq); | 3918 | idle_balance(cpu, rq); |
| 3659 | 3919 | ||
| @@ -3668,14 +3928,20 @@ need_resched_nonpreemptible: | |||
| 3668 | ++*switch_count; | 3928 | ++*switch_count; |
| 3669 | 3929 | ||
| 3670 | context_switch(rq, prev, next); /* unlocks the rq */ | 3930 | context_switch(rq, prev, next); /* unlocks the rq */ |
| 3931 | /* | ||
| 3932 | * the context switch might have flipped the stack from under | ||
| 3933 | * us, hence refresh the local variables. | ||
| 3934 | */ | ||
| 3935 | cpu = smp_processor_id(); | ||
| 3936 | rq = cpu_rq(cpu); | ||
| 3671 | } else | 3937 | } else |
| 3672 | spin_unlock_irq(&rq->lock); | 3938 | spin_unlock_irq(&rq->lock); |
| 3673 | 3939 | ||
| 3674 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3940 | hrtick_set(rq); |
| 3675 | cpu = smp_processor_id(); | 3941 | |
| 3676 | rq = cpu_rq(cpu); | 3942 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
| 3677 | goto need_resched_nonpreemptible; | 3943 | goto need_resched_nonpreemptible; |
| 3678 | } | 3944 | |
| 3679 | preempt_enable_no_resched(); | 3945 | preempt_enable_no_resched(); |
| 3680 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3946 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
| 3681 | goto need_resched; | 3947 | goto need_resched; |
| @@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule); | |||
| 3691 | asmlinkage void __sched preempt_schedule(void) | 3957 | asmlinkage void __sched preempt_schedule(void) |
| 3692 | { | 3958 | { |
| 3693 | struct thread_info *ti = current_thread_info(); | 3959 | struct thread_info *ti = current_thread_info(); |
| 3694 | #ifdef CONFIG_PREEMPT_BKL | ||
| 3695 | struct task_struct *task = current; | 3960 | struct task_struct *task = current; |
| 3696 | int saved_lock_depth; | 3961 | int saved_lock_depth; |
| 3697 | #endif | 3962 | |
| 3698 | /* | 3963 | /* |
| 3699 | * If there is a non-zero preempt_count or interrupts are disabled, | 3964 | * If there is a non-zero preempt_count or interrupts are disabled, |
| 3700 | * we do not want to preempt the current task. Just return.. | 3965 | * we do not want to preempt the current task. Just return.. |
| @@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void) | |||
| 3710 | * clear ->lock_depth so that schedule() doesnt | 3975 | * clear ->lock_depth so that schedule() doesnt |
| 3711 | * auto-release the semaphore: | 3976 | * auto-release the semaphore: |
| 3712 | */ | 3977 | */ |
| 3713 | #ifdef CONFIG_PREEMPT_BKL | ||
| 3714 | saved_lock_depth = task->lock_depth; | 3978 | saved_lock_depth = task->lock_depth; |
| 3715 | task->lock_depth = -1; | 3979 | task->lock_depth = -1; |
| 3716 | #endif | ||
| 3717 | schedule(); | 3980 | schedule(); |
| 3718 | #ifdef CONFIG_PREEMPT_BKL | ||
| 3719 | task->lock_depth = saved_lock_depth; | 3981 | task->lock_depth = saved_lock_depth; |
| 3720 | #endif | ||
| 3721 | sub_preempt_count(PREEMPT_ACTIVE); | 3982 | sub_preempt_count(PREEMPT_ACTIVE); |
| 3722 | 3983 | ||
| 3723 | /* | 3984 | /* |
| @@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule); | |||
| 3738 | asmlinkage void __sched preempt_schedule_irq(void) | 3999 | asmlinkage void __sched preempt_schedule_irq(void) |
| 3739 | { | 4000 | { |
| 3740 | struct thread_info *ti = current_thread_info(); | 4001 | struct thread_info *ti = current_thread_info(); |
| 3741 | #ifdef CONFIG_PREEMPT_BKL | ||
| 3742 | struct task_struct *task = current; | 4002 | struct task_struct *task = current; |
| 3743 | int saved_lock_depth; | 4003 | int saved_lock_depth; |
| 3744 | #endif | 4004 | |
| 3745 | /* Catch callers which need to be fixed */ | 4005 | /* Catch callers which need to be fixed */ |
| 3746 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4006 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
| 3747 | 4007 | ||
| @@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 3753 | * clear ->lock_depth so that schedule() doesnt | 4013 | * clear ->lock_depth so that schedule() doesnt |
| 3754 | * auto-release the semaphore: | 4014 | * auto-release the semaphore: |
| 3755 | */ | 4015 | */ |
| 3756 | #ifdef CONFIG_PREEMPT_BKL | ||
| 3757 | saved_lock_depth = task->lock_depth; | 4016 | saved_lock_depth = task->lock_depth; |
| 3758 | task->lock_depth = -1; | 4017 | task->lock_depth = -1; |
| 3759 | #endif | ||
| 3760 | local_irq_enable(); | 4018 | local_irq_enable(); |
| 3761 | schedule(); | 4019 | schedule(); |
| 3762 | local_irq_disable(); | 4020 | local_irq_disable(); |
| 3763 | #ifdef CONFIG_PREEMPT_BKL | ||
| 3764 | task->lock_depth = saved_lock_depth; | 4021 | task->lock_depth = saved_lock_depth; |
| 3765 | #endif | ||
| 3766 | sub_preempt_count(PREEMPT_ACTIVE); | 4022 | sub_preempt_count(PREEMPT_ACTIVE); |
| 3767 | 4023 | ||
| 3768 | /* | 4024 | /* |
| @@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4019 | unsigned long flags; | 4275 | unsigned long flags; |
| 4020 | int oldprio, on_rq, running; | 4276 | int oldprio, on_rq, running; |
| 4021 | struct rq *rq; | 4277 | struct rq *rq; |
| 4278 | const struct sched_class *prev_class = p->sched_class; | ||
| 4022 | 4279 | ||
| 4023 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4280 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
| 4024 | 4281 | ||
| @@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4044 | if (on_rq) { | 4301 | if (on_rq) { |
| 4045 | if (running) | 4302 | if (running) |
| 4046 | p->sched_class->set_curr_task(rq); | 4303 | p->sched_class->set_curr_task(rq); |
| 4304 | |||
| 4047 | enqueue_task(rq, p, 0); | 4305 | enqueue_task(rq, p, 0); |
| 4048 | /* | 4306 | |
| 4049 | * Reschedule if we are currently running on this runqueue and | 4307 | check_class_changed(rq, p, prev_class, oldprio, running); |
| 4050 | * our priority decreased, or if we are not currently running on | ||
| 4051 | * this runqueue and our priority is higher than the current's | ||
| 4052 | */ | ||
| 4053 | if (running) { | ||
| 4054 | if (p->prio > oldprio) | ||
| 4055 | resched_task(rq->curr); | ||
| 4056 | } else { | ||
| 4057 | check_preempt_curr(rq, p); | ||
| 4058 | } | ||
| 4059 | } | 4308 | } |
| 4060 | task_rq_unlock(rq, &flags); | 4309 | task_rq_unlock(rq, &flags); |
| 4061 | } | 4310 | } |
| @@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4087 | goto out_unlock; | 4336 | goto out_unlock; |
| 4088 | } | 4337 | } |
| 4089 | on_rq = p->se.on_rq; | 4338 | on_rq = p->se.on_rq; |
| 4090 | if (on_rq) { | 4339 | if (on_rq) |
| 4091 | dequeue_task(rq, p, 0); | 4340 | dequeue_task(rq, p, 0); |
| 4092 | dec_load(rq, p); | ||
| 4093 | } | ||
| 4094 | 4341 | ||
| 4095 | p->static_prio = NICE_TO_PRIO(nice); | 4342 | p->static_prio = NICE_TO_PRIO(nice); |
| 4096 | set_load_weight(p); | 4343 | set_load_weight(p); |
| @@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4100 | 4347 | ||
| 4101 | if (on_rq) { | 4348 | if (on_rq) { |
| 4102 | enqueue_task(rq, p, 0); | 4349 | enqueue_task(rq, p, 0); |
| 4103 | inc_load(rq, p); | ||
| 4104 | /* | 4350 | /* |
| 4105 | * If the task increased its priority or is running and | 4351 | * If the task increased its priority or is running and |
| 4106 | * lowered its priority, then reschedule its CPU: | 4352 | * lowered its priority, then reschedule its CPU: |
| @@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
| 4258 | { | 4504 | { |
| 4259 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4505 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
| 4260 | unsigned long flags; | 4506 | unsigned long flags; |
| 4507 | const struct sched_class *prev_class = p->sched_class; | ||
| 4261 | struct rq *rq; | 4508 | struct rq *rq; |
| 4262 | 4509 | ||
| 4263 | /* may grab non-irq protected spin_locks */ | 4510 | /* may grab non-irq protected spin_locks */ |
| @@ -4351,18 +4598,10 @@ recheck: | |||
| 4351 | if (on_rq) { | 4598 | if (on_rq) { |
| 4352 | if (running) | 4599 | if (running) |
| 4353 | p->sched_class->set_curr_task(rq); | 4600 | p->sched_class->set_curr_task(rq); |
| 4601 | |||
| 4354 | activate_task(rq, p, 0); | 4602 | activate_task(rq, p, 0); |
| 4355 | /* | 4603 | |
| 4356 | * Reschedule if we are currently running on this runqueue and | 4604 | check_class_changed(rq, p, prev_class, oldprio, running); |
| 4357 | * our priority decreased, or if we are not currently running on | ||
| 4358 | * this runqueue and our priority is higher than the current's | ||
| 4359 | */ | ||
| 4360 | if (running) { | ||
| 4361 | if (p->prio > oldprio) | ||
| 4362 | resched_task(rq->curr); | ||
| 4363 | } else { | ||
| 4364 | check_preempt_curr(rq, p); | ||
| 4365 | } | ||
| 4366 | } | 4605 | } |
| 4367 | __task_rq_unlock(rq); | 4606 | __task_rq_unlock(rq); |
| 4368 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4607 | spin_unlock_irqrestore(&p->pi_lock, flags); |
| @@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
| 4490 | struct task_struct *p; | 4729 | struct task_struct *p; |
| 4491 | int retval; | 4730 | int retval; |
| 4492 | 4731 | ||
| 4493 | mutex_lock(&sched_hotcpu_mutex); | 4732 | get_online_cpus(); |
| 4494 | read_lock(&tasklist_lock); | 4733 | read_lock(&tasklist_lock); |
| 4495 | 4734 | ||
| 4496 | p = find_process_by_pid(pid); | 4735 | p = find_process_by_pid(pid); |
| 4497 | if (!p) { | 4736 | if (!p) { |
| 4498 | read_unlock(&tasklist_lock); | 4737 | read_unlock(&tasklist_lock); |
| 4499 | mutex_unlock(&sched_hotcpu_mutex); | 4738 | put_online_cpus(); |
| 4500 | return -ESRCH; | 4739 | return -ESRCH; |
| 4501 | } | 4740 | } |
| 4502 | 4741 | ||
| @@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
| 4536 | } | 4775 | } |
| 4537 | out_unlock: | 4776 | out_unlock: |
| 4538 | put_task_struct(p); | 4777 | put_task_struct(p); |
| 4539 | mutex_unlock(&sched_hotcpu_mutex); | 4778 | put_online_cpus(); |
| 4540 | return retval; | 4779 | return retval; |
| 4541 | } | 4780 | } |
| 4542 | 4781 | ||
| @@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
| 4593 | struct task_struct *p; | 4832 | struct task_struct *p; |
| 4594 | int retval; | 4833 | int retval; |
| 4595 | 4834 | ||
| 4596 | mutex_lock(&sched_hotcpu_mutex); | 4835 | get_online_cpus(); |
| 4597 | read_lock(&tasklist_lock); | 4836 | read_lock(&tasklist_lock); |
| 4598 | 4837 | ||
| 4599 | retval = -ESRCH; | 4838 | retval = -ESRCH; |
| @@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
| 4609 | 4848 | ||
| 4610 | out_unlock: | 4849 | out_unlock: |
| 4611 | read_unlock(&tasklist_lock); | 4850 | read_unlock(&tasklist_lock); |
| 4612 | mutex_unlock(&sched_hotcpu_mutex); | 4851 | put_online_cpus(); |
| 4613 | 4852 | ||
| 4614 | return retval; | 4853 | return retval; |
| 4615 | } | 4854 | } |
| @@ -4683,7 +4922,8 @@ static void __cond_resched(void) | |||
| 4683 | } while (need_resched()); | 4922 | } while (need_resched()); |
| 4684 | } | 4923 | } |
| 4685 | 4924 | ||
| 4686 | int __sched cond_resched(void) | 4925 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) |
| 4926 | int __sched _cond_resched(void) | ||
| 4687 | { | 4927 | { |
| 4688 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4928 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
| 4689 | system_state == SYSTEM_RUNNING) { | 4929 | system_state == SYSTEM_RUNNING) { |
| @@ -4692,7 +4932,8 @@ int __sched cond_resched(void) | |||
| 4692 | } | 4932 | } |
| 4693 | return 0; | 4933 | return 0; |
| 4694 | } | 4934 | } |
| 4695 | EXPORT_SYMBOL(cond_resched); | 4935 | EXPORT_SYMBOL(_cond_resched); |
| 4936 | #endif | ||
| 4696 | 4937 | ||
| 4697 | /* | 4938 | /* |
| 4698 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4939 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
| @@ -4890,7 +5131,7 @@ out_unlock: | |||
| 4890 | 5131 | ||
| 4891 | static const char stat_nam[] = "RSDTtZX"; | 5132 | static const char stat_nam[] = "RSDTtZX"; |
| 4892 | 5133 | ||
| 4893 | static void show_task(struct task_struct *p) | 5134 | void sched_show_task(struct task_struct *p) |
| 4894 | { | 5135 | { |
| 4895 | unsigned long free = 0; | 5136 | unsigned long free = 0; |
| 4896 | unsigned state; | 5137 | unsigned state; |
| @@ -4920,8 +5161,7 @@ static void show_task(struct task_struct *p) | |||
| 4920 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 5161 | printk(KERN_CONT "%5lu %5d %6d\n", free, |
| 4921 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 5162 | task_pid_nr(p), task_pid_nr(p->real_parent)); |
| 4922 | 5163 | ||
| 4923 | if (state != TASK_RUNNING) | 5164 | show_stack(p, NULL); |
| 4924 | show_stack(p, NULL); | ||
| 4925 | } | 5165 | } |
| 4926 | 5166 | ||
| 4927 | void show_state_filter(unsigned long state_filter) | 5167 | void show_state_filter(unsigned long state_filter) |
| @@ -4943,7 +5183,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4943 | */ | 5183 | */ |
| 4944 | touch_nmi_watchdog(); | 5184 | touch_nmi_watchdog(); |
| 4945 | if (!state_filter || (p->state & state_filter)) | 5185 | if (!state_filter || (p->state & state_filter)) |
| 4946 | show_task(p); | 5186 | sched_show_task(p); |
| 4947 | } while_each_thread(g, p); | 5187 | } while_each_thread(g, p); |
| 4948 | 5188 | ||
| 4949 | touch_all_softlockup_watchdogs(); | 5189 | touch_all_softlockup_watchdogs(); |
| @@ -4992,11 +5232,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 4992 | spin_unlock_irqrestore(&rq->lock, flags); | 5232 | spin_unlock_irqrestore(&rq->lock, flags); |
| 4993 | 5233 | ||
| 4994 | /* Set the preempt count _outside_ the spinlocks! */ | 5234 | /* Set the preempt count _outside_ the spinlocks! */ |
| 4995 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | ||
| 4996 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
| 4997 | #else | ||
| 4998 | task_thread_info(idle)->preempt_count = 0; | 5235 | task_thread_info(idle)->preempt_count = 0; |
| 4999 | #endif | 5236 | |
| 5000 | /* | 5237 | /* |
| 5001 | * The idle tasks have their own, simple scheduling class: | 5238 | * The idle tasks have their own, simple scheduling class: |
| 5002 | */ | 5239 | */ |
| @@ -5077,7 +5314,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
| 5077 | goto out; | 5314 | goto out; |
| 5078 | } | 5315 | } |
| 5079 | 5316 | ||
| 5080 | p->cpus_allowed = new_mask; | 5317 | if (p->sched_class->set_cpus_allowed) |
| 5318 | p->sched_class->set_cpus_allowed(p, &new_mask); | ||
| 5319 | else { | ||
| 5320 | p->cpus_allowed = new_mask; | ||
| 5321 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | ||
| 5322 | } | ||
| 5323 | |||
| 5081 | /* Can the task run on the task's current CPU? If so, we're done */ | 5324 | /* Can the task run on the task's current CPU? If so, we're done */ |
| 5082 | if (cpu_isset(task_cpu(p), new_mask)) | 5325 | if (cpu_isset(task_cpu(p), new_mask)) |
| 5083 | goto out; | 5326 | goto out; |
| @@ -5569,9 +5812,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5569 | struct rq *rq; | 5812 | struct rq *rq; |
| 5570 | 5813 | ||
| 5571 | switch (action) { | 5814 | switch (action) { |
| 5572 | case CPU_LOCK_ACQUIRE: | ||
| 5573 | mutex_lock(&sched_hotcpu_mutex); | ||
| 5574 | break; | ||
| 5575 | 5815 | ||
| 5576 | case CPU_UP_PREPARE: | 5816 | case CPU_UP_PREPARE: |
| 5577 | case CPU_UP_PREPARE_FROZEN: | 5817 | case CPU_UP_PREPARE_FROZEN: |
| @@ -5590,6 +5830,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5590 | case CPU_ONLINE_FROZEN: | 5830 | case CPU_ONLINE_FROZEN: |
| 5591 | /* Strictly unnecessary, as first user will wake it. */ | 5831 | /* Strictly unnecessary, as first user will wake it. */ |
| 5592 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5832 | wake_up_process(cpu_rq(cpu)->migration_thread); |
| 5833 | |||
| 5834 | /* Update our root-domain */ | ||
| 5835 | rq = cpu_rq(cpu); | ||
| 5836 | spin_lock_irqsave(&rq->lock, flags); | ||
| 5837 | if (rq->rd) { | ||
| 5838 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
| 5839 | cpu_set(cpu, rq->rd->online); | ||
| 5840 | } | ||
| 5841 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 5593 | break; | 5842 | break; |
| 5594 | 5843 | ||
| 5595 | #ifdef CONFIG_HOTPLUG_CPU | 5844 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -5640,10 +5889,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5640 | } | 5889 | } |
| 5641 | spin_unlock_irq(&rq->lock); | 5890 | spin_unlock_irq(&rq->lock); |
| 5642 | break; | 5891 | break; |
| 5643 | #endif | 5892 | |
| 5644 | case CPU_LOCK_RELEASE: | 5893 | case CPU_DOWN_PREPARE: |
| 5645 | mutex_unlock(&sched_hotcpu_mutex); | 5894 | /* Update our root-domain */ |
| 5895 | rq = cpu_rq(cpu); | ||
| 5896 | spin_lock_irqsave(&rq->lock, flags); | ||
| 5897 | if (rq->rd) { | ||
| 5898 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
| 5899 | cpu_clear(cpu, rq->rd->online); | ||
| 5900 | } | ||
| 5901 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 5646 | break; | 5902 | break; |
| 5903 | #endif | ||
| 5647 | } | 5904 | } |
| 5648 | return NOTIFY_OK; | 5905 | return NOTIFY_OK; |
| 5649 | } | 5906 | } |
| @@ -5831,11 +6088,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 5831 | return 1; | 6088 | return 1; |
| 5832 | } | 6089 | } |
| 5833 | 6090 | ||
| 6091 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
| 6092 | { | ||
| 6093 | unsigned long flags; | ||
| 6094 | const struct sched_class *class; | ||
| 6095 | |||
| 6096 | spin_lock_irqsave(&rq->lock, flags); | ||
| 6097 | |||
| 6098 | if (rq->rd) { | ||
| 6099 | struct root_domain *old_rd = rq->rd; | ||
| 6100 | |||
| 6101 | for (class = sched_class_highest; class; class = class->next) { | ||
| 6102 | if (class->leave_domain) | ||
| 6103 | class->leave_domain(rq); | ||
| 6104 | } | ||
| 6105 | |||
| 6106 | cpu_clear(rq->cpu, old_rd->span); | ||
| 6107 | cpu_clear(rq->cpu, old_rd->online); | ||
| 6108 | |||
| 6109 | if (atomic_dec_and_test(&old_rd->refcount)) | ||
| 6110 | kfree(old_rd); | ||
| 6111 | } | ||
| 6112 | |||
| 6113 | atomic_inc(&rd->refcount); | ||
| 6114 | rq->rd = rd; | ||
| 6115 | |||
| 6116 | cpu_set(rq->cpu, rd->span); | ||
| 6117 | if (cpu_isset(rq->cpu, cpu_online_map)) | ||
| 6118 | cpu_set(rq->cpu, rd->online); | ||
| 6119 | |||
| 6120 | for (class = sched_class_highest; class; class = class->next) { | ||
| 6121 | if (class->join_domain) | ||
| 6122 | class->join_domain(rq); | ||
| 6123 | } | ||
| 6124 | |||
| 6125 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 6126 | } | ||
| 6127 | |||
| 6128 | static void init_rootdomain(struct root_domain *rd) | ||
| 6129 | { | ||
| 6130 | memset(rd, 0, sizeof(*rd)); | ||
| 6131 | |||
| 6132 | cpus_clear(rd->span); | ||
| 6133 | cpus_clear(rd->online); | ||
| 6134 | } | ||
| 6135 | |||
| 6136 | static void init_defrootdomain(void) | ||
| 6137 | { | ||
| 6138 | init_rootdomain(&def_root_domain); | ||
| 6139 | atomic_set(&def_root_domain.refcount, 1); | ||
| 6140 | } | ||
| 6141 | |||
| 6142 | static struct root_domain *alloc_rootdomain(void) | ||
| 6143 | { | ||
| 6144 | struct root_domain *rd; | ||
| 6145 | |||
| 6146 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
| 6147 | if (!rd) | ||
| 6148 | return NULL; | ||
| 6149 | |||
| 6150 | init_rootdomain(rd); | ||
| 6151 | |||
| 6152 | return rd; | ||
| 6153 | } | ||
| 6154 | |||
| 5834 | /* | 6155 | /* |
| 5835 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6156 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
| 5836 | * hold the hotplug lock. | 6157 | * hold the hotplug lock. |
| 5837 | */ | 6158 | */ |
| 5838 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 6159 | static void |
| 6160 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
| 5839 | { | 6161 | { |
| 5840 | struct rq *rq = cpu_rq(cpu); | 6162 | struct rq *rq = cpu_rq(cpu); |
| 5841 | struct sched_domain *tmp; | 6163 | struct sched_domain *tmp; |
| @@ -5860,6 +6182,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
| 5860 | 6182 | ||
| 5861 | sched_domain_debug(sd, cpu); | 6183 | sched_domain_debug(sd, cpu); |
| 5862 | 6184 | ||
| 6185 | rq_attach_root(rq, rd); | ||
| 5863 | rcu_assign_pointer(rq->sd, sd); | 6186 | rcu_assign_pointer(rq->sd, sd); |
| 5864 | } | 6187 | } |
| 5865 | 6188 | ||
| @@ -6228,6 +6551,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 6228 | static int build_sched_domains(const cpumask_t *cpu_map) | 6551 | static int build_sched_domains(const cpumask_t *cpu_map) |
| 6229 | { | 6552 | { |
| 6230 | int i; | 6553 | int i; |
| 6554 | struct root_domain *rd; | ||
| 6231 | #ifdef CONFIG_NUMA | 6555 | #ifdef CONFIG_NUMA |
| 6232 | struct sched_group **sched_group_nodes = NULL; | 6556 | struct sched_group **sched_group_nodes = NULL; |
| 6233 | int sd_allnodes = 0; | 6557 | int sd_allnodes = 0; |
| @@ -6244,6 +6568,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6244 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6568 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
| 6245 | #endif | 6569 | #endif |
| 6246 | 6570 | ||
| 6571 | rd = alloc_rootdomain(); | ||
| 6572 | if (!rd) { | ||
| 6573 | printk(KERN_WARNING "Cannot alloc root domain\n"); | ||
| 6574 | return -ENOMEM; | ||
| 6575 | } | ||
| 6576 | |||
| 6247 | /* | 6577 | /* |
| 6248 | * Set up domains for cpus specified by the cpu_map. | 6578 | * Set up domains for cpus specified by the cpu_map. |
| 6249 | */ | 6579 | */ |
| @@ -6460,7 +6790,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6460 | #else | 6790 | #else |
| 6461 | sd = &per_cpu(phys_domains, i); | 6791 | sd = &per_cpu(phys_domains, i); |
| 6462 | #endif | 6792 | #endif |
| 6463 | cpu_attach_domain(sd, i); | 6793 | cpu_attach_domain(sd, rd, i); |
| 6464 | } | 6794 | } |
| 6465 | 6795 | ||
| 6466 | return 0; | 6796 | return 0; |
| @@ -6518,7 +6848,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6518 | unregister_sched_domain_sysctl(); | 6848 | unregister_sched_domain_sysctl(); |
| 6519 | 6849 | ||
| 6520 | for_each_cpu_mask(i, *cpu_map) | 6850 | for_each_cpu_mask(i, *cpu_map) |
| 6521 | cpu_attach_domain(NULL, i); | 6851 | cpu_attach_domain(NULL, &def_root_domain, i); |
| 6522 | synchronize_sched(); | 6852 | synchronize_sched(); |
| 6523 | arch_destroy_sched_domains(cpu_map); | 6853 | arch_destroy_sched_domains(cpu_map); |
| 6524 | } | 6854 | } |
| @@ -6548,6 +6878,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
| 6548 | { | 6878 | { |
| 6549 | int i, j; | 6879 | int i, j; |
| 6550 | 6880 | ||
| 6881 | lock_doms_cur(); | ||
| 6882 | |||
| 6551 | /* always unregister in case we don't destroy any domains */ | 6883 | /* always unregister in case we don't destroy any domains */ |
| 6552 | unregister_sched_domain_sysctl(); | 6884 | unregister_sched_domain_sysctl(); |
| 6553 | 6885 | ||
| @@ -6588,6 +6920,8 @@ match2: | |||
| 6588 | ndoms_cur = ndoms_new; | 6920 | ndoms_cur = ndoms_new; |
| 6589 | 6921 | ||
| 6590 | register_sched_domain_sysctl(); | 6922 | register_sched_domain_sysctl(); |
| 6923 | |||
| 6924 | unlock_doms_cur(); | ||
| 6591 | } | 6925 | } |
| 6592 | 6926 | ||
| 6593 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6927 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| @@ -6595,10 +6929,10 @@ static int arch_reinit_sched_domains(void) | |||
| 6595 | { | 6929 | { |
| 6596 | int err; | 6930 | int err; |
| 6597 | 6931 | ||
| 6598 | mutex_lock(&sched_hotcpu_mutex); | 6932 | get_online_cpus(); |
| 6599 | detach_destroy_domains(&cpu_online_map); | 6933 | detach_destroy_domains(&cpu_online_map); |
| 6600 | err = arch_init_sched_domains(&cpu_online_map); | 6934 | err = arch_init_sched_domains(&cpu_online_map); |
| 6601 | mutex_unlock(&sched_hotcpu_mutex); | 6935 | put_online_cpus(); |
| 6602 | 6936 | ||
| 6603 | return err; | 6937 | return err; |
| 6604 | } | 6938 | } |
| @@ -6709,12 +7043,12 @@ void __init sched_init_smp(void) | |||
| 6709 | { | 7043 | { |
| 6710 | cpumask_t non_isolated_cpus; | 7044 | cpumask_t non_isolated_cpus; |
| 6711 | 7045 | ||
| 6712 | mutex_lock(&sched_hotcpu_mutex); | 7046 | get_online_cpus(); |
| 6713 | arch_init_sched_domains(&cpu_online_map); | 7047 | arch_init_sched_domains(&cpu_online_map); |
| 6714 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7048 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
| 6715 | if (cpus_empty(non_isolated_cpus)) | 7049 | if (cpus_empty(non_isolated_cpus)) |
| 6716 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7050 | cpu_set(smp_processor_id(), non_isolated_cpus); |
| 6717 | mutex_unlock(&sched_hotcpu_mutex); | 7051 | put_online_cpus(); |
| 6718 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7052 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
| 6719 | hotcpu_notifier(update_sched_domains, 0); | 7053 | hotcpu_notifier(update_sched_domains, 0); |
| 6720 | 7054 | ||
| @@ -6722,6 +7056,21 @@ void __init sched_init_smp(void) | |||
| 6722 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7056 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
| 6723 | BUG(); | 7057 | BUG(); |
| 6724 | sched_init_granularity(); | 7058 | sched_init_granularity(); |
| 7059 | |||
| 7060 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 7061 | if (nr_cpu_ids == 1) | ||
| 7062 | return; | ||
| 7063 | |||
| 7064 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
| 7065 | "group_balance"); | ||
| 7066 | if (!IS_ERR(lb_monitor_task)) { | ||
| 7067 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
| 7068 | wake_up_process(lb_monitor_task); | ||
| 7069 | } else { | ||
| 7070 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
| 7071 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
| 7072 | } | ||
| 7073 | #endif | ||
| 6725 | } | 7074 | } |
| 6726 | #else | 7075 | #else |
| 6727 | void __init sched_init_smp(void) | 7076 | void __init sched_init_smp(void) |
| @@ -6746,13 +7095,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
| 6746 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7095 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
| 6747 | } | 7096 | } |
| 6748 | 7097 | ||
| 7098 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
| 7099 | { | ||
| 7100 | struct rt_prio_array *array; | ||
| 7101 | int i; | ||
| 7102 | |||
| 7103 | array = &rt_rq->active; | ||
| 7104 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
| 7105 | INIT_LIST_HEAD(array->queue + i); | ||
| 7106 | __clear_bit(i, array->bitmap); | ||
| 7107 | } | ||
| 7108 | /* delimiter for bitsearch: */ | ||
| 7109 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
| 7110 | |||
| 7111 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
| 7112 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
| 7113 | #endif | ||
| 7114 | #ifdef CONFIG_SMP | ||
| 7115 | rt_rq->rt_nr_migratory = 0; | ||
| 7116 | rt_rq->overloaded = 0; | ||
| 7117 | #endif | ||
| 7118 | |||
| 7119 | rt_rq->rt_time = 0; | ||
| 7120 | rt_rq->rt_throttled = 0; | ||
| 7121 | |||
| 7122 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 7123 | rt_rq->rq = rq; | ||
| 7124 | #endif | ||
| 7125 | } | ||
| 7126 | |||
| 7127 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 7128 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | ||
| 7129 | struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
| 7130 | int cpu, int add) | ||
| 7131 | { | ||
| 7132 | tg->cfs_rq[cpu] = cfs_rq; | ||
| 7133 | init_cfs_rq(cfs_rq, rq); | ||
| 7134 | cfs_rq->tg = tg; | ||
| 7135 | if (add) | ||
| 7136 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
| 7137 | |||
| 7138 | tg->se[cpu] = se; | ||
| 7139 | se->cfs_rq = &rq->cfs; | ||
| 7140 | se->my_q = cfs_rq; | ||
| 7141 | se->load.weight = tg->shares; | ||
| 7142 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | ||
| 7143 | se->parent = NULL; | ||
| 7144 | } | ||
| 7145 | |||
| 7146 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | ||
| 7147 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | ||
| 7148 | int cpu, int add) | ||
| 7149 | { | ||
| 7150 | tg->rt_rq[cpu] = rt_rq; | ||
| 7151 | init_rt_rq(rt_rq, rq); | ||
| 7152 | rt_rq->tg = tg; | ||
| 7153 | rt_rq->rt_se = rt_se; | ||
| 7154 | if (add) | ||
| 7155 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
| 7156 | |||
| 7157 | tg->rt_se[cpu] = rt_se; | ||
| 7158 | rt_se->rt_rq = &rq->rt; | ||
| 7159 | rt_se->my_q = rt_rq; | ||
| 7160 | rt_se->parent = NULL; | ||
| 7161 | INIT_LIST_HEAD(&rt_se->run_list); | ||
| 7162 | } | ||
| 7163 | #endif | ||
| 7164 | |||
| 6749 | void __init sched_init(void) | 7165 | void __init sched_init(void) |
| 6750 | { | 7166 | { |
| 6751 | int highest_cpu = 0; | 7167 | int highest_cpu = 0; |
| 6752 | int i, j; | 7168 | int i, j; |
| 6753 | 7169 | ||
| 7170 | #ifdef CONFIG_SMP | ||
| 7171 | init_defrootdomain(); | ||
| 7172 | #endif | ||
| 7173 | |||
| 7174 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 7175 | list_add(&init_task_group.list, &task_groups); | ||
| 7176 | #endif | ||
| 7177 | |||
| 6754 | for_each_possible_cpu(i) { | 7178 | for_each_possible_cpu(i) { |
| 6755 | struct rt_prio_array *array; | ||
| 6756 | struct rq *rq; | 7179 | struct rq *rq; |
| 6757 | 7180 | ||
| 6758 | rq = cpu_rq(i); | 7181 | rq = cpu_rq(i); |
| @@ -6761,52 +7184,39 @@ void __init sched_init(void) | |||
| 6761 | rq->nr_running = 0; | 7184 | rq->nr_running = 0; |
| 6762 | rq->clock = 1; | 7185 | rq->clock = 1; |
| 6763 | init_cfs_rq(&rq->cfs, rq); | 7186 | init_cfs_rq(&rq->cfs, rq); |
| 7187 | init_rt_rq(&rq->rt, rq); | ||
| 6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7188 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6765 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
| 6766 | { | ||
| 6767 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
| 6768 | struct sched_entity *se = | ||
| 6769 | &per_cpu(init_sched_entity, i); | ||
| 6770 | |||
| 6771 | init_cfs_rq_p[i] = cfs_rq; | ||
| 6772 | init_cfs_rq(cfs_rq, rq); | ||
| 6773 | cfs_rq->tg = &init_task_group; | ||
| 6774 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
| 6775 | &rq->leaf_cfs_rq_list); | ||
| 6776 | |||
| 6777 | init_sched_entity_p[i] = se; | ||
| 6778 | se->cfs_rq = &rq->cfs; | ||
| 6779 | se->my_q = cfs_rq; | ||
| 6780 | se->load.weight = init_task_group_load; | ||
| 6781 | se->load.inv_weight = | ||
| 6782 | div64_64(1ULL<<32, init_task_group_load); | ||
| 6783 | se->parent = NULL; | ||
| 6784 | } | ||
| 6785 | init_task_group.shares = init_task_group_load; | 7189 | init_task_group.shares = init_task_group_load; |
| 6786 | spin_lock_init(&init_task_group.lock); | 7190 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 7191 | init_tg_cfs_entry(rq, &init_task_group, | ||
| 7192 | &per_cpu(init_cfs_rq, i), | ||
| 7193 | &per_cpu(init_sched_entity, i), i, 1); | ||
| 7194 | |||
| 7195 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ | ||
| 7196 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | ||
| 7197 | init_tg_rt_entry(rq, &init_task_group, | ||
| 7198 | &per_cpu(init_rt_rq, i), | ||
| 7199 | &per_cpu(init_sched_rt_entity, i), i, 1); | ||
| 6787 | #endif | 7200 | #endif |
| 7201 | rq->rt_period_expire = 0; | ||
| 7202 | rq->rt_throttled = 0; | ||
| 6788 | 7203 | ||
| 6789 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7204 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| 6790 | rq->cpu_load[j] = 0; | 7205 | rq->cpu_load[j] = 0; |
| 6791 | #ifdef CONFIG_SMP | 7206 | #ifdef CONFIG_SMP |
| 6792 | rq->sd = NULL; | 7207 | rq->sd = NULL; |
| 7208 | rq->rd = NULL; | ||
| 6793 | rq->active_balance = 0; | 7209 | rq->active_balance = 0; |
| 6794 | rq->next_balance = jiffies; | 7210 | rq->next_balance = jiffies; |
| 6795 | rq->push_cpu = 0; | 7211 | rq->push_cpu = 0; |
| 6796 | rq->cpu = i; | 7212 | rq->cpu = i; |
| 6797 | rq->migration_thread = NULL; | 7213 | rq->migration_thread = NULL; |
| 6798 | INIT_LIST_HEAD(&rq->migration_queue); | 7214 | INIT_LIST_HEAD(&rq->migration_queue); |
| 7215 | rq_attach_root(rq, &def_root_domain); | ||
| 6799 | #endif | 7216 | #endif |
| 7217 | init_rq_hrtick(rq); | ||
| 6800 | atomic_set(&rq->nr_iowait, 0); | 7218 | atomic_set(&rq->nr_iowait, 0); |
| 6801 | |||
| 6802 | array = &rq->rt.active; | ||
| 6803 | for (j = 0; j < MAX_RT_PRIO; j++) { | ||
| 6804 | INIT_LIST_HEAD(array->queue + j); | ||
| 6805 | __clear_bit(j, array->bitmap); | ||
| 6806 | } | ||
| 6807 | highest_cpu = i; | 7219 | highest_cpu = i; |
| 6808 | /* delimiter for bitsearch: */ | ||
| 6809 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
| 6810 | } | 7220 | } |
| 6811 | 7221 | ||
| 6812 | set_load_weight(&init_task); | 7222 | set_load_weight(&init_task); |
| @@ -6975,12 +7385,187 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
| 6975 | 7385 | ||
| 6976 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7386 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6977 | 7387 | ||
| 7388 | #ifdef CONFIG_SMP | ||
| 7389 | /* | ||
| 7390 | * distribute shares of all task groups among their schedulable entities, | ||
| 7391 | * to reflect load distribution across cpus. | ||
| 7392 | */ | ||
| 7393 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
| 7394 | { | ||
| 7395 | struct cfs_rq *cfs_rq; | ||
| 7396 | struct rq *rq = cpu_rq(this_cpu); | ||
| 7397 | cpumask_t sdspan = sd->span; | ||
| 7398 | int balanced = 1; | ||
| 7399 | |||
| 7400 | /* Walk thr' all the task groups that we have */ | ||
| 7401 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
| 7402 | int i; | ||
| 7403 | unsigned long total_load = 0, total_shares; | ||
| 7404 | struct task_group *tg = cfs_rq->tg; | ||
| 7405 | |||
| 7406 | /* Gather total task load of this group across cpus */ | ||
| 7407 | for_each_cpu_mask(i, sdspan) | ||
| 7408 | total_load += tg->cfs_rq[i]->load.weight; | ||
| 7409 | |||
| 7410 | /* Nothing to do if this group has no load */ | ||
| 7411 | if (!total_load) | ||
| 7412 | continue; | ||
| 7413 | |||
| 7414 | /* | ||
| 7415 | * tg->shares represents the number of cpu shares the task group | ||
| 7416 | * is eligible to hold on a single cpu. On N cpus, it is | ||
| 7417 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
| 7418 | */ | ||
| 7419 | total_shares = tg->shares * cpus_weight(sdspan); | ||
| 7420 | |||
| 7421 | /* | ||
| 7422 | * redistribute total_shares across cpus as per the task load | ||
| 7423 | * distribution. | ||
| 7424 | */ | ||
| 7425 | for_each_cpu_mask(i, sdspan) { | ||
| 7426 | unsigned long local_load, local_shares; | ||
| 7427 | |||
| 7428 | local_load = tg->cfs_rq[i]->load.weight; | ||
| 7429 | local_shares = (local_load * total_shares) / total_load; | ||
| 7430 | if (!local_shares) | ||
| 7431 | local_shares = MIN_GROUP_SHARES; | ||
| 7432 | if (local_shares == tg->se[i]->load.weight) | ||
| 7433 | continue; | ||
| 7434 | |||
| 7435 | spin_lock_irq(&cpu_rq(i)->lock); | ||
| 7436 | set_se_shares(tg->se[i], local_shares); | ||
| 7437 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
| 7438 | balanced = 0; | ||
| 7439 | } | ||
| 7440 | } | ||
| 7441 | |||
| 7442 | return balanced; | ||
| 7443 | } | ||
| 7444 | |||
| 7445 | /* | ||
| 7446 | * How frequently should we rebalance_shares() across cpus? | ||
| 7447 | * | ||
| 7448 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
| 7449 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
| 7450 | * also implies increased scheduling overhead. | ||
| 7451 | * | ||
| 7452 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
| 7453 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
| 7454 | * | ||
| 7455 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
| 7456 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
| 7457 | * | ||
| 7458 | * These settings allows for the appropriate trade-off between accuracy of | ||
| 7459 | * fairness and the associated overhead. | ||
| 7460 | * | ||
| 7461 | */ | ||
| 7462 | |||
| 7463 | /* default: 8ms, units: milliseconds */ | ||
| 7464 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
| 7465 | |||
| 7466 | /* default: 128ms, units: milliseconds */ | ||
| 7467 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
| 7468 | |||
| 7469 | /* kernel thread that runs rebalance_shares() periodically */ | ||
| 7470 | static int load_balance_monitor(void *unused) | ||
| 7471 | { | ||
| 7472 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
| 7473 | struct sched_param schedparm; | ||
| 7474 | int ret; | ||
| 7475 | |||
| 7476 | /* | ||
| 7477 | * We don't want this thread's execution to be limited by the shares | ||
| 7478 | * assigned to default group (init_task_group). Hence make it run | ||
| 7479 | * as a SCHED_RR RT task at the lowest priority. | ||
| 7480 | */ | ||
| 7481 | schedparm.sched_priority = 1; | ||
| 7482 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
| 7483 | if (ret) | ||
| 7484 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
| 7485 | " monitor thread (error = %d) \n", ret); | ||
| 7486 | |||
| 7487 | while (!kthread_should_stop()) { | ||
| 7488 | int i, cpu, balanced = 1; | ||
| 7489 | |||
| 7490 | /* Prevent cpus going down or coming up */ | ||
| 7491 | get_online_cpus(); | ||
| 7492 | /* lockout changes to doms_cur[] array */ | ||
| 7493 | lock_doms_cur(); | ||
| 7494 | /* | ||
| 7495 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
| 7496 | * chain on various cpus and to walk task group list | ||
| 7497 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
| 7498 | */ | ||
| 7499 | rcu_read_lock(); | ||
| 7500 | |||
| 7501 | for (i = 0; i < ndoms_cur; i++) { | ||
| 7502 | cpumask_t cpumap = doms_cur[i]; | ||
| 7503 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
| 7504 | |||
| 7505 | cpu = first_cpu(cpumap); | ||
| 7506 | |||
| 7507 | /* Find the highest domain at which to balance shares */ | ||
| 7508 | for_each_domain(cpu, sd) { | ||
| 7509 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 7510 | continue; | ||
| 7511 | sd_prev = sd; | ||
| 7512 | } | ||
| 7513 | |||
| 7514 | sd = sd_prev; | ||
| 7515 | /* sd == NULL? No load balance reqd in this domain */ | ||
| 7516 | if (!sd) | ||
| 7517 | continue; | ||
| 7518 | |||
| 7519 | balanced &= rebalance_shares(sd, cpu); | ||
| 7520 | } | ||
| 7521 | |||
| 7522 | rcu_read_unlock(); | ||
| 7523 | |||
| 7524 | unlock_doms_cur(); | ||
| 7525 | put_online_cpus(); | ||
| 7526 | |||
| 7527 | if (!balanced) | ||
| 7528 | timeout = sysctl_sched_min_bal_int_shares; | ||
| 7529 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
| 7530 | timeout *= 2; | ||
| 7531 | |||
| 7532 | msleep_interruptible(timeout); | ||
| 7533 | } | ||
| 7534 | |||
| 7535 | return 0; | ||
| 7536 | } | ||
| 7537 | #endif /* CONFIG_SMP */ | ||
| 7538 | |||
| 7539 | static void free_sched_group(struct task_group *tg) | ||
| 7540 | { | ||
| 7541 | int i; | ||
| 7542 | |||
| 7543 | for_each_possible_cpu(i) { | ||
| 7544 | if (tg->cfs_rq) | ||
| 7545 | kfree(tg->cfs_rq[i]); | ||
| 7546 | if (tg->se) | ||
| 7547 | kfree(tg->se[i]); | ||
| 7548 | if (tg->rt_rq) | ||
| 7549 | kfree(tg->rt_rq[i]); | ||
| 7550 | if (tg->rt_se) | ||
| 7551 | kfree(tg->rt_se[i]); | ||
| 7552 | } | ||
| 7553 | |||
| 7554 | kfree(tg->cfs_rq); | ||
| 7555 | kfree(tg->se); | ||
| 7556 | kfree(tg->rt_rq); | ||
| 7557 | kfree(tg->rt_se); | ||
| 7558 | kfree(tg); | ||
| 7559 | } | ||
| 7560 | |||
| 6978 | /* allocate runqueue etc for a new task group */ | 7561 | /* allocate runqueue etc for a new task group */ |
| 6979 | struct task_group *sched_create_group(void) | 7562 | struct task_group *sched_create_group(void) |
| 6980 | { | 7563 | { |
| 6981 | struct task_group *tg; | 7564 | struct task_group *tg; |
| 6982 | struct cfs_rq *cfs_rq; | 7565 | struct cfs_rq *cfs_rq; |
| 6983 | struct sched_entity *se; | 7566 | struct sched_entity *se; |
| 7567 | struct rt_rq *rt_rq; | ||
| 7568 | struct sched_rt_entity *rt_se; | ||
| 6984 | struct rq *rq; | 7569 | struct rq *rq; |
| 6985 | int i; | 7570 | int i; |
| 6986 | 7571 | ||
| @@ -6994,97 +7579,89 @@ struct task_group *sched_create_group(void) | |||
| 6994 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 7579 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); |
| 6995 | if (!tg->se) | 7580 | if (!tg->se) |
| 6996 | goto err; | 7581 | goto err; |
| 7582 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | ||
| 7583 | if (!tg->rt_rq) | ||
| 7584 | goto err; | ||
| 7585 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | ||
| 7586 | if (!tg->rt_se) | ||
| 7587 | goto err; | ||
| 7588 | |||
| 7589 | tg->shares = NICE_0_LOAD; | ||
| 7590 | tg->rt_ratio = 0; /* XXX */ | ||
| 6997 | 7591 | ||
| 6998 | for_each_possible_cpu(i) { | 7592 | for_each_possible_cpu(i) { |
| 6999 | rq = cpu_rq(i); | 7593 | rq = cpu_rq(i); |
| 7000 | 7594 | ||
| 7001 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | 7595 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), |
| 7002 | cpu_to_node(i)); | 7596 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
| 7003 | if (!cfs_rq) | 7597 | if (!cfs_rq) |
| 7004 | goto err; | 7598 | goto err; |
| 7005 | 7599 | ||
| 7006 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | 7600 | se = kmalloc_node(sizeof(struct sched_entity), |
| 7007 | cpu_to_node(i)); | 7601 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
| 7008 | if (!se) | 7602 | if (!se) |
| 7009 | goto err; | 7603 | goto err; |
| 7010 | 7604 | ||
| 7011 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | 7605 | rt_rq = kmalloc_node(sizeof(struct rt_rq), |
| 7012 | memset(se, 0, sizeof(struct sched_entity)); | 7606 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
| 7607 | if (!rt_rq) | ||
| 7608 | goto err; | ||
| 7013 | 7609 | ||
| 7014 | tg->cfs_rq[i] = cfs_rq; | 7610 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), |
| 7015 | init_cfs_rq(cfs_rq, rq); | 7611 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
| 7016 | cfs_rq->tg = tg; | 7612 | if (!rt_se) |
| 7613 | goto err; | ||
| 7017 | 7614 | ||
| 7018 | tg->se[i] = se; | 7615 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); |
| 7019 | se->cfs_rq = &rq->cfs; | 7616 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); |
| 7020 | se->my_q = cfs_rq; | ||
| 7021 | se->load.weight = NICE_0_LOAD; | ||
| 7022 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
| 7023 | se->parent = NULL; | ||
| 7024 | } | 7617 | } |
| 7025 | 7618 | ||
| 7619 | lock_task_group_list(); | ||
| 7026 | for_each_possible_cpu(i) { | 7620 | for_each_possible_cpu(i) { |
| 7027 | rq = cpu_rq(i); | 7621 | rq = cpu_rq(i); |
| 7028 | cfs_rq = tg->cfs_rq[i]; | 7622 | cfs_rq = tg->cfs_rq[i]; |
| 7029 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7623 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
| 7624 | rt_rq = tg->rt_rq[i]; | ||
| 7625 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
| 7030 | } | 7626 | } |
| 7031 | 7627 | list_add_rcu(&tg->list, &task_groups); | |
| 7032 | tg->shares = NICE_0_LOAD; | 7628 | unlock_task_group_list(); |
| 7033 | spin_lock_init(&tg->lock); | ||
| 7034 | 7629 | ||
| 7035 | return tg; | 7630 | return tg; |
| 7036 | 7631 | ||
| 7037 | err: | 7632 | err: |
| 7038 | for_each_possible_cpu(i) { | 7633 | free_sched_group(tg); |
| 7039 | if (tg->cfs_rq) | ||
| 7040 | kfree(tg->cfs_rq[i]); | ||
| 7041 | if (tg->se) | ||
| 7042 | kfree(tg->se[i]); | ||
| 7043 | } | ||
| 7044 | kfree(tg->cfs_rq); | ||
| 7045 | kfree(tg->se); | ||
| 7046 | kfree(tg); | ||
| 7047 | |||
| 7048 | return ERR_PTR(-ENOMEM); | 7634 | return ERR_PTR(-ENOMEM); |
| 7049 | } | 7635 | } |
| 7050 | 7636 | ||
| 7051 | /* rcu callback to free various structures associated with a task group */ | 7637 | /* rcu callback to free various structures associated with a task group */ |
| 7052 | static void free_sched_group(struct rcu_head *rhp) | 7638 | static void free_sched_group_rcu(struct rcu_head *rhp) |
| 7053 | { | 7639 | { |
| 7054 | struct task_group *tg = container_of(rhp, struct task_group, rcu); | ||
| 7055 | struct cfs_rq *cfs_rq; | ||
| 7056 | struct sched_entity *se; | ||
| 7057 | int i; | ||
| 7058 | |||
| 7059 | /* now it should be safe to free those cfs_rqs */ | 7640 | /* now it should be safe to free those cfs_rqs */ |
| 7060 | for_each_possible_cpu(i) { | 7641 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
| 7061 | cfs_rq = tg->cfs_rq[i]; | ||
| 7062 | kfree(cfs_rq); | ||
| 7063 | |||
| 7064 | se = tg->se[i]; | ||
| 7065 | kfree(se); | ||
| 7066 | } | ||
| 7067 | |||
| 7068 | kfree(tg->cfs_rq); | ||
| 7069 | kfree(tg->se); | ||
| 7070 | kfree(tg); | ||
| 7071 | } | 7642 | } |
| 7072 | 7643 | ||
| 7073 | /* Destroy runqueue etc associated with a task group */ | 7644 | /* Destroy runqueue etc associated with a task group */ |
| 7074 | void sched_destroy_group(struct task_group *tg) | 7645 | void sched_destroy_group(struct task_group *tg) |
| 7075 | { | 7646 | { |
| 7076 | struct cfs_rq *cfs_rq = NULL; | 7647 | struct cfs_rq *cfs_rq = NULL; |
| 7648 | struct rt_rq *rt_rq = NULL; | ||
| 7077 | int i; | 7649 | int i; |
| 7078 | 7650 | ||
| 7651 | lock_task_group_list(); | ||
| 7079 | for_each_possible_cpu(i) { | 7652 | for_each_possible_cpu(i) { |
| 7080 | cfs_rq = tg->cfs_rq[i]; | 7653 | cfs_rq = tg->cfs_rq[i]; |
| 7081 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7654 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
| 7655 | rt_rq = tg->rt_rq[i]; | ||
| 7656 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
| 7082 | } | 7657 | } |
| 7658 | list_del_rcu(&tg->list); | ||
| 7659 | unlock_task_group_list(); | ||
| 7083 | 7660 | ||
| 7084 | BUG_ON(!cfs_rq); | 7661 | BUG_ON(!cfs_rq); |
| 7085 | 7662 | ||
| 7086 | /* wait for possible concurrent references to cfs_rqs complete */ | 7663 | /* wait for possible concurrent references to cfs_rqs complete */ |
| 7087 | call_rcu(&tg->rcu, free_sched_group); | 7664 | call_rcu(&tg->rcu, free_sched_group_rcu); |
| 7088 | } | 7665 | } |
| 7089 | 7666 | ||
| 7090 | /* change task's runqueue when it moves between groups. | 7667 | /* change task's runqueue when it moves between groups. |
| @@ -7100,11 +7677,6 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7100 | 7677 | ||
| 7101 | rq = task_rq_lock(tsk, &flags); | 7678 | rq = task_rq_lock(tsk, &flags); |
| 7102 | 7679 | ||
| 7103 | if (tsk->sched_class != &fair_sched_class) { | ||
| 7104 | set_task_cfs_rq(tsk, task_cpu(tsk)); | ||
| 7105 | goto done; | ||
| 7106 | } | ||
| 7107 | |||
| 7108 | update_rq_clock(rq); | 7680 | update_rq_clock(rq); |
| 7109 | 7681 | ||
| 7110 | running = task_current(rq, tsk); | 7682 | running = task_current(rq, tsk); |
| @@ -7116,7 +7688,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7116 | tsk->sched_class->put_prev_task(rq, tsk); | 7688 | tsk->sched_class->put_prev_task(rq, tsk); |
| 7117 | } | 7689 | } |
| 7118 | 7690 | ||
| 7119 | set_task_cfs_rq(tsk, task_cpu(tsk)); | 7691 | set_task_rq(tsk, task_cpu(tsk)); |
| 7120 | 7692 | ||
| 7121 | if (on_rq) { | 7693 | if (on_rq) { |
| 7122 | if (unlikely(running)) | 7694 | if (unlikely(running)) |
| @@ -7124,53 +7696,82 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7124 | enqueue_task(rq, tsk, 0); | 7696 | enqueue_task(rq, tsk, 0); |
| 7125 | } | 7697 | } |
| 7126 | 7698 | ||
| 7127 | done: | ||
| 7128 | task_rq_unlock(rq, &flags); | 7699 | task_rq_unlock(rq, &flags); |
| 7129 | } | 7700 | } |
| 7130 | 7701 | ||
| 7702 | /* rq->lock to be locked by caller */ | ||
| 7131 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7703 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
| 7132 | { | 7704 | { |
| 7133 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7705 | struct cfs_rq *cfs_rq = se->cfs_rq; |
| 7134 | struct rq *rq = cfs_rq->rq; | 7706 | struct rq *rq = cfs_rq->rq; |
| 7135 | int on_rq; | 7707 | int on_rq; |
| 7136 | 7708 | ||
| 7137 | spin_lock_irq(&rq->lock); | 7709 | if (!shares) |
| 7710 | shares = MIN_GROUP_SHARES; | ||
| 7138 | 7711 | ||
| 7139 | on_rq = se->on_rq; | 7712 | on_rq = se->on_rq; |
| 7140 | if (on_rq) | 7713 | if (on_rq) { |
| 7141 | dequeue_entity(cfs_rq, se, 0); | 7714 | dequeue_entity(cfs_rq, se, 0); |
| 7715 | dec_cpu_load(rq, se->load.weight); | ||
| 7716 | } | ||
| 7142 | 7717 | ||
| 7143 | se->load.weight = shares; | 7718 | se->load.weight = shares; |
| 7144 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7719 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
| 7145 | 7720 | ||
| 7146 | if (on_rq) | 7721 | if (on_rq) { |
| 7147 | enqueue_entity(cfs_rq, se, 0); | 7722 | enqueue_entity(cfs_rq, se, 0); |
| 7148 | 7723 | inc_cpu_load(rq, se->load.weight); | |
| 7149 | spin_unlock_irq(&rq->lock); | 7724 | } |
| 7150 | } | 7725 | } |
| 7151 | 7726 | ||
| 7152 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7727 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
| 7153 | { | 7728 | { |
| 7154 | int i; | 7729 | int i; |
| 7730 | struct cfs_rq *cfs_rq; | ||
| 7731 | struct rq *rq; | ||
| 7732 | |||
| 7733 | lock_task_group_list(); | ||
| 7734 | if (tg->shares == shares) | ||
| 7735 | goto done; | ||
| 7736 | |||
| 7737 | if (shares < MIN_GROUP_SHARES) | ||
| 7738 | shares = MIN_GROUP_SHARES; | ||
| 7155 | 7739 | ||
| 7156 | /* | 7740 | /* |
| 7157 | * A weight of 0 or 1 can cause arithmetics problems. | 7741 | * Prevent any load balance activity (rebalance_shares, |
| 7158 | * (The default weight is 1024 - so there's no practical | 7742 | * load_balance_fair) from referring to this group first, |
| 7159 | * limitation from this.) | 7743 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. |
| 7160 | */ | 7744 | */ |
| 7161 | if (shares < 2) | 7745 | for_each_possible_cpu(i) { |
| 7162 | shares = 2; | 7746 | cfs_rq = tg->cfs_rq[i]; |
| 7747 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
| 7748 | } | ||
| 7163 | 7749 | ||
| 7164 | spin_lock(&tg->lock); | 7750 | /* wait for any ongoing reference to this group to finish */ |
| 7165 | if (tg->shares == shares) | 7751 | synchronize_sched(); |
| 7166 | goto done; | ||
| 7167 | 7752 | ||
| 7753 | /* | ||
| 7754 | * Now we are free to modify the group's share on each cpu | ||
| 7755 | * w/o tripping rebalance_share or load_balance_fair. | ||
| 7756 | */ | ||
| 7168 | tg->shares = shares; | 7757 | tg->shares = shares; |
| 7169 | for_each_possible_cpu(i) | 7758 | for_each_possible_cpu(i) { |
| 7759 | spin_lock_irq(&cpu_rq(i)->lock); | ||
| 7170 | set_se_shares(tg->se[i], shares); | 7760 | set_se_shares(tg->se[i], shares); |
| 7761 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
| 7762 | } | ||
| 7171 | 7763 | ||
| 7764 | /* | ||
| 7765 | * Enable load balance activity on this group, by inserting it back on | ||
| 7766 | * each cpu's rq->leaf_cfs_rq_list. | ||
| 7767 | */ | ||
| 7768 | for_each_possible_cpu(i) { | ||
| 7769 | rq = cpu_rq(i); | ||
| 7770 | cfs_rq = tg->cfs_rq[i]; | ||
| 7771 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
| 7772 | } | ||
| 7172 | done: | 7773 | done: |
| 7173 | spin_unlock(&tg->lock); | 7774 | unlock_task_group_list(); |
| 7174 | return 0; | 7775 | return 0; |
| 7175 | } | 7776 | } |
| 7176 | 7777 | ||
| @@ -7179,6 +7780,31 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
| 7179 | return tg->shares; | 7780 | return tg->shares; |
| 7180 | } | 7781 | } |
| 7181 | 7782 | ||
| 7783 | /* | ||
| 7784 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio | ||
| 7785 | */ | ||
| 7786 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) | ||
| 7787 | { | ||
| 7788 | struct task_group *tgi; | ||
| 7789 | unsigned long total = 0; | ||
| 7790 | |||
| 7791 | rcu_read_lock(); | ||
| 7792 | list_for_each_entry_rcu(tgi, &task_groups, list) | ||
| 7793 | total += tgi->rt_ratio; | ||
| 7794 | rcu_read_unlock(); | ||
| 7795 | |||
| 7796 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) | ||
| 7797 | return -EINVAL; | ||
| 7798 | |||
| 7799 | tg->rt_ratio = rt_ratio; | ||
| 7800 | return 0; | ||
| 7801 | } | ||
| 7802 | |||
| 7803 | unsigned long sched_group_rt_ratio(struct task_group *tg) | ||
| 7804 | { | ||
| 7805 | return tg->rt_ratio; | ||
| 7806 | } | ||
| 7807 | |||
| 7182 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7808 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 7183 | 7809 | ||
| 7184 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 7810 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
| @@ -7254,12 +7880,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
| 7254 | return (u64) tg->shares; | 7880 | return (u64) tg->shares; |
| 7255 | } | 7881 | } |
| 7256 | 7882 | ||
| 7883 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
| 7884 | u64 rt_ratio_val) | ||
| 7885 | { | ||
| 7886 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); | ||
| 7887 | } | ||
| 7888 | |||
| 7889 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
| 7890 | { | ||
| 7891 | struct task_group *tg = cgroup_tg(cgrp); | ||
| 7892 | |||
| 7893 | return (u64) tg->rt_ratio; | ||
| 7894 | } | ||
| 7895 | |||
| 7257 | static struct cftype cpu_files[] = { | 7896 | static struct cftype cpu_files[] = { |
| 7258 | { | 7897 | { |
| 7259 | .name = "shares", | 7898 | .name = "shares", |
| 7260 | .read_uint = cpu_shares_read_uint, | 7899 | .read_uint = cpu_shares_read_uint, |
| 7261 | .write_uint = cpu_shares_write_uint, | 7900 | .write_uint = cpu_shares_write_uint, |
| 7262 | }, | 7901 | }, |
| 7902 | { | ||
| 7903 | .name = "rt_ratio", | ||
| 7904 | .read_uint = cpu_rt_ratio_read_uint, | ||
| 7905 | .write_uint = cpu_rt_ratio_write_uint, | ||
| 7906 | }, | ||
| 7263 | }; | 7907 | }; |
| 7264 | 7908 | ||
| 7265 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7909 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 80fbbfc04290..4b5e24cf2f4a 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
| 179 | PN(prev_clock_raw); | 179 | PN(prev_clock_raw); |
| 180 | P(clock_warps); | 180 | P(clock_warps); |
| 181 | P(clock_overflows); | 181 | P(clock_overflows); |
| 182 | P(clock_underflows); | ||
| 182 | P(clock_deep_idle_events); | 183 | P(clock_deep_idle_events); |
| 183 | PN(clock_max_delta); | 184 | PN(clock_max_delta); |
| 184 | P(cpu_load[0]); | 185 | P(cpu_load[0]); |
| @@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 299 | PN(se.exec_max); | 300 | PN(se.exec_max); |
| 300 | PN(se.slice_max); | 301 | PN(se.slice_max); |
| 301 | PN(se.wait_max); | 302 | PN(se.wait_max); |
| 303 | PN(se.wait_sum); | ||
| 304 | P(se.wait_count); | ||
| 302 | P(sched_info.bkl_count); | 305 | P(sched_info.bkl_count); |
| 303 | P(se.nr_migrations); | 306 | P(se.nr_migrations); |
| 304 | P(se.nr_migrations_cold); | 307 | P(se.nr_migrations_cold); |
| @@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p) | |||
| 366 | { | 369 | { |
| 367 | #ifdef CONFIG_SCHEDSTATS | 370 | #ifdef CONFIG_SCHEDSTATS |
| 368 | p->se.wait_max = 0; | 371 | p->se.wait_max = 0; |
| 372 | p->se.wait_sum = 0; | ||
| 373 | p->se.wait_count = 0; | ||
| 369 | p->se.sleep_max = 0; | 374 | p->se.sleep_max = 0; |
| 370 | p->se.sum_sleep_runtime = 0; | 375 | p->se.sum_sleep_runtime = 0; |
| 371 | p->se.block_max = 0; | 376 | p->se.block_max = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index da7c061e7206..72e25c7a3a18 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -20,6 +20,8 @@ | |||
| 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
| 21 | */ | 21 | */ |
| 22 | 22 | ||
| 23 | #include <linux/latencytop.h> | ||
| 24 | |||
| 23 | /* | 25 | /* |
| 24 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
| 25 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) | 27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) |
| @@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running) | |||
| 248 | unsigned long nr_latency = sched_nr_latency; | 250 | unsigned long nr_latency = sched_nr_latency; |
| 249 | 251 | ||
| 250 | if (unlikely(nr_running > nr_latency)) { | 252 | if (unlikely(nr_running > nr_latency)) { |
| 253 | period = sysctl_sched_min_granularity; | ||
| 251 | period *= nr_running; | 254 | period *= nr_running; |
| 252 | do_div(period, nr_latency); | ||
| 253 | } | 255 | } |
| 254 | 256 | ||
| 255 | return period; | 257 | return period; |
| @@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 383 | { | 385 | { |
| 384 | schedstat_set(se->wait_max, max(se->wait_max, | 386 | schedstat_set(se->wait_max, max(se->wait_max, |
| 385 | rq_of(cfs_rq)->clock - se->wait_start)); | 387 | rq_of(cfs_rq)->clock - se->wait_start)); |
| 388 | schedstat_set(se->wait_count, se->wait_count + 1); | ||
| 389 | schedstat_set(se->wait_sum, se->wait_sum + | ||
| 390 | rq_of(cfs_rq)->clock - se->wait_start); | ||
| 386 | schedstat_set(se->wait_start, 0); | 391 | schedstat_set(se->wait_start, 0); |
| 387 | } | 392 | } |
| 388 | 393 | ||
| @@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 434 | #ifdef CONFIG_SCHEDSTATS | 439 | #ifdef CONFIG_SCHEDSTATS |
| 435 | if (se->sleep_start) { | 440 | if (se->sleep_start) { |
| 436 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 441 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
| 442 | struct task_struct *tsk = task_of(se); | ||
| 437 | 443 | ||
| 438 | if ((s64)delta < 0) | 444 | if ((s64)delta < 0) |
| 439 | delta = 0; | 445 | delta = 0; |
| @@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 443 | 449 | ||
| 444 | se->sleep_start = 0; | 450 | se->sleep_start = 0; |
| 445 | se->sum_sleep_runtime += delta; | 451 | se->sum_sleep_runtime += delta; |
| 452 | |||
| 453 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
| 446 | } | 454 | } |
| 447 | if (se->block_start) { | 455 | if (se->block_start) { |
| 448 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 456 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
| 457 | struct task_struct *tsk = task_of(se); | ||
| 449 | 458 | ||
| 450 | if ((s64)delta < 0) | 459 | if ((s64)delta < 0) |
| 451 | delta = 0; | 460 | delta = 0; |
| @@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 462 | * time that the task spent sleeping: | 471 | * time that the task spent sleeping: |
| 463 | */ | 472 | */ |
| 464 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 473 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
| 465 | struct task_struct *tsk = task_of(se); | ||
| 466 | 474 | ||
| 467 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 475 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
| 468 | delta >> 20); | 476 | delta >> 20); |
| 469 | } | 477 | } |
| 478 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
| 470 | } | 479 | } |
| 471 | #endif | 480 | #endif |
| 472 | } | 481 | } |
| @@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
| 642 | cfs_rq->curr = NULL; | 651 | cfs_rq->curr = NULL; |
| 643 | } | 652 | } |
| 644 | 653 | ||
| 645 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 654 | static void |
| 655 | entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | ||
| 646 | { | 656 | { |
| 647 | /* | 657 | /* |
| 648 | * Update run-time statistics of the 'current'. | 658 | * Update run-time statistics of the 'current'. |
| 649 | */ | 659 | */ |
| 650 | update_curr(cfs_rq); | 660 | update_curr(cfs_rq); |
| 651 | 661 | ||
| 662 | #ifdef CONFIG_SCHED_HRTICK | ||
| 663 | /* | ||
| 664 | * queued ticks are scheduled to match the slice, so don't bother | ||
| 665 | * validating it and just reschedule. | ||
| 666 | */ | ||
| 667 | if (queued) | ||
| 668 | return resched_task(rq_of(cfs_rq)->curr); | ||
| 669 | /* | ||
| 670 | * don't let the period tick interfere with the hrtick preemption | ||
| 671 | */ | ||
| 672 | if (!sched_feat(DOUBLE_TICK) && | ||
| 673 | hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) | ||
| 674 | return; | ||
| 675 | #endif | ||
| 676 | |||
| 652 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 677 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
| 653 | check_preempt_tick(cfs_rq, curr); | 678 | check_preempt_tick(cfs_rq, curr); |
| 654 | } | 679 | } |
| @@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
| 690 | 715 | ||
| 691 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 716 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
| 692 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 717 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
| 693 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 718 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
| 694 | 719 | ||
| 695 | /* Do the two (enqueued) entities belong to the same group ? */ | 720 | /* Do the two (enqueued) entities belong to the same group ? */ |
| 696 | static inline int | 721 | static inline int |
| @@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
| 707 | return se->parent; | 732 | return se->parent; |
| 708 | } | 733 | } |
| 709 | 734 | ||
| 735 | #define GROUP_IMBALANCE_PCT 20 | ||
| 736 | |||
| 710 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 737 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| 711 | 738 | ||
| 712 | #define for_each_sched_entity(se) \ | 739 | #define for_each_sched_entity(se) \ |
| @@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
| 752 | 779 | ||
| 753 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 780 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 754 | 781 | ||
| 782 | #ifdef CONFIG_SCHED_HRTICK | ||
| 783 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
| 784 | { | ||
| 785 | int requeue = rq->curr == p; | ||
| 786 | struct sched_entity *se = &p->se; | ||
| 787 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 788 | |||
| 789 | WARN_ON(task_rq(p) != rq); | ||
| 790 | |||
| 791 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | ||
| 792 | u64 slice = sched_slice(cfs_rq, se); | ||
| 793 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||
| 794 | s64 delta = slice - ran; | ||
| 795 | |||
| 796 | if (delta < 0) { | ||
| 797 | if (rq->curr == p) | ||
| 798 | resched_task(p); | ||
| 799 | return; | ||
| 800 | } | ||
| 801 | |||
| 802 | /* | ||
| 803 | * Don't schedule slices shorter than 10000ns, that just | ||
| 804 | * doesn't make sense. Rely on vruntime for fairness. | ||
| 805 | */ | ||
| 806 | if (!requeue) | ||
| 807 | delta = max(10000LL, delta); | ||
| 808 | |||
| 809 | hrtick_start(rq, delta, requeue); | ||
| 810 | } | ||
| 811 | } | ||
| 812 | #else | ||
| 813 | static inline void | ||
| 814 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
| 815 | { | ||
| 816 | } | ||
| 817 | #endif | ||
| 818 | |||
| 755 | /* | 819 | /* |
| 756 | * The enqueue_task method is called before nr_running is | 820 | * The enqueue_task method is called before nr_running is |
| 757 | * increased. Here we update the fair scheduling stats and | 821 | * increased. Here we update the fair scheduling stats and |
| @@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
| 760 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 824 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
| 761 | { | 825 | { |
| 762 | struct cfs_rq *cfs_rq; | 826 | struct cfs_rq *cfs_rq; |
| 763 | struct sched_entity *se = &p->se; | 827 | struct sched_entity *se = &p->se, |
| 828 | *topse = NULL; /* Highest schedulable entity */ | ||
| 829 | int incload = 1; | ||
| 764 | 830 | ||
| 765 | for_each_sched_entity(se) { | 831 | for_each_sched_entity(se) { |
| 766 | if (se->on_rq) | 832 | topse = se; |
| 833 | if (se->on_rq) { | ||
| 834 | incload = 0; | ||
| 767 | break; | 835 | break; |
| 836 | } | ||
| 768 | cfs_rq = cfs_rq_of(se); | 837 | cfs_rq = cfs_rq_of(se); |
| 769 | enqueue_entity(cfs_rq, se, wakeup); | 838 | enqueue_entity(cfs_rq, se, wakeup); |
| 770 | wakeup = 1; | 839 | wakeup = 1; |
| 771 | } | 840 | } |
| 841 | /* Increment cpu load if we just enqueued the first task of a group on | ||
| 842 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
| 843 | * at the highest grouping level. | ||
| 844 | */ | ||
| 845 | if (incload) | ||
| 846 | inc_cpu_load(rq, topse->load.weight); | ||
| 847 | |||
| 848 | hrtick_start_fair(rq, rq->curr); | ||
| 772 | } | 849 | } |
| 773 | 850 | ||
| 774 | /* | 851 | /* |
| @@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 779 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 856 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
| 780 | { | 857 | { |
| 781 | struct cfs_rq *cfs_rq; | 858 | struct cfs_rq *cfs_rq; |
| 782 | struct sched_entity *se = &p->se; | 859 | struct sched_entity *se = &p->se, |
| 860 | *topse = NULL; /* Highest schedulable entity */ | ||
| 861 | int decload = 1; | ||
| 783 | 862 | ||
| 784 | for_each_sched_entity(se) { | 863 | for_each_sched_entity(se) { |
| 864 | topse = se; | ||
| 785 | cfs_rq = cfs_rq_of(se); | 865 | cfs_rq = cfs_rq_of(se); |
| 786 | dequeue_entity(cfs_rq, se, sleep); | 866 | dequeue_entity(cfs_rq, se, sleep); |
| 787 | /* Don't dequeue parent if it has other entities besides us */ | 867 | /* Don't dequeue parent if it has other entities besides us */ |
| 788 | if (cfs_rq->load.weight) | 868 | if (cfs_rq->load.weight) { |
| 869 | if (parent_entity(se)) | ||
| 870 | decload = 0; | ||
| 789 | break; | 871 | break; |
| 872 | } | ||
| 790 | sleep = 1; | 873 | sleep = 1; |
| 791 | } | 874 | } |
| 875 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
| 876 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
| 877 | * at the highest grouping level. | ||
| 878 | */ | ||
| 879 | if (decload) | ||
| 880 | dec_cpu_load(rq, topse->load.weight); | ||
| 881 | |||
| 882 | hrtick_start_fair(rq, rq->curr); | ||
| 792 | } | 883 | } |
| 793 | 884 | ||
| 794 | /* | 885 | /* |
| @@ -836,6 +927,154 @@ static void yield_task_fair(struct rq *rq) | |||
| 836 | } | 927 | } |
| 837 | 928 | ||
| 838 | /* | 929 | /* |
| 930 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
| 931 | * not idle and an idle cpu is available. The span of cpus to | ||
| 932 | * search starts with cpus closest then further out as needed, | ||
| 933 | * so we always favor a closer, idle cpu. | ||
| 934 | * | ||
| 935 | * Returns the CPU we should wake onto. | ||
| 936 | */ | ||
| 937 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
| 938 | static int wake_idle(int cpu, struct task_struct *p) | ||
| 939 | { | ||
| 940 | cpumask_t tmp; | ||
| 941 | struct sched_domain *sd; | ||
| 942 | int i; | ||
| 943 | |||
| 944 | /* | ||
| 945 | * If it is idle, then it is the best cpu to run this task. | ||
| 946 | * | ||
| 947 | * This cpu is also the best, if it has more than one task already. | ||
| 948 | * Siblings must be also busy(in most cases) as they didn't already | ||
| 949 | * pickup the extra load from this cpu and hence we need not check | ||
| 950 | * sibling runqueue info. This will avoid the checks and cache miss | ||
| 951 | * penalities associated with that. | ||
| 952 | */ | ||
| 953 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
| 954 | return cpu; | ||
| 955 | |||
| 956 | for_each_domain(cpu, sd) { | ||
| 957 | if (sd->flags & SD_WAKE_IDLE) { | ||
| 958 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
| 959 | for_each_cpu_mask(i, tmp) { | ||
| 960 | if (idle_cpu(i)) { | ||
| 961 | if (i != task_cpu(p)) { | ||
| 962 | schedstat_inc(p, | ||
| 963 | se.nr_wakeups_idle); | ||
| 964 | } | ||
| 965 | return i; | ||
| 966 | } | ||
| 967 | } | ||
| 968 | } else { | ||
| 969 | break; | ||
| 970 | } | ||
| 971 | } | ||
| 972 | return cpu; | ||
| 973 | } | ||
| 974 | #else | ||
| 975 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
| 976 | { | ||
| 977 | return cpu; | ||
| 978 | } | ||
| 979 | #endif | ||
| 980 | |||
| 981 | #ifdef CONFIG_SMP | ||
| 982 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
| 983 | { | ||
| 984 | int cpu, this_cpu; | ||
| 985 | struct rq *rq; | ||
| 986 | struct sched_domain *sd, *this_sd = NULL; | ||
| 987 | int new_cpu; | ||
| 988 | |||
| 989 | cpu = task_cpu(p); | ||
| 990 | rq = task_rq(p); | ||
| 991 | this_cpu = smp_processor_id(); | ||
| 992 | new_cpu = cpu; | ||
| 993 | |||
| 994 | if (cpu == this_cpu) | ||
| 995 | goto out_set_cpu; | ||
| 996 | |||
| 997 | for_each_domain(this_cpu, sd) { | ||
| 998 | if (cpu_isset(cpu, sd->span)) { | ||
| 999 | this_sd = sd; | ||
| 1000 | break; | ||
| 1001 | } | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
| 1005 | goto out_set_cpu; | ||
| 1006 | |||
| 1007 | /* | ||
| 1008 | * Check for affine wakeup and passive balancing possibilities. | ||
| 1009 | */ | ||
| 1010 | if (this_sd) { | ||
| 1011 | int idx = this_sd->wake_idx; | ||
| 1012 | unsigned int imbalance; | ||
| 1013 | unsigned long load, this_load; | ||
| 1014 | |||
| 1015 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
| 1016 | |||
| 1017 | load = source_load(cpu, idx); | ||
| 1018 | this_load = target_load(this_cpu, idx); | ||
| 1019 | |||
| 1020 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
| 1021 | |||
| 1022 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
| 1023 | unsigned long tl = this_load; | ||
| 1024 | unsigned long tl_per_task; | ||
| 1025 | |||
| 1026 | /* | ||
| 1027 | * Attract cache-cold tasks on sync wakeups: | ||
| 1028 | */ | ||
| 1029 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
| 1030 | goto out_set_cpu; | ||
| 1031 | |||
| 1032 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
| 1033 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 1034 | |||
| 1035 | /* | ||
| 1036 | * If sync wakeup then subtract the (maximum possible) | ||
| 1037 | * effect of the currently running task from the load | ||
| 1038 | * of the current CPU: | ||
| 1039 | */ | ||
| 1040 | if (sync) | ||
| 1041 | tl -= current->se.load.weight; | ||
| 1042 | |||
| 1043 | if ((tl <= load && | ||
| 1044 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
| 1045 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
| 1046 | /* | ||
| 1047 | * This domain has SD_WAKE_AFFINE and | ||
| 1048 | * p is cache cold in this domain, and | ||
| 1049 | * there is no bad imbalance. | ||
| 1050 | */ | ||
| 1051 | schedstat_inc(this_sd, ttwu_move_affine); | ||
| 1052 | schedstat_inc(p, se.nr_wakeups_affine); | ||
| 1053 | goto out_set_cpu; | ||
| 1054 | } | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | /* | ||
| 1058 | * Start passive balancing when half the imbalance_pct | ||
| 1059 | * limit is reached. | ||
| 1060 | */ | ||
| 1061 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
| 1062 | if (imbalance*this_load <= 100*load) { | ||
| 1063 | schedstat_inc(this_sd, ttwu_move_balance); | ||
| 1064 | schedstat_inc(p, se.nr_wakeups_passive); | ||
| 1065 | goto out_set_cpu; | ||
| 1066 | } | ||
| 1067 | } | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
| 1071 | out_set_cpu: | ||
| 1072 | return wake_idle(new_cpu, p); | ||
| 1073 | } | ||
| 1074 | #endif /* CONFIG_SMP */ | ||
| 1075 | |||
| 1076 | |||
| 1077 | /* | ||
| 839 | * Preempt the current task with a newly woken task if needed: | 1078 | * Preempt the current task with a newly woken task if needed: |
| 840 | */ | 1079 | */ |
| 841 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1080 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
| @@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 876 | 1115 | ||
| 877 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1116 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
| 878 | { | 1117 | { |
| 1118 | struct task_struct *p; | ||
| 879 | struct cfs_rq *cfs_rq = &rq->cfs; | 1119 | struct cfs_rq *cfs_rq = &rq->cfs; |
| 880 | struct sched_entity *se; | 1120 | struct sched_entity *se; |
| 881 | 1121 | ||
| @@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
| 887 | cfs_rq = group_cfs_rq(se); | 1127 | cfs_rq = group_cfs_rq(se); |
| 888 | } while (cfs_rq); | 1128 | } while (cfs_rq); |
| 889 | 1129 | ||
| 890 | return task_of(se); | 1130 | p = task_of(se); |
| 1131 | hrtick_start_fair(rq, p); | ||
| 1132 | |||
| 1133 | return p; | ||
| 891 | } | 1134 | } |
| 892 | 1135 | ||
| 893 | /* | 1136 | /* |
| @@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
| 944 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1187 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
| 945 | } | 1188 | } |
| 946 | 1189 | ||
| 947 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 948 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
| 949 | { | ||
| 950 | struct sched_entity *curr; | ||
| 951 | struct task_struct *p; | ||
| 952 | |||
| 953 | if (!cfs_rq->nr_running) | ||
| 954 | return MAX_PRIO; | ||
| 955 | |||
| 956 | curr = cfs_rq->curr; | ||
| 957 | if (!curr) | ||
| 958 | curr = __pick_next_entity(cfs_rq); | ||
| 959 | |||
| 960 | p = task_of(curr); | ||
| 961 | |||
| 962 | return p->prio; | ||
| 963 | } | ||
| 964 | #endif | ||
| 965 | |||
| 966 | static unsigned long | 1190 | static unsigned long |
| 967 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1191 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 968 | unsigned long max_load_move, | 1192 | unsigned long max_load_move, |
| @@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 972 | struct cfs_rq *busy_cfs_rq; | 1196 | struct cfs_rq *busy_cfs_rq; |
| 973 | long rem_load_move = max_load_move; | 1197 | long rem_load_move = max_load_move; |
| 974 | struct rq_iterator cfs_rq_iterator; | 1198 | struct rq_iterator cfs_rq_iterator; |
| 1199 | unsigned long load_moved; | ||
| 975 | 1200 | ||
| 976 | cfs_rq_iterator.start = load_balance_start_fair; | 1201 | cfs_rq_iterator.start = load_balance_start_fair; |
| 977 | cfs_rq_iterator.next = load_balance_next_fair; | 1202 | cfs_rq_iterator.next = load_balance_next_fair; |
| 978 | 1203 | ||
| 979 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1204 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
| 980 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1205 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 981 | struct cfs_rq *this_cfs_rq; | 1206 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; |
| 982 | long imbalance; | 1207 | unsigned long maxload, task_load, group_weight; |
| 983 | unsigned long maxload; | 1208 | unsigned long thisload, per_task_load; |
| 1209 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
| 1210 | |||
| 1211 | task_load = busy_cfs_rq->load.weight; | ||
| 1212 | group_weight = se->load.weight; | ||
| 984 | 1213 | ||
| 985 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1214 | /* |
| 1215 | * 'group_weight' is contributed by tasks of total weight | ||
| 1216 | * 'task_load'. To move 'rem_load_move' worth of weight only, | ||
| 1217 | * we need to move a maximum task load of: | ||
| 1218 | * | ||
| 1219 | * maxload = (remload / group_weight) * task_load; | ||
| 1220 | */ | ||
| 1221 | maxload = (rem_load_move * task_load) / group_weight; | ||
| 986 | 1222 | ||
| 987 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1223 | if (!maxload || !task_load) |
| 988 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
| 989 | if (imbalance <= 0) | ||
| 990 | continue; | 1224 | continue; |
| 991 | 1225 | ||
| 992 | /* Don't pull more than imbalance/2 */ | 1226 | per_task_load = task_load / busy_cfs_rq->nr_running; |
| 993 | imbalance /= 2; | 1227 | /* |
| 994 | maxload = min(rem_load_move, imbalance); | 1228 | * balance_tasks will try to forcibly move atleast one task if |
| 1229 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
| 1230 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
| 1231 | */ | ||
| 1232 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
| 1233 | continue; | ||
| 995 | 1234 | ||
| 996 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1235 | /* Disable priority-based load balance */ |
| 1236 | *this_best_prio = 0; | ||
| 1237 | thisload = this_cfs_rq->load.weight; | ||
| 997 | #else | 1238 | #else |
| 998 | # define maxload rem_load_move | 1239 | # define maxload rem_load_move |
| 999 | #endif | 1240 | #endif |
| @@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1002 | * load_balance_[start|next]_fair iterators | 1243 | * load_balance_[start|next]_fair iterators |
| 1003 | */ | 1244 | */ |
| 1004 | cfs_rq_iterator.arg = busy_cfs_rq; | 1245 | cfs_rq_iterator.arg = busy_cfs_rq; |
| 1005 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | 1246 | load_moved = balance_tasks(this_rq, this_cpu, busiest, |
| 1006 | maxload, sd, idle, all_pinned, | 1247 | maxload, sd, idle, all_pinned, |
| 1007 | this_best_prio, | 1248 | this_best_prio, |
| 1008 | &cfs_rq_iterator); | 1249 | &cfs_rq_iterator); |
| 1009 | 1250 | ||
| 1251 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1252 | /* | ||
| 1253 | * load_moved holds the task load that was moved. The | ||
| 1254 | * effective (group) weight moved would be: | ||
| 1255 | * load_moved_eff = load_moved/task_load * group_weight; | ||
| 1256 | */ | ||
| 1257 | load_moved = (group_weight * load_moved) / task_load; | ||
| 1258 | |||
| 1259 | /* Adjust shares on both cpus to reflect load_moved */ | ||
| 1260 | group_weight -= load_moved; | ||
| 1261 | set_se_shares(se, group_weight); | ||
| 1262 | |||
| 1263 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
| 1264 | if (!thisload) | ||
| 1265 | group_weight = load_moved; | ||
| 1266 | else | ||
| 1267 | group_weight = se->load.weight + load_moved; | ||
| 1268 | set_se_shares(se, group_weight); | ||
| 1269 | #endif | ||
| 1270 | |||
| 1271 | rem_load_move -= load_moved; | ||
| 1272 | |||
| 1010 | if (rem_load_move <= 0) | 1273 | if (rem_load_move <= 0) |
| 1011 | break; | 1274 | break; |
| 1012 | } | 1275 | } |
| @@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1042 | /* | 1305 | /* |
| 1043 | * scheduler tick hitting a task of our scheduling class: | 1306 | * scheduler tick hitting a task of our scheduling class: |
| 1044 | */ | 1307 | */ |
| 1045 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | 1308 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
| 1046 | { | 1309 | { |
| 1047 | struct cfs_rq *cfs_rq; | 1310 | struct cfs_rq *cfs_rq; |
| 1048 | struct sched_entity *se = &curr->se; | 1311 | struct sched_entity *se = &curr->se; |
| 1049 | 1312 | ||
| 1050 | for_each_sched_entity(se) { | 1313 | for_each_sched_entity(se) { |
| 1051 | cfs_rq = cfs_rq_of(se); | 1314 | cfs_rq = cfs_rq_of(se); |
| 1052 | entity_tick(cfs_rq, se); | 1315 | entity_tick(cfs_rq, se, queued); |
| 1053 | } | 1316 | } |
| 1054 | } | 1317 | } |
| 1055 | 1318 | ||
| @@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
| 1087 | resched_task(rq->curr); | 1350 | resched_task(rq->curr); |
| 1088 | } | 1351 | } |
| 1089 | 1352 | ||
| 1353 | /* | ||
| 1354 | * Priority of the task has changed. Check to see if we preempt | ||
| 1355 | * the current task. | ||
| 1356 | */ | ||
| 1357 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | ||
| 1358 | int oldprio, int running) | ||
| 1359 | { | ||
| 1360 | /* | ||
| 1361 | * Reschedule if we are currently running on this runqueue and | ||
| 1362 | * our priority decreased, or if we are not currently running on | ||
| 1363 | * this runqueue and our priority is higher than the current's | ||
| 1364 | */ | ||
| 1365 | if (running) { | ||
| 1366 | if (p->prio > oldprio) | ||
| 1367 | resched_task(rq->curr); | ||
| 1368 | } else | ||
| 1369 | check_preempt_curr(rq, p); | ||
| 1370 | } | ||
| 1371 | |||
| 1372 | /* | ||
| 1373 | * We switched to the sched_fair class. | ||
| 1374 | */ | ||
| 1375 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | ||
| 1376 | int running) | ||
| 1377 | { | ||
| 1378 | /* | ||
| 1379 | * We were most likely switched from sched_rt, so | ||
| 1380 | * kick off the schedule if running, otherwise just see | ||
| 1381 | * if we can still preempt the current task. | ||
| 1382 | */ | ||
| 1383 | if (running) | ||
| 1384 | resched_task(rq->curr); | ||
| 1385 | else | ||
| 1386 | check_preempt_curr(rq, p); | ||
| 1387 | } | ||
| 1388 | |||
| 1090 | /* Account for a task changing its policy or group. | 1389 | /* Account for a task changing its policy or group. |
| 1091 | * | 1390 | * |
| 1092 | * This routine is mostly called to set cfs_rq->curr field when a task | 1391 | * This routine is mostly called to set cfs_rq->curr field when a task |
| @@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = { | |||
| 1108 | .enqueue_task = enqueue_task_fair, | 1407 | .enqueue_task = enqueue_task_fair, |
| 1109 | .dequeue_task = dequeue_task_fair, | 1408 | .dequeue_task = dequeue_task_fair, |
| 1110 | .yield_task = yield_task_fair, | 1409 | .yield_task = yield_task_fair, |
| 1410 | #ifdef CONFIG_SMP | ||
| 1411 | .select_task_rq = select_task_rq_fair, | ||
| 1412 | #endif /* CONFIG_SMP */ | ||
| 1111 | 1413 | ||
| 1112 | .check_preempt_curr = check_preempt_wakeup, | 1414 | .check_preempt_curr = check_preempt_wakeup, |
| 1113 | 1415 | ||
| @@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = { | |||
| 1122 | .set_curr_task = set_curr_task_fair, | 1424 | .set_curr_task = set_curr_task_fair, |
| 1123 | .task_tick = task_tick_fair, | 1425 | .task_tick = task_tick_fair, |
| 1124 | .task_new = task_new_fair, | 1426 | .task_new = task_new_fair, |
| 1427 | |||
| 1428 | .prio_changed = prio_changed_fair, | ||
| 1429 | .switched_to = switched_to_fair, | ||
| 1125 | }; | 1430 | }; |
| 1126 | 1431 | ||
| 1127 | #ifdef CONFIG_SCHED_DEBUG | 1432 | #ifdef CONFIG_SCHED_DEBUG |
| @@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
| 1132 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1437 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1133 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | 1438 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); |
| 1134 | #endif | 1439 | #endif |
| 1440 | rcu_read_lock(); | ||
| 1135 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1441 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
| 1136 | print_cfs_rq(m, cpu, cfs_rq); | 1442 | print_cfs_rq(m, cpu, cfs_rq); |
| 1443 | rcu_read_unlock(); | ||
| 1137 | } | 1444 | } |
| 1138 | #endif | 1445 | #endif |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index bf9c25c15b8b..2bcafa375633 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
| @@ -5,6 +5,12 @@ | |||
| 5 | * handled in sched_fair.c) | 5 | * handled in sched_fair.c) |
| 6 | */ | 6 | */ |
| 7 | 7 | ||
| 8 | #ifdef CONFIG_SMP | ||
| 9 | static int select_task_rq_idle(struct task_struct *p, int sync) | ||
| 10 | { | ||
| 11 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
| 12 | } | ||
| 13 | #endif /* CONFIG_SMP */ | ||
| 8 | /* | 14 | /* |
| 9 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
| 10 | */ | 16 | */ |
| @@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 55 | } | 61 | } |
| 56 | #endif | 62 | #endif |
| 57 | 63 | ||
| 58 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) | 64 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
| 59 | { | 65 | { |
| 60 | } | 66 | } |
| 61 | 67 | ||
| @@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq) | |||
| 63 | { | 69 | { |
| 64 | } | 70 | } |
| 65 | 71 | ||
| 72 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | ||
| 73 | int running) | ||
| 74 | { | ||
| 75 | /* Can this actually happen?? */ | ||
| 76 | if (running) | ||
| 77 | resched_task(rq->curr); | ||
| 78 | else | ||
| 79 | check_preempt_curr(rq, p); | ||
| 80 | } | ||
| 81 | |||
| 82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | ||
| 83 | int oldprio, int running) | ||
| 84 | { | ||
| 85 | /* This can happen for hot plug CPUS */ | ||
| 86 | |||
| 87 | /* | ||
| 88 | * Reschedule if we are currently running on this runqueue and | ||
| 89 | * our priority decreased, or if we are not currently running on | ||
| 90 | * this runqueue and our priority is higher than the current's | ||
| 91 | */ | ||
| 92 | if (running) { | ||
| 93 | if (p->prio > oldprio) | ||
| 94 | resched_task(rq->curr); | ||
| 95 | } else | ||
| 96 | check_preempt_curr(rq, p); | ||
| 97 | } | ||
| 98 | |||
| 66 | /* | 99 | /* |
| 67 | * Simple, special scheduling class for the per-CPU idle tasks: | 100 | * Simple, special scheduling class for the per-CPU idle tasks: |
| 68 | */ | 101 | */ |
| @@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = { | |||
| 72 | 105 | ||
| 73 | /* dequeue is not valid, we print a debug message there: */ | 106 | /* dequeue is not valid, we print a debug message there: */ |
| 74 | .dequeue_task = dequeue_task_idle, | 107 | .dequeue_task = dequeue_task_idle, |
| 108 | #ifdef CONFIG_SMP | ||
| 109 | .select_task_rq = select_task_rq_idle, | ||
| 110 | #endif /* CONFIG_SMP */ | ||
| 75 | 111 | ||
| 76 | .check_preempt_curr = check_preempt_curr_idle, | 112 | .check_preempt_curr = check_preempt_curr_idle, |
| 77 | 113 | ||
| @@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = { | |||
| 85 | 121 | ||
| 86 | .set_curr_task = set_curr_task_idle, | 122 | .set_curr_task = set_curr_task_idle, |
| 87 | .task_tick = task_tick_idle, | 123 | .task_tick = task_tick_idle, |
| 124 | |||
| 125 | .prio_changed = prio_changed_idle, | ||
| 126 | .switched_to = switched_to_idle, | ||
| 127 | |||
| 88 | /* no .task_new for idle tasks */ | 128 | /* no .task_new for idle tasks */ |
| 89 | }; | 129 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9ba3daa03475..274b40d7bef2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -3,6 +3,217 @@ | |||
| 3 | * policies) | 3 | * policies) |
| 4 | */ | 4 | */ |
| 5 | 5 | ||
| 6 | #ifdef CONFIG_SMP | ||
| 7 | |||
| 8 | static inline int rt_overloaded(struct rq *rq) | ||
| 9 | { | ||
| 10 | return atomic_read(&rq->rd->rto_count); | ||
| 11 | } | ||
| 12 | |||
| 13 | static inline void rt_set_overload(struct rq *rq) | ||
| 14 | { | ||
| 15 | cpu_set(rq->cpu, rq->rd->rto_mask); | ||
| 16 | /* | ||
| 17 | * Make sure the mask is visible before we set | ||
| 18 | * the overload count. That is checked to determine | ||
| 19 | * if we should look at the mask. It would be a shame | ||
| 20 | * if we looked at the mask, but the mask was not | ||
| 21 | * updated yet. | ||
| 22 | */ | ||
| 23 | wmb(); | ||
| 24 | atomic_inc(&rq->rd->rto_count); | ||
| 25 | } | ||
| 26 | |||
| 27 | static inline void rt_clear_overload(struct rq *rq) | ||
| 28 | { | ||
| 29 | /* the order here really doesn't matter */ | ||
| 30 | atomic_dec(&rq->rd->rto_count); | ||
| 31 | cpu_clear(rq->cpu, rq->rd->rto_mask); | ||
| 32 | } | ||
| 33 | |||
| 34 | static void update_rt_migration(struct rq *rq) | ||
| 35 | { | ||
| 36 | if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { | ||
| 37 | if (!rq->rt.overloaded) { | ||
| 38 | rt_set_overload(rq); | ||
| 39 | rq->rt.overloaded = 1; | ||
| 40 | } | ||
| 41 | } else if (rq->rt.overloaded) { | ||
| 42 | rt_clear_overload(rq); | ||
| 43 | rq->rt.overloaded = 0; | ||
| 44 | } | ||
| 45 | } | ||
| 46 | #endif /* CONFIG_SMP */ | ||
| 47 | |||
| 48 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | ||
| 49 | { | ||
| 50 | return container_of(rt_se, struct task_struct, rt); | ||
| 51 | } | ||
| 52 | |||
| 53 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | ||
| 54 | { | ||
| 55 | return !list_empty(&rt_se->run_list); | ||
| 56 | } | ||
| 57 | |||
| 58 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 59 | |||
| 60 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
| 61 | { | ||
| 62 | if (!rt_rq->tg) | ||
| 63 | return SCHED_RT_FRAC; | ||
| 64 | |||
| 65 | return rt_rq->tg->rt_ratio; | ||
| 66 | } | ||
| 67 | |||
| 68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
| 69 | list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | ||
| 70 | |||
| 71 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
| 72 | { | ||
| 73 | return rt_rq->rq; | ||
| 74 | } | ||
| 75 | |||
| 76 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
| 77 | { | ||
| 78 | return rt_se->rt_rq; | ||
| 79 | } | ||
| 80 | |||
| 81 | #define for_each_sched_rt_entity(rt_se) \ | ||
| 82 | for (; rt_se; rt_se = rt_se->parent) | ||
| 83 | |||
| 84 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
| 85 | { | ||
| 86 | return rt_se->my_q; | ||
| 87 | } | ||
| 88 | |||
| 89 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | ||
| 90 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | ||
| 91 | |||
| 92 | static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
| 93 | { | ||
| 94 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
| 95 | |||
| 96 | if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { | ||
| 97 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | ||
| 98 | |||
| 99 | enqueue_rt_entity(rt_se); | ||
| 100 | if (rt_rq->highest_prio < curr->prio) | ||
| 101 | resched_task(curr); | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
| 106 | { | ||
| 107 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
| 108 | |||
| 109 | if (rt_se && on_rt_rq(rt_se)) | ||
| 110 | dequeue_rt_entity(rt_se); | ||
| 111 | } | ||
| 112 | |||
| 113 | #else | ||
| 114 | |||
| 115 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
| 116 | { | ||
| 117 | return sysctl_sched_rt_ratio; | ||
| 118 | } | ||
| 119 | |||
| 120 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
| 121 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
| 122 | |||
| 123 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
| 124 | { | ||
| 125 | return container_of(rt_rq, struct rq, rt); | ||
| 126 | } | ||
| 127 | |||
| 128 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
| 129 | { | ||
| 130 | struct task_struct *p = rt_task_of(rt_se); | ||
| 131 | struct rq *rq = task_rq(p); | ||
| 132 | |||
| 133 | return &rq->rt; | ||
| 134 | } | ||
| 135 | |||
| 136 | #define for_each_sched_rt_entity(rt_se) \ | ||
| 137 | for (; rt_se; rt_se = NULL) | ||
| 138 | |||
| 139 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
| 140 | { | ||
| 141 | return NULL; | ||
| 142 | } | ||
| 143 | |||
| 144 | static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
| 145 | { | ||
| 146 | } | ||
| 147 | |||
| 148 | static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
| 149 | { | ||
| 150 | } | ||
| 151 | |||
| 152 | #endif | ||
| 153 | |||
| 154 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | ||
| 155 | { | ||
| 156 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 157 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | ||
| 158 | |||
| 159 | if (rt_rq) | ||
| 160 | return rt_rq->highest_prio; | ||
| 161 | #endif | ||
| 162 | |||
| 163 | return rt_task_of(rt_se)->prio; | ||
| 164 | } | ||
| 165 | |||
| 166 | static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) | ||
| 167 | { | ||
| 168 | unsigned int rt_ratio = sched_rt_ratio(rt_rq); | ||
| 169 | u64 period, ratio; | ||
| 170 | |||
| 171 | if (rt_ratio == SCHED_RT_FRAC) | ||
| 172 | return 0; | ||
| 173 | |||
| 174 | if (rt_rq->rt_throttled) | ||
| 175 | return 1; | ||
| 176 | |||
| 177 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
| 178 | ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
| 179 | |||
| 180 | if (rt_rq->rt_time > ratio) { | ||
| 181 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 182 | |||
| 183 | rq->rt_throttled = 1; | ||
| 184 | rt_rq->rt_throttled = 1; | ||
| 185 | |||
| 186 | sched_rt_ratio_dequeue(rt_rq); | ||
| 187 | return 1; | ||
| 188 | } | ||
| 189 | |||
| 190 | return 0; | ||
| 191 | } | ||
| 192 | |||
| 193 | static void update_sched_rt_period(struct rq *rq) | ||
| 194 | { | ||
| 195 | struct rt_rq *rt_rq; | ||
| 196 | u64 period; | ||
| 197 | |||
| 198 | while (rq->clock > rq->rt_period_expire) { | ||
| 199 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
| 200 | rq->rt_period_expire += period; | ||
| 201 | |||
| 202 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
| 203 | unsigned long rt_ratio = sched_rt_ratio(rt_rq); | ||
| 204 | u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
| 205 | |||
| 206 | rt_rq->rt_time -= min(rt_rq->rt_time, ratio); | ||
| 207 | if (rt_rq->rt_throttled) { | ||
| 208 | rt_rq->rt_throttled = 0; | ||
| 209 | sched_rt_ratio_enqueue(rt_rq); | ||
| 210 | } | ||
| 211 | } | ||
| 212 | |||
| 213 | rq->rt_throttled = 0; | ||
| 214 | } | ||
| 215 | } | ||
| 216 | |||
| 6 | /* | 217 | /* |
| 7 | * Update the current task's runtime statistics. Skip current tasks that | 218 | * Update the current task's runtime statistics. Skip current tasks that |
| 8 | * are not in our scheduling class. | 219 | * are not in our scheduling class. |
| @@ -10,6 +221,8 @@ | |||
| 10 | static void update_curr_rt(struct rq *rq) | 221 | static void update_curr_rt(struct rq *rq) |
| 11 | { | 222 | { |
| 12 | struct task_struct *curr = rq->curr; | 223 | struct task_struct *curr = rq->curr; |
| 224 | struct sched_rt_entity *rt_se = &curr->rt; | ||
| 225 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
| 13 | u64 delta_exec; | 226 | u64 delta_exec; |
| 14 | 227 | ||
| 15 | if (!task_has_rt_policy(curr)) | 228 | if (!task_has_rt_policy(curr)) |
| @@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq) | |||
| 24 | curr->se.sum_exec_runtime += delta_exec; | 237 | curr->se.sum_exec_runtime += delta_exec; |
| 25 | curr->se.exec_start = rq->clock; | 238 | curr->se.exec_start = rq->clock; |
| 26 | cpuacct_charge(curr, delta_exec); | 239 | cpuacct_charge(curr, delta_exec); |
| 240 | |||
| 241 | rt_rq->rt_time += delta_exec; | ||
| 242 | /* | ||
| 243 | * might make it a tad more accurate: | ||
| 244 | * | ||
| 245 | * update_sched_rt_period(rq); | ||
| 246 | */ | ||
| 247 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
| 248 | resched_task(curr); | ||
| 27 | } | 249 | } |
| 28 | 250 | ||
| 29 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 251 | static inline |
| 252 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
| 253 | { | ||
| 254 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
| 255 | rt_rq->rt_nr_running++; | ||
| 256 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
| 257 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) | ||
| 258 | rt_rq->highest_prio = rt_se_prio(rt_se); | ||
| 259 | #endif | ||
| 260 | #ifdef CONFIG_SMP | ||
| 261 | if (rt_se->nr_cpus_allowed > 1) { | ||
| 262 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 263 | rq->rt.rt_nr_migratory++; | ||
| 264 | } | ||
| 265 | |||
| 266 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
| 267 | #endif | ||
| 268 | } | ||
| 269 | |||
| 270 | static inline | ||
| 271 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
| 272 | { | ||
| 273 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
| 274 | WARN_ON(!rt_rq->rt_nr_running); | ||
| 275 | rt_rq->rt_nr_running--; | ||
| 276 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
| 277 | if (rt_rq->rt_nr_running) { | ||
| 278 | struct rt_prio_array *array; | ||
| 279 | |||
| 280 | WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); | ||
| 281 | if (rt_se_prio(rt_se) == rt_rq->highest_prio) { | ||
| 282 | /* recalculate */ | ||
| 283 | array = &rt_rq->active; | ||
| 284 | rt_rq->highest_prio = | ||
| 285 | sched_find_first_bit(array->bitmap); | ||
| 286 | } /* otherwise leave rq->highest prio alone */ | ||
| 287 | } else | ||
| 288 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
| 289 | #endif | ||
| 290 | #ifdef CONFIG_SMP | ||
| 291 | if (rt_se->nr_cpus_allowed > 1) { | ||
| 292 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 293 | rq->rt.rt_nr_migratory--; | ||
| 294 | } | ||
| 295 | |||
| 296 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
| 297 | #endif /* CONFIG_SMP */ | ||
| 298 | } | ||
| 299 | |||
| 300 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | ||
| 301 | { | ||
| 302 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
| 303 | struct rt_prio_array *array = &rt_rq->active; | ||
| 304 | struct rt_rq *group_rq = group_rt_rq(rt_se); | ||
| 305 | |||
| 306 | if (group_rq && group_rq->rt_throttled) | ||
| 307 | return; | ||
| 308 | |||
| 309 | list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
| 310 | __set_bit(rt_se_prio(rt_se), array->bitmap); | ||
| 311 | |||
| 312 | inc_rt_tasks(rt_se, rt_rq); | ||
| 313 | } | ||
| 314 | |||
| 315 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | ||
| 30 | { | 316 | { |
| 31 | struct rt_prio_array *array = &rq->rt.active; | 317 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
| 318 | struct rt_prio_array *array = &rt_rq->active; | ||
| 319 | |||
| 320 | list_del_init(&rt_se->run_list); | ||
| 321 | if (list_empty(array->queue + rt_se_prio(rt_se))) | ||
| 322 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | ||
| 32 | 323 | ||
| 33 | list_add_tail(&p->run_list, array->queue + p->prio); | 324 | dec_rt_tasks(rt_se, rt_rq); |
| 34 | __set_bit(p->prio, array->bitmap); | 325 | } |
| 326 | |||
| 327 | /* | ||
| 328 | * Because the prio of an upper entry depends on the lower | ||
| 329 | * entries, we must remove entries top - down. | ||
| 330 | * | ||
| 331 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
| 332 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
| 333 | */ | ||
| 334 | static void dequeue_rt_stack(struct task_struct *p) | ||
| 335 | { | ||
| 336 | struct sched_rt_entity *rt_se, *top_se; | ||
| 337 | |||
| 338 | /* | ||
| 339 | * dequeue all, top - down. | ||
| 340 | */ | ||
| 341 | do { | ||
| 342 | rt_se = &p->rt; | ||
| 343 | top_se = NULL; | ||
| 344 | for_each_sched_rt_entity(rt_se) { | ||
| 345 | if (on_rt_rq(rt_se)) | ||
| 346 | top_se = rt_se; | ||
| 347 | } | ||
| 348 | if (top_se) | ||
| 349 | dequeue_rt_entity(top_se); | ||
| 350 | } while (top_se); | ||
| 35 | } | 351 | } |
| 36 | 352 | ||
| 37 | /* | 353 | /* |
| 38 | * Adding/removing a task to/from a priority array: | 354 | * Adding/removing a task to/from a priority array: |
| 39 | */ | 355 | */ |
| 356 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | ||
| 357 | { | ||
| 358 | struct sched_rt_entity *rt_se = &p->rt; | ||
| 359 | |||
| 360 | if (wakeup) | ||
| 361 | rt_se->timeout = 0; | ||
| 362 | |||
| 363 | dequeue_rt_stack(p); | ||
| 364 | |||
| 365 | /* | ||
| 366 | * enqueue everybody, bottom - up. | ||
| 367 | */ | ||
| 368 | for_each_sched_rt_entity(rt_se) | ||
| 369 | enqueue_rt_entity(rt_se); | ||
| 370 | |||
| 371 | inc_cpu_load(rq, p->se.load.weight); | ||
| 372 | } | ||
| 373 | |||
| 40 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 374 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
| 41 | { | 375 | { |
| 42 | struct rt_prio_array *array = &rq->rt.active; | 376 | struct sched_rt_entity *rt_se = &p->rt; |
| 377 | struct rt_rq *rt_rq; | ||
| 43 | 378 | ||
| 44 | update_curr_rt(rq); | 379 | update_curr_rt(rq); |
| 45 | 380 | ||
| 46 | list_del(&p->run_list); | 381 | dequeue_rt_stack(p); |
| 47 | if (list_empty(array->queue + p->prio)) | 382 | |
| 48 | __clear_bit(p->prio, array->bitmap); | 383 | /* |
| 384 | * re-enqueue all non-empty rt_rq entities. | ||
| 385 | */ | ||
| 386 | for_each_sched_rt_entity(rt_se) { | ||
| 387 | rt_rq = group_rt_rq(rt_se); | ||
| 388 | if (rt_rq && rt_rq->rt_nr_running) | ||
| 389 | enqueue_rt_entity(rt_se); | ||
| 390 | } | ||
| 391 | |||
| 392 | dec_cpu_load(rq, p->se.load.weight); | ||
| 49 | } | 393 | } |
| 50 | 394 | ||
| 51 | /* | 395 | /* |
| 52 | * Put task to the end of the run list without the overhead of dequeue | 396 | * Put task to the end of the run list without the overhead of dequeue |
| 53 | * followed by enqueue. | 397 | * followed by enqueue. |
| 54 | */ | 398 | */ |
| 399 | static | ||
| 400 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) | ||
| 401 | { | ||
| 402 | struct rt_prio_array *array = &rt_rq->active; | ||
| 403 | |||
| 404 | list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
| 405 | } | ||
| 406 | |||
| 55 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 407 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
| 56 | { | 408 | { |
| 57 | struct rt_prio_array *array = &rq->rt.active; | 409 | struct sched_rt_entity *rt_se = &p->rt; |
| 410 | struct rt_rq *rt_rq; | ||
| 58 | 411 | ||
| 59 | list_move_tail(&p->run_list, array->queue + p->prio); | 412 | for_each_sched_rt_entity(rt_se) { |
| 413 | rt_rq = rt_rq_of_se(rt_se); | ||
| 414 | requeue_rt_entity(rt_rq, rt_se); | ||
| 415 | } | ||
| 60 | } | 416 | } |
| 61 | 417 | ||
| 62 | static void | 418 | static void yield_task_rt(struct rq *rq) |
| 63 | yield_task_rt(struct rq *rq) | ||
| 64 | { | 419 | { |
| 65 | requeue_task_rt(rq, rq->curr); | 420 | requeue_task_rt(rq, rq->curr); |
| 66 | } | 421 | } |
| 67 | 422 | ||
| 423 | #ifdef CONFIG_SMP | ||
| 424 | static int find_lowest_rq(struct task_struct *task); | ||
| 425 | |||
| 426 | static int select_task_rq_rt(struct task_struct *p, int sync) | ||
| 427 | { | ||
| 428 | struct rq *rq = task_rq(p); | ||
| 429 | |||
| 430 | /* | ||
| 431 | * If the current task is an RT task, then | ||
| 432 | * try to see if we can wake this RT task up on another | ||
| 433 | * runqueue. Otherwise simply start this RT task | ||
| 434 | * on its current runqueue. | ||
| 435 | * | ||
| 436 | * We want to avoid overloading runqueues. Even if | ||
| 437 | * the RT task is of higher priority than the current RT task. | ||
| 438 | * RT tasks behave differently than other tasks. If | ||
| 439 | * one gets preempted, we try to push it off to another queue. | ||
| 440 | * So trying to keep a preempting RT task on the same | ||
| 441 | * cache hot CPU will force the running RT task to | ||
| 442 | * a cold CPU. So we waste all the cache for the lower | ||
| 443 | * RT task in hopes of saving some of a RT task | ||
| 444 | * that is just being woken and probably will have | ||
| 445 | * cold cache anyway. | ||
| 446 | */ | ||
| 447 | if (unlikely(rt_task(rq->curr)) && | ||
| 448 | (p->rt.nr_cpus_allowed > 1)) { | ||
| 449 | int cpu = find_lowest_rq(p); | ||
| 450 | |||
| 451 | return (cpu == -1) ? task_cpu(p) : cpu; | ||
| 452 | } | ||
| 453 | |||
| 454 | /* | ||
| 455 | * Otherwise, just let it ride on the affined RQ and the | ||
| 456 | * post-schedule router will push the preempted task away | ||
| 457 | */ | ||
| 458 | return task_cpu(p); | ||
| 459 | } | ||
| 460 | #endif /* CONFIG_SMP */ | ||
| 461 | |||
| 68 | /* | 462 | /* |
| 69 | * Preempt the current task with a newly woken task if needed: | 463 | * Preempt the current task with a newly woken task if needed: |
| 70 | */ | 464 | */ |
| @@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | |||
| 74 | resched_task(rq->curr); | 468 | resched_task(rq->curr); |
| 75 | } | 469 | } |
| 76 | 470 | ||
| 77 | static struct task_struct *pick_next_task_rt(struct rq *rq) | 471 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, |
| 472 | struct rt_rq *rt_rq) | ||
| 78 | { | 473 | { |
| 79 | struct rt_prio_array *array = &rq->rt.active; | 474 | struct rt_prio_array *array = &rt_rq->active; |
| 80 | struct task_struct *next; | 475 | struct sched_rt_entity *next = NULL; |
| 81 | struct list_head *queue; | 476 | struct list_head *queue; |
| 82 | int idx; | 477 | int idx; |
| 83 | 478 | ||
| 84 | idx = sched_find_first_bit(array->bitmap); | 479 | idx = sched_find_first_bit(array->bitmap); |
| 85 | if (idx >= MAX_RT_PRIO) | 480 | BUG_ON(idx >= MAX_RT_PRIO); |
| 86 | return NULL; | ||
| 87 | 481 | ||
| 88 | queue = array->queue + idx; | 482 | queue = array->queue + idx; |
| 89 | next = list_entry(queue->next, struct task_struct, run_list); | 483 | next = list_entry(queue->next, struct sched_rt_entity, run_list); |
| 90 | |||
| 91 | next->se.exec_start = rq->clock; | ||
| 92 | 484 | ||
| 93 | return next; | 485 | return next; |
| 94 | } | 486 | } |
| 95 | 487 | ||
| 488 | static struct task_struct *pick_next_task_rt(struct rq *rq) | ||
| 489 | { | ||
| 490 | struct sched_rt_entity *rt_se; | ||
| 491 | struct task_struct *p; | ||
| 492 | struct rt_rq *rt_rq; | ||
| 493 | |||
| 494 | rt_rq = &rq->rt; | ||
| 495 | |||
| 496 | if (unlikely(!rt_rq->rt_nr_running)) | ||
| 497 | return NULL; | ||
| 498 | |||
| 499 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
| 500 | return NULL; | ||
| 501 | |||
| 502 | do { | ||
| 503 | rt_se = pick_next_rt_entity(rq, rt_rq); | ||
| 504 | BUG_ON(!rt_se); | ||
| 505 | rt_rq = group_rt_rq(rt_se); | ||
| 506 | } while (rt_rq); | ||
| 507 | |||
| 508 | p = rt_task_of(rt_se); | ||
| 509 | p->se.exec_start = rq->clock; | ||
| 510 | return p; | ||
| 511 | } | ||
| 512 | |||
| 96 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 513 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
| 97 | { | 514 | { |
| 98 | update_curr_rt(rq); | 515 | update_curr_rt(rq); |
| @@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 100 | } | 517 | } |
| 101 | 518 | ||
| 102 | #ifdef CONFIG_SMP | 519 | #ifdef CONFIG_SMP |
| 103 | /* | 520 | |
| 104 | * Load-balancing iterator. Note: while the runqueue stays locked | 521 | /* Only try algorithms three times */ |
| 105 | * during the whole iteration, the current task might be | 522 | #define RT_MAX_TRIES 3 |
| 106 | * dequeued so the iterator has to be dequeue-safe. Here we | 523 | |
| 107 | * achieve that by always pre-iterating before returning | 524 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest); |
| 108 | * the current task: | 525 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); |
| 109 | */ | 526 | |
| 110 | static struct task_struct *load_balance_start_rt(void *arg) | 527 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
| 111 | { | 528 | { |
| 112 | struct rq *rq = arg; | 529 | if (!task_running(rq, p) && |
| 113 | struct rt_prio_array *array = &rq->rt.active; | 530 | (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && |
| 114 | struct list_head *head, *curr; | 531 | (p->rt.nr_cpus_allowed > 1)) |
| 115 | struct task_struct *p; | 532 | return 1; |
| 533 | return 0; | ||
| 534 | } | ||
| 535 | |||
| 536 | /* Return the second highest RT task, NULL otherwise */ | ||
| 537 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | ||
| 538 | { | ||
| 539 | struct task_struct *next = NULL; | ||
| 540 | struct sched_rt_entity *rt_se; | ||
| 541 | struct rt_prio_array *array; | ||
| 542 | struct rt_rq *rt_rq; | ||
| 116 | int idx; | 543 | int idx; |
| 117 | 544 | ||
| 118 | idx = sched_find_first_bit(array->bitmap); | 545 | for_each_leaf_rt_rq(rt_rq, rq) { |
| 119 | if (idx >= MAX_RT_PRIO) | 546 | array = &rt_rq->active; |
| 120 | return NULL; | 547 | idx = sched_find_first_bit(array->bitmap); |
| 548 | next_idx: | ||
| 549 | if (idx >= MAX_RT_PRIO) | ||
| 550 | continue; | ||
| 551 | if (next && next->prio < idx) | ||
| 552 | continue; | ||
| 553 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | ||
| 554 | struct task_struct *p = rt_task_of(rt_se); | ||
| 555 | if (pick_rt_task(rq, p, cpu)) { | ||
| 556 | next = p; | ||
| 557 | break; | ||
| 558 | } | ||
| 559 | } | ||
| 560 | if (!next) { | ||
| 561 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
| 562 | goto next_idx; | ||
| 563 | } | ||
| 564 | } | ||
| 121 | 565 | ||
| 122 | head = array->queue + idx; | 566 | return next; |
| 123 | curr = head->prev; | 567 | } |
| 124 | 568 | ||
| 125 | p = list_entry(curr, struct task_struct, run_list); | 569 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
| 126 | 570 | ||
| 127 | curr = curr->prev; | 571 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) |
| 572 | { | ||
| 573 | int lowest_prio = -1; | ||
| 574 | int lowest_cpu = -1; | ||
| 575 | int count = 0; | ||
| 576 | int cpu; | ||
| 128 | 577 | ||
| 129 | rq->rt.rt_load_balance_idx = idx; | 578 | cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); |
| 130 | rq->rt.rt_load_balance_head = head; | ||
| 131 | rq->rt.rt_load_balance_curr = curr; | ||
| 132 | 579 | ||
| 133 | return p; | 580 | /* |
| 581 | * Scan each rq for the lowest prio. | ||
| 582 | */ | ||
| 583 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
| 584 | struct rq *rq = cpu_rq(cpu); | ||
| 585 | |||
| 586 | /* We look for lowest RT prio or non-rt CPU */ | ||
| 587 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | ||
| 588 | /* | ||
| 589 | * if we already found a low RT queue | ||
| 590 | * and now we found this non-rt queue | ||
| 591 | * clear the mask and set our bit. | ||
| 592 | * Otherwise just return the queue as is | ||
| 593 | * and the count==1 will cause the algorithm | ||
| 594 | * to use the first bit found. | ||
| 595 | */ | ||
| 596 | if (lowest_cpu != -1) { | ||
| 597 | cpus_clear(*lowest_mask); | ||
| 598 | cpu_set(rq->cpu, *lowest_mask); | ||
| 599 | } | ||
| 600 | return 1; | ||
| 601 | } | ||
| 602 | |||
| 603 | /* no locking for now */ | ||
| 604 | if ((rq->rt.highest_prio > task->prio) | ||
| 605 | && (rq->rt.highest_prio >= lowest_prio)) { | ||
| 606 | if (rq->rt.highest_prio > lowest_prio) { | ||
| 607 | /* new low - clear old data */ | ||
| 608 | lowest_prio = rq->rt.highest_prio; | ||
| 609 | lowest_cpu = cpu; | ||
| 610 | count = 0; | ||
| 611 | } | ||
| 612 | count++; | ||
| 613 | } else | ||
| 614 | cpu_clear(cpu, *lowest_mask); | ||
| 615 | } | ||
| 616 | |||
| 617 | /* | ||
| 618 | * Clear out all the set bits that represent | ||
| 619 | * runqueues that were of higher prio than | ||
| 620 | * the lowest_prio. | ||
| 621 | */ | ||
| 622 | if (lowest_cpu > 0) { | ||
| 623 | /* | ||
| 624 | * Perhaps we could add another cpumask op to | ||
| 625 | * zero out bits. Like cpu_zero_bits(cpumask, nrbits); | ||
| 626 | * Then that could be optimized to use memset and such. | ||
| 627 | */ | ||
| 628 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
| 629 | if (cpu >= lowest_cpu) | ||
| 630 | break; | ||
| 631 | cpu_clear(cpu, *lowest_mask); | ||
| 632 | } | ||
| 633 | } | ||
| 634 | |||
| 635 | return count; | ||
| 134 | } | 636 | } |
| 135 | 637 | ||
| 136 | static struct task_struct *load_balance_next_rt(void *arg) | 638 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) |
| 137 | { | 639 | { |
| 138 | struct rq *rq = arg; | 640 | int first; |
| 139 | struct rt_prio_array *array = &rq->rt.active; | 641 | |
| 140 | struct list_head *head, *curr; | 642 | /* "this_cpu" is cheaper to preempt than a remote processor */ |
| 141 | struct task_struct *p; | 643 | if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) |
| 142 | int idx; | 644 | return this_cpu; |
| 645 | |||
| 646 | first = first_cpu(*mask); | ||
| 647 | if (first != NR_CPUS) | ||
| 648 | return first; | ||
| 649 | |||
| 650 | return -1; | ||
| 651 | } | ||
| 652 | |||
| 653 | static int find_lowest_rq(struct task_struct *task) | ||
| 654 | { | ||
| 655 | struct sched_domain *sd; | ||
| 656 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | ||
| 657 | int this_cpu = smp_processor_id(); | ||
| 658 | int cpu = task_cpu(task); | ||
| 659 | int count = find_lowest_cpus(task, lowest_mask); | ||
| 143 | 660 | ||
| 144 | idx = rq->rt.rt_load_balance_idx; | 661 | if (!count) |
| 145 | head = rq->rt.rt_load_balance_head; | 662 | return -1; /* No targets found */ |
| 146 | curr = rq->rt.rt_load_balance_curr; | ||
| 147 | 663 | ||
| 148 | /* | 664 | /* |
| 149 | * If we arrived back to the head again then | 665 | * There is no sense in performing an optimal search if only one |
| 150 | * iterate to the next queue (if any): | 666 | * target is found. |
| 151 | */ | 667 | */ |
| 152 | if (unlikely(head == curr)) { | 668 | if (count == 1) |
| 153 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | 669 | return first_cpu(*lowest_mask); |
| 154 | 670 | ||
| 155 | if (next_idx >= MAX_RT_PRIO) | 671 | /* |
| 156 | return NULL; | 672 | * At this point we have built a mask of cpus representing the |
| 673 | * lowest priority tasks in the system. Now we want to elect | ||
| 674 | * the best one based on our affinity and topology. | ||
| 675 | * | ||
| 676 | * We prioritize the last cpu that the task executed on since | ||
| 677 | * it is most likely cache-hot in that location. | ||
| 678 | */ | ||
| 679 | if (cpu_isset(cpu, *lowest_mask)) | ||
| 680 | return cpu; | ||
| 681 | |||
| 682 | /* | ||
| 683 | * Otherwise, we consult the sched_domains span maps to figure | ||
| 684 | * out which cpu is logically closest to our hot cache data. | ||
| 685 | */ | ||
| 686 | if (this_cpu == cpu) | ||
| 687 | this_cpu = -1; /* Skip this_cpu opt if the same */ | ||
| 688 | |||
| 689 | for_each_domain(cpu, sd) { | ||
| 690 | if (sd->flags & SD_WAKE_AFFINE) { | ||
| 691 | cpumask_t domain_mask; | ||
| 692 | int best_cpu; | ||
| 157 | 693 | ||
| 158 | idx = next_idx; | 694 | cpus_and(domain_mask, sd->span, *lowest_mask); |
| 159 | head = array->queue + idx; | ||
| 160 | curr = head->prev; | ||
| 161 | 695 | ||
| 162 | rq->rt.rt_load_balance_idx = idx; | 696 | best_cpu = pick_optimal_cpu(this_cpu, |
| 163 | rq->rt.rt_load_balance_head = head; | 697 | &domain_mask); |
| 698 | if (best_cpu != -1) | ||
| 699 | return best_cpu; | ||
| 700 | } | ||
| 164 | } | 701 | } |
| 165 | 702 | ||
| 166 | p = list_entry(curr, struct task_struct, run_list); | 703 | /* |
| 704 | * And finally, if there were no matches within the domains | ||
| 705 | * just give the caller *something* to work with from the compatible | ||
| 706 | * locations. | ||
| 707 | */ | ||
| 708 | return pick_optimal_cpu(this_cpu, lowest_mask); | ||
| 709 | } | ||
| 167 | 710 | ||
| 168 | curr = curr->prev; | 711 | /* Will lock the rq it finds */ |
| 712 | static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | ||
| 713 | { | ||
| 714 | struct rq *lowest_rq = NULL; | ||
| 715 | int tries; | ||
| 716 | int cpu; | ||
| 169 | 717 | ||
| 170 | rq->rt.rt_load_balance_curr = curr; | 718 | for (tries = 0; tries < RT_MAX_TRIES; tries++) { |
| 719 | cpu = find_lowest_rq(task); | ||
| 171 | 720 | ||
| 172 | return p; | 721 | if ((cpu == -1) || (cpu == rq->cpu)) |
| 722 | break; | ||
| 723 | |||
| 724 | lowest_rq = cpu_rq(cpu); | ||
| 725 | |||
| 726 | /* if the prio of this runqueue changed, try again */ | ||
| 727 | if (double_lock_balance(rq, lowest_rq)) { | ||
| 728 | /* | ||
| 729 | * We had to unlock the run queue. In | ||
| 730 | * the mean time, task could have | ||
| 731 | * migrated already or had its affinity changed. | ||
| 732 | * Also make sure that it wasn't scheduled on its rq. | ||
| 733 | */ | ||
| 734 | if (unlikely(task_rq(task) != rq || | ||
| 735 | !cpu_isset(lowest_rq->cpu, | ||
| 736 | task->cpus_allowed) || | ||
| 737 | task_running(rq, task) || | ||
| 738 | !task->se.on_rq)) { | ||
| 739 | |||
| 740 | spin_unlock(&lowest_rq->lock); | ||
| 741 | lowest_rq = NULL; | ||
| 742 | break; | ||
| 743 | } | ||
| 744 | } | ||
| 745 | |||
| 746 | /* If this rq is still suitable use it. */ | ||
| 747 | if (lowest_rq->rt.highest_prio > task->prio) | ||
| 748 | break; | ||
| 749 | |||
| 750 | /* try again */ | ||
| 751 | spin_unlock(&lowest_rq->lock); | ||
| 752 | lowest_rq = NULL; | ||
| 753 | } | ||
| 754 | |||
| 755 | return lowest_rq; | ||
| 756 | } | ||
| 757 | |||
| 758 | /* | ||
| 759 | * If the current CPU has more than one RT task, see if the non | ||
| 760 | * running task can migrate over to a CPU that is running a task | ||
| 761 | * of lesser priority. | ||
| 762 | */ | ||
| 763 | static int push_rt_task(struct rq *rq) | ||
| 764 | { | ||
| 765 | struct task_struct *next_task; | ||
| 766 | struct rq *lowest_rq; | ||
| 767 | int ret = 0; | ||
| 768 | int paranoid = RT_MAX_TRIES; | ||
| 769 | |||
| 770 | if (!rq->rt.overloaded) | ||
| 771 | return 0; | ||
| 772 | |||
| 773 | next_task = pick_next_highest_task_rt(rq, -1); | ||
| 774 | if (!next_task) | ||
| 775 | return 0; | ||
| 776 | |||
| 777 | retry: | ||
| 778 | if (unlikely(next_task == rq->curr)) { | ||
| 779 | WARN_ON(1); | ||
| 780 | return 0; | ||
| 781 | } | ||
| 782 | |||
| 783 | /* | ||
| 784 | * It's possible that the next_task slipped in of | ||
| 785 | * higher priority than current. If that's the case | ||
| 786 | * just reschedule current. | ||
| 787 | */ | ||
| 788 | if (unlikely(next_task->prio < rq->curr->prio)) { | ||
| 789 | resched_task(rq->curr); | ||
| 790 | return 0; | ||
| 791 | } | ||
| 792 | |||
| 793 | /* We might release rq lock */ | ||
| 794 | get_task_struct(next_task); | ||
| 795 | |||
| 796 | /* find_lock_lowest_rq locks the rq if found */ | ||
| 797 | lowest_rq = find_lock_lowest_rq(next_task, rq); | ||
| 798 | if (!lowest_rq) { | ||
| 799 | struct task_struct *task; | ||
| 800 | /* | ||
| 801 | * find lock_lowest_rq releases rq->lock | ||
| 802 | * so it is possible that next_task has changed. | ||
| 803 | * If it has, then try again. | ||
| 804 | */ | ||
| 805 | task = pick_next_highest_task_rt(rq, -1); | ||
| 806 | if (unlikely(task != next_task) && task && paranoid--) { | ||
| 807 | put_task_struct(next_task); | ||
| 808 | next_task = task; | ||
| 809 | goto retry; | ||
| 810 | } | ||
| 811 | goto out; | ||
| 812 | } | ||
| 813 | |||
| 814 | deactivate_task(rq, next_task, 0); | ||
| 815 | set_task_cpu(next_task, lowest_rq->cpu); | ||
| 816 | activate_task(lowest_rq, next_task, 0); | ||
| 817 | |||
| 818 | resched_task(lowest_rq->curr); | ||
| 819 | |||
| 820 | spin_unlock(&lowest_rq->lock); | ||
| 821 | |||
| 822 | ret = 1; | ||
| 823 | out: | ||
| 824 | put_task_struct(next_task); | ||
| 825 | |||
| 826 | return ret; | ||
| 827 | } | ||
| 828 | |||
| 829 | /* | ||
| 830 | * TODO: Currently we just use the second highest prio task on | ||
| 831 | * the queue, and stop when it can't migrate (or there's | ||
| 832 | * no more RT tasks). There may be a case where a lower | ||
| 833 | * priority RT task has a different affinity than the | ||
| 834 | * higher RT task. In this case the lower RT task could | ||
| 835 | * possibly be able to migrate where as the higher priority | ||
| 836 | * RT task could not. We currently ignore this issue. | ||
| 837 | * Enhancements are welcome! | ||
| 838 | */ | ||
| 839 | static void push_rt_tasks(struct rq *rq) | ||
| 840 | { | ||
| 841 | /* push_rt_task will return true if it moved an RT */ | ||
| 842 | while (push_rt_task(rq)) | ||
| 843 | ; | ||
| 844 | } | ||
| 845 | |||
| 846 | static int pull_rt_task(struct rq *this_rq) | ||
| 847 | { | ||
| 848 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
| 849 | struct task_struct *p, *next; | ||
| 850 | struct rq *src_rq; | ||
| 851 | |||
| 852 | if (likely(!rt_overloaded(this_rq))) | ||
| 853 | return 0; | ||
| 854 | |||
| 855 | next = pick_next_task_rt(this_rq); | ||
| 856 | |||
| 857 | for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { | ||
| 858 | if (this_cpu == cpu) | ||
| 859 | continue; | ||
| 860 | |||
| 861 | src_rq = cpu_rq(cpu); | ||
| 862 | /* | ||
| 863 | * We can potentially drop this_rq's lock in | ||
| 864 | * double_lock_balance, and another CPU could | ||
| 865 | * steal our next task - hence we must cause | ||
| 866 | * the caller to recalculate the next task | ||
| 867 | * in that case: | ||
| 868 | */ | ||
| 869 | if (double_lock_balance(this_rq, src_rq)) { | ||
| 870 | struct task_struct *old_next = next; | ||
| 871 | |||
| 872 | next = pick_next_task_rt(this_rq); | ||
| 873 | if (next != old_next) | ||
| 874 | ret = 1; | ||
| 875 | } | ||
| 876 | |||
| 877 | /* | ||
| 878 | * Are there still pullable RT tasks? | ||
| 879 | */ | ||
| 880 | if (src_rq->rt.rt_nr_running <= 1) | ||
| 881 | goto skip; | ||
| 882 | |||
| 883 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
| 884 | |||
| 885 | /* | ||
| 886 | * Do we have an RT task that preempts | ||
| 887 | * the to-be-scheduled task? | ||
| 888 | */ | ||
| 889 | if (p && (!next || (p->prio < next->prio))) { | ||
| 890 | WARN_ON(p == src_rq->curr); | ||
| 891 | WARN_ON(!p->se.on_rq); | ||
| 892 | |||
| 893 | /* | ||
| 894 | * There's a chance that p is higher in priority | ||
| 895 | * than what's currently running on its cpu. | ||
| 896 | * This is just that p is wakeing up and hasn't | ||
| 897 | * had a chance to schedule. We only pull | ||
| 898 | * p if it is lower in priority than the | ||
| 899 | * current task on the run queue or | ||
| 900 | * this_rq next task is lower in prio than | ||
| 901 | * the current task on that rq. | ||
| 902 | */ | ||
| 903 | if (p->prio < src_rq->curr->prio || | ||
| 904 | (next && next->prio < src_rq->curr->prio)) | ||
| 905 | goto skip; | ||
| 906 | |||
| 907 | ret = 1; | ||
| 908 | |||
| 909 | deactivate_task(src_rq, p, 0); | ||
| 910 | set_task_cpu(p, this_cpu); | ||
| 911 | activate_task(this_rq, p, 0); | ||
| 912 | /* | ||
| 913 | * We continue with the search, just in | ||
| 914 | * case there's an even higher prio task | ||
| 915 | * in another runqueue. (low likelyhood | ||
| 916 | * but possible) | ||
| 917 | * | ||
| 918 | * Update next so that we won't pick a task | ||
| 919 | * on another cpu with a priority lower (or equal) | ||
| 920 | * than the one we just picked. | ||
| 921 | */ | ||
| 922 | next = p; | ||
| 923 | |||
| 924 | } | ||
| 925 | skip: | ||
| 926 | spin_unlock(&src_rq->lock); | ||
| 927 | } | ||
| 928 | |||
| 929 | return ret; | ||
| 930 | } | ||
| 931 | |||
| 932 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | ||
| 933 | { | ||
| 934 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
| 935 | if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) | ||
| 936 | pull_rt_task(rq); | ||
| 937 | } | ||
| 938 | |||
| 939 | static void post_schedule_rt(struct rq *rq) | ||
| 940 | { | ||
| 941 | /* | ||
| 942 | * If we have more than one rt_task queued, then | ||
| 943 | * see if we can push the other rt_tasks off to other CPUS. | ||
| 944 | * Note we may release the rq lock, and since | ||
| 945 | * the lock was owned by prev, we need to release it | ||
| 946 | * first via finish_lock_switch and then reaquire it here. | ||
| 947 | */ | ||
| 948 | if (unlikely(rq->rt.overloaded)) { | ||
| 949 | spin_lock_irq(&rq->lock); | ||
| 950 | push_rt_tasks(rq); | ||
| 951 | spin_unlock_irq(&rq->lock); | ||
| 952 | } | ||
| 953 | } | ||
| 954 | |||
| 955 | |||
| 956 | static void task_wake_up_rt(struct rq *rq, struct task_struct *p) | ||
| 957 | { | ||
| 958 | if (!task_running(rq, p) && | ||
| 959 | (p->prio >= rq->rt.highest_prio) && | ||
| 960 | rq->rt.overloaded) | ||
| 961 | push_rt_tasks(rq); | ||
| 173 | } | 962 | } |
| 174 | 963 | ||
| 175 | static unsigned long | 964 | static unsigned long |
| @@ -178,38 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 178 | struct sched_domain *sd, enum cpu_idle_type idle, | 967 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 179 | int *all_pinned, int *this_best_prio) | 968 | int *all_pinned, int *this_best_prio) |
| 180 | { | 969 | { |
| 181 | struct rq_iterator rt_rq_iterator; | 970 | /* don't touch RT tasks */ |
| 182 | 971 | return 0; | |
| 183 | rt_rq_iterator.start = load_balance_start_rt; | ||
| 184 | rt_rq_iterator.next = load_balance_next_rt; | ||
| 185 | /* pass 'busiest' rq argument into | ||
| 186 | * load_balance_[start|next]_rt iterators | ||
| 187 | */ | ||
| 188 | rt_rq_iterator.arg = busiest; | ||
| 189 | |||
| 190 | return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, | ||
| 191 | idle, all_pinned, this_best_prio, &rt_rq_iterator); | ||
| 192 | } | 972 | } |
| 193 | 973 | ||
| 194 | static int | 974 | static int |
| 195 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 975 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 196 | struct sched_domain *sd, enum cpu_idle_type idle) | 976 | struct sched_domain *sd, enum cpu_idle_type idle) |
| 197 | { | 977 | { |
| 198 | struct rq_iterator rt_rq_iterator; | 978 | /* don't touch RT tasks */ |
| 979 | return 0; | ||
| 980 | } | ||
| 981 | |||
| 982 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | ||
| 983 | { | ||
| 984 | int weight = cpus_weight(*new_mask); | ||
| 985 | |||
| 986 | BUG_ON(!rt_task(p)); | ||
| 199 | 987 | ||
| 200 | rt_rq_iterator.start = load_balance_start_rt; | 988 | /* |
| 201 | rt_rq_iterator.next = load_balance_next_rt; | 989 | * Update the migration status of the RQ if we have an RT task |
| 202 | rt_rq_iterator.arg = busiest; | 990 | * which is running AND changing its weight value. |
| 991 | */ | ||
| 992 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
| 993 | struct rq *rq = task_rq(p); | ||
| 994 | |||
| 995 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | ||
| 996 | rq->rt.rt_nr_migratory++; | ||
| 997 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
| 998 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
| 999 | rq->rt.rt_nr_migratory--; | ||
| 1000 | } | ||
| 1001 | |||
| 1002 | update_rt_migration(rq); | ||
| 1003 | } | ||
| 203 | 1004 | ||
| 204 | return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 1005 | p->cpus_allowed = *new_mask; |
| 205 | &rt_rq_iterator); | 1006 | p->rt.nr_cpus_allowed = weight; |
| 206 | } | 1007 | } |
| 207 | #endif | ||
| 208 | 1008 | ||
| 209 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 1009 | /* Assumes rq->lock is held */ |
| 1010 | static void join_domain_rt(struct rq *rq) | ||
| 1011 | { | ||
| 1012 | if (rq->rt.overloaded) | ||
| 1013 | rt_set_overload(rq); | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | /* Assumes rq->lock is held */ | ||
| 1017 | static void leave_domain_rt(struct rq *rq) | ||
| 1018 | { | ||
| 1019 | if (rq->rt.overloaded) | ||
| 1020 | rt_clear_overload(rq); | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | /* | ||
| 1024 | * When switch from the rt queue, we bring ourselves to a position | ||
| 1025 | * that we might want to pull RT tasks from other runqueues. | ||
| 1026 | */ | ||
| 1027 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | ||
| 1028 | int running) | ||
| 1029 | { | ||
| 1030 | /* | ||
| 1031 | * If there are other RT tasks then we will reschedule | ||
| 1032 | * and the scheduling of the other RT tasks will handle | ||
| 1033 | * the balancing. But if we are the last RT task | ||
| 1034 | * we may need to handle the pulling of RT tasks | ||
| 1035 | * now. | ||
| 1036 | */ | ||
| 1037 | if (!rq->rt.rt_nr_running) | ||
| 1038 | pull_rt_task(rq); | ||
| 1039 | } | ||
| 1040 | #endif /* CONFIG_SMP */ | ||
| 1041 | |||
| 1042 | /* | ||
| 1043 | * When switching a task to RT, we may overload the runqueue | ||
| 1044 | * with RT tasks. In this case we try to push them off to | ||
| 1045 | * other runqueues. | ||
| 1046 | */ | ||
| 1047 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | ||
| 1048 | int running) | ||
| 1049 | { | ||
| 1050 | int check_resched = 1; | ||
| 1051 | |||
| 1052 | /* | ||
| 1053 | * If we are already running, then there's nothing | ||
| 1054 | * that needs to be done. But if we are not running | ||
| 1055 | * we may need to preempt the current running task. | ||
| 1056 | * If that current running task is also an RT task | ||
| 1057 | * then see if we can move to another run queue. | ||
| 1058 | */ | ||
| 1059 | if (!running) { | ||
| 1060 | #ifdef CONFIG_SMP | ||
| 1061 | if (rq->rt.overloaded && push_rt_task(rq) && | ||
| 1062 | /* Don't resched if we changed runqueues */ | ||
| 1063 | rq != task_rq(p)) | ||
| 1064 | check_resched = 0; | ||
| 1065 | #endif /* CONFIG_SMP */ | ||
| 1066 | if (check_resched && p->prio < rq->curr->prio) | ||
| 1067 | resched_task(rq->curr); | ||
| 1068 | } | ||
| 1069 | } | ||
| 1070 | |||
| 1071 | /* | ||
| 1072 | * Priority of the task has changed. This may cause | ||
| 1073 | * us to initiate a push or pull. | ||
| 1074 | */ | ||
| 1075 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | ||
| 1076 | int oldprio, int running) | ||
| 1077 | { | ||
| 1078 | if (running) { | ||
| 1079 | #ifdef CONFIG_SMP | ||
| 1080 | /* | ||
| 1081 | * If our priority decreases while running, we | ||
| 1082 | * may need to pull tasks to this runqueue. | ||
| 1083 | */ | ||
| 1084 | if (oldprio < p->prio) | ||
| 1085 | pull_rt_task(rq); | ||
| 1086 | /* | ||
| 1087 | * If there's a higher priority task waiting to run | ||
| 1088 | * then reschedule. | ||
| 1089 | */ | ||
| 1090 | if (p->prio > rq->rt.highest_prio) | ||
| 1091 | resched_task(p); | ||
| 1092 | #else | ||
| 1093 | /* For UP simply resched on drop of prio */ | ||
| 1094 | if (oldprio < p->prio) | ||
| 1095 | resched_task(p); | ||
| 1096 | #endif /* CONFIG_SMP */ | ||
| 1097 | } else { | ||
| 1098 | /* | ||
| 1099 | * This task is not running, but if it is | ||
| 1100 | * greater than the current running task | ||
| 1101 | * then reschedule. | ||
| 1102 | */ | ||
| 1103 | if (p->prio < rq->curr->prio) | ||
| 1104 | resched_task(rq->curr); | ||
| 1105 | } | ||
| 1106 | } | ||
| 1107 | |||
| 1108 | static void watchdog(struct rq *rq, struct task_struct *p) | ||
| 1109 | { | ||
| 1110 | unsigned long soft, hard; | ||
| 1111 | |||
| 1112 | if (!p->signal) | ||
| 1113 | return; | ||
| 1114 | |||
| 1115 | soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; | ||
| 1116 | hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; | ||
| 1117 | |||
| 1118 | if (soft != RLIM_INFINITY) { | ||
| 1119 | unsigned long next; | ||
| 1120 | |||
| 1121 | p->rt.timeout++; | ||
| 1122 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); | ||
| 1123 | if (p->rt.timeout > next) | ||
| 1124 | p->it_sched_expires = p->se.sum_exec_runtime; | ||
| 1125 | } | ||
| 1126 | } | ||
| 1127 | |||
| 1128 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | ||
| 210 | { | 1129 | { |
| 211 | update_curr_rt(rq); | 1130 | update_curr_rt(rq); |
| 212 | 1131 | ||
| 1132 | watchdog(rq, p); | ||
| 1133 | |||
| 213 | /* | 1134 | /* |
| 214 | * RR tasks need a special form of timeslice management. | 1135 | * RR tasks need a special form of timeslice management. |
| 215 | * FIFO tasks have no timeslices. | 1136 | * FIFO tasks have no timeslices. |
| @@ -217,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
| 217 | if (p->policy != SCHED_RR) | 1138 | if (p->policy != SCHED_RR) |
| 218 | return; | 1139 | return; |
| 219 | 1140 | ||
| 220 | if (--p->time_slice) | 1141 | if (--p->rt.time_slice) |
| 221 | return; | 1142 | return; |
| 222 | 1143 | ||
| 223 | p->time_slice = DEF_TIMESLICE; | 1144 | p->rt.time_slice = DEF_TIMESLICE; |
| 224 | 1145 | ||
| 225 | /* | 1146 | /* |
| 226 | * Requeue to the end of queue if we are not the only element | 1147 | * Requeue to the end of queue if we are not the only element |
| 227 | * on the queue: | 1148 | * on the queue: |
| 228 | */ | 1149 | */ |
| 229 | if (p->run_list.prev != p->run_list.next) { | 1150 | if (p->rt.run_list.prev != p->rt.run_list.next) { |
| 230 | requeue_task_rt(rq, p); | 1151 | requeue_task_rt(rq, p); |
| 231 | set_tsk_need_resched(p); | 1152 | set_tsk_need_resched(p); |
| 232 | } | 1153 | } |
| @@ -244,6 +1165,9 @@ const struct sched_class rt_sched_class = { | |||
| 244 | .enqueue_task = enqueue_task_rt, | 1165 | .enqueue_task = enqueue_task_rt, |
| 245 | .dequeue_task = dequeue_task_rt, | 1166 | .dequeue_task = dequeue_task_rt, |
| 246 | .yield_task = yield_task_rt, | 1167 | .yield_task = yield_task_rt, |
| 1168 | #ifdef CONFIG_SMP | ||
| 1169 | .select_task_rq = select_task_rq_rt, | ||
| 1170 | #endif /* CONFIG_SMP */ | ||
| 247 | 1171 | ||
| 248 | .check_preempt_curr = check_preempt_curr_rt, | 1172 | .check_preempt_curr = check_preempt_curr_rt, |
| 249 | 1173 | ||
| @@ -253,8 +1177,18 @@ const struct sched_class rt_sched_class = { | |||
| 253 | #ifdef CONFIG_SMP | 1177 | #ifdef CONFIG_SMP |
| 254 | .load_balance = load_balance_rt, | 1178 | .load_balance = load_balance_rt, |
| 255 | .move_one_task = move_one_task_rt, | 1179 | .move_one_task = move_one_task_rt, |
| 1180 | .set_cpus_allowed = set_cpus_allowed_rt, | ||
| 1181 | .join_domain = join_domain_rt, | ||
| 1182 | .leave_domain = leave_domain_rt, | ||
| 1183 | .pre_schedule = pre_schedule_rt, | ||
| 1184 | .post_schedule = post_schedule_rt, | ||
| 1185 | .task_wake_up = task_wake_up_rt, | ||
| 1186 | .switched_from = switched_from_rt, | ||
| 256 | #endif | 1187 | #endif |
| 257 | 1188 | ||
| 258 | .set_curr_task = set_curr_task_rt, | 1189 | .set_curr_task = set_curr_task_rt, |
| 259 | .task_tick = task_tick_rt, | 1190 | .task_tick = task_tick_rt, |
| 1191 | |||
| 1192 | .prio_changed = prio_changed_rt, | ||
| 1193 | .switched_to = switched_to_rt, | ||
| 260 | }; | 1194 | }; |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 11df812263c8..c1d76552446e 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | */ | 8 | */ |
| 9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
| 10 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
| 11 | #include <linux/nmi.h> | ||
| 11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
| 12 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
| 13 | #include <linux/freezer.h> | 14 | #include <linux/freezer.h> |
| @@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp); | |||
| 23 | static DEFINE_PER_CPU(unsigned long, print_timestamp); | 24 | static DEFINE_PER_CPU(unsigned long, print_timestamp); |
| 24 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | 25 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); |
| 25 | 26 | ||
| 26 | static int did_panic; | 27 | static int __read_mostly did_panic; |
| 27 | int softlockup_thresh = 10; | 28 | unsigned long __read_mostly softlockup_thresh = 60; |
| 28 | 29 | ||
| 29 | static int | 30 | static int |
| 30 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | 31 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) |
| @@ -45,7 +46,7 @@ static struct notifier_block panic_block = { | |||
| 45 | */ | 46 | */ |
| 46 | static unsigned long get_timestamp(int this_cpu) | 47 | static unsigned long get_timestamp(int this_cpu) |
| 47 | { | 48 | { |
| 48 | return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ | 49 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
| 49 | } | 50 | } |
| 50 | 51 | ||
| 51 | void touch_softlockup_watchdog(void) | 52 | void touch_softlockup_watchdog(void) |
| @@ -100,11 +101,7 @@ void softlockup_tick(void) | |||
| 100 | 101 | ||
| 101 | now = get_timestamp(this_cpu); | 102 | now = get_timestamp(this_cpu); |
| 102 | 103 | ||
| 103 | /* Wake up the high-prio watchdog task every second: */ | 104 | /* Warn about unreasonable delays: */ |
| 104 | if (now > (touch_timestamp + 1)) | ||
| 105 | wake_up_process(per_cpu(watchdog_task, this_cpu)); | ||
| 106 | |||
| 107 | /* Warn about unreasonable 10+ seconds delays: */ | ||
| 108 | if (now <= (touch_timestamp + softlockup_thresh)) | 105 | if (now <= (touch_timestamp + softlockup_thresh)) |
| 109 | return; | 106 | return; |
| 110 | 107 | ||
| @@ -122,11 +119,93 @@ void softlockup_tick(void) | |||
| 122 | } | 119 | } |
| 123 | 120 | ||
| 124 | /* | 121 | /* |
| 122 | * Have a reasonable limit on the number of tasks checked: | ||
| 123 | */ | ||
| 124 | unsigned long __read_mostly sysctl_hung_task_check_count = 1024; | ||
| 125 | |||
| 126 | /* | ||
| 127 | * Zero means infinite timeout - no checking done: | ||
| 128 | */ | ||
| 129 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | ||
| 130 | |||
| 131 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | ||
| 132 | |||
| 133 | /* | ||
| 134 | * Only do the hung-tasks check on one CPU: | ||
| 135 | */ | ||
| 136 | static int check_cpu __read_mostly = -1; | ||
| 137 | |||
| 138 | static void check_hung_task(struct task_struct *t, unsigned long now) | ||
| 139 | { | ||
| 140 | unsigned long switch_count = t->nvcsw + t->nivcsw; | ||
| 141 | |||
| 142 | if (t->flags & PF_FROZEN) | ||
| 143 | return; | ||
| 144 | |||
| 145 | if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { | ||
| 146 | t->last_switch_count = switch_count; | ||
| 147 | t->last_switch_timestamp = now; | ||
| 148 | return; | ||
| 149 | } | ||
| 150 | if ((long)(now - t->last_switch_timestamp) < | ||
| 151 | sysctl_hung_task_timeout_secs) | ||
| 152 | return; | ||
| 153 | if (sysctl_hung_task_warnings < 0) | ||
| 154 | return; | ||
| 155 | sysctl_hung_task_warnings--; | ||
| 156 | |||
| 157 | /* | ||
| 158 | * Ok, the task did not get scheduled for more than 2 minutes, | ||
| 159 | * complain: | ||
| 160 | */ | ||
| 161 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | ||
| 162 | "%ld seconds.\n", t->comm, t->pid, | ||
| 163 | sysctl_hung_task_timeout_secs); | ||
| 164 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
| 165 | " disables this message.\n"); | ||
| 166 | sched_show_task(t); | ||
| 167 | __debug_show_held_locks(t); | ||
| 168 | |||
| 169 | t->last_switch_timestamp = now; | ||
| 170 | touch_nmi_watchdog(); | ||
| 171 | } | ||
| 172 | |||
| 173 | /* | ||
| 174 | * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for | ||
| 175 | * a really long time (120 seconds). If that happens, print out | ||
| 176 | * a warning. | ||
| 177 | */ | ||
| 178 | static void check_hung_uninterruptible_tasks(int this_cpu) | ||
| 179 | { | ||
| 180 | int max_count = sysctl_hung_task_check_count; | ||
| 181 | unsigned long now = get_timestamp(this_cpu); | ||
| 182 | struct task_struct *g, *t; | ||
| 183 | |||
| 184 | /* | ||
| 185 | * If the system crashed already then all bets are off, | ||
| 186 | * do not report extra hung tasks: | ||
| 187 | */ | ||
| 188 | if ((tainted & TAINT_DIE) || did_panic) | ||
| 189 | return; | ||
| 190 | |||
| 191 | read_lock(&tasklist_lock); | ||
| 192 | do_each_thread(g, t) { | ||
| 193 | if (!--max_count) | ||
| 194 | break; | ||
| 195 | if (t->state & TASK_UNINTERRUPTIBLE) | ||
| 196 | check_hung_task(t, now); | ||
| 197 | } while_each_thread(g, t); | ||
| 198 | |||
| 199 | read_unlock(&tasklist_lock); | ||
| 200 | } | ||
| 201 | |||
| 202 | /* | ||
| 125 | * The watchdog thread - runs every second and touches the timestamp. | 203 | * The watchdog thread - runs every second and touches the timestamp. |
| 126 | */ | 204 | */ |
| 127 | static int watchdog(void *__bind_cpu) | 205 | static int watchdog(void *__bind_cpu) |
| 128 | { | 206 | { |
| 129 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 207 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
| 208 | int this_cpu = (long)__bind_cpu; | ||
| 130 | 209 | ||
| 131 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 210 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
| 132 | 211 | ||
| @@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu) | |||
| 135 | 214 | ||
| 136 | /* | 215 | /* |
| 137 | * Run briefly once per second to reset the softlockup timestamp. | 216 | * Run briefly once per second to reset the softlockup timestamp. |
| 138 | * If this gets delayed for more than 10 seconds then the | 217 | * If this gets delayed for more than 60 seconds then the |
| 139 | * debug-printout triggers in softlockup_tick(). | 218 | * debug-printout triggers in softlockup_tick(). |
| 140 | */ | 219 | */ |
| 141 | while (!kthread_should_stop()) { | 220 | while (!kthread_should_stop()) { |
| 142 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 143 | touch_softlockup_watchdog(); | 221 | touch_softlockup_watchdog(); |
| 144 | schedule(); | 222 | msleep_interruptible(10000); |
| 223 | |||
| 224 | if (this_cpu != check_cpu) | ||
| 225 | continue; | ||
| 226 | |||
| 227 | if (sysctl_hung_task_timeout_secs) | ||
| 228 | check_hung_uninterruptible_tasks(this_cpu); | ||
| 145 | } | 229 | } |
| 146 | 230 | ||
| 147 | return 0; | 231 | return 0; |
| @@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 171 | break; | 255 | break; |
| 172 | case CPU_ONLINE: | 256 | case CPU_ONLINE: |
| 173 | case CPU_ONLINE_FROZEN: | 257 | case CPU_ONLINE_FROZEN: |
| 258 | check_cpu = any_online_cpu(cpu_online_map); | ||
| 174 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | 259 | wake_up_process(per_cpu(watchdog_task, hotcpu)); |
| 175 | break; | 260 | break; |
| 176 | #ifdef CONFIG_HOTPLUG_CPU | 261 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 181 | /* Unbind so it can run. Fall thru. */ | 266 | /* Unbind so it can run. Fall thru. */ |
| 182 | kthread_bind(per_cpu(watchdog_task, hotcpu), | 267 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
| 183 | any_online_cpu(cpu_online_map)); | 268 | any_online_cpu(cpu_online_map)); |
| 269 | case CPU_DOWN_PREPARE: | ||
| 270 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 271 | if (hotcpu == check_cpu) { | ||
| 272 | cpumask_t temp_cpu_online_map = cpu_online_map; | ||
| 273 | |||
| 274 | cpu_clear(hotcpu, temp_cpu_online_map); | ||
| 275 | check_cpu = any_online_cpu(temp_cpu_online_map); | ||
| 276 | } | ||
| 277 | break; | ||
| 184 | case CPU_DEAD: | 278 | case CPU_DEAD: |
| 185 | case CPU_DEAD_FROZEN: | 279 | case CPU_DEAD_FROZEN: |
| 186 | p = per_cpu(watchdog_task, hotcpu); | 280 | p = per_cpu(watchdog_task, hotcpu); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 319821ef78af..51b5ee53571a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) | |||
| 203 | int ret; | 203 | int ret; |
| 204 | 204 | ||
| 205 | /* No CPUs can come up or down during this. */ | 205 | /* No CPUs can come up or down during this. */ |
| 206 | lock_cpu_hotplug(); | 206 | get_online_cpus(); |
| 207 | p = __stop_machine_run(fn, data, cpu); | 207 | p = __stop_machine_run(fn, data, cpu); |
| 208 | if (!IS_ERR(p)) | 208 | if (!IS_ERR(p)) |
| 209 | ret = kthread_stop(p); | 209 | ret = kthread_stop(p); |
| 210 | else | 210 | else |
| 211 | ret = PTR_ERR(p); | 211 | ret = PTR_ERR(p); |
| 212 | unlock_cpu_hotplug(); | 212 | put_online_cpus(); |
| 213 | 213 | ||
| 214 | return ret; | 214 | return ret; |
| 215 | } | 215 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c68f68dcc605..8e96558cb8f3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -81,6 +81,7 @@ extern int compat_log; | |||
| 81 | extern int maps_protect; | 81 | extern int maps_protect; |
| 82 | extern int sysctl_stat_interval; | 82 | extern int sysctl_stat_interval; |
| 83 | extern int audit_argv_kb; | 83 | extern int audit_argv_kb; |
| 84 | extern int latencytop_enabled; | ||
| 84 | 85 | ||
| 85 | /* Constants used for minimum and maximum */ | 86 | /* Constants used for minimum and maximum */ |
| 86 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 87 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
| @@ -306,9 +307,43 @@ static struct ctl_table kern_table[] = { | |||
| 306 | .procname = "sched_nr_migrate", | 307 | .procname = "sched_nr_migrate", |
| 307 | .data = &sysctl_sched_nr_migrate, | 308 | .data = &sysctl_sched_nr_migrate, |
| 308 | .maxlen = sizeof(unsigned int), | 309 | .maxlen = sizeof(unsigned int), |
| 309 | .mode = 644, | 310 | .mode = 0644, |
| 311 | .proc_handler = &proc_dointvec, | ||
| 312 | }, | ||
| 313 | { | ||
| 314 | .ctl_name = CTL_UNNUMBERED, | ||
| 315 | .procname = "sched_rt_period_ms", | ||
| 316 | .data = &sysctl_sched_rt_period, | ||
| 317 | .maxlen = sizeof(unsigned int), | ||
| 318 | .mode = 0644, | ||
| 310 | .proc_handler = &proc_dointvec, | 319 | .proc_handler = &proc_dointvec, |
| 311 | }, | 320 | }, |
| 321 | { | ||
| 322 | .ctl_name = CTL_UNNUMBERED, | ||
| 323 | .procname = "sched_rt_ratio", | ||
| 324 | .data = &sysctl_sched_rt_ratio, | ||
| 325 | .maxlen = sizeof(unsigned int), | ||
| 326 | .mode = 0644, | ||
| 327 | .proc_handler = &proc_dointvec, | ||
| 328 | }, | ||
| 329 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
| 330 | { | ||
| 331 | .ctl_name = CTL_UNNUMBERED, | ||
| 332 | .procname = "sched_min_bal_int_shares", | ||
| 333 | .data = &sysctl_sched_min_bal_int_shares, | ||
| 334 | .maxlen = sizeof(unsigned int), | ||
| 335 | .mode = 0644, | ||
| 336 | .proc_handler = &proc_dointvec, | ||
| 337 | }, | ||
| 338 | { | ||
| 339 | .ctl_name = CTL_UNNUMBERED, | ||
| 340 | .procname = "sched_max_bal_int_shares", | ||
| 341 | .data = &sysctl_sched_max_bal_int_shares, | ||
| 342 | .maxlen = sizeof(unsigned int), | ||
| 343 | .mode = 0644, | ||
| 344 | .proc_handler = &proc_dointvec, | ||
| 345 | }, | ||
| 346 | #endif | ||
| 312 | #endif | 347 | #endif |
| 313 | { | 348 | { |
| 314 | .ctl_name = CTL_UNNUMBERED, | 349 | .ctl_name = CTL_UNNUMBERED, |
| @@ -382,6 +417,15 @@ static struct ctl_table kern_table[] = { | |||
| 382 | .proc_handler = &proc_dointvec_taint, | 417 | .proc_handler = &proc_dointvec_taint, |
| 383 | }, | 418 | }, |
| 384 | #endif | 419 | #endif |
| 420 | #ifdef CONFIG_LATENCYTOP | ||
| 421 | { | ||
| 422 | .procname = "latencytop", | ||
| 423 | .data = &latencytop_enabled, | ||
| 424 | .maxlen = sizeof(int), | ||
| 425 | .mode = 0644, | ||
| 426 | .proc_handler = &proc_dointvec, | ||
| 427 | }, | ||
| 428 | #endif | ||
| 385 | #ifdef CONFIG_SECURITY_CAPABILITIES | 429 | #ifdef CONFIG_SECURITY_CAPABILITIES |
| 386 | { | 430 | { |
| 387 | .procname = "cap-bound", | 431 | .procname = "cap-bound", |
| @@ -728,13 +772,40 @@ static struct ctl_table kern_table[] = { | |||
| 728 | .ctl_name = CTL_UNNUMBERED, | 772 | .ctl_name = CTL_UNNUMBERED, |
| 729 | .procname = "softlockup_thresh", | 773 | .procname = "softlockup_thresh", |
| 730 | .data = &softlockup_thresh, | 774 | .data = &softlockup_thresh, |
| 731 | .maxlen = sizeof(int), | 775 | .maxlen = sizeof(unsigned long), |
| 732 | .mode = 0644, | 776 | .mode = 0644, |
| 733 | .proc_handler = &proc_dointvec_minmax, | 777 | .proc_handler = &proc_doulongvec_minmax, |
| 734 | .strategy = &sysctl_intvec, | 778 | .strategy = &sysctl_intvec, |
| 735 | .extra1 = &one, | 779 | .extra1 = &one, |
| 736 | .extra2 = &sixty, | 780 | .extra2 = &sixty, |
| 737 | }, | 781 | }, |
| 782 | { | ||
| 783 | .ctl_name = CTL_UNNUMBERED, | ||
| 784 | .procname = "hung_task_check_count", | ||
| 785 | .data = &sysctl_hung_task_check_count, | ||
| 786 | .maxlen = sizeof(unsigned long), | ||
| 787 | .mode = 0644, | ||
| 788 | .proc_handler = &proc_doulongvec_minmax, | ||
| 789 | .strategy = &sysctl_intvec, | ||
| 790 | }, | ||
| 791 | { | ||
| 792 | .ctl_name = CTL_UNNUMBERED, | ||
| 793 | .procname = "hung_task_timeout_secs", | ||
| 794 | .data = &sysctl_hung_task_timeout_secs, | ||
| 795 | .maxlen = sizeof(unsigned long), | ||
| 796 | .mode = 0644, | ||
| 797 | .proc_handler = &proc_doulongvec_minmax, | ||
| 798 | .strategy = &sysctl_intvec, | ||
| 799 | }, | ||
| 800 | { | ||
| 801 | .ctl_name = CTL_UNNUMBERED, | ||
| 802 | .procname = "hung_task_warnings", | ||
| 803 | .data = &sysctl_hung_task_warnings, | ||
| 804 | .maxlen = sizeof(unsigned long), | ||
| 805 | .mode = 0644, | ||
| 806 | .proc_handler = &proc_doulongvec_minmax, | ||
| 807 | .strategy = &sysctl_intvec, | ||
| 808 | }, | ||
| 738 | #endif | 809 | #endif |
| 739 | #ifdef CONFIG_COMPAT | 810 | #ifdef CONFIG_COMPAT |
| 740 | { | 811 | { |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb89fa8db110..1a21b6fdb674 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void) | |||
| 153 | void tick_nohz_stop_sched_tick(void) | 153 | void tick_nohz_stop_sched_tick(void) |
| 154 | { | 154 | { |
| 155 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 155 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
| 156 | unsigned long rt_jiffies; | ||
| 156 | struct tick_sched *ts; | 157 | struct tick_sched *ts; |
| 157 | ktime_t last_update, expires, now, delta; | 158 | ktime_t last_update, expires, now, delta; |
| 158 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 159 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
| @@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void) | |||
| 216 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 217 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
| 217 | delta_jiffies = next_jiffies - last_jiffies; | 218 | delta_jiffies = next_jiffies - last_jiffies; |
| 218 | 219 | ||
| 220 | rt_jiffies = rt_needs_cpu(cpu); | ||
| 221 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
| 222 | delta_jiffies = rt_jiffies; | ||
| 223 | |||
| 219 | if (rcu_needs_cpu(cpu)) | 224 | if (rcu_needs_cpu(cpu)) |
| 220 | delta_jiffies = 1; | 225 | delta_jiffies = 1; |
| 221 | /* | 226 | /* |
| @@ -509,7 +514,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
| 509 | { | 514 | { |
| 510 | struct tick_sched *ts = | 515 | struct tick_sched *ts = |
| 511 | container_of(timer, struct tick_sched, sched_timer); | 516 | container_of(timer, struct tick_sched, sched_timer); |
| 512 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
| 513 | struct pt_regs *regs = get_irq_regs(); | 517 | struct pt_regs *regs = get_irq_regs(); |
| 514 | ktime_t now = ktime_get(); | 518 | ktime_t now = ktime_get(); |
| 515 | int cpu = smp_processor_id(); | 519 | int cpu = smp_processor_id(); |
| @@ -547,15 +551,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
| 547 | touch_softlockup_watchdog(); | 551 | touch_softlockup_watchdog(); |
| 548 | ts->idle_jiffies++; | 552 | ts->idle_jiffies++; |
| 549 | } | 553 | } |
| 550 | /* | ||
| 551 | * update_process_times() might take tasklist_lock, hence | ||
| 552 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
| 553 | * never accessible by userspace APIs, so this is safe to do. | ||
| 554 | */ | ||
| 555 | spin_unlock(&base->lock); | ||
| 556 | update_process_times(user_mode(regs)); | 554 | update_process_times(user_mode(regs)); |
| 557 | profile_tick(CPU_PROFILING); | 555 | profile_tick(CPU_PROFILING); |
| 558 | spin_lock(&base->lock); | ||
| 559 | } | 556 | } |
| 560 | 557 | ||
| 561 | /* Do not restart, when we are in the idle loop */ | 558 | /* Do not restart, when we are in the idle loop */ |
diff --git a/kernel/timer.c b/kernel/timer.c index 2a00c22203f3..f739dfb539ce 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -896,7 +896,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
| 896 | { | 896 | { |
| 897 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 897 | tvec_base_t *base = __get_cpu_var(tvec_bases); |
| 898 | 898 | ||
| 899 | hrtimer_run_queues(); | 899 | hrtimer_run_pending(); |
| 900 | 900 | ||
| 901 | if (time_after_eq(jiffies, base->timer_jiffies)) | 901 | if (time_after_eq(jiffies, base->timer_jiffies)) |
| 902 | __run_timers(base); | 902 | __run_timers(base); |
| @@ -907,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
| 907 | */ | 907 | */ |
| 908 | void run_local_timers(void) | 908 | void run_local_timers(void) |
| 909 | { | 909 | { |
| 910 | hrtimer_run_queues(); | ||
| 910 | raise_softirq(TIMER_SOFTIRQ); | 911 | raise_softirq(TIMER_SOFTIRQ); |
| 911 | softlockup_tick(); | 912 | softlockup_tick(); |
| 912 | } | 913 | } |
diff --git a/kernel/user.c b/kernel/user.c index ab4fd706993b..bc1c48d35cb3 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -319,7 +319,7 @@ void free_uid(struct user_struct *up) | |||
| 319 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 319 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
| 320 | { | 320 | { |
| 321 | struct hlist_head *hashent = uidhashentry(ns, uid); | 321 | struct hlist_head *hashent = uidhashentry(ns, uid); |
| 322 | struct user_struct *up; | 322 | struct user_struct *up, *new; |
| 323 | 323 | ||
| 324 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() | 324 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
| 325 | * atomic. | 325 | * atomic. |
| @@ -331,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 331 | spin_unlock_irq(&uidhash_lock); | 331 | spin_unlock_irq(&uidhash_lock); |
| 332 | 332 | ||
| 333 | if (!up) { | 333 | if (!up) { |
| 334 | struct user_struct *new; | ||
| 335 | |||
| 336 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); | 334 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); |
| 337 | if (!new) { | 335 | if (!new) |
| 338 | uids_mutex_unlock(); | 336 | goto out_unlock; |
| 339 | return NULL; | ||
| 340 | } | ||
| 341 | 337 | ||
| 342 | new->uid = uid; | 338 | new->uid = uid; |
| 343 | atomic_set(&new->__count, 1); | 339 | atomic_set(&new->__count, 1); |
| @@ -353,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 353 | #endif | 349 | #endif |
| 354 | new->locked_shm = 0; | 350 | new->locked_shm = 0; |
| 355 | 351 | ||
| 356 | if (alloc_uid_keyring(new, current) < 0) { | 352 | if (alloc_uid_keyring(new, current) < 0) |
| 357 | kmem_cache_free(uid_cachep, new); | 353 | goto out_free_user; |
| 358 | uids_mutex_unlock(); | ||
| 359 | return NULL; | ||
| 360 | } | ||
| 361 | 354 | ||
| 362 | if (sched_create_user(new) < 0) { | 355 | if (sched_create_user(new) < 0) |
| 363 | key_put(new->uid_keyring); | 356 | goto out_put_keys; |
| 364 | key_put(new->session_keyring); | ||
| 365 | kmem_cache_free(uid_cachep, new); | ||
| 366 | uids_mutex_unlock(); | ||
| 367 | return NULL; | ||
| 368 | } | ||
| 369 | 357 | ||
| 370 | if (uids_user_create(new)) { | 358 | if (uids_user_create(new)) |
| 371 | sched_destroy_user(new); | 359 | goto out_destoy_sched; |
| 372 | key_put(new->uid_keyring); | ||
| 373 | key_put(new->session_keyring); | ||
| 374 | kmem_cache_free(uid_cachep, new); | ||
| 375 | uids_mutex_unlock(); | ||
| 376 | return NULL; | ||
| 377 | } | ||
| 378 | 360 | ||
| 379 | /* | 361 | /* |
| 380 | * Before adding this, check whether we raced | 362 | * Before adding this, check whether we raced |
| @@ -402,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 402 | uids_mutex_unlock(); | 384 | uids_mutex_unlock(); |
| 403 | 385 | ||
| 404 | return up; | 386 | return up; |
| 387 | |||
| 388 | out_destoy_sched: | ||
| 389 | sched_destroy_user(new); | ||
| 390 | out_put_keys: | ||
| 391 | key_put(new->uid_keyring); | ||
| 392 | key_put(new->session_keyring); | ||
| 393 | out_free_user: | ||
| 394 | kmem_cache_free(uid_cachep, new); | ||
| 395 | out_unlock: | ||
| 396 | uids_mutex_unlock(); | ||
| 397 | return NULL; | ||
| 405 | } | 398 | } |
| 406 | 399 | ||
| 407 | void switch_uid(struct user_struct *new_user) | 400 | void switch_uid(struct user_struct *new_user) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8db0b597509e..52db48e7f6e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -67,9 +67,8 @@ struct workqueue_struct { | |||
| 67 | #endif | 67 | #endif |
| 68 | }; | 68 | }; |
| 69 | 69 | ||
| 70 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove | 70 | /* Serializes the accesses to the list of workqueues. */ |
| 71 | threads to each one as cpus come/go. */ | 71 | static DEFINE_SPINLOCK(workqueue_lock); |
| 72 | static DEFINE_MUTEX(workqueue_mutex); | ||
| 73 | static LIST_HEAD(workqueues); | 72 | static LIST_HEAD(workqueues); |
| 74 | 73 | ||
| 75 | static int singlethread_cpu __read_mostly; | 74 | static int singlethread_cpu __read_mostly; |
| @@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
| 592 | * Returns zero on success. | 591 | * Returns zero on success. |
| 593 | * Returns -ve errno on failure. | 592 | * Returns -ve errno on failure. |
| 594 | * | 593 | * |
| 595 | * Appears to be racy against CPU hotplug. | ||
| 596 | * | ||
| 597 | * schedule_on_each_cpu() is very slow. | 594 | * schedule_on_each_cpu() is very slow. |
| 598 | */ | 595 | */ |
| 599 | int schedule_on_each_cpu(work_func_t func) | 596 | int schedule_on_each_cpu(work_func_t func) |
| @@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
| 605 | if (!works) | 602 | if (!works) |
| 606 | return -ENOMEM; | 603 | return -ENOMEM; |
| 607 | 604 | ||
| 608 | preempt_disable(); /* CPU hotplug */ | 605 | get_online_cpus(); |
| 609 | for_each_online_cpu(cpu) { | 606 | for_each_online_cpu(cpu) { |
| 610 | struct work_struct *work = per_cpu_ptr(works, cpu); | 607 | struct work_struct *work = per_cpu_ptr(works, cpu); |
| 611 | 608 | ||
| @@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func) | |||
| 613 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); | 610 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); |
| 614 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); | 611 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); |
| 615 | } | 612 | } |
| 616 | preempt_enable(); | ||
| 617 | flush_workqueue(keventd_wq); | 613 | flush_workqueue(keventd_wq); |
| 614 | put_online_cpus(); | ||
| 618 | free_percpu(works); | 615 | free_percpu(works); |
| 619 | return 0; | 616 | return 0; |
| 620 | } | 617 | } |
| @@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
| 750 | err = create_workqueue_thread(cwq, singlethread_cpu); | 747 | err = create_workqueue_thread(cwq, singlethread_cpu); |
| 751 | start_workqueue_thread(cwq, -1); | 748 | start_workqueue_thread(cwq, -1); |
| 752 | } else { | 749 | } else { |
| 753 | mutex_lock(&workqueue_mutex); | 750 | get_online_cpus(); |
| 751 | spin_lock(&workqueue_lock); | ||
| 754 | list_add(&wq->list, &workqueues); | 752 | list_add(&wq->list, &workqueues); |
| 753 | spin_unlock(&workqueue_lock); | ||
| 755 | 754 | ||
| 756 | for_each_possible_cpu(cpu) { | 755 | for_each_possible_cpu(cpu) { |
| 757 | cwq = init_cpu_workqueue(wq, cpu); | 756 | cwq = init_cpu_workqueue(wq, cpu); |
| @@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
| 760 | err = create_workqueue_thread(cwq, cpu); | 759 | err = create_workqueue_thread(cwq, cpu); |
| 761 | start_workqueue_thread(cwq, cpu); | 760 | start_workqueue_thread(cwq, cpu); |
| 762 | } | 761 | } |
| 763 | mutex_unlock(&workqueue_mutex); | 762 | put_online_cpus(); |
| 764 | } | 763 | } |
| 765 | 764 | ||
| 766 | if (err) { | 765 | if (err) { |
| @@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
| 775 | { | 774 | { |
| 776 | /* | 775 | /* |
| 777 | * Our caller is either destroy_workqueue() or CPU_DEAD, | 776 | * Our caller is either destroy_workqueue() or CPU_DEAD, |
| 778 | * workqueue_mutex protects cwq->thread | 777 | * get_online_cpus() protects cwq->thread. |
| 779 | */ | 778 | */ |
| 780 | if (cwq->thread == NULL) | 779 | if (cwq->thread == NULL) |
| 781 | return; | 780 | return; |
| @@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 810 | struct cpu_workqueue_struct *cwq; | 809 | struct cpu_workqueue_struct *cwq; |
| 811 | int cpu; | 810 | int cpu; |
| 812 | 811 | ||
| 813 | mutex_lock(&workqueue_mutex); | 812 | get_online_cpus(); |
| 813 | spin_lock(&workqueue_lock); | ||
| 814 | list_del(&wq->list); | 814 | list_del(&wq->list); |
| 815 | mutex_unlock(&workqueue_mutex); | 815 | spin_unlock(&workqueue_lock); |
| 816 | put_online_cpus(); | ||
| 816 | 817 | ||
| 817 | for_each_cpu_mask(cpu, *cpu_map) { | 818 | for_each_cpu_mask(cpu, *cpu_map) { |
| 818 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 819 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
| @@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 835 | action &= ~CPU_TASKS_FROZEN; | 836 | action &= ~CPU_TASKS_FROZEN; |
| 836 | 837 | ||
| 837 | switch (action) { | 838 | switch (action) { |
| 838 | case CPU_LOCK_ACQUIRE: | ||
| 839 | mutex_lock(&workqueue_mutex); | ||
| 840 | return NOTIFY_OK; | ||
| 841 | |||
| 842 | case CPU_LOCK_RELEASE: | ||
| 843 | mutex_unlock(&workqueue_mutex); | ||
| 844 | return NOTIFY_OK; | ||
| 845 | 839 | ||
| 846 | case CPU_UP_PREPARE: | 840 | case CPU_UP_PREPARE: |
| 847 | cpu_set(cpu, cpu_populated_map); | 841 | cpu_set(cpu, cpu_populated_map); |
| @@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 854 | case CPU_UP_PREPARE: | 848 | case CPU_UP_PREPARE: |
| 855 | if (!create_workqueue_thread(cwq, cpu)) | 849 | if (!create_workqueue_thread(cwq, cpu)) |
| 856 | break; | 850 | break; |
| 857 | printk(KERN_ERR "workqueue for %i failed\n", cpu); | 851 | printk(KERN_ERR "workqueue [%s] for %i failed\n", |
| 852 | wq->name, cpu); | ||
| 858 | return NOTIFY_BAD; | 853 | return NOTIFY_BAD; |
| 859 | 854 | ||
| 860 | case CPU_ONLINE: | 855 | case CPU_ONLINE: |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a60109307d32..14fb355e3caa 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
| @@ -517,4 +517,18 @@ config FAULT_INJECTION_STACKTRACE_FILTER | |||
| 517 | help | 517 | help |
| 518 | Provide stacktrace filter for fault-injection capabilities | 518 | Provide stacktrace filter for fault-injection capabilities |
| 519 | 519 | ||
| 520 | config LATENCYTOP | ||
| 521 | bool "Latency measuring infrastructure" | ||
| 522 | select FRAME_POINTER if !MIPS | ||
| 523 | select KALLSYMS | ||
| 524 | select KALLSYMS_ALL | ||
| 525 | select STACKTRACE | ||
| 526 | select SCHEDSTATS | ||
| 527 | select SCHED_DEBUG | ||
| 528 | depends on X86 || X86_64 | ||
| 529 | help | ||
| 530 | Enable this option if you want to use the LatencyTOP tool | ||
| 531 | to find out which userspace is blocking on what kernel operations. | ||
| 532 | |||
| 533 | |||
| 520 | source "samples/Kconfig" | 534 | source "samples/Kconfig" |
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c index f73e2f8c308f..812dbf00844b 100644 --- a/lib/kernel_lock.c +++ b/lib/kernel_lock.c | |||
| @@ -9,7 +9,6 @@ | |||
| 9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
| 10 | #include <linux/kallsyms.h> | 10 | #include <linux/kallsyms.h> |
| 11 | 11 | ||
| 12 | #ifdef CONFIG_PREEMPT_BKL | ||
| 13 | /* | 12 | /* |
| 14 | * The 'big kernel semaphore' | 13 | * The 'big kernel semaphore' |
| 15 | * | 14 | * |
| @@ -86,128 +85,6 @@ void __lockfunc unlock_kernel(void) | |||
| 86 | up(&kernel_sem); | 85 | up(&kernel_sem); |
| 87 | } | 86 | } |
| 88 | 87 | ||
| 89 | #else | ||
| 90 | |||
| 91 | /* | ||
| 92 | * The 'big kernel lock' | ||
| 93 | * | ||
| 94 | * This spinlock is taken and released recursively by lock_kernel() | ||
| 95 | * and unlock_kernel(). It is transparently dropped and reacquired | ||
| 96 | * over schedule(). It is used to protect legacy code that hasn't | ||
| 97 | * been migrated to a proper locking design yet. | ||
| 98 | * | ||
| 99 | * Don't use in new code. | ||
| 100 | */ | ||
| 101 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag); | ||
| 102 | |||
| 103 | |||
| 104 | /* | ||
| 105 | * Acquire/release the underlying lock from the scheduler. | ||
| 106 | * | ||
| 107 | * This is called with preemption disabled, and should | ||
| 108 | * return an error value if it cannot get the lock and | ||
| 109 | * TIF_NEED_RESCHED gets set. | ||
| 110 | * | ||
| 111 | * If it successfully gets the lock, it should increment | ||
| 112 | * the preemption count like any spinlock does. | ||
| 113 | * | ||
| 114 | * (This works on UP too - _raw_spin_trylock will never | ||
| 115 | * return false in that case) | ||
| 116 | */ | ||
| 117 | int __lockfunc __reacquire_kernel_lock(void) | ||
| 118 | { | ||
| 119 | while (!_raw_spin_trylock(&kernel_flag)) { | ||
| 120 | if (test_thread_flag(TIF_NEED_RESCHED)) | ||
| 121 | return -EAGAIN; | ||
| 122 | cpu_relax(); | ||
| 123 | } | ||
| 124 | preempt_disable(); | ||
| 125 | return 0; | ||
| 126 | } | ||
| 127 | |||
| 128 | void __lockfunc __release_kernel_lock(void) | ||
| 129 | { | ||
| 130 | _raw_spin_unlock(&kernel_flag); | ||
| 131 | preempt_enable_no_resched(); | ||
| 132 | } | ||
| 133 | |||
| 134 | /* | ||
| 135 | * These are the BKL spinlocks - we try to be polite about preemption. | ||
| 136 | * If SMP is not on (ie UP preemption), this all goes away because the | ||
| 137 | * _raw_spin_trylock() will always succeed. | ||
| 138 | */ | ||
| 139 | #ifdef CONFIG_PREEMPT | ||
| 140 | static inline void __lock_kernel(void) | ||
| 141 | { | ||
| 142 | preempt_disable(); | ||
| 143 | if (unlikely(!_raw_spin_trylock(&kernel_flag))) { | ||
| 144 | /* | ||
| 145 | * If preemption was disabled even before this | ||
| 146 | * was called, there's nothing we can be polite | ||
| 147 | * about - just spin. | ||
| 148 | */ | ||
| 149 | if (preempt_count() > 1) { | ||
| 150 | _raw_spin_lock(&kernel_flag); | ||
| 151 | return; | ||
| 152 | } | ||
| 153 | |||
| 154 | /* | ||
| 155 | * Otherwise, let's wait for the kernel lock | ||
| 156 | * with preemption enabled.. | ||
| 157 | */ | ||
| 158 | do { | ||
| 159 | preempt_enable(); | ||
| 160 | while (spin_is_locked(&kernel_flag)) | ||
| 161 | cpu_relax(); | ||
| 162 | preempt_disable(); | ||
| 163 | } while (!_raw_spin_trylock(&kernel_flag)); | ||
| 164 | } | ||
| 165 | } | ||
| 166 | |||
| 167 | #else | ||
| 168 | |||
| 169 | /* | ||
| 170 | * Non-preemption case - just get the spinlock | ||
| 171 | */ | ||
| 172 | static inline void __lock_kernel(void) | ||
| 173 | { | ||
| 174 | _raw_spin_lock(&kernel_flag); | ||
| 175 | } | ||
| 176 | #endif | ||
| 177 | |||
| 178 | static inline void __unlock_kernel(void) | ||
| 179 | { | ||
| 180 | /* | ||
| 181 | * the BKL is not covered by lockdep, so we open-code the | ||
| 182 | * unlocking sequence (and thus avoid the dep-chain ops): | ||
| 183 | */ | ||
| 184 | _raw_spin_unlock(&kernel_flag); | ||
| 185 | preempt_enable(); | ||
| 186 | } | ||
| 187 | |||
| 188 | /* | ||
| 189 | * Getting the big kernel lock. | ||
| 190 | * | ||
| 191 | * This cannot happen asynchronously, so we only need to | ||
| 192 | * worry about other CPU's. | ||
| 193 | */ | ||
| 194 | void __lockfunc lock_kernel(void) | ||
| 195 | { | ||
| 196 | int depth = current->lock_depth+1; | ||
| 197 | if (likely(!depth)) | ||
| 198 | __lock_kernel(); | ||
| 199 | current->lock_depth = depth; | ||
| 200 | } | ||
| 201 | |||
| 202 | void __lockfunc unlock_kernel(void) | ||
| 203 | { | ||
| 204 | BUG_ON(current->lock_depth < 0); | ||
| 205 | if (likely(--current->lock_depth < 0)) | ||
| 206 | __unlock_kernel(); | ||
| 207 | } | ||
| 208 | |||
| 209 | #endif | ||
| 210 | |||
| 211 | EXPORT_SYMBOL(lock_kernel); | 88 | EXPORT_SYMBOL(lock_kernel); |
| 212 | EXPORT_SYMBOL(unlock_kernel); | 89 | EXPORT_SYMBOL(unlock_kernel); |
| 213 | 90 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 91a081a82f55..96473b482099 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -286,7 +286,7 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
| 286 | * all the memory it needs. That way it should be able to | 286 | * all the memory it needs. That way it should be able to |
| 287 | * exit() and clear out its resources quickly... | 287 | * exit() and clear out its resources quickly... |
| 288 | */ | 288 | */ |
| 289 | p->time_slice = HZ; | 289 | p->rt.time_slice = HZ; |
| 290 | set_tsk_thread_flag(p, TIF_MEMDIE); | 290 | set_tsk_thread_flag(p, TIF_MEMDIE); |
| 291 | 291 | ||
| 292 | force_sig(SIGKILL, p); | 292 | force_sig(SIGKILL, p); |
| @@ -730,8 +730,7 @@ static inline void init_lock_keys(void) | |||
| 730 | #endif | 730 | #endif |
| 731 | 731 | ||
| 732 | /* | 732 | /* |
| 733 | * 1. Guard access to the cache-chain. | 733 | * Guard access to the cache-chain. |
| 734 | * 2. Protect sanity of cpu_online_map against cpu hotplug events | ||
| 735 | */ | 734 | */ |
| 736 | static DEFINE_MUTEX(cache_chain_mutex); | 735 | static DEFINE_MUTEX(cache_chain_mutex); |
| 737 | static struct list_head cache_chain; | 736 | static struct list_head cache_chain; |
| @@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
| 1331 | int err = 0; | 1330 | int err = 0; |
| 1332 | 1331 | ||
| 1333 | switch (action) { | 1332 | switch (action) { |
| 1334 | case CPU_LOCK_ACQUIRE: | ||
| 1335 | mutex_lock(&cache_chain_mutex); | ||
| 1336 | break; | ||
| 1337 | case CPU_UP_PREPARE: | 1333 | case CPU_UP_PREPARE: |
| 1338 | case CPU_UP_PREPARE_FROZEN: | 1334 | case CPU_UP_PREPARE_FROZEN: |
| 1335 | mutex_lock(&cache_chain_mutex); | ||
| 1339 | err = cpuup_prepare(cpu); | 1336 | err = cpuup_prepare(cpu); |
| 1337 | mutex_unlock(&cache_chain_mutex); | ||
| 1340 | break; | 1338 | break; |
| 1341 | case CPU_ONLINE: | 1339 | case CPU_ONLINE: |
| 1342 | case CPU_ONLINE_FROZEN: | 1340 | case CPU_ONLINE_FROZEN: |
| @@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
| 1373 | #endif | 1371 | #endif |
| 1374 | case CPU_UP_CANCELED: | 1372 | case CPU_UP_CANCELED: |
| 1375 | case CPU_UP_CANCELED_FROZEN: | 1373 | case CPU_UP_CANCELED_FROZEN: |
| 1374 | mutex_lock(&cache_chain_mutex); | ||
| 1376 | cpuup_canceled(cpu); | 1375 | cpuup_canceled(cpu); |
| 1377 | break; | ||
| 1378 | case CPU_LOCK_RELEASE: | ||
| 1379 | mutex_unlock(&cache_chain_mutex); | 1376 | mutex_unlock(&cache_chain_mutex); |
| 1380 | break; | 1377 | break; |
| 1381 | } | 1378 | } |
| @@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2170 | * We use cache_chain_mutex to ensure a consistent view of | 2167 | * We use cache_chain_mutex to ensure a consistent view of |
| 2171 | * cpu_online_map as well. Please see cpuup_callback | 2168 | * cpu_online_map as well. Please see cpuup_callback |
| 2172 | */ | 2169 | */ |
| 2170 | get_online_cpus(); | ||
| 2173 | mutex_lock(&cache_chain_mutex); | 2171 | mutex_lock(&cache_chain_mutex); |
| 2174 | 2172 | ||
| 2175 | list_for_each_entry(pc, &cache_chain, next) { | 2173 | list_for_each_entry(pc, &cache_chain, next) { |
| @@ -2396,6 +2394,7 @@ oops: | |||
| 2396 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2394 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
| 2397 | name); | 2395 | name); |
| 2398 | mutex_unlock(&cache_chain_mutex); | 2396 | mutex_unlock(&cache_chain_mutex); |
| 2397 | put_online_cpus(); | ||
| 2399 | return cachep; | 2398 | return cachep; |
| 2400 | } | 2399 | } |
| 2401 | EXPORT_SYMBOL(kmem_cache_create); | 2400 | EXPORT_SYMBOL(kmem_cache_create); |
| @@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
| 2547 | int ret; | 2546 | int ret; |
| 2548 | BUG_ON(!cachep || in_interrupt()); | 2547 | BUG_ON(!cachep || in_interrupt()); |
| 2549 | 2548 | ||
| 2549 | get_online_cpus(); | ||
| 2550 | mutex_lock(&cache_chain_mutex); | 2550 | mutex_lock(&cache_chain_mutex); |
| 2551 | ret = __cache_shrink(cachep); | 2551 | ret = __cache_shrink(cachep); |
| 2552 | mutex_unlock(&cache_chain_mutex); | 2552 | mutex_unlock(&cache_chain_mutex); |
| 2553 | put_online_cpus(); | ||
| 2553 | return ret; | 2554 | return ret; |
| 2554 | } | 2555 | } |
| 2555 | EXPORT_SYMBOL(kmem_cache_shrink); | 2556 | EXPORT_SYMBOL(kmem_cache_shrink); |
| @@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2575 | BUG_ON(!cachep || in_interrupt()); | 2576 | BUG_ON(!cachep || in_interrupt()); |
| 2576 | 2577 | ||
| 2577 | /* Find the cache in the chain of caches. */ | 2578 | /* Find the cache in the chain of caches. */ |
| 2579 | get_online_cpus(); | ||
| 2578 | mutex_lock(&cache_chain_mutex); | 2580 | mutex_lock(&cache_chain_mutex); |
| 2579 | /* | 2581 | /* |
| 2580 | * the chain is never empty, cache_cache is never destroyed | 2582 | * the chain is never empty, cache_cache is never destroyed |
| @@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2584 | slab_error(cachep, "Can't free all objects"); | 2586 | slab_error(cachep, "Can't free all objects"); |
| 2585 | list_add(&cachep->next, &cache_chain); | 2587 | list_add(&cachep->next, &cache_chain); |
| 2586 | mutex_unlock(&cache_chain_mutex); | 2588 | mutex_unlock(&cache_chain_mutex); |
| 2589 | put_online_cpus(); | ||
| 2587 | return; | 2590 | return; |
| 2588 | } | 2591 | } |
| 2589 | 2592 | ||
| @@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2592 | 2595 | ||
| 2593 | __kmem_cache_destroy(cachep); | 2596 | __kmem_cache_destroy(cachep); |
| 2594 | mutex_unlock(&cache_chain_mutex); | 2597 | mutex_unlock(&cache_chain_mutex); |
| 2598 | put_online_cpus(); | ||
| 2595 | } | 2599 | } |
| 2596 | EXPORT_SYMBOL(kmem_cache_destroy); | 2600 | EXPORT_SYMBOL(kmem_cache_destroy); |
| 2597 | 2601 | ||
diff --git a/net/core/flow.c b/net/core/flow.c index 3ed2b4b1d6d4..6489f4e24ecf 100644 --- a/net/core/flow.c +++ b/net/core/flow.c | |||
| @@ -293,7 +293,7 @@ void flow_cache_flush(void) | |||
| 293 | static DEFINE_MUTEX(flow_flush_sem); | 293 | static DEFINE_MUTEX(flow_flush_sem); |
| 294 | 294 | ||
| 295 | /* Don't want cpus going down or up during this. */ | 295 | /* Don't want cpus going down or up during this. */ |
| 296 | lock_cpu_hotplug(); | 296 | get_online_cpus(); |
| 297 | mutex_lock(&flow_flush_sem); | 297 | mutex_lock(&flow_flush_sem); |
| 298 | atomic_set(&info.cpuleft, num_online_cpus()); | 298 | atomic_set(&info.cpuleft, num_online_cpus()); |
| 299 | init_completion(&info.completion); | 299 | init_completion(&info.completion); |
| @@ -305,7 +305,7 @@ void flow_cache_flush(void) | |||
| 305 | 305 | ||
| 306 | wait_for_completion(&info.completion); | 306 | wait_for_completion(&info.completion); |
| 307 | mutex_unlock(&flow_flush_sem); | 307 | mutex_unlock(&flow_flush_sem); |
| 308 | unlock_cpu_hotplug(); | 308 | put_online_cpus(); |
| 309 | } | 309 | } |
| 310 | 310 | ||
| 311 | static void __devinit flow_cache_cpu_prepare(int cpu) | 311 | static void __devinit flow_cache_cpu_prepare(int cpu) |
