aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c3
-rw-r--r--kernel/capability.c21
-rw-r--r--kernel/cgroup.c5
-rw-r--r--kernel/cpuset.c349
-rw-r--r--kernel/dma-coherent.c2
-rw-r--r--kernel/exit.c100
-rw-r--r--kernel/hrtimer.c95
-rw-r--r--kernel/kexec.c74
-rw-r--r--kernel/kgdb.c13
-rw-r--r--kernel/lockdep.c14
-rw-r--r--kernel/lockdep_internals.h13
-rw-r--r--kernel/lockdep_proc.c15
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/nsproxy.c1
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/pm_qos_params.c25
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/rcupdate.c1
-rw-r--r--kernel/resource.c88
-rw-r--r--kernel/sched.c144
-rw-r--r--kernel/sched_clock.c84
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c16
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/smp.c10
-rw-r--r--kernel/softlockup.c3
-rw-r--r--kernel/spinlock.c3
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sysctl.c1
-rw-r--r--kernel/time/clockevents.c15
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c99
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-internal.h11
-rw-r--r--kernel/time/tick-oneshot.c44
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/trace/trace_sysprof.c2
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/utsname_sysctl.c1
44 files changed, 863 insertions, 478 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 972f8e61d36a..59cedfb040e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask)
243 243
244static int audit_match_perm(struct audit_context *ctx, int mask) 244static int audit_match_perm(struct audit_context *ctx, int mask)
245{ 245{
246 unsigned n;
246 if (unlikely(!ctx)) 247 if (unlikely(!ctx))
247 return 0; 248 return 0;
248 249
249 unsigned n = ctx->major; 250 n = ctx->major;
250 switch (audit_classify_syscall(ctx->arch, n)) { 251 switch (audit_classify_syscall(ctx->arch, n)) {
251 case 0: /* native */ 252 case 0: /* native */
252 if ((mask & AUDIT_PERM_WRITE) && 253 if ((mask & AUDIT_PERM_WRITE) &&
diff --git a/kernel/capability.c b/kernel/capability.c
index 0101e847603e..33e51e78c2d8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
486 return ret; 486 return ret;
487} 487}
488 488
489int __capable(struct task_struct *t, int cap) 489/**
490 * capable - Determine if the current task has a superior capability in effect
491 * @cap: The capability to be tested for
492 *
493 * Return true if the current task has the given superior capability currently
494 * available for use, false if not.
495 *
496 * This sets PF_SUPERPRIV on the task if the capability is available on the
497 * assumption that it's about to be used.
498 */
499int capable(int cap)
490{ 500{
491 if (security_capable(t, cap) == 0) { 501 if (has_capability(current, cap)) {
492 t->flags |= PF_SUPERPRIV; 502 current->flags |= PF_SUPERPRIV;
493 return 1; 503 return 1;
494 } 504 }
495 return 0; 505 return 0;
496} 506}
497
498int capable(int cap)
499{
500 return __capable(current, cap);
501}
502EXPORT_SYMBOL(capable); 507EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13932abde159..a0123d75ec9a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
2738 */ 2738 */
2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2740{ 2740{
2741 struct cgroup *oldcgrp, *newcgrp; 2741 struct cgroup *oldcgrp, *newcgrp = NULL;
2742 2742
2743 if (need_mm_owner_callback) { 2743 if (need_mm_owner_callback) {
2744 int i; 2744 int i;
2745 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2745 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2746 struct cgroup_subsys *ss = subsys[i]; 2746 struct cgroup_subsys *ss = subsys[i];
2747 oldcgrp = task_cgroup(old, ss->subsys_id); 2747 oldcgrp = task_cgroup(old, ss->subsys_id);
2748 newcgrp = task_cgroup(new, ss->subsys_id); 2748 if (new)
2749 newcgrp = task_cgroup(new, ss->subsys_id);
2749 if (oldcgrp == newcgrp) 2750 if (oldcgrp == newcgrp)
2750 continue; 2751 continue;
2751 if (ss->mm_owner_changed) 2752 if (ss->mm_owner_changed)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..827cd9adccb2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
236 238
237static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
238 240
239/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
240 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
241 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
242static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
243 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
244 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
473} 477}
474 478
475/* 479/*
476 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
477 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
478 */ 482 */
479
480static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
481{ 484{
482 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
518} 521}
519 522
520/* 523/*
521 * rebuild_sched_domains() 524 * generate_sched_domains()
522 * 525 *
523 * This routine will be called to rebuild the scheduler's dynamic 526 * This function builds a partial partition of the systems CPUs
524 * sched domains: 527 * A 'partial partition' is a set of non-overlapping subsets whose
525 * - if the flag 'sched_load_balance' of any cpuset with non-empty 528 * union is a subset of that set.
526 * 'cpus' changes, 529 * The output of this function needs to be passed to kernel/sched.c
527 * - or if the 'cpus' allowed changes in any cpuset which has that 530 * partition_sched_domains() routine, which will rebuild the scheduler's
528 * flag enabled, 531 * load balancing domains (sched domains) as specified by that partial
529 * - or if the 'sched_relax_domain_level' of any cpuset which has 532 * partition.
530 * that flag enabled and with non-empty 'cpus' changes,
531 * - or if any cpuset with non-empty 'cpus' is removed,
532 * - or if a cpu gets offlined.
533 *
534 * This routine builds a partial partition of the systems CPUs
535 * (the set of non-overlappping cpumask_t's in the array 'part'
536 * below), and passes that partial partition to the kernel/sched.c
537 * partition_sched_domains() routine, which will rebuild the
538 * schedulers load balancing domains (sched domains) as specified
539 * by that partial partition. A 'partial partition' is a set of
540 * non-overlapping subsets whose union is a subset of that set.
541 * 533 *
542 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
543 * for a background explanation of this. 535 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
547 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
548 * that could cause allocation failures below. 540 * that could cause allocation failures below.
549 * 541 *
550 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
551 * call due to the kfifo_alloc() and kmalloc() calls. May nest
552 * a call to the get_online_cpus()/put_online_cpus() pair.
553 * Must not be called holding callback_mutex, because we must not
554 * call get_online_cpus() while holding callback_mutex. Elsewhere
555 * the kernel nests callback_mutex inside get_online_cpus() calls.
556 * So the reverse nesting would risk an ABBA deadlock.
557 * 543 *
558 * The three key local variables below are: 544 * The three key local variables below are:
559 * q - a linked-list queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
588 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
589 * partition_sched_domains(). 575 * partition_sched_domains().
590 */ 576 */
591 577static int generate_sched_domains(cpumask_t **domains,
592void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
593{ 579{
594 LIST_HEAD(q); /* queue of cpusets to be scanned*/ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
595 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
596 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
597 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
601 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
602 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
603 589
604 csa = NULL; 590 ndoms = 0;
605 doms = NULL; 591 doms = NULL;
606 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
607 594
608 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
609 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
610 ndoms = 1;
611 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
612 if (!doms) 598 if (!doms)
613 goto rebuild; 599 goto done;
600
614 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
615 if (dattr) { 602 if (dattr) {
616 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
617 update_domain_attr_tree(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
618 } 605 }
619 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
620 goto rebuild; 607
608 ndoms = 1;
609 goto done;
621 } 610 }
622 611
623 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
680 } 669 }
681 } 670 }
682 671
683 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
684 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
685 if (!doms) 677 if (!doms) {
686 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
687 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
688 687
689 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
690 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
691 int apn = a->pn; 691 int apn = a->pn;
692 692
693 if (apn >= 0) { 693 if (apn < 0) {
694 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
695 695 continue;
696 if (nslot == ndoms) { 696 }
697 static int warnings = 10; 697
698 if (warnings) { 698 dp = doms + nslot;
699 printk(KERN_WARNING 699
700 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
701 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
702 " apn %d\n", 702 if (warnings) {
703 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
704 warnings--; 704 "rebuild_sched_domains confused:"
705 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
706 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
707 } 709 }
710 continue;
711 }
708 712
709 cpus_clear(*dp); 713 cpus_clear(*dp);
710 if (dattr) 714 if (dattr)
711 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
712 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
713 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
714 718
715 if (apn == b->pn) { 719 if (apn == b->pn) {
716 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
717 b->pn = -1; 721 if (dattr)
718 if (dattr) 722 update_domain_attr_tree(dattr + nslot, b);
719 update_domain_attr_tree(dattr 723
720 + nslot, b); 724 /* Done with this partition */
721 } 725 b->pn = -1;
722 } 726 }
723 nslot++;
724 } 727 }
728 nslot++;
725 } 729 }
726 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
727 731
728rebuild: 732done:
729 /* Have scheduler rebuild sched domains */ 733 kfree(csa);
734
735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
738}
739
740/*
741 * Rebuild scheduler domains.
742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
751{
752 struct sched_domain_attr *attr;
753 cpumask_t *doms;
754 int ndoms;
755
730 get_online_cpus(); 756 get_online_cpus();
731 partition_sched_domains(ndoms, doms, dattr); 757
758 /* Generate domain masks and attrs */
759 cgroup_lock();
760 ndoms = generate_sched_domains(&doms, &attr);
761 cgroup_unlock();
762
763 /* Have scheduler rebuild the domains */
764 partition_sched_domains(ndoms, doms, attr);
765
732 put_online_cpus(); 766 put_online_cpus();
767}
733 768
734done: 769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
735 kfree(csa); 770
736 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 771/*
737 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
793}
794
795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
805{
806 do_rebuild_sched_domains(NULL);
738} 807}
739 808
740/** 809/**
@@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
774/** 843/**
775 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 844 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
776 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 845 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
846 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
777 * 847 *
778 * Called with cgroup_mutex held 848 * Called with cgroup_mutex held
779 * 849 *
780 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 850 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
781 * calling callback functions for each. 851 * calling callback functions for each.
782 * 852 *
783 * Return 0 if successful, -errno if not. 853 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
854 * if @heap != NULL.
784 */ 855 */
785static int update_tasks_cpumask(struct cpuset *cs) 856static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
786{ 857{
787 struct cgroup_scanner scan; 858 struct cgroup_scanner scan;
788 struct ptr_heap heap;
789 int retval;
790
791 /*
792 * cgroup_scan_tasks() will initialize heap->gt for us.
793 * heap_init() is still needed here for we should not change
794 * cs->cpus_allowed when heap_init() fails.
795 */
796 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
797 if (retval)
798 return retval;
799 859
800 scan.cg = cs->css.cgroup; 860 scan.cg = cs->css.cgroup;
801 scan.test_task = cpuset_test_cpumask; 861 scan.test_task = cpuset_test_cpumask;
802 scan.process_task = cpuset_change_cpumask; 862 scan.process_task = cpuset_change_cpumask;
803 scan.heap = &heap; 863 scan.heap = heap;
804 retval = cgroup_scan_tasks(&scan); 864 cgroup_scan_tasks(&scan);
805
806 heap_free(&heap);
807 return retval;
808} 865}
809 866
810/** 867/**
@@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
814 */ 871 */
815static int update_cpumask(struct cpuset *cs, const char *buf) 872static int update_cpumask(struct cpuset *cs, const char *buf)
816{ 873{
874 struct ptr_heap heap;
817 struct cpuset trialcs; 875 struct cpuset trialcs;
818 int retval; 876 int retval;
819 int is_load_balanced; 877 int is_load_balanced;
@@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
848 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 906 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
849 return 0; 907 return 0;
850 908
909 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
910 if (retval)
911 return retval;
912
851 is_load_balanced = is_sched_load_balance(&trialcs); 913 is_load_balanced = is_sched_load_balance(&trialcs);
852 914
853 mutex_lock(&callback_mutex); 915 mutex_lock(&callback_mutex);
@@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
858 * Scan tasks in the cpuset, and update the cpumasks of any 920 * Scan tasks in the cpuset, and update the cpumasks of any
859 * that need an update. 921 * that need an update.
860 */ 922 */
861 retval = update_tasks_cpumask(cs); 923 update_tasks_cpumask(cs, &heap);
862 if (retval < 0) 924
863 return retval; 925 heap_free(&heap);
864 926
865 if (is_load_balanced) 927 if (is_load_balanced)
866 rebuild_sched_domains(); 928 async_rebuild_sched_domains();
867 return 0; 929 return 0;
868} 930}
869 931
@@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1090 if (val != cs->relax_domain_level) { 1152 if (val != cs->relax_domain_level) {
1091 cs->relax_domain_level = val; 1153 cs->relax_domain_level = val;
1092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1154 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1093 rebuild_sched_domains(); 1155 async_rebuild_sched_domains();
1094 } 1156 }
1095 1157
1096 return 0; 1158 return 0;
@@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1131 mutex_unlock(&callback_mutex); 1193 mutex_unlock(&callback_mutex);
1132 1194
1133 if (cpus_nonempty && balance_flag_changed) 1195 if (cpus_nonempty && balance_flag_changed)
1134 rebuild_sched_domains(); 1196 async_rebuild_sched_domains();
1135 1197
1136 return 0; 1198 return 0;
1137} 1199}
@@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1492 default: 1554 default:
1493 BUG(); 1555 BUG();
1494 } 1556 }
1557
1558 /* Unreachable but makes gcc happy */
1559 return 0;
1495} 1560}
1496 1561
1497static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1562static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1504 default: 1569 default:
1505 BUG(); 1570 BUG();
1506 } 1571 }
1572
1573 /* Unrechable but makes gcc happy */
1574 return 0;
1507} 1575}
1508 1576
1509 1577
@@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
1692} 1760}
1693 1761
1694/* 1762/*
1695 * Locking note on the strange update_flag() call below:
1696 *
1697 * If the cpuset being removed has its flag 'sched_load_balance' 1763 * If the cpuset being removed has its flag 'sched_load_balance'
1698 * enabled, then simulate turning sched_load_balance off, which 1764 * enabled, then simulate turning sched_load_balance off, which
1699 * will call rebuild_sched_domains(). The get_online_cpus() 1765 * will call async_rebuild_sched_domains().
1700 * call in rebuild_sched_domains() must not be made while holding
1701 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1702 * get_online_cpus() calls. So the reverse nesting would risk an
1703 * ABBA deadlock.
1704 */ 1766 */
1705 1767
1706static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1768static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1719struct cgroup_subsys cpuset_subsys = { 1781struct cgroup_subsys cpuset_subsys = {
1720 .name = "cpuset", 1782 .name = "cpuset",
1721 .create = cpuset_create, 1783 .create = cpuset_create,
1722 .destroy = cpuset_destroy, 1784 .destroy = cpuset_destroy,
1723 .can_attach = cpuset_can_attach, 1785 .can_attach = cpuset_can_attach,
1724 .attach = cpuset_attach, 1786 .attach = cpuset_attach,
1725 .populate = cpuset_populate, 1787 .populate = cpuset_populate,
@@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1811} 1873}
1812 1874
1813/* 1875/*
1814 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1876 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1815 * or memory nodes, we need to walk over the cpuset hierarchy, 1877 * or memory nodes, we need to walk over the cpuset hierarchy,
1816 * removing that CPU or node from all cpusets. If this removes the 1878 * removing that CPU or node from all cpusets. If this removes the
1817 * last CPU or node from a cpuset, then move the tasks in the empty 1879 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1896 nodes_empty(cp->mems_allowed)) 1958 nodes_empty(cp->mems_allowed))
1897 remove_tasks_in_empty_cpuset(cp); 1959 remove_tasks_in_empty_cpuset(cp);
1898 else { 1960 else {
1899 update_tasks_cpumask(cp); 1961 update_tasks_cpumask(cp, NULL);
1900 update_tasks_nodemask(cp, &oldmems); 1962 update_tasks_nodemask(cp, &oldmems);
1901 } 1963 }
1902 } 1964 }
1903} 1965}
1904 1966
1905/* 1967/*
1906 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1907 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1908 * track what's online after any CPU or memory node hotplug or unplug event.
1909 *
1910 * Since there are two callers of this routine, one for CPU hotplug
1911 * events and one for memory node hotplug events, we could have coded
1912 * two separate routines here. We code it as a single common routine
1913 * in order to minimize text size.
1914 */
1915
1916static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1917{
1918 cgroup_lock();
1919
1920 top_cpuset.cpus_allowed = cpu_online_map;
1921 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1922 scan_for_empty_cpusets(&top_cpuset);
1923
1924 /*
1925 * Scheduler destroys domains on hotplug events.
1926 * Rebuild them based on the current settings.
1927 */
1928 if (rebuild_sd)
1929 rebuild_sched_domains();
1930
1931 cgroup_unlock();
1932}
1933
1934/*
1935 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1968 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1936 * period. This is necessary in order to make cpusets transparent 1969 * period. This is necessary in order to make cpusets transparent
1937 * (of no affect) on systems that are actively using CPU hotplug 1970 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1939 * 1972 *
1940 * This routine ensures that top_cpuset.cpus_allowed tracks 1973 * This routine ensures that top_cpuset.cpus_allowed tracks
1941 * cpu_online_map on each CPU hotplug (cpuhp) event. 1974 * cpu_online_map on each CPU hotplug (cpuhp) event.
1975 *
1976 * Called within get_online_cpus(). Needs to call cgroup_lock()
1977 * before calling generate_sched_domains().
1942 */ 1978 */
1943 1979static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1944static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1945 unsigned long phase, void *unused_cpu) 1980 unsigned long phase, void *unused_cpu)
1946{ 1981{
1982 struct sched_domain_attr *attr;
1983 cpumask_t *doms;
1984 int ndoms;
1985
1947 switch (phase) { 1986 switch (phase) {
1948 case CPU_UP_CANCELED:
1949 case CPU_UP_CANCELED_FROZEN:
1950 case CPU_DOWN_FAILED:
1951 case CPU_DOWN_FAILED_FROZEN:
1952 case CPU_ONLINE: 1987 case CPU_ONLINE:
1953 case CPU_ONLINE_FROZEN: 1988 case CPU_ONLINE_FROZEN:
1954 case CPU_DEAD: 1989 case CPU_DEAD:
1955 case CPU_DEAD_FROZEN: 1990 case CPU_DEAD_FROZEN:
1956 common_cpu_mem_hotplug_unplug(1);
1957 break; 1991 break;
1992
1958 default: 1993 default:
1959 return NOTIFY_DONE; 1994 return NOTIFY_DONE;
1960 } 1995 }
1961 1996
1997 cgroup_lock();
1998 top_cpuset.cpus_allowed = cpu_online_map;
1999 scan_for_empty_cpusets(&top_cpuset);
2000 ndoms = generate_sched_domains(&doms, &attr);
2001 cgroup_unlock();
2002
2003 /* Have scheduler rebuild the domains */
2004 partition_sched_domains(ndoms, doms, attr);
2005
1962 return NOTIFY_OK; 2006 return NOTIFY_OK;
1963} 2007}
1964 2008
1965#ifdef CONFIG_MEMORY_HOTPLUG 2009#ifdef CONFIG_MEMORY_HOTPLUG
1966/* 2010/*
1967 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2011 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1968 * Call this routine anytime after you change 2012 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1969 * node_states[N_HIGH_MEMORY]. 2013 * See also the previous routine cpuset_track_online_cpus().
1970 * See also the previous routine cpuset_handle_cpuhp().
1971 */ 2014 */
1972
1973void cpuset_track_online_nodes(void) 2015void cpuset_track_online_nodes(void)
1974{ 2016{
1975 common_cpu_mem_hotplug_unplug(0); 2017 cgroup_lock();
2018 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2019 scan_for_empty_cpusets(&top_cpuset);
2020 cgroup_unlock();
1976} 2021}
1977#endif 2022#endif
1978 2023
@@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void)
1987 top_cpuset.cpus_allowed = cpu_online_map; 2032 top_cpuset.cpus_allowed = cpu_online_map;
1988 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2033 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1989 2034
1990 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2035 hotcpu_notifier(cpuset_track_online_cpus, 0);
1991} 2036}
1992 2037
1993/** 2038/**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index c1d4d5b4c61c..f013a0c2e111 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
124 } 124 }
125 return (mem != NULL); 125 return (mem != NULL);
126} 126}
127EXPORT_SYMBOL(dma_alloc_from_coherent);
127 128
128/** 129/**
129 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool 130 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
151 } 152 }
152 return 0; 153 return 0;
153} 154}
155EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index 38ec40630149..85a83c831856 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 112 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 113 * will have been the last reference on the signal_struct.
114 */ 114 */
115 sig->utime = cputime_add(sig->utime, tsk->utime); 115 sig->utime = cputime_add(sig->utime, task_utime(tsk));
116 sig->stime = cputime_add(sig->stime, tsk->stime); 116 sig->stime = cputime_add(sig->stime, task_stime(tsk));
117 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
118 sig->min_flt += tsk->min_flt; 118 sig->min_flt += tsk->min_flt;
119 sig->maj_flt += tsk->maj_flt; 119 sig->maj_flt += tsk->maj_flt;
120 sig->nvcsw += tsk->nvcsw; 120 sig->nvcsw += tsk->nvcsw;
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
583 * If there are other users of the mm and the owner (us) is exiting 583 * If there are other users of the mm and the owner (us) is exiting
584 * we need to find a new owner to take on the responsibility. 584 * we need to find a new owner to take on the responsibility.
585 */ 585 */
586 if (!mm)
587 return 0;
588 if (atomic_read(&mm->mm_users) <= 1) 586 if (atomic_read(&mm->mm_users) <= 1)
589 return 0; 587 return 0;
590 if (mm->owner != p) 588 if (mm->owner != p)
@@ -627,6 +625,16 @@ retry:
627 } while_each_thread(g, c); 625 } while_each_thread(g, c);
628 626
629 read_unlock(&tasklist_lock); 627 read_unlock(&tasklist_lock);
628 /*
629 * We found no owner yet mm_users > 1: this implies that we are
630 * most likely racing with swapoff (try_to_unuse()) or /proc or
631 * ptrace or page migration (get_task_mm()). Mark owner as NULL,
632 * so that subsystems can understand the callback and take action.
633 */
634 down_write(&mm->mmap_sem);
635 cgroup_mm_owner_callbacks(mm->owner, NULL);
636 mm->owner = NULL;
637 up_write(&mm->mmap_sem);
630 return; 638 return;
631 639
632assign_new_owner: 640assign_new_owner:
@@ -831,26 +839,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
831 * the child reaper process (ie "init") in our pid 839 * the child reaper process (ie "init") in our pid
832 * space. 840 * space.
833 */ 841 */
842static struct task_struct *find_new_reaper(struct task_struct *father)
843{
844 struct pid_namespace *pid_ns = task_active_pid_ns(father);
845 struct task_struct *thread;
846
847 thread = father;
848 while_each_thread(father, thread) {
849 if (thread->flags & PF_EXITING)
850 continue;
851 if (unlikely(pid_ns->child_reaper == father))
852 pid_ns->child_reaper = thread;
853 return thread;
854 }
855
856 if (unlikely(pid_ns->child_reaper == father)) {
857 write_unlock_irq(&tasklist_lock);
858 if (unlikely(pid_ns == &init_pid_ns))
859 panic("Attempted to kill init!");
860
861 zap_pid_ns_processes(pid_ns);
862 write_lock_irq(&tasklist_lock);
863 /*
864 * We can not clear ->child_reaper or leave it alone.
865 * There may by stealth EXIT_DEAD tasks on ->children,
866 * forget_original_parent() must move them somewhere.
867 */
868 pid_ns->child_reaper = init_pid_ns.child_reaper;
869 }
870
871 return pid_ns->child_reaper;
872}
873
834static void forget_original_parent(struct task_struct *father) 874static void forget_original_parent(struct task_struct *father)
835{ 875{
836 struct task_struct *p, *n, *reaper = father; 876 struct task_struct *p, *n, *reaper;
837 LIST_HEAD(ptrace_dead); 877 LIST_HEAD(ptrace_dead);
838 878
839 write_lock_irq(&tasklist_lock); 879 write_lock_irq(&tasklist_lock);
840 880 reaper = find_new_reaper(father);
841 /* 881 /*
842 * First clean up ptrace if we were using it. 882 * First clean up ptrace if we were using it.
843 */ 883 */
844 ptrace_exit(father, &ptrace_dead); 884 ptrace_exit(father, &ptrace_dead);
845 885
846 do {
847 reaper = next_thread(reaper);
848 if (reaper == father) {
849 reaper = task_child_reaper(father);
850 break;
851 }
852 } while (reaper->flags & PF_EXITING);
853
854 list_for_each_entry_safe(p, n, &father->children, sibling) { 886 list_for_each_entry_safe(p, n, &father->children, sibling) {
855 p->real_parent = reaper; 887 p->real_parent = reaper;
856 if (p->parent == father) { 888 if (p->parent == father) {
@@ -918,8 +950,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
918 950
919 /* mt-exec, de_thread() is waiting for us */ 951 /* mt-exec, de_thread() is waiting for us */
920 if (thread_group_leader(tsk) && 952 if (thread_group_leader(tsk) &&
921 tsk->signal->notify_count < 0 && 953 tsk->signal->group_exit_task &&
922 tsk->signal->group_exit_task) 954 tsk->signal->notify_count < 0)
923 wake_up_process(tsk->signal->group_exit_task); 955 wake_up_process(tsk->signal->group_exit_task);
924 956
925 write_unlock_irq(&tasklist_lock); 957 write_unlock_irq(&tasklist_lock);
@@ -959,39 +991,6 @@ static void check_stack_usage(void)
959static inline void check_stack_usage(void) {} 991static inline void check_stack_usage(void) {}
960#endif 992#endif
961 993
962static inline void exit_child_reaper(struct task_struct *tsk)
963{
964 if (likely(tsk->group_leader != task_child_reaper(tsk)))
965 return;
966
967 if (tsk->nsproxy->pid_ns == &init_pid_ns)
968 panic("Attempted to kill init!");
969
970 /*
971 * @tsk is the last thread in the 'cgroup-init' and is exiting.
972 * Terminate all remaining processes in the namespace and reap them
973 * before exiting @tsk.
974 *
975 * Note that @tsk (last thread of cgroup-init) may not necessarily
976 * be the child-reaper (i.e main thread of cgroup-init) of the
977 * namespace i.e the child_reaper may have already exited.
978 *
979 * Even after a child_reaper exits, we let it inherit orphaned children,
980 * because, pid_ns->child_reaper remains valid as long as there is
981 * at least one living sub-thread in the cgroup init.
982
983 * This living sub-thread of the cgroup-init will be notified when
984 * a child inherited by the 'child-reaper' exits (do_notify_parent()
985 * uses __group_send_sig_info()). Further, when reaping child processes,
986 * do_wait() iterates over children of all living sub threads.
987
988 * i.e even though 'child_reaper' thread is listed as the parent of the
989 * orphaned children, any living sub-thread in the cgroup-init can
990 * perform the role of the child_reaper.
991 */
992 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
993}
994
995NORET_TYPE void do_exit(long code) 994NORET_TYPE void do_exit(long code)
996{ 995{
997 struct task_struct *tsk = current; 996 struct task_struct *tsk = current;
@@ -1051,7 +1050,6 @@ NORET_TYPE void do_exit(long code)
1051 } 1050 }
1052 group_dead = atomic_dec_and_test(&tsk->signal->live); 1051 group_dead = atomic_dec_and_test(&tsk->signal->live);
1053 if (group_dead) { 1052 if (group_dead) {
1054 exit_child_reaper(tsk);
1055 hrtimer_cancel(&tsk->signal->real_timer); 1053 hrtimer_cancel(&tsk->signal->real_timer);
1056 exit_itimers(tsk->signal); 1054 exit_itimers(tsk->signal);
1057 } 1055 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 */ 672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART); 673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1; 674 return 1;
675 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: 675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
676 /* 677 /*
677 * This is solely for the sched tick emulation with 678 * This is solely for the sched tick emulation with
678 * dynamic tick support to ensure that we do not 679 * dynamic tick support to ensure that we do not
679 * restart the tick right on the edge and end up with 680 * restart the tick right on the edge and end up with
680 * the tick timer in the softirq ! The calling site 681 * the tick timer in the softirq ! The calling site
681 * takes care of this. 682 * takes care of this. Also used for hrtimer sleeper !
682 */ 683 */
683 debug_hrtimer_deactivate(timer); 684 debug_hrtimer_deactivate(timer);
684 return 1; 685 return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
1245 timer_stats_account_hrtimer(timer); 1246 timer_stats_account_hrtimer(timer);
1246 1247
1247 fn = timer->function; 1248 fn = timer->function;
1248 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1249 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1250 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1249 /* 1251 /*
1250 * Used for scheduler timers, avoid lock inversion with 1252 * Used for scheduler timers, avoid lock inversion with
1251 * rq->lock and tasklist_lock. 1253 * rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1452 sl->timer.function = hrtimer_wakeup; 1454 sl->timer.function = hrtimer_wakeup;
1453 sl->task = task; 1455 sl->task = task;
1454#ifdef CONFIG_HIGH_RES_TIMERS 1456#ifdef CONFIG_HIGH_RES_TIMERS
1455 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1457 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1456#endif 1458#endif
1457} 1459}
1458 1460
@@ -1591,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 1593
1592#ifdef CONFIG_HOTPLUG_CPU 1594#ifdef CONFIG_HOTPLUG_CPU
1593 1595
1594static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1596static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1595 struct hrtimer_clock_base *new_base) 1597 struct hrtimer_clock_base *new_base, int dcpu)
1596{ 1598{
1597 struct hrtimer *timer; 1599 struct hrtimer *timer;
1598 struct rb_node *node; 1600 struct rb_node *node;
1601 int raise = 0;
1599 1602
1600 while ((node = rb_first(&old_base->active))) { 1603 while ((node = rb_first(&old_base->active))) {
1601 timer = rb_entry(node, struct hrtimer, node); 1604 timer = rb_entry(node, struct hrtimer, node);
1602 BUG_ON(hrtimer_callback_running(timer)); 1605 BUG_ON(hrtimer_callback_running(timer));
1603 debug_hrtimer_deactivate(timer); 1606 debug_hrtimer_deactivate(timer);
1604 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1607
1608 /*
1609 * Should not happen. Per CPU timers should be
1610 * canceled _before_ the migration code is called
1611 */
1612 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1613 __remove_hrtimer(timer, old_base,
1614 HRTIMER_STATE_INACTIVE, 0);
1615 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1616 timer, timer->function, dcpu);
1617 continue;
1618 }
1619
1620 /*
1621 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1622 * timer could be seen as !active and just vanish away
1623 * under us on another CPU
1624 */
1625 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1605 timer->base = new_base; 1626 timer->base = new_base;
1606 /* 1627 /*
1607 * Enqueue the timer. Allow reprogramming of the event device 1628 * Enqueue the timer. Allow reprogramming of the event device
1608 */ 1629 */
1609 enqueue_hrtimer(timer, new_base, 1); 1630 enqueue_hrtimer(timer, new_base, 1);
1631
1632#ifdef CONFIG_HIGH_RES_TIMERS
1633 /*
1634 * Happens with high res enabled when the timer was
1635 * already expired and the callback mode is
1636 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1637 * enqueue code does not move them to the soft irq
1638 * pending list for performance/latency reasons, but
1639 * in the migration state, we need to do that
1640 * otherwise we end up with a stale timer.
1641 */
1642 if (timer->state == HRTIMER_STATE_MIGRATE) {
1643 timer->state = HRTIMER_STATE_PENDING;
1644 list_add_tail(&timer->cb_entry,
1645 &new_base->cpu_base->cb_pending);
1646 raise = 1;
1647 }
1648#endif
1649 /* Clear the migration state bit */
1650 timer->state &= ~HRTIMER_STATE_MIGRATE;
1651 }
1652 return raise;
1653}
1654
1655#ifdef CONFIG_HIGH_RES_TIMERS
1656static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1657 struct hrtimer_cpu_base *new_base)
1658{
1659 struct hrtimer *timer;
1660 int raise = 0;
1661
1662 while (!list_empty(&old_base->cb_pending)) {
1663 timer = list_entry(old_base->cb_pending.next,
1664 struct hrtimer, cb_entry);
1665
1666 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1667 timer->base = &new_base->clock_base[timer->base->index];
1668 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1669 raise = 1;
1610 } 1670 }
1671 return raise;
1672}
1673#else
1674static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1675 struct hrtimer_cpu_base *new_base)
1676{
1677 return 0;
1611} 1678}
1679#endif
1612 1680
1613static void migrate_hrtimers(int cpu) 1681static void migrate_hrtimers(int cpu)
1614{ 1682{
1615 struct hrtimer_cpu_base *old_base, *new_base; 1683 struct hrtimer_cpu_base *old_base, *new_base;
1616 int i; 1684 int i, raise = 0;
1617 1685
1618 BUG_ON(cpu_online(cpu)); 1686 BUG_ON(cpu_online(cpu));
1619 old_base = &per_cpu(hrtimer_bases, cpu); 1687 old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1626,14 +1694,21 @@ static void migrate_hrtimers(int cpu)
1626 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1694 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1627 1695
1628 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1696 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1629 migrate_hrtimer_list(&old_base->clock_base[i], 1697 if (migrate_hrtimer_list(&old_base->clock_base[i],
1630 &new_base->clock_base[i]); 1698 &new_base->clock_base[i], cpu))
1699 raise = 1;
1631 } 1700 }
1632 1701
1702 if (migrate_hrtimer_pending(old_base, new_base))
1703 raise = 1;
1704
1633 spin_unlock(&old_base->lock); 1705 spin_unlock(&old_base->lock);
1634 spin_unlock(&new_base->lock); 1706 spin_unlock(&new_base->lock);
1635 local_irq_enable(); 1707 local_irq_enable();
1636 put_cpu_var(hrtimer_bases); 1708 put_cpu_var(hrtimer_bases);
1709
1710 if (raise)
1711 hrtimer_raise_softirq();
1637} 1712}
1638#endif /* CONFIG_HOTPLUG_CPU */ 1713#endif /* CONFIG_HOTPLUG_CPU */
1639 1714
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8a4370e2a34..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/kexec.h> 14#include <linux/kexec.h>
15#include <linux/spinlock.h> 15#include <linux/mutex.h>
16#include <linux/list.h> 16#include <linux/list.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
@@ -77,7 +77,7 @@ int kexec_should_crash(struct task_struct *p)
77 * 77 *
78 * The code for the transition from the current kernel to the 78 * The code for the transition from the current kernel to the
79 * the new kernel is placed in the control_code_buffer, whose size 79 * the new kernel is placed in the control_code_buffer, whose size
80 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 80 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
81 * page of memory is necessary, but some architectures require more. 81 * page of memory is necessary, but some architectures require more.
82 * Because this memory must be identity mapped in the transition from 82 * Because this memory must be identity mapped in the transition from
83 * virtual to physical addresses it must live in the range 83 * virtual to physical addresses it must live in the range
@@ -242,7 +242,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
242 */ 242 */
243 result = -ENOMEM; 243 result = -ENOMEM;
244 image->control_code_page = kimage_alloc_control_pages(image, 244 image->control_code_page = kimage_alloc_control_pages(image,
245 get_order(KEXEC_CONTROL_CODE_SIZE)); 245 get_order(KEXEC_CONTROL_PAGE_SIZE));
246 if (!image->control_code_page) { 246 if (!image->control_code_page) {
247 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 247 printk(KERN_ERR "Could not allocate control_code_buffer\n");
248 goto out; 248 goto out;
@@ -317,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
317 */ 317 */
318 result = -ENOMEM; 318 result = -ENOMEM;
319 image->control_code_page = kimage_alloc_control_pages(image, 319 image->control_code_page = kimage_alloc_control_pages(image,
320 get_order(KEXEC_CONTROL_CODE_SIZE)); 320 get_order(KEXEC_CONTROL_PAGE_SIZE));
321 if (!image->control_code_page) { 321 if (!image->control_code_page) {
322 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 322 printk(KERN_ERR "Could not allocate control_code_buffer\n");
323 goto out; 323 goto out;
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
753 *old = addr | (*old & ~PAGE_MASK); 753 *old = addr | (*old & ~PAGE_MASK);
754 754
755 /* The old page I have found cannot be a 755 /* The old page I have found cannot be a
756 * destination page, so return it. 756 * destination page, so return it if it's
757 * gfp_flags honor the ones passed in.
757 */ 758 */
759 if (!(gfp_mask & __GFP_HIGHMEM) &&
760 PageHighMem(old_page)) {
761 kimage_free_pages(old_page);
762 continue;
763 }
758 addr = old_addr; 764 addr = old_addr;
759 page = old_page; 765 page = old_page;
760 break; 766 break;
@@ -924,19 +930,14 @@ static int kimage_load_segment(struct kimage *image,
924 */ 930 */
925struct kimage *kexec_image; 931struct kimage *kexec_image;
926struct kimage *kexec_crash_image; 932struct kimage *kexec_crash_image;
927/* 933
928 * A home grown binary mutex. 934static DEFINE_MUTEX(kexec_mutex);
929 * Nothing can wait so this mutex is safe to use
930 * in interrupt context :)
931 */
932static int kexec_lock;
933 935
934asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 936asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
935 struct kexec_segment __user *segments, 937 struct kexec_segment __user *segments,
936 unsigned long flags) 938 unsigned long flags)
937{ 939{
938 struct kimage **dest_image, *image; 940 struct kimage **dest_image, *image;
939 int locked;
940 int result; 941 int result;
941 942
942 /* We only trust the superuser with rebooting the system. */ 943 /* We only trust the superuser with rebooting the system. */
@@ -972,8 +973,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
972 * 973 *
973 * KISS: always take the mutex. 974 * KISS: always take the mutex.
974 */ 975 */
975 locked = xchg(&kexec_lock, 1); 976 if (!mutex_trylock(&kexec_mutex))
976 if (locked)
977 return -EBUSY; 977 return -EBUSY;
978 978
979 dest_image = &kexec_image; 979 dest_image = &kexec_image;
@@ -1015,8 +1015,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
1015 image = xchg(dest_image, image); 1015 image = xchg(dest_image, image);
1016 1016
1017out: 1017out:
1018 locked = xchg(&kexec_lock, 0); /* Release the mutex */ 1018 mutex_unlock(&kexec_mutex);
1019 BUG_ON(!locked);
1020 kimage_free(image); 1019 kimage_free(image);
1021 1020
1022 return result; 1021 return result;
@@ -1063,10 +1062,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
1063 1062
1064void crash_kexec(struct pt_regs *regs) 1063void crash_kexec(struct pt_regs *regs)
1065{ 1064{
1066 int locked; 1065 /* Take the kexec_mutex here to prevent sys_kexec_load
1067
1068
1069 /* Take the kexec_lock here to prevent sys_kexec_load
1070 * running on one cpu from replacing the crash kernel 1066 * running on one cpu from replacing the crash kernel
1071 * we are using after a panic on a different cpu. 1067 * we are using after a panic on a different cpu.
1072 * 1068 *
@@ -1074,8 +1070,7 @@ void crash_kexec(struct pt_regs *regs)
1074 * of memory the xchg(&kexec_crash_image) would be 1070 * of memory the xchg(&kexec_crash_image) would be
1075 * sufficient. But since I reuse the memory... 1071 * sufficient. But since I reuse the memory...
1076 */ 1072 */
1077 locked = xchg(&kexec_lock, 1); 1073 if (mutex_trylock(&kexec_mutex)) {
1078 if (!locked) {
1079 if (kexec_crash_image) { 1074 if (kexec_crash_image) {
1080 struct pt_regs fixed_regs; 1075 struct pt_regs fixed_regs;
1081 crash_setup_regs(&fixed_regs, regs); 1076 crash_setup_regs(&fixed_regs, regs);
@@ -1083,8 +1078,7 @@ void crash_kexec(struct pt_regs *regs)
1083 machine_crash_shutdown(&fixed_regs); 1078 machine_crash_shutdown(&fixed_regs);
1084 machine_kexec(kexec_crash_image); 1079 machine_kexec(kexec_crash_image);
1085 } 1080 }
1086 locked = xchg(&kexec_lock, 0); 1081 mutex_unlock(&kexec_mutex);
1087 BUG_ON(!locked);
1088 } 1082 }
1089} 1083}
1090 1084
@@ -1426,25 +1420,23 @@ static int __init crash_save_vmcoreinfo_init(void)
1426 1420
1427module_init(crash_save_vmcoreinfo_init) 1421module_init(crash_save_vmcoreinfo_init)
1428 1422
1429/** 1423/*
1430 * kernel_kexec - reboot the system 1424 * Move into place and start executing a preloaded standalone
1431 * 1425 * executable. If nothing was preloaded return an error.
1432 * Move into place and start executing a preloaded standalone
1433 * executable. If nothing was preloaded return an error.
1434 */ 1426 */
1435int kernel_kexec(void) 1427int kernel_kexec(void)
1436{ 1428{
1437 int error = 0; 1429 int error = 0;
1438 1430
1439 if (xchg(&kexec_lock, 1)) 1431 if (!mutex_trylock(&kexec_mutex))
1440 return -EBUSY; 1432 return -EBUSY;
1441 if (!kexec_image) { 1433 if (!kexec_image) {
1442 error = -EINVAL; 1434 error = -EINVAL;
1443 goto Unlock; 1435 goto Unlock;
1444 } 1436 }
1445 1437
1446 if (kexec_image->preserve_context) {
1447#ifdef CONFIG_KEXEC_JUMP 1438#ifdef CONFIG_KEXEC_JUMP
1439 if (kexec_image->preserve_context) {
1448 mutex_lock(&pm_mutex); 1440 mutex_lock(&pm_mutex);
1449 pm_prepare_console(); 1441 pm_prepare_console();
1450 error = freeze_processes(); 1442 error = freeze_processes();
@@ -1459,6 +1451,7 @@ int kernel_kexec(void)
1459 error = disable_nonboot_cpus(); 1451 error = disable_nonboot_cpus();
1460 if (error) 1452 if (error)
1461 goto Resume_devices; 1453 goto Resume_devices;
1454 device_pm_lock();
1462 local_irq_disable(); 1455 local_irq_disable();
1463 /* At this point, device_suspend() has been called, 1456 /* At this point, device_suspend() has been called,
1464 * but *not* device_power_down(). We *must* 1457 * but *not* device_power_down(). We *must*
@@ -1470,26 +1463,22 @@ int kernel_kexec(void)
1470 error = device_power_down(PMSG_FREEZE); 1463 error = device_power_down(PMSG_FREEZE);
1471 if (error) 1464 if (error)
1472 goto Enable_irqs; 1465 goto Enable_irqs;
1473 save_processor_state(); 1466 } else
1474#endif 1467#endif
1475 } else { 1468 {
1476 blocking_notifier_call_chain(&reboot_notifier_list, 1469 kernel_restart_prepare(NULL);
1477 SYS_RESTART, NULL);
1478 system_state = SYSTEM_RESTART;
1479 device_shutdown();
1480 sysdev_shutdown();
1481 printk(KERN_EMERG "Starting new kernel\n"); 1470 printk(KERN_EMERG "Starting new kernel\n");
1482 machine_shutdown(); 1471 machine_shutdown();
1483 } 1472 }
1484 1473
1485 machine_kexec(kexec_image); 1474 machine_kexec(kexec_image);
1486 1475
1487 if (kexec_image->preserve_context) {
1488#ifdef CONFIG_KEXEC_JUMP 1476#ifdef CONFIG_KEXEC_JUMP
1489 restore_processor_state(); 1477 if (kexec_image->preserve_context) {
1490 device_power_up(PMSG_RESTORE); 1478 device_power_up(PMSG_RESTORE);
1491 Enable_irqs: 1479 Enable_irqs:
1492 local_irq_enable(); 1480 local_irq_enable();
1481 device_pm_unlock();
1493 enable_nonboot_cpus(); 1482 enable_nonboot_cpus();
1494 Resume_devices: 1483 Resume_devices:
1495 device_resume(PMSG_RESTORE); 1484 device_resume(PMSG_RESTORE);
@@ -1499,11 +1488,10 @@ int kernel_kexec(void)
1499 Restore_console: 1488 Restore_console:
1500 pm_restore_console(); 1489 pm_restore_console();
1501 mutex_unlock(&pm_mutex); 1490 mutex_unlock(&pm_mutex);
1502#endif
1503 } 1491 }
1492#endif
1504 1493
1505 Unlock: 1494 Unlock:
1506 xchg(&kexec_lock, 0); 1495 mutex_unlock(&kexec_mutex);
1507
1508 return error; 1496 return error;
1509} 1497}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index eaa21fc9ad1d..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
488 if (err) 488 if (err)
489 return err; 489 return err;
490 if (CACHE_FLUSH_IS_SAFE) 490 if (CACHE_FLUSH_IS_SAFE)
491 flush_icache_range(addr, addr + length + 1); 491 flush_icache_range(addr, addr + length);
492 return 0; 492 return 0;
493 } 493 }
494 494
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 590
591 /* Signal the primary CPU that we are done: */ 591 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
593 clocksource_touch_watchdog(); 594 clocksource_touch_watchdog();
594 local_irq_restore(flags); 595 local_irq_restore(flags);
595} 596}
@@ -1432,6 +1433,7 @@ acquirelock:
1432 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1433 1434
1434 atomic_set(&kgdb_active, -1); 1435 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog();
1435 clocksource_touch_watchdog(); 1437 clocksource_touch_watchdog();
1436 local_irq_restore(flags); 1438 local_irq_restore(flags);
1437 1439
@@ -1462,7 +1464,7 @@ acquirelock:
1462 * Get the passive CPU lock which will hold all the non-primary 1464 * Get the passive CPU lock which will hold all the non-primary
1463 * CPU in a spin state while the debugger is active 1465 * CPU in a spin state while the debugger is active
1464 */ 1466 */
1465 if (!kgdb_single_step || !kgdb_contthread) { 1467 if (!kgdb_single_step) {
1466 for (i = 0; i < NR_CPUS; i++) 1468 for (i = 0; i < NR_CPUS; i++)
1467 atomic_set(&passive_cpu_wait[i], 1); 1469 atomic_set(&passive_cpu_wait[i], 1);
1468 } 1470 }
@@ -1475,7 +1477,7 @@ acquirelock:
1475 1477
1476#ifdef CONFIG_SMP 1478#ifdef CONFIG_SMP
1477 /* Signal the other CPUs to enter kgdb_wait() */ 1479 /* Signal the other CPUs to enter kgdb_wait() */
1478 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) 1480 if ((!kgdb_single_step) && kgdb_do_roundup)
1479 kgdb_roundup_cpus(flags); 1481 kgdb_roundup_cpus(flags);
1480#endif 1482#endif
1481 1483
@@ -1494,7 +1496,7 @@ acquirelock:
1494 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); 1496 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1495 kgdb_deactivate_sw_breakpoints(); 1497 kgdb_deactivate_sw_breakpoints();
1496 kgdb_single_step = 0; 1498 kgdb_single_step = 0;
1497 kgdb_contthread = NULL; 1499 kgdb_contthread = current;
1498 exception_level = 0; 1500 exception_level = 0;
1499 1501
1500 /* Talk to debugger with gdbserial protocol */ 1502 /* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1510,7 @@ acquirelock:
1508 kgdb_info[ks->cpu].task = NULL; 1510 kgdb_info[ks->cpu].task = NULL;
1509 atomic_set(&cpu_in_kgdb[ks->cpu], 0); 1511 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1510 1512
1511 if (!kgdb_single_step || !kgdb_contthread) { 1513 if (!kgdb_single_step) {
1512 for (i = NR_CPUS-1; i >= 0; i--) 1514 for (i = NR_CPUS-1; i >= 0; i--)
1513 atomic_set(&passive_cpu_wait[i], 0); 1515 atomic_set(&passive_cpu_wait[i], 0);
1514 /* 1516 /*
@@ -1524,6 +1526,7 @@ acquirelock:
1524kgdb_restore: 1526kgdb_restore:
1525 /* Free kgdb_active */ 1527 /* Free kgdb_active */
1526 atomic_set(&kgdb_active, -1); 1528 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog();
1527 clocksource_touch_watchdog(); 1530 clocksource_touch_watchdog();
1528 local_irq_restore(flags); 1531 local_irq_restore(flags);
1529 1532
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1aa91fd6b06e..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
875 if (!entry) 875 if (!entry)
876 return 0; 876 return 0;
877 877
878 entry->class = this;
879 entry->distance = distance;
880 if (!save_trace(&entry->trace)) 878 if (!save_trace(&entry->trace))
881 return 0; 879 return 0;
882 880
881 entry->class = this;
882 entry->distance = distance;
883 /* 883 /*
884 * Since we never remove from the dependency list, the list can 884 * Since we never remove from the dependency list, the list can
885 * be walked lockless by other CPUs, it's only allocation 885 * be walked lockless by other CPUs, it's only allocation
@@ -1759,11 +1759,10 @@ static void check_chain_key(struct task_struct *curr)
1759 hlock = curr->held_locks + i; 1759 hlock = curr->held_locks + i;
1760 if (chain_key != hlock->prev_chain_key) { 1760 if (chain_key != hlock->prev_chain_key) {
1761 debug_locks_off(); 1761 debug_locks_off();
1762 printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", 1762 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
1763 curr->lockdep_depth, i, 1763 curr->lockdep_depth, i,
1764 (unsigned long long)chain_key, 1764 (unsigned long long)chain_key,
1765 (unsigned long long)hlock->prev_chain_key); 1765 (unsigned long long)hlock->prev_chain_key);
1766 WARN_ON(1);
1767 return; 1766 return;
1768 } 1767 }
1769 id = hlock->class_idx - 1; 1768 id = hlock->class_idx - 1;
@@ -1778,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr)
1778 } 1777 }
1779 if (chain_key != curr->curr_chain_key) { 1778 if (chain_key != curr->curr_chain_key) {
1780 debug_locks_off(); 1779 debug_locks_off();
1781 printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", 1780 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
1782 curr->lockdep_depth, i, 1781 curr->lockdep_depth, i,
1783 (unsigned long long)chain_key, 1782 (unsigned long long)chain_key,
1784 (unsigned long long)curr->curr_chain_key); 1783 (unsigned long long)curr->curr_chain_key);
1785 WARN_ON(1);
1786 } 1784 }
1787#endif 1785#endif
1788} 1786}
@@ -2584,7 +2582,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 hlock->trylock = trylock; 2582 hlock->trylock = trylock;
2585 hlock->read = read; 2583 hlock->read = read;
2586 hlock->check = check; 2584 hlock->check = check;
2587 hlock->hardirqs_off = hardirqs_off; 2585 hlock->hardirqs_off = !!hardirqs_off;
2588#ifdef CONFIG_LOCK_STAT 2586#ifdef CONFIG_LOCK_STAT
2589 hlock->waittime_stamp = 0; 2587 hlock->waittime_stamp = 0;
2590 hlock->holdtime_stamp = sched_clock(); 2588 hlock->holdtime_stamp = sched_clock();
@@ -3031,7 +3029,7 @@ found_it:
3031 3029
3032 stats = get_lock_stats(hlock_class(hlock)); 3030 stats = get_lock_stats(hlock_class(hlock));
3033 if (point < ARRAY_SIZE(stats->contention_point)) 3031 if (point < ARRAY_SIZE(stats->contention_point))
3034 stats->contention_point[i]++; 3032 stats->contention_point[point]++;
3035 if (lock->cpu != smp_processor_id()) 3033 if (lock->cpu != smp_processor_id())
3036 stats->bounces[bounce_contended + !!hlock->read]++; 3034 stats->bounces[bounce_contended + !!hlock->read]++;
3037 put_lock_stats(stats); 3035 put_lock_stats(stats);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 55db193d366d..56b196932c08 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -50,8 +50,21 @@ extern unsigned int nr_process_chains;
50extern unsigned int max_lockdep_depth; 50extern unsigned int max_lockdep_depth;
51extern unsigned int max_recursion_depth; 51extern unsigned int max_recursion_depth;
52 52
53#ifdef CONFIG_PROVE_LOCKING
53extern unsigned long lockdep_count_forward_deps(struct lock_class *); 54extern unsigned long lockdep_count_forward_deps(struct lock_class *);
54extern unsigned long lockdep_count_backward_deps(struct lock_class *); 55extern unsigned long lockdep_count_backward_deps(struct lock_class *);
56#else
57static inline unsigned long
58lockdep_count_forward_deps(struct lock_class *class)
59{
60 return 0;
61}
62static inline unsigned long
63lockdep_count_backward_deps(struct lock_class *class)
64{
65 return 0;
66}
67#endif
55 68
56#ifdef CONFIG_DEBUG_LOCKDEP 69#ifdef CONFIG_DEBUG_LOCKDEP
57/* 70/*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index fa19aee604c2..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -82,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 82
83static int l_show(struct seq_file *m, void *v) 83static int l_show(struct seq_file *m, void *v)
84{ 84{
85 unsigned long nr_forward_deps, nr_backward_deps;
86 struct lock_class *class = v; 85 struct lock_class *class = v;
87 struct lock_list *entry; 86 struct lock_list *entry;
88 char c1, c2, c3, c4; 87 char c1, c2, c3, c4;
@@ -96,11 +95,10 @@ static int l_show(struct seq_file *m, void *v)
96#ifdef CONFIG_DEBUG_LOCKDEP 95#ifdef CONFIG_DEBUG_LOCKDEP
97 seq_printf(m, " OPS:%8ld", class->ops); 96 seq_printf(m, " OPS:%8ld", class->ops);
98#endif 97#endif
99 nr_forward_deps = lockdep_count_forward_deps(class); 98#ifdef CONFIG_PROVE_LOCKING
100 seq_printf(m, " FD:%5ld", nr_forward_deps); 99 seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
101 100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
102 nr_backward_deps = lockdep_count_backward_deps(class); 101#endif
103 seq_printf(m, " BD:%5ld", nr_backward_deps);
104 102
105 get_usage_chars(class, &c1, &c2, &c3, &c4); 103 get_usage_chars(class, &c1, &c2, &c3, &c4);
106 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); 104 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
@@ -325,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
325 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
326 nr_hardirq_read_unsafe++; 324 nr_hardirq_read_unsafe++;
327 325
326#ifdef CONFIG_PROVE_LOCKING
328 sum_forward_deps += lockdep_count_forward_deps(class); 327 sum_forward_deps += lockdep_count_forward_deps(class);
328#endif
329 } 329 }
330#ifdef CONFIG_DEBUG_LOCKDEP 330#ifdef CONFIG_DEBUG_LOCKDEP
331 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 331 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
@@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 unsigned long rem;
474 474
475 nr += 5; /* for display rounding */
475 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 476 rem = do_div(nr, 1000); /* XXX: do_div_signed */
476 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); 477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
477} 478}
478 479
479static void seq_time(struct seq_file *m, s64 time) 480static void seq_time(struct seq_file *m, s64 time)
diff --git a/kernel/module.c b/kernel/module.c
index 08864d257eb0..9db11911e04b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1799,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size)
1799 1799
1800/* Allocate and load the module: note that size of section 0 is always 1800/* Allocate and load the module: note that size of section 0 is always
1801 zero, and we rely on this for optional sections. */ 1801 zero, and we rely on this for optional sections. */
1802static struct module *load_module(void __user *umod, 1802static noinline struct module *load_module(void __user *umod,
1803 unsigned long len, 1803 unsigned long len,
1804 const char __user *uargs) 1804 const char __user *uargs)
1805{ 1805{
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 21575fc46d05..1d3ef29a2583 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/version.h>
18#include <linux/nsproxy.h> 17#include <linux/nsproxy.h>
19#include <linux/init_task.h> 18#include <linux/init_task.h>
20#include <linux/mnt_namespace.h> 19#include <linux/mnt_namespace.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ea567b78d1aa..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
179 rc = sys_wait4(-1, NULL, __WALL, NULL); 179 rc = sys_wait4(-1, NULL, __WALL, NULL);
180 } while (rc != -ECHILD); 180 } while (rc != -ECHILD);
181 181
182
183 /* Child reaper for the pid namespace is going away */
184 pid_ns->child_reaper = NULL;
185 acct_exit_ns(pid_ns); 182 acct_exit_ns(pid_ns);
186 return; 183 return;
187} 184}
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index da9c2dda6a4e..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -43,7 +43,7 @@
43#include <linux/uaccess.h> 43#include <linux/uaccess.h>
44 44
45/* 45/*
46 * locking rule: all changes to target_value or requirements or notifiers lists 46 * locking rule: all changes to requirements or notifiers lists
47 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 47 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
48 * held, taken with _irqsave. One lock to rule them all 48 * held, taken with _irqsave. One lock to rule them all
49 */ 49 */
@@ -66,7 +66,7 @@ struct pm_qos_object {
66 struct miscdevice pm_qos_power_miscdev; 66 struct miscdevice pm_qos_power_miscdev;
67 char *name; 67 char *name;
68 s32 default_value; 68 s32 default_value;
69 s32 target_value; 69 atomic_t target_value;
70 s32 (*comparitor)(s32, s32); 70 s32 (*comparitor)(s32, s32);
71}; 71};
72 72
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
77 .notifiers = &cpu_dma_lat_notifier, 77 .notifiers = &cpu_dma_lat_notifier,
78 .name = "cpu_dma_latency", 78 .name = "cpu_dma_latency",
79 .default_value = 2000 * USEC_PER_SEC, 79 .default_value = 2000 * USEC_PER_SEC,
80 .target_value = 2000 * USEC_PER_SEC, 80 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
81 .comparitor = min_compare 81 .comparitor = min_compare
82}; 82};
83 83
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
87 .notifiers = &network_lat_notifier, 87 .notifiers = &network_lat_notifier,
88 .name = "network_latency", 88 .name = "network_latency",
89 .default_value = 2000 * USEC_PER_SEC, 89 .default_value = 2000 * USEC_PER_SEC,
90 .target_value = 2000 * USEC_PER_SEC, 90 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
91 .comparitor = min_compare 91 .comparitor = min_compare
92}; 92};
93 93
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
99 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 100 .name = "network_throughput",
101 .default_value = 0, 101 .default_value = 0,
102 .target_value = 0, 102 .target_value = ATOMIC_INIT(0),
103 .comparitor = max_compare 103 .comparitor = max_compare
104}; 104};
105 105
@@ -150,11 +150,11 @@ static void update_target(int target)
150 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[target]->comparitor(
151 extreme_value, node->value); 151 extreme_value, node->value);
152 } 152 }
153 if (pm_qos_array[target]->target_value != extreme_value) { 153 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
154 call_notifier = 1; 154 call_notifier = 1;
155 pm_qos_array[target]->target_value = extreme_value; 155 atomic_set(&pm_qos_array[target]->target_value, extreme_value);
156 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 156 pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
157 pm_qos_array[target]->target_value); 157 atomic_read(&pm_qos_array[target]->target_value));
158 } 158 }
159 spin_unlock_irqrestore(&pm_qos_lock, flags); 159 spin_unlock_irqrestore(&pm_qos_lock, flags);
160 160
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
193 */ 193 */
194int pm_qos_requirement(int pm_qos_class) 194int pm_qos_requirement(int pm_qos_class)
195{ 195{
196 int ret_val; 196 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
197 unsigned long flags;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 ret_val = pm_qos_array[pm_qos_class]->target_value;
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return ret_val;
204} 197}
205EXPORT_SYMBOL_GPL(pm_qos_requirement); 198EXPORT_SYMBOL_GPL(pm_qos_requirement);
206 199
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..5131e5471169 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void)
441 return tmr; 441 return tmr;
442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
443 kmem_cache_free(posix_timers_cache, tmr); 443 kmem_cache_free(posix_timers_cache, tmr);
444 tmr = NULL; 444 return NULL;
445 } 445 }
446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); 446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
447 return tmr; 447 return tmr;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b52..bbd85c60f741 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -21,6 +21,7 @@
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h> 23#include <linux/freezer.h>
24#include <linux/ftrace.h>
24 25
25#include "power.h" 26#include "power.h"
26 27
@@ -255,7 +256,7 @@ static int create_image(int platform_mode)
255 256
256int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
257{ 258{
258 int error; 259 int error, ftrace_save;
259 260
260 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
261 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode)
267 goto Close; 268 goto Close;
268 269
269 suspend_console(); 270 suspend_console();
271 ftrace_save = __ftrace_enabled_save();
270 error = device_suspend(PMSG_FREEZE); 272 error = device_suspend(PMSG_FREEZE);
271 if (error) 273 if (error)
272 goto Recover_platform; 274 goto Recover_platform;
@@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode)
296 Resume_devices: 298 Resume_devices:
297 device_resume(in_suspend ? 299 device_resume(in_suspend ?
298 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 300 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
301 __ftrace_enabled_restore(ftrace_save);
299 resume_console(); 302 resume_console();
300 Close: 303 Close:
301 platform_end(platform_mode); 304 platform_end(platform_mode);
@@ -366,10 +369,11 @@ static int resume_target_kernel(void)
366 369
367int hibernation_restore(int platform_mode) 370int hibernation_restore(int platform_mode)
368{ 371{
369 int error; 372 int error, ftrace_save;
370 373
371 pm_prepare_console(); 374 pm_prepare_console();
372 suspend_console(); 375 suspend_console();
376 ftrace_save = __ftrace_enabled_save();
373 error = device_suspend(PMSG_QUIESCE); 377 error = device_suspend(PMSG_QUIESCE);
374 if (error) 378 if (error)
375 goto Finish; 379 goto Finish;
@@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode)
384 platform_restore_cleanup(platform_mode); 388 platform_restore_cleanup(platform_mode);
385 device_resume(PMSG_RECOVER); 389 device_resume(PMSG_RECOVER);
386 Finish: 390 Finish:
391 __ftrace_enabled_restore(ftrace_save);
387 resume_console(); 392 resume_console();
388 pm_restore_console(); 393 pm_restore_console();
389 return error; 394 return error;
@@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode)
396 401
397int hibernation_platform_enter(void) 402int hibernation_platform_enter(void)
398{ 403{
399 int error; 404 int error, ftrace_save;
400 405
401 if (!hibernation_ops) 406 if (!hibernation_ops)
402 return -ENOSYS; 407 return -ENOSYS;
@@ -411,6 +416,7 @@ int hibernation_platform_enter(void)
411 goto Close; 416 goto Close;
412 417
413 suspend_console(); 418 suspend_console();
419 ftrace_save = __ftrace_enabled_save();
414 error = device_suspend(PMSG_HIBERNATE); 420 error = device_suspend(PMSG_HIBERNATE);
415 if (error) { 421 if (error) {
416 if (hibernation_ops->recover) 422 if (hibernation_ops->recover)
@@ -445,6 +451,7 @@ int hibernation_platform_enter(void)
445 hibernation_ops->finish(); 451 hibernation_ops->finish();
446 Resume_devices: 452 Resume_devices:
447 device_resume(PMSG_RESTORE); 453 device_resume(PMSG_RESTORE);
454 __ftrace_enabled_restore(ftrace_save);
448 resume_console(); 455 resume_console();
449 Close: 456 Close:
450 hibernation_ops->end(); 457 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b7476f5d2a6..540b16b68565 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,7 @@
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ftrace.h>
24 25
25#include "power.h" 26#include "power.h"
26 27
@@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state)
310 */ 311 */
311int suspend_devices_and_enter(suspend_state_t state) 312int suspend_devices_and_enter(suspend_state_t state)
312{ 313{
313 int error; 314 int error, ftrace_save;
314 315
315 if (!suspend_ops) 316 if (!suspend_ops)
316 return -ENOSYS; 317 return -ENOSYS;
@@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state)
321 goto Close; 322 goto Close;
322 } 323 }
323 suspend_console(); 324 suspend_console();
325 ftrace_save = __ftrace_enabled_save();
324 suspend_test_start(); 326 suspend_test_start();
325 error = device_suspend(PMSG_SUSPEND); 327 error = device_suspend(PMSG_SUSPEND);
326 if (error) { 328 if (error) {
@@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state)
352 suspend_test_start(); 354 suspend_test_start();
353 device_resume(PMSG_RESUME); 355 device_resume(PMSG_RESUME);
354 suspend_test_finish("resume devices"); 356 suspend_test_finish("resume devices");
357 __ftrace_enabled_restore(ftrace_save);
355 resume_console(); 358 resume_console();
356 Close: 359 Close:
357 if (suspend_ops->end) 360 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f9..80ccac849e46 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h> 16#include <linux/utsname.h>
17#include <linux/version.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/bitops.h> 18#include <linux/bitops.h>
20#include <linux/genhd.h> 19#include <linux/genhd.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082b3fcb32a0..356699a96d56 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
140 if (!dumpable && !capable(CAP_SYS_PTRACE)) 140 if (!dumpable && !capable(CAP_SYS_PTRACE))
141 return -EPERM; 141 return -EPERM;
142 142
143 return security_ptrace(current, task, mode); 143 return security_ptrace_may_access(task, mode);
144} 144}
145 145
146bool ptrace_may_access(struct task_struct *task, unsigned int mode) 146bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -499,8 +499,7 @@ repeat:
499 goto repeat; 499 goto repeat;
500 } 500 }
501 501
502 ret = security_ptrace(current->parent, current, 502 ret = security_ptrace_traceme(current->parent);
503 PTRACE_MODE_ATTACH);
504 503
505 /* 504 /*
506 * Set the ptrace bit in the process ptrace flags. 505 * Set the ptrace bit in the process ptrace flags.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f14f372cf6f5..467d5940f624 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */
80synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81synchronize_rcu_xxx(synchronize_rcu, call_rcu)
81EXPORT_SYMBOL_GPL(synchronize_rcu); 82EXPORT_SYMBOL_GPL(synchronize_rcu);
82 83
diff --git a/kernel/resource.c b/kernel/resource.c
index f5b518eabefe..03d796c1b2e9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
362 362
363EXPORT_SYMBOL(allocate_resource); 363EXPORT_SYMBOL(allocate_resource);
364 364
365/** 365/*
366 * insert_resource - Inserts a resource in the resource tree 366 * Insert a resource into the resource tree. If successful, return NULL,
367 * @parent: parent of the new resource 367 * otherwise return the conflicting resource (compare to __request_resource())
368 * @new: new resource to insert
369 *
370 * Returns 0 on success, -EBUSY if the resource can't be inserted.
371 *
372 * This function is equivalent to request_resource when no conflict
373 * happens. If a conflict happens, and the conflicting resources
374 * entirely fit within the range of the new resource, then the new
375 * resource is inserted and the conflicting resources become children of
376 * the new resource.
377 */ 368 */
378int insert_resource(struct resource *parent, struct resource *new) 369static struct resource * __insert_resource(struct resource *parent, struct resource *new)
379{ 370{
380 int result;
381 struct resource *first, *next; 371 struct resource *first, *next;
382 372
383 write_lock(&resource_lock);
384
385 for (;; parent = first) { 373 for (;; parent = first) {
386 result = 0;
387 first = __request_resource(parent, new); 374 first = __request_resource(parent, new);
388 if (!first) 375 if (!first)
389 goto out; 376 return first;
390 377
391 result = -EBUSY;
392 if (first == parent) 378 if (first == parent)
393 goto out; 379 return first;
394 380
395 if ((first->start > new->start) || (first->end < new->end)) 381 if ((first->start > new->start) || (first->end < new->end))
396 break; 382 break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
401 for (next = first; ; next = next->sibling) { 387 for (next = first; ; next = next->sibling) {
402 /* Partial overlap? Bad, and unfixable */ 388 /* Partial overlap? Bad, and unfixable */
403 if (next->start < new->start || next->end > new->end) 389 if (next->start < new->start || next->end > new->end)
404 goto out; 390 return next;
405 if (!next->sibling) 391 if (!next->sibling)
406 break; 392 break;
407 if (next->sibling->start > new->end) 393 if (next->sibling->start > new->end)
408 break; 394 break;
409 } 395 }
410 396
411 result = 0;
412
413 new->parent = parent; 397 new->parent = parent;
414 new->sibling = next->sibling; 398 new->sibling = next->sibling;
415 new->child = first; 399 new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
426 next = next->sibling; 410 next = next->sibling;
427 next->sibling = new; 411 next->sibling = new;
428 } 412 }
413 return NULL;
414}
429 415
430 out: 416/**
417 * insert_resource - Inserts a resource in the resource tree
418 * @parent: parent of the new resource
419 * @new: new resource to insert
420 *
421 * Returns 0 on success, -EBUSY if the resource can't be inserted.
422 *
423 * This function is equivalent to request_resource when no conflict
424 * happens. If a conflict happens, and the conflicting resources
425 * entirely fit within the range of the new resource, then the new
426 * resource is inserted and the conflicting resources become children of
427 * the new resource.
428 */
429int insert_resource(struct resource *parent, struct resource *new)
430{
431 struct resource *conflict;
432
433 write_lock(&resource_lock);
434 conflict = __insert_resource(parent, new);
435 write_unlock(&resource_lock);
436 return conflict ? -EBUSY : 0;
437}
438
439/**
440 * insert_resource_expand_to_fit - Insert a resource into the resource tree
441 * @root: root resource descriptor
442 * @new: new resource to insert
443 *
444 * Insert a resource into the resource tree, possibly expanding it in order
445 * to make it encompass any conflicting resources.
446 */
447void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
448{
449 if (new->parent)
450 return;
451
452 write_lock(&resource_lock);
453 for (;;) {
454 struct resource *conflict;
455
456 conflict = __insert_resource(root, new);
457 if (!conflict)
458 break;
459 if (conflict == root)
460 break;
461
462 /* Ok, expand resource to cover the conflict, then try again .. */
463 if (conflict->start < new->start)
464 new->start = conflict->start;
465 if (conflict->end > new->end)
466 new->end = conflict->end;
467
468 printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
469 }
431 write_unlock(&resource_lock); 470 write_unlock(&resource_lock);
432 return result;
433} 471}
434 472
435/** 473/**
diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..ad1962dc0aa2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205} 205}
206 206
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 207static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -808,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
808 808
809/* 809/*
810 * ratelimit for updating the group shares. 810 * ratelimit for updating the group shares.
811 * default: 0.5ms 811 * default: 0.25ms
812 */ 812 */
813const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; 813unsigned int sysctl_sched_shares_ratelimit = 250000;
814 814
815/* 815/*
816 * period over which we measure -rt task cpu usage in us. 816 * period over which we measure -rt task cpu usage in us.
@@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1087 return NOTIFY_DONE; 1087 return NOTIFY_DONE;
1088} 1088}
1089 1089
1090static void init_hrtick(void) 1090static __init void init_hrtick(void)
1091{ 1091{
1092 hotcpu_notifier(hotplug_hrtick, 0); 1092 hotcpu_notifier(hotplug_hrtick, 0);
1093} 1093}
@@ -1119,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq)
1119 1119
1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1121 rq->hrtick_timer.function = hrtick; 1121 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1123}
1124#else 1124#else
1125static inline void hrtick_clear(struct rq *rq) 1125static inline void hrtick_clear(struct rq *rq)
@@ -4179,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4179} 4179}
4180 4180
4181/* 4181/*
4182 * Use precise platform statistics if available:
4183 */
4184#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4185cputime_t task_utime(struct task_struct *p)
4186{
4187 return p->utime;
4188}
4189
4190cputime_t task_stime(struct task_struct *p)
4191{
4192 return p->stime;
4193}
4194#else
4195cputime_t task_utime(struct task_struct *p)
4196{
4197 clock_t utime = cputime_to_clock_t(p->utime),
4198 total = utime + cputime_to_clock_t(p->stime);
4199 u64 temp;
4200
4201 /*
4202 * Use CFS's precise accounting:
4203 */
4204 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4205
4206 if (total) {
4207 temp *= utime;
4208 do_div(temp, total);
4209 }
4210 utime = (clock_t)temp;
4211
4212 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4213 return p->prev_utime;
4214}
4215
4216cputime_t task_stime(struct task_struct *p)
4217{
4218 clock_t stime;
4219
4220 /*
4221 * Use CFS's precise accounting. (we subtract utime from
4222 * the total, to make sure the total observed by userspace
4223 * grows monotonically - apps rely on that):
4224 */
4225 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4226 cputime_to_clock_t(task_utime(p));
4227
4228 if (stime >= 0)
4229 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4230
4231 return p->prev_stime;
4232}
4233#endif
4234
4235inline cputime_t task_gtime(struct task_struct *p)
4236{
4237 return p->gtime;
4238}
4239
4240/*
4182 * This function gets called by the timer code, with HZ frequency. 4241 * This function gets called by the timer code, with HZ frequency.
4183 * We call it with interrupts disabled. 4242 * We call it with interrupts disabled.
4184 * 4243 *
@@ -4669,6 +4728,52 @@ int __sched wait_for_completion_killable(struct completion *x)
4669} 4728}
4670EXPORT_SYMBOL(wait_for_completion_killable); 4729EXPORT_SYMBOL(wait_for_completion_killable);
4671 4730
4731/**
4732 * try_wait_for_completion - try to decrement a completion without blocking
4733 * @x: completion structure
4734 *
4735 * Returns: 0 if a decrement cannot be done without blocking
4736 * 1 if a decrement succeeded.
4737 *
4738 * If a completion is being used as a counting completion,
4739 * attempt to decrement the counter without blocking. This
4740 * enables us to avoid waiting if the resource the completion
4741 * is protecting is not available.
4742 */
4743bool try_wait_for_completion(struct completion *x)
4744{
4745 int ret = 1;
4746
4747 spin_lock_irq(&x->wait.lock);
4748 if (!x->done)
4749 ret = 0;
4750 else
4751 x->done--;
4752 spin_unlock_irq(&x->wait.lock);
4753 return ret;
4754}
4755EXPORT_SYMBOL(try_wait_for_completion);
4756
4757/**
4758 * completion_done - Test to see if a completion has any waiters
4759 * @x: completion structure
4760 *
4761 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4762 * 1 if there are no waiters.
4763 *
4764 */
4765bool completion_done(struct completion *x)
4766{
4767 int ret = 1;
4768
4769 spin_lock_irq(&x->wait.lock);
4770 if (!x->done)
4771 ret = 0;
4772 spin_unlock_irq(&x->wait.lock);
4773 return ret;
4774}
4775EXPORT_SYMBOL(completion_done);
4776
4672static long __sched 4777static long __sched
4673sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4778sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4674{ 4779{
@@ -5740,6 +5845,8 @@ static inline void sched_init_granularity(void)
5740 sysctl_sched_latency = limit; 5845 sysctl_sched_latency = limit;
5741 5846
5742 sysctl_sched_wakeup_granularity *= factor; 5847 sysctl_sched_wakeup_granularity *= factor;
5848
5849 sysctl_sched_shares_ratelimit *= factor;
5743} 5850}
5744 5851
5745#ifdef CONFIG_SMP 5852#ifdef CONFIG_SMP
@@ -7589,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7589 * and partition_sched_domains() will fallback to the single partition 7696 * and partition_sched_domains() will fallback to the single partition
7590 * 'fallback_doms', it also forces the domains to be rebuilt. 7697 * 'fallback_doms', it also forces the domains to be rebuilt.
7591 * 7698 *
7699 * If doms_new==NULL it will be replaced with cpu_online_map.
7700 * ndoms_new==0 is a special case for destroying existing domains.
7701 * It will not create the default domain.
7702 *
7592 * Call with hotplug lock held 7703 * Call with hotplug lock held
7593 */ 7704 */
7594void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7705void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7595 struct sched_domain_attr *dattr_new) 7706 struct sched_domain_attr *dattr_new)
7596{ 7707{
7597 int i, j; 7708 int i, j, n;
7598 7709
7599 mutex_lock(&sched_domains_mutex); 7710 mutex_lock(&sched_domains_mutex);
7600 7711
7601 /* always unregister in case we don't destroy any domains */ 7712 /* always unregister in case we don't destroy any domains */
7602 unregister_sched_domain_sysctl(); 7713 unregister_sched_domain_sysctl();
7603 7714
7604 if (doms_new == NULL) 7715 n = doms_new ? ndoms_new : 0;
7605 ndoms_new = 0;
7606 7716
7607 /* Destroy deleted domains */ 7717 /* Destroy deleted domains */
7608 for (i = 0; i < ndoms_cur; i++) { 7718 for (i = 0; i < ndoms_cur; i++) {
7609 for (j = 0; j < ndoms_new; j++) { 7719 for (j = 0; j < n; j++) {
7610 if (cpus_equal(doms_cur[i], doms_new[j]) 7720 if (cpus_equal(doms_cur[i], doms_new[j])
7611 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7721 && dattrs_equal(dattr_cur, i, dattr_new, j))
7612 goto match1; 7722 goto match1;
@@ -7619,7 +7729,6 @@ match1:
7619 7729
7620 if (doms_new == NULL) { 7730 if (doms_new == NULL) {
7621 ndoms_cur = 0; 7731 ndoms_cur = 0;
7622 ndoms_new = 1;
7623 doms_new = &fallback_doms; 7732 doms_new = &fallback_doms;
7624 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7733 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7625 dattr_new = NULL; 7734 dattr_new = NULL;
@@ -7656,8 +7765,13 @@ match2:
7656int arch_reinit_sched_domains(void) 7765int arch_reinit_sched_domains(void)
7657{ 7766{
7658 get_online_cpus(); 7767 get_online_cpus();
7768
7769 /* Destroy domains first to force the rebuild */
7770 partition_sched_domains(0, NULL, NULL);
7771
7659 rebuild_sched_domains(); 7772 rebuild_sched_domains();
7660 put_online_cpus(); 7773 put_online_cpus();
7774
7661 return 0; 7775 return 0;
7662} 7776}
7663 7777
@@ -7741,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7741 case CPU_ONLINE_FROZEN: 7855 case CPU_ONLINE_FROZEN:
7742 case CPU_DEAD: 7856 case CPU_DEAD:
7743 case CPU_DEAD_FROZEN: 7857 case CPU_DEAD_FROZEN:
7744 partition_sched_domains(0, NULL, NULL); 7858 partition_sched_domains(1, NULL, NULL);
7745 return NOTIFY_OK; 7859 return NOTIFY_OK;
7746 7860
7747 default: 7861 default:
@@ -8462,8 +8576,8 @@ struct task_group *sched_create_group(struct task_group *parent)
8462 WARN_ON(!parent); /* root should already exist */ 8576 WARN_ON(!parent); /* root should already exist */
8463 8577
8464 tg->parent = parent; 8578 tg->parent = parent;
8465 list_add_rcu(&tg->siblings, &parent->children);
8466 INIT_LIST_HEAD(&tg->children); 8579 INIT_LIST_HEAD(&tg->children);
8580 list_add_rcu(&tg->siblings, &parent->children);
8467 spin_unlock_irqrestore(&task_group_lock, flags); 8581 spin_unlock_irqrestore(&task_group_lock, flags);
8468 8582
8469 return tg; 8583 return tg;
@@ -8795,6 +8909,9 @@ static int sched_rt_global_constraints(void)
8795 u64 rt_runtime, rt_period; 8909 u64 rt_runtime, rt_period;
8796 int ret = 0; 8910 int ret = 0;
8797 8911
8912 if (sysctl_sched_rt_period <= 0)
8913 return -EINVAL;
8914
8798 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8915 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8799 rt_runtime = tg->rt_bandwidth.rt_runtime; 8916 rt_runtime = tg->rt_bandwidth.rt_runtime;
8800 8917
@@ -8811,6 +8928,9 @@ static int sched_rt_global_constraints(void)
8811 unsigned long flags; 8928 unsigned long flags;
8812 int i; 8929 int i;
8813 8930
8931 if (sysctl_sched_rt_period <= 0)
8932 return -EINVAL;
8933
8814 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 8934 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8815 for_each_possible_cpu(i) { 8935 for_each_possible_cpu(i) {
8816 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 8936 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 204991a0bfa7..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -12,19 +12,17 @@
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 * Create a semi stable clock from a mixture of other events, including:
14 * - gtod 14 * - gtod
15 * - jiffies
16 * - sched_clock() 15 * - sched_clock()
17 * - explicit idle events 16 * - explicit idle events
18 * 17 *
19 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 18 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
20 * making it monotonic and keeping it within an expected window. This window 19 * making it monotonic and keeping it within an expected window.
21 * is set up using jiffies.
22 * 20 *
23 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
24 * that is otherwise invisible (TSC gets stopped). 22 * that is otherwise invisible (TSC gets stopped).
25 * 23 *
26 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
27 * consistent between cpus (never more than 1 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
28 */ 26 */
29#include <linux/sched.h> 27#include <linux/sched.h>
30#include <linux/percpu.h> 28#include <linux/percpu.h>
@@ -54,7 +52,6 @@ struct sched_clock_data {
54 */ 52 */
55 raw_spinlock_t lock; 53 raw_spinlock_t lock;
56 54
57 unsigned long tick_jiffies;
58 u64 tick_raw; 55 u64 tick_raw;
59 u64 tick_gtod; 56 u64 tick_gtod;
60 u64 clock; 57 u64 clock;
@@ -75,14 +72,12 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
75void sched_clock_init(void) 72void sched_clock_init(void)
76{ 73{
77 u64 ktime_now = ktime_to_ns(ktime_get()); 74 u64 ktime_now = ktime_to_ns(ktime_get());
78 unsigned long now_jiffies = jiffies;
79 int cpu; 75 int cpu;
80 76
81 for_each_possible_cpu(cpu) { 77 for_each_possible_cpu(cpu) {
82 struct sched_clock_data *scd = cpu_sdc(cpu); 78 struct sched_clock_data *scd = cpu_sdc(cpu);
83 79
84 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 80 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
85 scd->tick_jiffies = now_jiffies;
86 scd->tick_raw = 0; 81 scd->tick_raw = 0;
87 scd->tick_gtod = ktime_now; 82 scd->tick_gtod = ktime_now;
88 scd->clock = ktime_now; 83 scd->clock = ktime_now;
@@ -92,46 +87,51 @@ void sched_clock_init(void)
92} 87}
93 88
94/* 89/*
90 * min,max except they take wrapping into account
91 */
92
93static inline u64 wrap_min(u64 x, u64 y)
94{
95 return (s64)(x - y) < 0 ? x : y;
96}
97
98static inline u64 wrap_max(u64 x, u64 y)
99{
100 return (s64)(x - y) > 0 ? x : y;
101}
102
103/*
95 * update the percpu scd from the raw @now value 104 * update the percpu scd from the raw @now value
96 * 105 *
97 * - filter out backward motion 106 * - filter out backward motion
98 * - use jiffies to generate a min,max window to clip the raw values 107 * - use the GTOD tick value to create a window to filter crazy TSC values
99 */ 108 */
100static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 109static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
101{ 110{
102 unsigned long now_jiffies = jiffies;
103 long delta_jiffies = now_jiffies - scd->tick_jiffies;
104 u64 clock = scd->clock;
105 u64 min_clock, max_clock;
106 s64 delta = now - scd->tick_raw; 111 s64 delta = now - scd->tick_raw;
112 u64 clock, min_clock, max_clock;
107 113
108 WARN_ON_ONCE(!irqs_disabled()); 114 WARN_ON_ONCE(!irqs_disabled());
109 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
110 115
111 if (unlikely(delta < 0)) { 116 if (unlikely(delta < 0))
112 clock++; 117 delta = 0;
113 goto out;
114 }
115 118
116 max_clock = min_clock + TICK_NSEC; 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC);
123 */
117 124
118 if (unlikely(clock + delta > max_clock)) { 125 clock = scd->tick_gtod + delta;
119 if (clock < max_clock) 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
120 clock = max_clock; 127 max_clock = scd->tick_gtod + TICK_NSEC;
121 else
122 clock++;
123 } else {
124 clock += delta;
125 }
126 128
127 out: 129 clock = wrap_max(clock, min_clock);
128 if (unlikely(clock < min_clock)) 130 clock = wrap_min(clock, max_clock);
129 clock = min_clock;
130 131
131 scd->tick_jiffies = now_jiffies;
132 scd->clock = clock; 132 scd->clock = clock;
133 133
134 return clock; 134 return scd->clock;
135} 135}
136 136
137static void lock_double_clock(struct sched_clock_data *data1, 137static void lock_double_clock(struct sched_clock_data *data1,
@@ -171,7 +171,7 @@ u64 sched_clock_cpu(int cpu)
171 * larger time as the latest time for both 171 * larger time as the latest time for both
172 * runqueues. (this creates monotonic movement) 172 * runqueues. (this creates monotonic movement)
173 */ 173 */
174 if (likely(remote_clock < this_clock)) { 174 if (likely((s64)(remote_clock - this_clock) < 0)) {
175 clock = this_clock; 175 clock = this_clock;
176 scd->clock = clock; 176 scd->clock = clock;
177 } else { 177 } else {
@@ -207,14 +207,9 @@ void sched_clock_tick(void)
207 now = sched_clock(); 207 now = sched_clock();
208 208
209 __raw_spin_lock(&scd->lock); 209 __raw_spin_lock(&scd->lock);
210 __update_sched_clock(scd, now);
211 /*
212 * update tick_gtod after __update_sched_clock() because that will
213 * already observe 1 new jiffy; adding a new tick_gtod to that would
214 * increase the clock 2 jiffies.
215 */
216 scd->tick_raw = now; 210 scd->tick_raw = now;
217 scd->tick_gtod = now_gtod; 211 scd->tick_gtod = now_gtod;
212 __update_sched_clock(scd, now);
218 __raw_spin_unlock(&scd->lock); 213 __raw_spin_unlock(&scd->lock);
219} 214}
220 215
@@ -232,18 +227,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
232 */ 227 */
233void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
234{ 229{
235 struct sched_clock_data *scd = this_scd(); 230 sched_clock_tick();
236
237 /*
238 * Override the previous timestamp and ignore all
239 * sched_clock() deltas that occured while we idled,
240 * and use the PM-provided delta_ns to advance the
241 * rq clock:
242 */
243 __raw_spin_lock(&scd->lock);
244 scd->clock += delta_ns;
245 __raw_spin_unlock(&scd->lock);
246
247 touch_softlockup_watchdog(); 231 touch_softlockup_watchdog();
248} 232}
249EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 233EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 862b06bd560a..9353ca78154e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,6 +8,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
8SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 1)
9SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
10SCHED_FEAT(ASYM_GRAN, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 0) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6163e4cf885b..1113157b2058 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
199 199
200static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 if (rt_rq->rt_nr_running)
203 resched_task(rq_of_rt_rq(rt_rq)->curr);
202} 204}
203 205
204static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 206static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -298,7 +300,7 @@ static void __disable_runtime(struct rq *rq)
298 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 300 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
299 s64 diff; 301 s64 diff;
300 302
301 if (iter == rt_rq) 303 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
302 continue; 304 continue;
303 305
304 spin_lock(&iter->rt_runtime_lock); 306 spin_lock(&iter->rt_runtime_lock);
@@ -348,6 +350,7 @@ static void __enable_runtime(struct rq *rq)
348 spin_lock(&rt_rq->rt_runtime_lock); 350 spin_lock(&rt_rq->rt_runtime_lock);
349 rt_rq->rt_runtime = rt_b->rt_runtime; 351 rt_rq->rt_runtime = rt_b->rt_runtime;
350 rt_rq->rt_time = 0; 352 rt_rq->rt_time = 0;
353 rt_rq->rt_throttled = 0;
351 spin_unlock(&rt_rq->rt_runtime_lock); 354 spin_unlock(&rt_rq->rt_runtime_lock);
352 spin_unlock(&rt_b->rt_runtime_lock); 355 spin_unlock(&rt_b->rt_runtime_lock);
353 } 356 }
@@ -438,9 +441,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
438{ 441{
439 u64 runtime = sched_rt_runtime(rt_rq); 442 u64 runtime = sched_rt_runtime(rt_rq);
440 443
441 if (runtime == RUNTIME_INF)
442 return 0;
443
444 if (rt_rq->rt_throttled) 444 if (rt_rq->rt_throttled)
445 return rt_rq_throttled(rt_rq); 445 return rt_rq_throttled(rt_rq);
446 446
@@ -491,9 +491,11 @@ static void update_curr_rt(struct rq *rq)
491 rt_rq = rt_rq_of_se(rt_se); 491 rt_rq = rt_rq_of_se(rt_se);
492 492
493 spin_lock(&rt_rq->rt_runtime_lock); 493 spin_lock(&rt_rq->rt_runtime_lock);
494 rt_rq->rt_time += delta_exec; 494 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
495 if (sched_rt_runtime_exceeded(rt_rq)) 495 rt_rq->rt_time += delta_exec;
496 resched_task(curr); 496 if (sched_rt_runtime_exceeded(rt_rq))
497 resched_task(curr);
498 }
497 spin_unlock(&rt_rq->rt_runtime_lock); 499 spin_unlock(&rt_rq->rt_runtime_lock);
498 } 500 }
499} 501}
diff --git a/kernel/signal.c b/kernel/signal.c
index c539f60c6f41..e661b01d340f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1338 struct siginfo info; 1338 struct siginfo info;
1339 unsigned long flags; 1339 unsigned long flags;
1340 struct sighand_struct *psig; 1340 struct sighand_struct *psig;
1341 int ret = sig;
1341 1342
1342 BUG_ON(sig == -1); 1343 BUG_ON(sig == -1);
1343 1344
@@ -1402,7 +1403,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1402 * is implementation-defined: we do (if you don't want 1403 * is implementation-defined: we do (if you don't want
1403 * it, just use SIG_IGN instead). 1404 * it, just use SIG_IGN instead).
1404 */ 1405 */
1405 tsk->exit_signal = -1; 1406 ret = tsk->exit_signal = -1;
1406 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) 1407 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1407 sig = -1; 1408 sig = -1;
1408 } 1409 }
@@ -1411,7 +1412,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1411 __wake_up_parent(tsk, tsk->parent); 1412 __wake_up_parent(tsk, tsk->parent);
1412 spin_unlock_irqrestore(&psig->siglock, flags); 1413 spin_unlock_irqrestore(&psig->siglock, flags);
1413 1414
1414 return sig; 1415 return ret;
1415} 1416}
1416 1417
1417static void do_notify_parent_cldstop(struct task_struct *tsk, int why) 1418static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
diff --git a/kernel/smp.c b/kernel/smp.c
index 782e2b93e465..f362a8553777 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
210{ 210{
211 struct call_single_data d; 211 struct call_single_data d;
212 unsigned long flags; 212 unsigned long flags;
213 /* prevent preemption and reschedule on another processor */ 213 /* prevent preemption and reschedule on another processor,
214 as well as CPU removal */
214 int me = get_cpu(); 215 int me = get_cpu();
216 int err = 0;
215 217
216 /* Can deadlock when called with interrupts disabled */ 218 /* Can deadlock when called with interrupts disabled */
217 WARN_ON(irqs_disabled()); 219 WARN_ON(irqs_disabled());
@@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
220 local_irq_save(flags); 222 local_irq_save(flags);
221 func(info); 223 func(info);
222 local_irq_restore(flags); 224 local_irq_restore(flags);
223 } else { 225 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
224 struct call_single_data *data = NULL; 226 struct call_single_data *data = NULL;
225 227
226 if (!wait) { 228 if (!wait) {
@@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
236 data->func = func; 238 data->func = func;
237 data->info = info; 239 data->info = info;
238 generic_exec_single(cpu, data); 240 generic_exec_single(cpu, data);
241 } else {
242 err = -ENXIO; /* CPU not online */
239 } 243 }
240 244
241 put_cpu(); 245 put_cpu();
242 return 0; 246 return err;
243} 247}
244EXPORT_SYMBOL(smp_call_function_single); 248EXPORT_SYMBOL(smp_call_function_single);
245 249
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfcf..cb838ee93a82 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -233,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
233 do_each_thread(g, t) { 233 do_each_thread(g, t) {
234 if (!--max_count) 234 if (!--max_count)
235 goto unlock; 235 goto unlock;
236 if (t->state & TASK_UNINTERRUPTIBLE) 236 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
237 if (t->state == TASK_UNINTERRUPTIBLE)
237 check_hung_task(t, now); 238 check_hung_task(t, now);
238 } while_each_thread(g, t); 239 } while_each_thread(g, t);
239 unlock: 240 unlock:
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 44baeea94ab9..29ab20749dd3 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -290,7 +290,6 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
290 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 290 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
291 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); 291 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
292} 292}
293
294EXPORT_SYMBOL(_spin_lock_nested); 293EXPORT_SYMBOL(_spin_lock_nested);
295 294
296unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 295unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
@@ -312,7 +311,6 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
312#endif 311#endif
313 return flags; 312 return flags;
314} 313}
315
316EXPORT_SYMBOL(_spin_lock_irqsave_nested); 314EXPORT_SYMBOL(_spin_lock_irqsave_nested);
317 315
318void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, 316void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
@@ -322,7 +320,6 @@ void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
322 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); 320 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
323 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); 321 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
324} 322}
325
326EXPORT_SYMBOL(_spin_lock_nest_lock); 323EXPORT_SYMBOL(_spin_lock_nest_lock);
327 324
328#endif 325#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index c01858090a98..038a7bc0901d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
169 pgrp = find_vpid(who); 169 pgrp = find_vpid(who);
170 else 170 else
171 pgrp = task_pgrp(current); 171 pgrp = task_pgrp(current);
172 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 172 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
173 error = set_one_prio(p, niceval, error); 173 error = set_one_prio(p, niceval, error);
174 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 174 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
175 break; 175 break;
176 case PRIO_USER: 176 case PRIO_USER:
177 user = current->user; 177 user = current->user;
@@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who)
229 pgrp = find_vpid(who); 229 pgrp = find_vpid(who);
230 else 230 else
231 pgrp = task_pgrp(current); 231 pgrp = task_pgrp(current);
232 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 232 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
233 niceval = 20 - task_nice(p); 233 niceval = 20 - task_nice(p);
234 if (niceval > retval) 234 if (niceval > retval)
235 retval = niceval; 235 retval = niceval;
236 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 236 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
237 break; 237 break;
238 case PRIO_USER: 238 case PRIO_USER:
239 user = current->user; 239 user = current->user;
@@ -274,7 +274,7 @@ void emergency_restart(void)
274} 274}
275EXPORT_SYMBOL_GPL(emergency_restart); 275EXPORT_SYMBOL_GPL(emergency_restart);
276 276
277static void kernel_restart_prepare(char *cmd) 277void kernel_restart_prepare(char *cmd)
278{ 278{
279 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 279 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
280 system_state = SYSTEM_RESTART; 280 system_state = SYSTEM_RESTART;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fe4713347275..50ec0886fa3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
159static struct ctl_table root_table[]; 159static struct ctl_table root_table[];
160static struct ctl_table_root sysctl_table_root; 160static struct ctl_table_root sysctl_table_root;
161static struct ctl_table_header root_table_header = { 161static struct ctl_table_header root_table_header = {
162 .count = 1,
162 .ctl_table = root_table, 163 .ctl_table = root_table,
163 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), 164 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
164 .root = &sysctl_table_root, 165 .root = &sysctl_table_root,
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
72} 72}
73 73
74/** 74/**
75 * clockevents_shutdown - shutdown the device and clear next_event
76 * @dev: device to shutdown
77 */
78void clockevents_shutdown(struct clock_event_device *dev)
79{
80 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
81 dev->next_event.tv64 = KTIME_MAX;
82}
83
84/**
75 * clockevents_program_event - Reprogram the clock event device. 85 * clockevents_program_event - Reprogram the clock event device.
76 * @expires: absolute expiry time (monotonic clock) 86 * @expires: absolute expiry time (monotonic clock)
77 * 87 *
@@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev)
177/* 187/*
178 * Noop handler when we shut down an event device 188 * Noop handler when we shut down an event device
179 */ 189 */
180static void clockevents_handle_noop(struct clock_event_device *dev) 190void clockevents_handle_noop(struct clock_event_device *dev)
181{ 191{
182} 192}
183 193
@@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
199 * released list and do a notify add later. 209 * released list and do a notify add later.
200 */ 210 */
201 if (old) { 211 if (old) {
202 old->event_handler = clockevents_handle_noop;
203 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 212 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
204 list_del(&old->list); 213 list_del(&old->list);
205 list_add(&old->list, &clockevents_released); 214 list_add(&old->list, &clockevents_released);
@@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
207 216
208 if (new) { 217 if (new) {
209 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 218 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
210 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); 219 clockevents_shutdown(new);
211 } 220 }
212 local_irq_restore(flags); 221 local_irq_restore(flags);
213} 222}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..1ad46f3df6e7 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
245 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 245 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
246 fail = update_persistent_clock(now); 246 fail = update_persistent_clock(now);
247 247
248 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; 248 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
249 if (next.tv_nsec <= 0) 249 if (next.tv_nsec <= 0)
250 next.tv_nsec += NSEC_PER_SEC; 250 next.tv_nsec += NSEC_PER_SEC;
251 251
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b94..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
175 */ 175 */
176static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 176static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
177{ 177{
178 ktime_t next;
179
178 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
179 181
180 /* 182 /*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
185 187
186 /* 188 /*
187 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
188 * periodic mode: 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really
193 * programmed to the device.
189 */ 194 */
190 for (;;) { 195 for (next = dev->next_event; ;) {
191 ktime_t next = ktime_add(dev->next_event, tick_period); 196 next = ktime_add(next, tick_period);
192 197
193 if (!clockevents_program_event(dev, next, ktime_get())) 198 if (!clockevents_program_event(dev, next, ktime_get()))
194 return; 199 return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
205 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
206 struct tick_device *td; 211 struct tick_device *td;
207 unsigned long flags, *reason = why; 212 unsigned long flags, *reason = why;
208 int cpu; 213 int cpu, bc_stopped;
209 214
210 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
211 216
@@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why)
223 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
224 goto out; 229 goto out;
225 230
231 bc_stopped = cpus_empty(tick_broadcast_mask);
232
226 switch (*reason) { 233 switch (*reason) {
227 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
228 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
229 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpu_isset(cpu, tick_broadcast_mask)) {
230 cpu_set(cpu, tick_broadcast_mask); 237 cpu_set(cpu, tick_broadcast_mask);
231 if (td->mode == TICKDEV_MODE_PERIODIC) 238 if (tick_broadcast_device.mode ==
232 clockevents_set_mode(dev, 239 TICKDEV_MODE_PERIODIC)
233 CLOCK_EVT_MODE_SHUTDOWN); 240 clockevents_shutdown(dev);
234 } 241 }
235 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
236 tick_broadcast_force = 1; 243 tick_broadcast_force = 1;
@@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why)
239 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
240 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpu_isset(cpu, tick_broadcast_mask)) {
241 cpu_clear(cpu, tick_broadcast_mask); 248 cpu_clear(cpu, tick_broadcast_mask);
242 if (td->mode == TICKDEV_MODE_PERIODIC) 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC)
243 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
244 } 252 }
245 break; 253 break;
246 } 254 }
247 255
248 if (cpus_empty(tick_broadcast_mask)) 256 if (cpus_empty(tick_broadcast_mask)) {
249 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 257 if (!bc_stopped)
250 else { 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) {
251 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 260 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
252 tick_broadcast_start_periodic(bc); 261 tick_broadcast_start_periodic(bc);
253 else 262 else
@@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
298 307
299 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
300 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpus_empty(tick_broadcast_mask))
301 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 310 clockevents_shutdown(bc);
302 } 311 }
303 312
304 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 313 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -313,7 +322,7 @@ void tick_suspend_broadcast(void)
313 322
314 bc = tick_broadcast_device.evtdev; 323 bc = tick_broadcast_device.evtdev;
315 if (bc) 324 if (bc)
316 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 325 clockevents_shutdown(bc);
317 326
318 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 327 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
319} 328}
@@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
364static int tick_broadcast_set_event(ktime_t expires, int force) 373static int tick_broadcast_set_event(ktime_t expires, int force)
365{ 374{
366 struct clock_event_device *bc = tick_broadcast_device.evtdev; 375 struct clock_event_device *bc = tick_broadcast_device.evtdev;
367 ktime_t now = ktime_get(); 376
368 int res; 377 return tick_dev_program_event(bc, expires, force);
369
370 for(;;) {
371 res = clockevents_program_event(bc, expires, now);
372 if (!res || !force)
373 return res;
374 now = ktime_get();
375 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
376 }
377} 378}
378 379
379int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 380int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -491,14 +492,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
491 cpu_clear(cpu, tick_broadcast_oneshot_mask); 492 cpu_clear(cpu, tick_broadcast_oneshot_mask);
492} 493}
493 494
495static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
496{
497 struct tick_device *td;
498 int cpu;
499
500 for_each_cpu_mask_nr(cpu, *mask) {
501 td = &per_cpu(tick_cpu_device, cpu);
502 if (td->evtdev)
503 td->evtdev->next_event = expires;
504 }
505}
506
494/** 507/**
495 * tick_broadcast_setup_oneshot - setup the broadcast device 508 * tick_broadcast_setup_oneshot - setup the broadcast device
496 */ 509 */
497void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 510void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
498{ 511{
499 bc->event_handler = tick_handle_oneshot_broadcast; 512 /* Set it up only once ! */
500 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 513 if (bc->event_handler != tick_handle_oneshot_broadcast) {
501 bc->next_event.tv64 = KTIME_MAX; 514 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
515 int cpu = smp_processor_id();
516 cpumask_t mask;
517
518 bc->event_handler = tick_handle_oneshot_broadcast;
519 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
520
521 /* Take the do_timer update */
522 tick_do_timer_cpu = cpu;
523
524 /*
525 * We must be careful here. There might be other CPUs
526 * waiting for periodic broadcast. We need to set the
527 * oneshot_mask bits for those and program the
528 * broadcast device to fire.
529 */
530 mask = tick_broadcast_mask;
531 cpu_clear(cpu, mask);
532 cpus_or(tick_broadcast_oneshot_mask,
533 tick_broadcast_oneshot_mask, mask);
534
535 if (was_periodic && !cpus_empty(mask)) {
536 tick_broadcast_init_next_event(&mask, tick_next_period);
537 tick_broadcast_set_event(tick_next_period, 1);
538 } else
539 bc->next_event.tv64 = KTIME_MAX;
540 }
502} 541}
503 542
504/* 543/*
@@ -538,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
538 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 577 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
539} 578}
540 579
580/*
581 * Check, whether the broadcast device is in one shot mode
582 */
583int tick_broadcast_oneshot_active(void)
584{
585 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
586}
587
541#endif 588#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c4336f4188..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = -1; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37DEFINE_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
109 if (!tick_device_is_functional(dev)) 109 if (!tick_device_is_functional(dev))
110 return; 110 return;
111 111
112 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { 112 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
113 !tick_broadcast_oneshot_active()) {
113 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 114 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
114 } else { 115 } else {
115 unsigned long seq; 116 unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
148 * If no cpu took the do_timer update, assign it to 149 * If no cpu took the do_timer update, assign it to
149 * this cpu: 150 * this cpu:
150 */ 151 */
151 if (tick_do_timer_cpu == -1) { 152 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
152 tick_do_timer_cpu = cpu; 153 tick_do_timer_cpu = cpu;
153 tick_next_period = ktime_get(); 154 tick_next_period = ktime_get();
154 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 155 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td,
161 } else { 162 } else {
162 handler = td->evtdev->event_handler; 163 handler = td->evtdev->event_handler;
163 next_event = td->evtdev->next_event; 164 next_event = td->evtdev->next_event;
165 td->evtdev->event_handler = clockevents_handle_noop;
164 } 166 }
165 167
166 td->evtdev = newdev; 168 td->evtdev = newdev;
@@ -248,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
248 * not give it back to the clockevents layer ! 250 * not give it back to the clockevents layer !
249 */ 251 */
250 if (tick_is_broadcast_device(curdev)) { 252 if (tick_is_broadcast_device(curdev)) {
251 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); 253 clockevents_shutdown(curdev);
252 curdev = NULL; 254 curdev = NULL;
253 } 255 }
254 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
@@ -299,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
299 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
300 int cpu = first_cpu(cpu_online_map); 302 int cpu = first_cpu(cpu_online_map);
301 303
302 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; 304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
305 TICK_DO_TIMER_NONE;
303 } 306 }
304 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
305} 308}
@@ -310,7 +313,7 @@ static void tick_suspend(void)
310 unsigned long flags; 313 unsigned long flags;
311 314
312 spin_lock_irqsave(&tick_device_lock, flags); 315 spin_lock_irqsave(&tick_device_lock, flags);
313 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); 316 clockevents_shutdown(td->evtdev);
314 spin_unlock_irqrestore(&tick_device_lock, flags); 317 spin_unlock_irqrestore(&tick_device_lock, flags);
315} 318}
316 319
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4
5#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2
7
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock; 9extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period; 10extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
10extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 14extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
11extern void tick_handle_periodic(struct clock_event_device *dev); 15extern void tick_handle_periodic(struct clock_event_device *dev);
12 16
17extern void clockevents_shutdown(struct clock_event_device *dev);
18
13/* 19/*
14 * NO_HZ / high resolution timer shared code 20 * NO_HZ / high resolution timer shared code
15 */ 21 */
@@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
17extern void tick_setup_oneshot(struct clock_event_device *newdev, 23extern void tick_setup_oneshot(struct clock_event_device *newdev,
18 void (*handler)(struct clock_event_device *), 24 void (*handler)(struct clock_event_device *),
19 ktime_t nextevt); 25 ktime_t nextevt);
26extern int tick_dev_program_event(struct clock_event_device *dev,
27 ktime_t expires, int force);
20extern int tick_program_event(ktime_t expires, int force); 28extern int tick_program_event(ktime_t expires, int force);
21extern void tick_oneshot_notify(void); 29extern void tick_oneshot_notify(void);
22extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 30extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -27,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
27extern void tick_broadcast_switch_to_oneshot(void); 35extern void tick_broadcast_switch_to_oneshot(void);
28extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
29extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void);
30# else /* BROADCAST */ 39# else /* BROADCAST */
31static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
32{ 41{
@@ -35,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
35static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 44static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
36static inline void tick_broadcast_switch_to_oneshot(void) { } 45static inline void tick_broadcast_switch_to_oneshot(void) { }
37static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; }
38# endif /* !BROADCAST */ 48# endif /* !BROADCAST */
39 49
40#else /* !ONESHOT */ 50#else /* !ONESHOT */
@@ -64,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
64{ 74{
65 return 0; 75 return 0;
66} 76}
77static inline int tick_broadcast_oneshot_active(void) { return 0; }
67#endif /* !TICK_ONESHOT */ 78#endif /* !TICK_ONESHOT */
68 79
69/* 80/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/** 25/**
26 * tick_program_event 26 * tick_program_event internal worker function
27 */ 27 */
28int tick_program_event(ktime_t expires, int force) 28int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
29 int force)
29{ 30{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get(); 31 ktime_t now = ktime_get();
32 int i;
32 33
33 while (1) { 34 for (i = 0;;) {
34 int ret = clockevents_program_event(dev, expires, now); 35 int ret = clockevents_program_event(dev, expires, now);
35 36
36 if (!ret || !force) 37 if (!ret || !force)
37 return ret; 38 return ret;
39
40 /*
41 * We tried 2 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it
43 * and emit a warning.
44 */
45 if (++i > 2) {
46 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns)
48 dev->min_delta_ns = 5000;
49 else
50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51
52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n",
54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1);
56
57 i = 0;
58 }
59
38 now = ktime_get(); 60 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); 61 expires = ktime_add_ns(now, dev->min_delta_ns);
40 } 62 }
41} 63}
42 64
43/** 65/**
66 * tick_program_event
67 */
68int tick_program_event(ktime_t expires, int force)
69{
70 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
71
72 return tick_dev_program_event(dev, expires, force);
73}
74
75/**
44 * tick_resume_onshot - resume oneshot mode 76 * tick_resume_onshot - resume oneshot mode
45 */ 77 */
46void tick_resume_oneshot(void) 78void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
61{ 93{
62 newdev->event_handler = handler; 94 newdev->event_handler = handler;
63 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 95 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
64 clockevents_program_event(newdev, next_event, ktime_get()); 96 tick_dev_program_event(newdev, next_event, 1);
65} 97}
66 98
67/** 99/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f5da526424a9..cb02324bdb88 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now)
75 incr * ticks); 75 incr * ticks);
76 } 76 }
77 do_timer(++ticks); 77 do_timer(++ticks);
78
79 /* Keep the tick_next_period variable up to date */
80 tick_next_period = ktime_add(last_jiffies_update, tick_period);
78 } 81 }
79 write_sequnlock(&xtime_lock); 82 write_sequnlock(&xtime_lock);
80} 83}
@@ -162,6 +165,8 @@ void tick_nohz_stop_idle(int cpu)
162 ts->idle_lastupdate = now; 165 ts->idle_lastupdate = now;
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 166 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 ts->idle_active = 0; 167 ts->idle_active = 0;
168
169 sched_clock_idle_wakeup_event(0);
165 } 170 }
166} 171}
167 172
@@ -177,6 +182,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
177 } 182 }
178 ts->idle_entrytime = now; 183 ts->idle_entrytime = now;
179 ts->idle_active = 1; 184 ts->idle_active = 1;
185 sched_clock_idle_sleep_event();
180 return now; 186 return now;
181} 187}
182 188
@@ -218,7 +224,7 @@ void tick_nohz_stop_sched_tick(int inidle)
218 */ 224 */
219 if (unlikely(!cpu_online(cpu))) { 225 if (unlikely(!cpu_online(cpu))) {
220 if (cpu == tick_do_timer_cpu) 226 if (cpu == tick_do_timer_cpu)
221 tick_do_timer_cpu = -1; 227 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
222 } 228 }
223 229
224 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 230 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -300,7 +306,7 @@ void tick_nohz_stop_sched_tick(int inidle)
300 * invoked. 306 * invoked.
301 */ 307 */
302 if (cpu == tick_do_timer_cpu) 308 if (cpu == tick_do_timer_cpu)
303 tick_do_timer_cpu = -1; 309 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
304 310
305 ts->idle_sleeps++; 311 ts->idle_sleeps++;
306 312
@@ -465,7 +471,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
465 * this duty, then the jiffies update is still serialized by 471 * this duty, then the jiffies update is still serialized by
466 * xtime_lock. 472 * xtime_lock.
467 */ 473 */
468 if (unlikely(tick_do_timer_cpu == -1)) 474 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
469 tick_do_timer_cpu = cpu; 475 tick_do_timer_cpu = cpu;
470 476
471 /* Check, if the jiffies need an update */ 477 /* Check, if the jiffies need an update */
@@ -567,7 +573,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
567 * this duty, then the jiffies update is still serialized by 573 * this duty, then the jiffies update is still serialized by
568 * xtime_lock. 574 * xtime_lock.
569 */ 575 */
570 if (unlikely(tick_do_timer_cpu == -1)) 576 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
571 tick_do_timer_cpu = cpu; 577 tick_do_timer_cpu = cpu;
572#endif 578#endif
573 579
@@ -619,7 +625,7 @@ void tick_setup_sched_timer(void)
619 */ 625 */
620 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 626 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
621 ts->sched_timer.function = tick_sched_timer; 627 ts->sched_timer.function = tick_sched_timer;
622 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 628 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
623 629
624 /* Get the next period (per cpu) */ 630 /* Get the next period (per cpu) */
625 ts->sched_timer.expires = tick_init_jiffy_update(); 631 ts->sched_timer.expires = tick_init_jiffy_update();
@@ -643,17 +649,21 @@ void tick_setup_sched_timer(void)
643 ts->nohz_mode = NOHZ_MODE_HIGHRES; 649 ts->nohz_mode = NOHZ_MODE_HIGHRES;
644#endif 650#endif
645} 651}
652#endif /* HIGH_RES_TIMERS */
646 653
654#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
647void tick_cancel_sched_timer(int cpu) 655void tick_cancel_sched_timer(int cpu)
648{ 656{
649 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 657 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
650 658
659# ifdef CONFIG_HIGH_RES_TIMERS
651 if (ts->sched_timer.base) 660 if (ts->sched_timer.base)
652 hrtimer_cancel(&ts->sched_timer); 661 hrtimer_cancel(&ts->sched_timer);
662# endif
653 663
654 ts->nohz_mode = NOHZ_MODE_INACTIVE; 664 ts->nohz_mode = NOHZ_MODE_INACTIVE;
655} 665}
656#endif /* HIGH_RES_TIMERS */ 666#endif
657 667
658/** 668/**
659 * Async notification about clocksource changes 669 * Async notification about clocksource changes
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 208}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de44..532858fa5b88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h>
10#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f12444..815237a55af8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/err.h> 15#include <linux/err.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18 17
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256d..4ab9659d269e 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/sysctl.h> 15#include <linux/sysctl.h>
17 16
18static void *get_uts(ctl_table *table, int write) 17static void *get_uts(ctl_table *table, int write)