aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c3
-rw-r--r--kernel/cpuset.c312
-rw-r--r--kernel/exit.c88
-rw-r--r--kernel/lockdep.c6
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/nsproxy.c1
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/pm_qos_params.c25
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/rcupdate.c1
-rw-r--r--kernel/resource.c88
-rw-r--r--kernel/sched.c78
-rw-r--r--kernel/sched_clock.c84
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c13
-rw-r--r--kernel/smp.c10
-rw-r--r--kernel/softlockup.c3
-rw-r--r--kernel/sysctl.c1
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c78
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-oneshot.c46
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/utsname_sysctl.c1
31 files changed, 551 insertions, 335 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 972f8e61d36a..59cedfb040e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask)
243 243
244static int audit_match_perm(struct audit_context *ctx, int mask) 244static int audit_match_perm(struct audit_context *ctx, int mask)
245{ 245{
246 unsigned n;
246 if (unlikely(!ctx)) 247 if (unlikely(!ctx))
247 return 0; 248 return 0;
248 249
249 unsigned n = ctx->major; 250 n = ctx->major;
250 switch (audit_classify_syscall(ctx->arch, n)) { 251 switch (audit_classify_syscall(ctx->arch, n)) {
251 case 0: /* native */ 252 case 0: /* native */
252 if ((mask & AUDIT_PERM_WRITE) && 253 if ((mask & AUDIT_PERM_WRITE) &&
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..f227bc172690 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
236 238
237static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
238 240
239/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
240 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
241 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
242static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
243 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
244 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
473} 477}
474 478
475/* 479/*
476 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
477 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
478 */ 482 */
479
480static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
481{ 484{
482 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
518} 521}
519 522
520/* 523/*
521 * rebuild_sched_domains() 524 * generate_sched_domains()
522 * 525 *
523 * This routine will be called to rebuild the scheduler's dynamic 526 * This function builds a partial partition of the systems CPUs
524 * sched domains: 527 * A 'partial partition' is a set of non-overlapping subsets whose
525 * - if the flag 'sched_load_balance' of any cpuset with non-empty 528 * union is a subset of that set.
526 * 'cpus' changes, 529 * The output of this function needs to be passed to kernel/sched.c
527 * - or if the 'cpus' allowed changes in any cpuset which has that 530 * partition_sched_domains() routine, which will rebuild the scheduler's
528 * flag enabled, 531 * load balancing domains (sched domains) as specified by that partial
529 * - or if the 'sched_relax_domain_level' of any cpuset which has 532 * partition.
530 * that flag enabled and with non-empty 'cpus' changes,
531 * - or if any cpuset with non-empty 'cpus' is removed,
532 * - or if a cpu gets offlined.
533 *
534 * This routine builds a partial partition of the systems CPUs
535 * (the set of non-overlappping cpumask_t's in the array 'part'
536 * below), and passes that partial partition to the kernel/sched.c
537 * partition_sched_domains() routine, which will rebuild the
538 * schedulers load balancing domains (sched domains) as specified
539 * by that partial partition. A 'partial partition' is a set of
540 * non-overlapping subsets whose union is a subset of that set.
541 * 533 *
542 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
543 * for a background explanation of this. 535 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
547 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
548 * that could cause allocation failures below. 540 * that could cause allocation failures below.
549 * 541 *
550 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
551 * call due to the kfifo_alloc() and kmalloc() calls. May nest
552 * a call to the get_online_cpus()/put_online_cpus() pair.
553 * Must not be called holding callback_mutex, because we must not
554 * call get_online_cpus() while holding callback_mutex. Elsewhere
555 * the kernel nests callback_mutex inside get_online_cpus() calls.
556 * So the reverse nesting would risk an ABBA deadlock.
557 * 543 *
558 * The three key local variables below are: 544 * The three key local variables below are:
559 * q - a linked-list queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
588 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
589 * partition_sched_domains(). 575 * partition_sched_domains().
590 */ 576 */
591 577static int generate_sched_domains(cpumask_t **domains,
592void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
593{ 579{
594 LIST_HEAD(q); /* queue of cpusets to be scanned*/ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
595 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
596 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
597 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
601 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
602 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
603 589
604 csa = NULL; 590 ndoms = 0;
605 doms = NULL; 591 doms = NULL;
606 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
607 594
608 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
609 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
610 ndoms = 1;
611 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
612 if (!doms) 598 if (!doms)
613 goto rebuild; 599 goto done;
600
614 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
615 if (dattr) { 602 if (dattr) {
616 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
617 update_domain_attr_tree(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
618 } 605 }
619 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
620 goto rebuild; 607
608 ndoms = 1;
609 goto done;
621 } 610 }
622 611
623 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
680 } 669 }
681 } 670 }
682 671
683 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
684 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
685 if (!doms) 677 if (!doms) {
686 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
687 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
688 687
689 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
690 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
691 int apn = a->pn; 691 int apn = a->pn;
692 692
693 if (apn >= 0) { 693 if (apn < 0) {
694 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
695 695 continue;
696 if (nslot == ndoms) { 696 }
697 static int warnings = 10; 697
698 if (warnings) { 698 dp = doms + nslot;
699 printk(KERN_WARNING 699
700 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
701 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
702 " apn %d\n", 702 if (warnings) {
703 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
704 warnings--; 704 "rebuild_sched_domains confused:"
705 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
706 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
707 } 709 }
710 continue;
711 }
708 712
709 cpus_clear(*dp); 713 cpus_clear(*dp);
710 if (dattr) 714 if (dattr)
711 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
712 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
713 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
714 718
715 if (apn == b->pn) { 719 if (apn == b->pn) {
716 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
717 b->pn = -1; 721 if (dattr)
718 if (dattr) 722 update_domain_attr_tree(dattr + nslot, b);
719 update_domain_attr_tree(dattr 723
720 + nslot, b); 724 /* Done with this partition */
721 } 725 b->pn = -1;
722 } 726 }
723 nslot++;
724 } 727 }
728 nslot++;
725 } 729 }
726 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
727 731
728rebuild: 732done:
729 /* Have scheduler rebuild sched domains */ 733 kfree(csa);
734
735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
738}
739
740/*
741 * Rebuild scheduler domains.
742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
751{
752 struct sched_domain_attr *attr;
753 cpumask_t *doms;
754 int ndoms;
755
730 get_online_cpus(); 756 get_online_cpus();
731 partition_sched_domains(ndoms, doms, dattr); 757
758 /* Generate domain masks and attrs */
759 cgroup_lock();
760 ndoms = generate_sched_domains(&doms, &attr);
761 cgroup_unlock();
762
763 /* Have scheduler rebuild the domains */
764 partition_sched_domains(ndoms, doms, attr);
765
732 put_online_cpus(); 766 put_online_cpus();
767}
733 768
734done: 769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
735 kfree(csa); 770
736 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 771/*
737 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
793}
794
795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
805{
806 do_rebuild_sched_domains(NULL);
738} 807}
739 808
740/** 809/**
@@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
863 return retval; 932 return retval;
864 933
865 if (is_load_balanced) 934 if (is_load_balanced)
866 rebuild_sched_domains(); 935 async_rebuild_sched_domains();
867 return 0; 936 return 0;
868} 937}
869 938
@@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1090 if (val != cs->relax_domain_level) { 1159 if (val != cs->relax_domain_level) {
1091 cs->relax_domain_level = val; 1160 cs->relax_domain_level = val;
1092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1161 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1093 rebuild_sched_domains(); 1162 async_rebuild_sched_domains();
1094 } 1163 }
1095 1164
1096 return 0; 1165 return 0;
@@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1131 mutex_unlock(&callback_mutex); 1200 mutex_unlock(&callback_mutex);
1132 1201
1133 if (cpus_nonempty && balance_flag_changed) 1202 if (cpus_nonempty && balance_flag_changed)
1134 rebuild_sched_domains(); 1203 async_rebuild_sched_domains();
1135 1204
1136 return 0; 1205 return 0;
1137} 1206}
@@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1492 default: 1561 default:
1493 BUG(); 1562 BUG();
1494 } 1563 }
1564
1565 /* Unreachable but makes gcc happy */
1566 return 0;
1495} 1567}
1496 1568
1497static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1569static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1504 default: 1576 default:
1505 BUG(); 1577 BUG();
1506 } 1578 }
1579
1580 /* Unrechable but makes gcc happy */
1581 return 0;
1507} 1582}
1508 1583
1509 1584
@@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create(
1692} 1767}
1693 1768
1694/* 1769/*
1695 * Locking note on the strange update_flag() call below:
1696 *
1697 * If the cpuset being removed has its flag 'sched_load_balance' 1770 * If the cpuset being removed has its flag 'sched_load_balance'
1698 * enabled, then simulate turning sched_load_balance off, which 1771 * enabled, then simulate turning sched_load_balance off, which
1699 * will call rebuild_sched_domains(). The get_online_cpus() 1772 * will call async_rebuild_sched_domains().
1700 * call in rebuild_sched_domains() must not be made while holding
1701 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1702 * get_online_cpus() calls. So the reverse nesting would risk an
1703 * ABBA deadlock.
1704 */ 1773 */
1705 1774
1706static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1775static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1719struct cgroup_subsys cpuset_subsys = { 1788struct cgroup_subsys cpuset_subsys = {
1720 .name = "cpuset", 1789 .name = "cpuset",
1721 .create = cpuset_create, 1790 .create = cpuset_create,
1722 .destroy = cpuset_destroy, 1791 .destroy = cpuset_destroy,
1723 .can_attach = cpuset_can_attach, 1792 .can_attach = cpuset_can_attach,
1724 .attach = cpuset_attach, 1793 .attach = cpuset_attach,
1725 .populate = cpuset_populate, 1794 .populate = cpuset_populate,
@@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1811} 1880}
1812 1881
1813/* 1882/*
1814 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1883 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1815 * or memory nodes, we need to walk over the cpuset hierarchy, 1884 * or memory nodes, we need to walk over the cpuset hierarchy,
1816 * removing that CPU or node from all cpusets. If this removes the 1885 * removing that CPU or node from all cpusets. If this removes the
1817 * last CPU or node from a cpuset, then move the tasks in the empty 1886 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1903,35 +1972,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1903} 1972}
1904 1973
1905/* 1974/*
1906 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1907 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1908 * track what's online after any CPU or memory node hotplug or unplug event.
1909 *
1910 * Since there are two callers of this routine, one for CPU hotplug
1911 * events and one for memory node hotplug events, we could have coded
1912 * two separate routines here. We code it as a single common routine
1913 * in order to minimize text size.
1914 */
1915
1916static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1917{
1918 cgroup_lock();
1919
1920 top_cpuset.cpus_allowed = cpu_online_map;
1921 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1922 scan_for_empty_cpusets(&top_cpuset);
1923
1924 /*
1925 * Scheduler destroys domains on hotplug events.
1926 * Rebuild them based on the current settings.
1927 */
1928 if (rebuild_sd)
1929 rebuild_sched_domains();
1930
1931 cgroup_unlock();
1932}
1933
1934/*
1935 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1975 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1936 * period. This is necessary in order to make cpusets transparent 1976 * period. This is necessary in order to make cpusets transparent
1937 * (of no affect) on systems that are actively using CPU hotplug 1977 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1939 * 1979 *
1940 * This routine ensures that top_cpuset.cpus_allowed tracks 1980 * This routine ensures that top_cpuset.cpus_allowed tracks
1941 * cpu_online_map on each CPU hotplug (cpuhp) event. 1981 * cpu_online_map on each CPU hotplug (cpuhp) event.
1982 *
1983 * Called within get_online_cpus(). Needs to call cgroup_lock()
1984 * before calling generate_sched_domains().
1942 */ 1985 */
1943 1986static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1944static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1945 unsigned long phase, void *unused_cpu) 1987 unsigned long phase, void *unused_cpu)
1946{ 1988{
1989 struct sched_domain_attr *attr;
1990 cpumask_t *doms;
1991 int ndoms;
1992
1947 switch (phase) { 1993 switch (phase) {
1948 case CPU_UP_CANCELED:
1949 case CPU_UP_CANCELED_FROZEN:
1950 case CPU_DOWN_FAILED:
1951 case CPU_DOWN_FAILED_FROZEN:
1952 case CPU_ONLINE: 1994 case CPU_ONLINE:
1953 case CPU_ONLINE_FROZEN: 1995 case CPU_ONLINE_FROZEN:
1954 case CPU_DEAD: 1996 case CPU_DEAD:
1955 case CPU_DEAD_FROZEN: 1997 case CPU_DEAD_FROZEN:
1956 common_cpu_mem_hotplug_unplug(1);
1957 break; 1998 break;
1999
1958 default: 2000 default:
1959 return NOTIFY_DONE; 2001 return NOTIFY_DONE;
1960 } 2002 }
1961 2003
2004 cgroup_lock();
2005 top_cpuset.cpus_allowed = cpu_online_map;
2006 scan_for_empty_cpusets(&top_cpuset);
2007 ndoms = generate_sched_domains(&doms, &attr);
2008 cgroup_unlock();
2009
2010 /* Have scheduler rebuild the domains */
2011 partition_sched_domains(ndoms, doms, attr);
2012
1962 return NOTIFY_OK; 2013 return NOTIFY_OK;
1963} 2014}
1964 2015
1965#ifdef CONFIG_MEMORY_HOTPLUG 2016#ifdef CONFIG_MEMORY_HOTPLUG
1966/* 2017/*
1967 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2018 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1968 * Call this routine anytime after you change 2019 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1969 * node_states[N_HIGH_MEMORY]. 2020 * See also the previous routine cpuset_track_online_cpus().
1970 * See also the previous routine cpuset_handle_cpuhp().
1971 */ 2021 */
1972
1973void cpuset_track_online_nodes(void) 2022void cpuset_track_online_nodes(void)
1974{ 2023{
1975 common_cpu_mem_hotplug_unplug(0); 2024 cgroup_lock();
2025 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2026 scan_for_empty_cpusets(&top_cpuset);
2027 cgroup_unlock();
1976} 2028}
1977#endif 2029#endif
1978 2030
@@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void)
1987 top_cpuset.cpus_allowed = cpu_online_map; 2039 top_cpuset.cpus_allowed = cpu_online_map;
1988 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2040 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1989 2041
1990 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2042 hotcpu_notifier(cpuset_track_online_cpus, 0);
1991} 2043}
1992 2044
1993/** 2045/**
diff --git a/kernel/exit.c b/kernel/exit.c
index 38ec40630149..16395644a98f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 112 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 113 * will have been the last reference on the signal_struct.
114 */ 114 */
115 sig->utime = cputime_add(sig->utime, tsk->utime); 115 sig->utime = cputime_add(sig->utime, task_utime(tsk));
116 sig->stime = cputime_add(sig->stime, tsk->stime); 116 sig->stime = cputime_add(sig->stime, task_stime(tsk));
117 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
118 sig->min_flt += tsk->min_flt; 118 sig->min_flt += tsk->min_flt;
119 sig->maj_flt += tsk->maj_flt; 119 sig->maj_flt += tsk->maj_flt;
120 sig->nvcsw += tsk->nvcsw; 120 sig->nvcsw += tsk->nvcsw;
@@ -831,26 +831,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
831 * the child reaper process (ie "init") in our pid 831 * the child reaper process (ie "init") in our pid
832 * space. 832 * space.
833 */ 833 */
834static struct task_struct *find_new_reaper(struct task_struct *father)
835{
836 struct pid_namespace *pid_ns = task_active_pid_ns(father);
837 struct task_struct *thread;
838
839 thread = father;
840 while_each_thread(father, thread) {
841 if (thread->flags & PF_EXITING)
842 continue;
843 if (unlikely(pid_ns->child_reaper == father))
844 pid_ns->child_reaper = thread;
845 return thread;
846 }
847
848 if (unlikely(pid_ns->child_reaper == father)) {
849 write_unlock_irq(&tasklist_lock);
850 if (unlikely(pid_ns == &init_pid_ns))
851 panic("Attempted to kill init!");
852
853 zap_pid_ns_processes(pid_ns);
854 write_lock_irq(&tasklist_lock);
855 /*
856 * We can not clear ->child_reaper or leave it alone.
857 * There may by stealth EXIT_DEAD tasks on ->children,
858 * forget_original_parent() must move them somewhere.
859 */
860 pid_ns->child_reaper = init_pid_ns.child_reaper;
861 }
862
863 return pid_ns->child_reaper;
864}
865
834static void forget_original_parent(struct task_struct *father) 866static void forget_original_parent(struct task_struct *father)
835{ 867{
836 struct task_struct *p, *n, *reaper = father; 868 struct task_struct *p, *n, *reaper;
837 LIST_HEAD(ptrace_dead); 869 LIST_HEAD(ptrace_dead);
838 870
839 write_lock_irq(&tasklist_lock); 871 write_lock_irq(&tasklist_lock);
840 872 reaper = find_new_reaper(father);
841 /* 873 /*
842 * First clean up ptrace if we were using it. 874 * First clean up ptrace if we were using it.
843 */ 875 */
844 ptrace_exit(father, &ptrace_dead); 876 ptrace_exit(father, &ptrace_dead);
845 877
846 do {
847 reaper = next_thread(reaper);
848 if (reaper == father) {
849 reaper = task_child_reaper(father);
850 break;
851 }
852 } while (reaper->flags & PF_EXITING);
853
854 list_for_each_entry_safe(p, n, &father->children, sibling) { 878 list_for_each_entry_safe(p, n, &father->children, sibling) {
855 p->real_parent = reaper; 879 p->real_parent = reaper;
856 if (p->parent == father) { 880 if (p->parent == father) {
@@ -918,8 +942,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
918 942
919 /* mt-exec, de_thread() is waiting for us */ 943 /* mt-exec, de_thread() is waiting for us */
920 if (thread_group_leader(tsk) && 944 if (thread_group_leader(tsk) &&
921 tsk->signal->notify_count < 0 && 945 tsk->signal->group_exit_task &&
922 tsk->signal->group_exit_task) 946 tsk->signal->notify_count < 0)
923 wake_up_process(tsk->signal->group_exit_task); 947 wake_up_process(tsk->signal->group_exit_task);
924 948
925 write_unlock_irq(&tasklist_lock); 949 write_unlock_irq(&tasklist_lock);
@@ -959,39 +983,6 @@ static void check_stack_usage(void)
959static inline void check_stack_usage(void) {} 983static inline void check_stack_usage(void) {}
960#endif 984#endif
961 985
962static inline void exit_child_reaper(struct task_struct *tsk)
963{
964 if (likely(tsk->group_leader != task_child_reaper(tsk)))
965 return;
966
967 if (tsk->nsproxy->pid_ns == &init_pid_ns)
968 panic("Attempted to kill init!");
969
970 /*
971 * @tsk is the last thread in the 'cgroup-init' and is exiting.
972 * Terminate all remaining processes in the namespace and reap them
973 * before exiting @tsk.
974 *
975 * Note that @tsk (last thread of cgroup-init) may not necessarily
976 * be the child-reaper (i.e main thread of cgroup-init) of the
977 * namespace i.e the child_reaper may have already exited.
978 *
979 * Even after a child_reaper exits, we let it inherit orphaned children,
980 * because, pid_ns->child_reaper remains valid as long as there is
981 * at least one living sub-thread in the cgroup init.
982
983 * This living sub-thread of the cgroup-init will be notified when
984 * a child inherited by the 'child-reaper' exits (do_notify_parent()
985 * uses __group_send_sig_info()). Further, when reaping child processes,
986 * do_wait() iterates over children of all living sub threads.
987
988 * i.e even though 'child_reaper' thread is listed as the parent of the
989 * orphaned children, any living sub-thread in the cgroup-init can
990 * perform the role of the child_reaper.
991 */
992 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
993}
994
995NORET_TYPE void do_exit(long code) 986NORET_TYPE void do_exit(long code)
996{ 987{
997 struct task_struct *tsk = current; 988 struct task_struct *tsk = current;
@@ -1051,7 +1042,6 @@ NORET_TYPE void do_exit(long code)
1051 } 1042 }
1052 group_dead = atomic_dec_and_test(&tsk->signal->live); 1043 group_dead = atomic_dec_and_test(&tsk->signal->live);
1053 if (group_dead) { 1044 if (group_dead) {
1054 exit_child_reaper(tsk);
1055 hrtimer_cancel(&tsk->signal->real_timer); 1045 hrtimer_cancel(&tsk->signal->real_timer);
1056 exit_itimers(tsk->signal); 1046 exit_itimers(tsk->signal);
1057 } 1047 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3bfb1877a003..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
875 if (!entry) 875 if (!entry)
876 return 0; 876 return 0;
877 877
878 entry->class = this;
879 entry->distance = distance;
880 if (!save_trace(&entry->trace)) 878 if (!save_trace(&entry->trace))
881 return 0; 879 return 0;
882 880
881 entry->class = this;
882 entry->distance = distance;
883 /* 883 /*
884 * Since we never remove from the dependency list, the list can 884 * Since we never remove from the dependency list, the list can
885 * be walked lockless by other CPUs, it's only allocation 885 * be walked lockless by other CPUs, it's only allocation
@@ -3029,7 +3029,7 @@ found_it:
3029 3029
3030 stats = get_lock_stats(hlock_class(hlock)); 3030 stats = get_lock_stats(hlock_class(hlock));
3031 if (point < ARRAY_SIZE(stats->contention_point)) 3031 if (point < ARRAY_SIZE(stats->contention_point))
3032 stats->contention_point[i]++; 3032 stats->contention_point[point]++;
3033 if (lock->cpu != smp_processor_id()) 3033 if (lock->cpu != smp_processor_id())
3034 stats->bounces[bounce_contended + !!hlock->read]++; 3034 stats->bounces[bounce_contended + !!hlock->read]++;
3035 put_lock_stats(stats); 3035 put_lock_stats(stats);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 4b194d34d77f..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 unsigned long rem;
474 474
475 nr += 5; /* for display rounding */
475 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 476 rem = do_div(nr, 1000); /* XXX: do_div_signed */
476 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); 477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
477} 478}
478 479
479static void seq_time(struct seq_file *m, s64 time) 480static void seq_time(struct seq_file *m, s64 time)
diff --git a/kernel/module.c b/kernel/module.c
index 08864d257eb0..9db11911e04b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1799,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size)
1799 1799
1800/* Allocate and load the module: note that size of section 0 is always 1800/* Allocate and load the module: note that size of section 0 is always
1801 zero, and we rely on this for optional sections. */ 1801 zero, and we rely on this for optional sections. */
1802static struct module *load_module(void __user *umod, 1802static noinline struct module *load_module(void __user *umod,
1803 unsigned long len, 1803 unsigned long len,
1804 const char __user *uargs) 1804 const char __user *uargs)
1805{ 1805{
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 21575fc46d05..1d3ef29a2583 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/version.h>
18#include <linux/nsproxy.h> 17#include <linux/nsproxy.h>
19#include <linux/init_task.h> 18#include <linux/init_task.h>
20#include <linux/mnt_namespace.h> 19#include <linux/mnt_namespace.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ea567b78d1aa..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
179 rc = sys_wait4(-1, NULL, __WALL, NULL); 179 rc = sys_wait4(-1, NULL, __WALL, NULL);
180 } while (rc != -ECHILD); 180 } while (rc != -ECHILD);
181 181
182
183 /* Child reaper for the pid namespace is going away */
184 pid_ns->child_reaper = NULL;
185 acct_exit_ns(pid_ns); 182 acct_exit_ns(pid_ns);
186 return; 183 return;
187} 184}
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index da9c2dda6a4e..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -43,7 +43,7 @@
43#include <linux/uaccess.h> 43#include <linux/uaccess.h>
44 44
45/* 45/*
46 * locking rule: all changes to target_value or requirements or notifiers lists 46 * locking rule: all changes to requirements or notifiers lists
47 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 47 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
48 * held, taken with _irqsave. One lock to rule them all 48 * held, taken with _irqsave. One lock to rule them all
49 */ 49 */
@@ -66,7 +66,7 @@ struct pm_qos_object {
66 struct miscdevice pm_qos_power_miscdev; 66 struct miscdevice pm_qos_power_miscdev;
67 char *name; 67 char *name;
68 s32 default_value; 68 s32 default_value;
69 s32 target_value; 69 atomic_t target_value;
70 s32 (*comparitor)(s32, s32); 70 s32 (*comparitor)(s32, s32);
71}; 71};
72 72
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
77 .notifiers = &cpu_dma_lat_notifier, 77 .notifiers = &cpu_dma_lat_notifier,
78 .name = "cpu_dma_latency", 78 .name = "cpu_dma_latency",
79 .default_value = 2000 * USEC_PER_SEC, 79 .default_value = 2000 * USEC_PER_SEC,
80 .target_value = 2000 * USEC_PER_SEC, 80 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
81 .comparitor = min_compare 81 .comparitor = min_compare
82}; 82};
83 83
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
87 .notifiers = &network_lat_notifier, 87 .notifiers = &network_lat_notifier,
88 .name = "network_latency", 88 .name = "network_latency",
89 .default_value = 2000 * USEC_PER_SEC, 89 .default_value = 2000 * USEC_PER_SEC,
90 .target_value = 2000 * USEC_PER_SEC, 90 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
91 .comparitor = min_compare 91 .comparitor = min_compare
92}; 92};
93 93
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
99 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 100 .name = "network_throughput",
101 .default_value = 0, 101 .default_value = 0,
102 .target_value = 0, 102 .target_value = ATOMIC_INIT(0),
103 .comparitor = max_compare 103 .comparitor = max_compare
104}; 104};
105 105
@@ -150,11 +150,11 @@ static void update_target(int target)
150 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[target]->comparitor(
151 extreme_value, node->value); 151 extreme_value, node->value);
152 } 152 }
153 if (pm_qos_array[target]->target_value != extreme_value) { 153 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
154 call_notifier = 1; 154 call_notifier = 1;
155 pm_qos_array[target]->target_value = extreme_value; 155 atomic_set(&pm_qos_array[target]->target_value, extreme_value);
156 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 156 pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
157 pm_qos_array[target]->target_value); 157 atomic_read(&pm_qos_array[target]->target_value));
158 } 158 }
159 spin_unlock_irqrestore(&pm_qos_lock, flags); 159 spin_unlock_irqrestore(&pm_qos_lock, flags);
160 160
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
193 */ 193 */
194int pm_qos_requirement(int pm_qos_class) 194int pm_qos_requirement(int pm_qos_class)
195{ 195{
196 int ret_val; 196 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
197 unsigned long flags;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 ret_val = pm_qos_array[pm_qos_class]->target_value;
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return ret_val;
204} 197}
205EXPORT_SYMBOL_GPL(pm_qos_requirement); 198EXPORT_SYMBOL_GPL(pm_qos_requirement);
206 199
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b52..bbd85c60f741 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -21,6 +21,7 @@
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h> 23#include <linux/freezer.h>
24#include <linux/ftrace.h>
24 25
25#include "power.h" 26#include "power.h"
26 27
@@ -255,7 +256,7 @@ static int create_image(int platform_mode)
255 256
256int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
257{ 258{
258 int error; 259 int error, ftrace_save;
259 260
260 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
261 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode)
267 goto Close; 268 goto Close;
268 269
269 suspend_console(); 270 suspend_console();
271 ftrace_save = __ftrace_enabled_save();
270 error = device_suspend(PMSG_FREEZE); 272 error = device_suspend(PMSG_FREEZE);
271 if (error) 273 if (error)
272 goto Recover_platform; 274 goto Recover_platform;
@@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode)
296 Resume_devices: 298 Resume_devices:
297 device_resume(in_suspend ? 299 device_resume(in_suspend ?
298 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 300 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
301 __ftrace_enabled_restore(ftrace_save);
299 resume_console(); 302 resume_console();
300 Close: 303 Close:
301 platform_end(platform_mode); 304 platform_end(platform_mode);
@@ -366,10 +369,11 @@ static int resume_target_kernel(void)
366 369
367int hibernation_restore(int platform_mode) 370int hibernation_restore(int platform_mode)
368{ 371{
369 int error; 372 int error, ftrace_save;
370 373
371 pm_prepare_console(); 374 pm_prepare_console();
372 suspend_console(); 375 suspend_console();
376 ftrace_save = __ftrace_enabled_save();
373 error = device_suspend(PMSG_QUIESCE); 377 error = device_suspend(PMSG_QUIESCE);
374 if (error) 378 if (error)
375 goto Finish; 379 goto Finish;
@@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode)
384 platform_restore_cleanup(platform_mode); 388 platform_restore_cleanup(platform_mode);
385 device_resume(PMSG_RECOVER); 389 device_resume(PMSG_RECOVER);
386 Finish: 390 Finish:
391 __ftrace_enabled_restore(ftrace_save);
387 resume_console(); 392 resume_console();
388 pm_restore_console(); 393 pm_restore_console();
389 return error; 394 return error;
@@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode)
396 401
397int hibernation_platform_enter(void) 402int hibernation_platform_enter(void)
398{ 403{
399 int error; 404 int error, ftrace_save;
400 405
401 if (!hibernation_ops) 406 if (!hibernation_ops)
402 return -ENOSYS; 407 return -ENOSYS;
@@ -411,6 +416,7 @@ int hibernation_platform_enter(void)
411 goto Close; 416 goto Close;
412 417
413 suspend_console(); 418 suspend_console();
419 ftrace_save = __ftrace_enabled_save();
414 error = device_suspend(PMSG_HIBERNATE); 420 error = device_suspend(PMSG_HIBERNATE);
415 if (error) { 421 if (error) {
416 if (hibernation_ops->recover) 422 if (hibernation_ops->recover)
@@ -445,6 +451,7 @@ int hibernation_platform_enter(void)
445 hibernation_ops->finish(); 451 hibernation_ops->finish();
446 Resume_devices: 452 Resume_devices:
447 device_resume(PMSG_RESTORE); 453 device_resume(PMSG_RESTORE);
454 __ftrace_enabled_restore(ftrace_save);
448 resume_console(); 455 resume_console();
449 Close: 456 Close:
450 hibernation_ops->end(); 457 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b7476f5d2a6..540b16b68565 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,7 @@
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ftrace.h>
24 25
25#include "power.h" 26#include "power.h"
26 27
@@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state)
310 */ 311 */
311int suspend_devices_and_enter(suspend_state_t state) 312int suspend_devices_and_enter(suspend_state_t state)
312{ 313{
313 int error; 314 int error, ftrace_save;
314 315
315 if (!suspend_ops) 316 if (!suspend_ops)
316 return -ENOSYS; 317 return -ENOSYS;
@@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state)
321 goto Close; 322 goto Close;
322 } 323 }
323 suspend_console(); 324 suspend_console();
325 ftrace_save = __ftrace_enabled_save();
324 suspend_test_start(); 326 suspend_test_start();
325 error = device_suspend(PMSG_SUSPEND); 327 error = device_suspend(PMSG_SUSPEND);
326 if (error) { 328 if (error) {
@@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state)
352 suspend_test_start(); 354 suspend_test_start();
353 device_resume(PMSG_RESUME); 355 device_resume(PMSG_RESUME);
354 suspend_test_finish("resume devices"); 356 suspend_test_finish("resume devices");
357 __ftrace_enabled_restore(ftrace_save);
355 resume_console(); 358 resume_console();
356 Close: 359 Close:
357 if (suspend_ops->end) 360 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f9..80ccac849e46 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h> 16#include <linux/utsname.h>
17#include <linux/version.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/bitops.h> 18#include <linux/bitops.h>
20#include <linux/genhd.h> 19#include <linux/genhd.h>
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f14f372cf6f5..467d5940f624 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */
80synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81synchronize_rcu_xxx(synchronize_rcu, call_rcu)
81EXPORT_SYMBOL_GPL(synchronize_rcu); 82EXPORT_SYMBOL_GPL(synchronize_rcu);
82 83
diff --git a/kernel/resource.c b/kernel/resource.c
index f5b518eabefe..03d796c1b2e9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
362 362
363EXPORT_SYMBOL(allocate_resource); 363EXPORT_SYMBOL(allocate_resource);
364 364
365/** 365/*
366 * insert_resource - Inserts a resource in the resource tree 366 * Insert a resource into the resource tree. If successful, return NULL,
367 * @parent: parent of the new resource 367 * otherwise return the conflicting resource (compare to __request_resource())
368 * @new: new resource to insert
369 *
370 * Returns 0 on success, -EBUSY if the resource can't be inserted.
371 *
372 * This function is equivalent to request_resource when no conflict
373 * happens. If a conflict happens, and the conflicting resources
374 * entirely fit within the range of the new resource, then the new
375 * resource is inserted and the conflicting resources become children of
376 * the new resource.
377 */ 368 */
378int insert_resource(struct resource *parent, struct resource *new) 369static struct resource * __insert_resource(struct resource *parent, struct resource *new)
379{ 370{
380 int result;
381 struct resource *first, *next; 371 struct resource *first, *next;
382 372
383 write_lock(&resource_lock);
384
385 for (;; parent = first) { 373 for (;; parent = first) {
386 result = 0;
387 first = __request_resource(parent, new); 374 first = __request_resource(parent, new);
388 if (!first) 375 if (!first)
389 goto out; 376 return first;
390 377
391 result = -EBUSY;
392 if (first == parent) 378 if (first == parent)
393 goto out; 379 return first;
394 380
395 if ((first->start > new->start) || (first->end < new->end)) 381 if ((first->start > new->start) || (first->end < new->end))
396 break; 382 break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
401 for (next = first; ; next = next->sibling) { 387 for (next = first; ; next = next->sibling) {
402 /* Partial overlap? Bad, and unfixable */ 388 /* Partial overlap? Bad, and unfixable */
403 if (next->start < new->start || next->end > new->end) 389 if (next->start < new->start || next->end > new->end)
404 goto out; 390 return next;
405 if (!next->sibling) 391 if (!next->sibling)
406 break; 392 break;
407 if (next->sibling->start > new->end) 393 if (next->sibling->start > new->end)
408 break; 394 break;
409 } 395 }
410 396
411 result = 0;
412
413 new->parent = parent; 397 new->parent = parent;
414 new->sibling = next->sibling; 398 new->sibling = next->sibling;
415 new->child = first; 399 new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
426 next = next->sibling; 410 next = next->sibling;
427 next->sibling = new; 411 next->sibling = new;
428 } 412 }
413 return NULL;
414}
429 415
430 out: 416/**
417 * insert_resource - Inserts a resource in the resource tree
418 * @parent: parent of the new resource
419 * @new: new resource to insert
420 *
421 * Returns 0 on success, -EBUSY if the resource can't be inserted.
422 *
423 * This function is equivalent to request_resource when no conflict
424 * happens. If a conflict happens, and the conflicting resources
425 * entirely fit within the range of the new resource, then the new
426 * resource is inserted and the conflicting resources become children of
427 * the new resource.
428 */
429int insert_resource(struct resource *parent, struct resource *new)
430{
431 struct resource *conflict;
432
433 write_lock(&resource_lock);
434 conflict = __insert_resource(parent, new);
435 write_unlock(&resource_lock);
436 return conflict ? -EBUSY : 0;
437}
438
439/**
440 * insert_resource_expand_to_fit - Insert a resource into the resource tree
441 * @root: root resource descriptor
442 * @new: new resource to insert
443 *
444 * Insert a resource into the resource tree, possibly expanding it in order
445 * to make it encompass any conflicting resources.
446 */
447void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
448{
449 if (new->parent)
450 return;
451
452 write_lock(&resource_lock);
453 for (;;) {
454 struct resource *conflict;
455
456 conflict = __insert_resource(root, new);
457 if (!conflict)
458 break;
459 if (conflict == root)
460 break;
461
462 /* Ok, expand resource to cover the conflict, then try again .. */
463 if (conflict->start < new->start)
464 new->start = conflict->start;
465 if (conflict->end > new->end)
466 new->end = conflict->end;
467
468 printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
469 }
431 write_unlock(&resource_lock); 470 write_unlock(&resource_lock);
432 return result;
433} 471}
434 472
435/** 473/**
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..cc1f81b50b82 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4179,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4179} 4179}
4180 4180
4181/* 4181/*
4182 * Use precise platform statistics if available:
4183 */
4184#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4185cputime_t task_utime(struct task_struct *p)
4186{
4187 return p->utime;
4188}
4189
4190cputime_t task_stime(struct task_struct *p)
4191{
4192 return p->stime;
4193}
4194#else
4195cputime_t task_utime(struct task_struct *p)
4196{
4197 clock_t utime = cputime_to_clock_t(p->utime),
4198 total = utime + cputime_to_clock_t(p->stime);
4199 u64 temp;
4200
4201 /*
4202 * Use CFS's precise accounting:
4203 */
4204 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4205
4206 if (total) {
4207 temp *= utime;
4208 do_div(temp, total);
4209 }
4210 utime = (clock_t)temp;
4211
4212 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4213 return p->prev_utime;
4214}
4215
4216cputime_t task_stime(struct task_struct *p)
4217{
4218 clock_t stime;
4219
4220 /*
4221 * Use CFS's precise accounting. (we subtract utime from
4222 * the total, to make sure the total observed by userspace
4223 * grows monotonically - apps rely on that):
4224 */
4225 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4226 cputime_to_clock_t(task_utime(p));
4227
4228 if (stime >= 0)
4229 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4230
4231 return p->prev_stime;
4232}
4233#endif
4234
4235inline cputime_t task_gtime(struct task_struct *p)
4236{
4237 return p->gtime;
4238}
4239
4240/*
4182 * This function gets called by the timer code, with HZ frequency. 4241 * This function gets called by the timer code, with HZ frequency.
4183 * We call it with interrupts disabled. 4242 * We call it with interrupts disabled.
4184 * 4243 *
@@ -7637,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7637 * and partition_sched_domains() will fallback to the single partition 7696 * and partition_sched_domains() will fallback to the single partition
7638 * 'fallback_doms', it also forces the domains to be rebuilt. 7697 * 'fallback_doms', it also forces the domains to be rebuilt.
7639 * 7698 *
7699 * If doms_new==NULL it will be replaced with cpu_online_map.
7700 * ndoms_new==0 is a special case for destroying existing domains.
7701 * It will not create the default domain.
7702 *
7640 * Call with hotplug lock held 7703 * Call with hotplug lock held
7641 */ 7704 */
7642void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7705void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7643 struct sched_domain_attr *dattr_new) 7706 struct sched_domain_attr *dattr_new)
7644{ 7707{
7645 int i, j; 7708 int i, j, n;
7646 7709
7647 mutex_lock(&sched_domains_mutex); 7710 mutex_lock(&sched_domains_mutex);
7648 7711
7649 /* always unregister in case we don't destroy any domains */ 7712 /* always unregister in case we don't destroy any domains */
7650 unregister_sched_domain_sysctl(); 7713 unregister_sched_domain_sysctl();
7651 7714
7652 if (doms_new == NULL) 7715 n = doms_new ? ndoms_new : 0;
7653 ndoms_new = 0;
7654 7716
7655 /* Destroy deleted domains */ 7717 /* Destroy deleted domains */
7656 for (i = 0; i < ndoms_cur; i++) { 7718 for (i = 0; i < ndoms_cur; i++) {
7657 for (j = 0; j < ndoms_new; j++) { 7719 for (j = 0; j < n; j++) {
7658 if (cpus_equal(doms_cur[i], doms_new[j]) 7720 if (cpus_equal(doms_cur[i], doms_new[j])
7659 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7721 && dattrs_equal(dattr_cur, i, dattr_new, j))
7660 goto match1; 7722 goto match1;
@@ -7667,7 +7729,6 @@ match1:
7667 7729
7668 if (doms_new == NULL) { 7730 if (doms_new == NULL) {
7669 ndoms_cur = 0; 7731 ndoms_cur = 0;
7670 ndoms_new = 1;
7671 doms_new = &fallback_doms; 7732 doms_new = &fallback_doms;
7672 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7733 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7673 dattr_new = NULL; 7734 dattr_new = NULL;
@@ -7704,8 +7765,13 @@ match2:
7704int arch_reinit_sched_domains(void) 7765int arch_reinit_sched_domains(void)
7705{ 7766{
7706 get_online_cpus(); 7767 get_online_cpus();
7768
7769 /* Destroy domains first to force the rebuild */
7770 partition_sched_domains(0, NULL, NULL);
7771
7707 rebuild_sched_domains(); 7772 rebuild_sched_domains();
7708 put_online_cpus(); 7773 put_online_cpus();
7774
7709 return 0; 7775 return 0;
7710} 7776}
7711 7777
@@ -7789,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7789 case CPU_ONLINE_FROZEN: 7855 case CPU_ONLINE_FROZEN:
7790 case CPU_DEAD: 7856 case CPU_DEAD:
7791 case CPU_DEAD_FROZEN: 7857 case CPU_DEAD_FROZEN:
7792 partition_sched_domains(0, NULL, NULL); 7858 partition_sched_domains(1, NULL, NULL);
7793 return NOTIFY_OK; 7859 return NOTIFY_OK;
7794 7860
7795 default: 7861 default:
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 204991a0bfa7..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -12,19 +12,17 @@
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 * Create a semi stable clock from a mixture of other events, including:
14 * - gtod 14 * - gtod
15 * - jiffies
16 * - sched_clock() 15 * - sched_clock()
17 * - explicit idle events 16 * - explicit idle events
18 * 17 *
19 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 18 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
20 * making it monotonic and keeping it within an expected window. This window 19 * making it monotonic and keeping it within an expected window.
21 * is set up using jiffies.
22 * 20 *
23 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
24 * that is otherwise invisible (TSC gets stopped). 22 * that is otherwise invisible (TSC gets stopped).
25 * 23 *
26 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
27 * consistent between cpus (never more than 1 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
28 */ 26 */
29#include <linux/sched.h> 27#include <linux/sched.h>
30#include <linux/percpu.h> 28#include <linux/percpu.h>
@@ -54,7 +52,6 @@ struct sched_clock_data {
54 */ 52 */
55 raw_spinlock_t lock; 53 raw_spinlock_t lock;
56 54
57 unsigned long tick_jiffies;
58 u64 tick_raw; 55 u64 tick_raw;
59 u64 tick_gtod; 56 u64 tick_gtod;
60 u64 clock; 57 u64 clock;
@@ -75,14 +72,12 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
75void sched_clock_init(void) 72void sched_clock_init(void)
76{ 73{
77 u64 ktime_now = ktime_to_ns(ktime_get()); 74 u64 ktime_now = ktime_to_ns(ktime_get());
78 unsigned long now_jiffies = jiffies;
79 int cpu; 75 int cpu;
80 76
81 for_each_possible_cpu(cpu) { 77 for_each_possible_cpu(cpu) {
82 struct sched_clock_data *scd = cpu_sdc(cpu); 78 struct sched_clock_data *scd = cpu_sdc(cpu);
83 79
84 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 80 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
85 scd->tick_jiffies = now_jiffies;
86 scd->tick_raw = 0; 81 scd->tick_raw = 0;
87 scd->tick_gtod = ktime_now; 82 scd->tick_gtod = ktime_now;
88 scd->clock = ktime_now; 83 scd->clock = ktime_now;
@@ -92,46 +87,51 @@ void sched_clock_init(void)
92} 87}
93 88
94/* 89/*
90 * min,max except they take wrapping into account
91 */
92
93static inline u64 wrap_min(u64 x, u64 y)
94{
95 return (s64)(x - y) < 0 ? x : y;
96}
97
98static inline u64 wrap_max(u64 x, u64 y)
99{
100 return (s64)(x - y) > 0 ? x : y;
101}
102
103/*
95 * update the percpu scd from the raw @now value 104 * update the percpu scd from the raw @now value
96 * 105 *
97 * - filter out backward motion 106 * - filter out backward motion
98 * - use jiffies to generate a min,max window to clip the raw values 107 * - use the GTOD tick value to create a window to filter crazy TSC values
99 */ 108 */
100static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 109static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
101{ 110{
102 unsigned long now_jiffies = jiffies;
103 long delta_jiffies = now_jiffies - scd->tick_jiffies;
104 u64 clock = scd->clock;
105 u64 min_clock, max_clock;
106 s64 delta = now - scd->tick_raw; 111 s64 delta = now - scd->tick_raw;
112 u64 clock, min_clock, max_clock;
107 113
108 WARN_ON_ONCE(!irqs_disabled()); 114 WARN_ON_ONCE(!irqs_disabled());
109 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
110 115
111 if (unlikely(delta < 0)) { 116 if (unlikely(delta < 0))
112 clock++; 117 delta = 0;
113 goto out;
114 }
115 118
116 max_clock = min_clock + TICK_NSEC; 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC);
123 */
117 124
118 if (unlikely(clock + delta > max_clock)) { 125 clock = scd->tick_gtod + delta;
119 if (clock < max_clock) 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
120 clock = max_clock; 127 max_clock = scd->tick_gtod + TICK_NSEC;
121 else
122 clock++;
123 } else {
124 clock += delta;
125 }
126 128
127 out: 129 clock = wrap_max(clock, min_clock);
128 if (unlikely(clock < min_clock)) 130 clock = wrap_min(clock, max_clock);
129 clock = min_clock;
130 131
131 scd->tick_jiffies = now_jiffies;
132 scd->clock = clock; 132 scd->clock = clock;
133 133
134 return clock; 134 return scd->clock;
135} 135}
136 136
137static void lock_double_clock(struct sched_clock_data *data1, 137static void lock_double_clock(struct sched_clock_data *data1,
@@ -171,7 +171,7 @@ u64 sched_clock_cpu(int cpu)
171 * larger time as the latest time for both 171 * larger time as the latest time for both
172 * runqueues. (this creates monotonic movement) 172 * runqueues. (this creates monotonic movement)
173 */ 173 */
174 if (likely(remote_clock < this_clock)) { 174 if (likely((s64)(remote_clock - this_clock) < 0)) {
175 clock = this_clock; 175 clock = this_clock;
176 scd->clock = clock; 176 scd->clock = clock;
177 } else { 177 } else {
@@ -207,14 +207,9 @@ void sched_clock_tick(void)
207 now = sched_clock(); 207 now = sched_clock();
208 208
209 __raw_spin_lock(&scd->lock); 209 __raw_spin_lock(&scd->lock);
210 __update_sched_clock(scd, now);
211 /*
212 * update tick_gtod after __update_sched_clock() because that will
213 * already observe 1 new jiffy; adding a new tick_gtod to that would
214 * increase the clock 2 jiffies.
215 */
216 scd->tick_raw = now; 210 scd->tick_raw = now;
217 scd->tick_gtod = now_gtod; 211 scd->tick_gtod = now_gtod;
212 __update_sched_clock(scd, now);
218 __raw_spin_unlock(&scd->lock); 213 __raw_spin_unlock(&scd->lock);
219} 214}
220 215
@@ -232,18 +227,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
232 */ 227 */
233void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
234{ 229{
235 struct sched_clock_data *scd = this_scd(); 230 sched_clock_tick();
236
237 /*
238 * Override the previous timestamp and ignore all
239 * sched_clock() deltas that occured while we idled,
240 * and use the PM-provided delta_ns to advance the
241 * rq clock:
242 */
243 __raw_spin_lock(&scd->lock);
244 scd->clock += delta_ns;
245 __raw_spin_unlock(&scd->lock);
246
247 touch_softlockup_watchdog(); 231 touch_softlockup_watchdog();
248} 232}
249EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 233EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 862b06bd560a..9353ca78154e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,6 +8,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
8SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 1)
9SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
10SCHED_FEAT(ASYM_GRAN, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 0) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 998ba54b4543..552310798dad 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
199 199
200static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 if (rt_rq->rt_nr_running)
203 resched_task(rq_of_rt_rq(rt_rq)->curr);
202} 204}
203 205
204static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 206static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -438,9 +440,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
438{ 440{
439 u64 runtime = sched_rt_runtime(rt_rq); 441 u64 runtime = sched_rt_runtime(rt_rq);
440 442
441 if (runtime == RUNTIME_INF)
442 return 0;
443
444 if (rt_rq->rt_throttled) 443 if (rt_rq->rt_throttled)
445 return rt_rq_throttled(rt_rq); 444 return rt_rq_throttled(rt_rq);
446 445
@@ -491,9 +490,11 @@ static void update_curr_rt(struct rq *rq)
491 rt_rq = rt_rq_of_se(rt_se); 490 rt_rq = rt_rq_of_se(rt_se);
492 491
493 spin_lock(&rt_rq->rt_runtime_lock); 492 spin_lock(&rt_rq->rt_runtime_lock);
494 rt_rq->rt_time += delta_exec; 493 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
495 if (sched_rt_runtime_exceeded(rt_rq)) 494 rt_rq->rt_time += delta_exec;
496 resched_task(curr); 495 if (sched_rt_runtime_exceeded(rt_rq))
496 resched_task(curr);
497 }
497 spin_unlock(&rt_rq->rt_runtime_lock); 498 spin_unlock(&rt_rq->rt_runtime_lock);
498 } 499 }
499} 500}
diff --git a/kernel/smp.c b/kernel/smp.c
index 782e2b93e465..f362a8553777 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
210{ 210{
211 struct call_single_data d; 211 struct call_single_data d;
212 unsigned long flags; 212 unsigned long flags;
213 /* prevent preemption and reschedule on another processor */ 213 /* prevent preemption and reschedule on another processor,
214 as well as CPU removal */
214 int me = get_cpu(); 215 int me = get_cpu();
216 int err = 0;
215 217
216 /* Can deadlock when called with interrupts disabled */ 218 /* Can deadlock when called with interrupts disabled */
217 WARN_ON(irqs_disabled()); 219 WARN_ON(irqs_disabled());
@@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
220 local_irq_save(flags); 222 local_irq_save(flags);
221 func(info); 223 func(info);
222 local_irq_restore(flags); 224 local_irq_restore(flags);
223 } else { 225 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
224 struct call_single_data *data = NULL; 226 struct call_single_data *data = NULL;
225 227
226 if (!wait) { 228 if (!wait) {
@@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
236 data->func = func; 238 data->func = func;
237 data->info = info; 239 data->info = info;
238 generic_exec_single(cpu, data); 240 generic_exec_single(cpu, data);
241 } else {
242 err = -ENXIO; /* CPU not online */
239 } 243 }
240 244
241 put_cpu(); 245 put_cpu();
242 return 0; 246 return err;
243} 247}
244EXPORT_SYMBOL(smp_call_function_single); 248EXPORT_SYMBOL(smp_call_function_single);
245 249
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfcf..cb838ee93a82 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -233,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
233 do_each_thread(g, t) { 233 do_each_thread(g, t) {
234 if (!--max_count) 234 if (!--max_count)
235 goto unlock; 235 goto unlock;
236 if (t->state & TASK_UNINTERRUPTIBLE) 236 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
237 if (t->state == TASK_UNINTERRUPTIBLE)
237 check_hung_task(t, now); 238 check_hung_task(t, now);
238 } while_each_thread(g, t); 239 } while_each_thread(g, t);
239 unlock: 240 unlock:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fe4713347275..50ec0886fa3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
159static struct ctl_table root_table[]; 159static struct ctl_table root_table[];
160static struct ctl_table_root sysctl_table_root; 160static struct ctl_table_root sysctl_table_root;
161static struct ctl_table_header root_table_header = { 161static struct ctl_table_header root_table_header = {
162 .count = 1,
162 .ctl_table = root_table, 163 .ctl_table = root_table,
163 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), 164 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
164 .root = &sysctl_table_root, 165 .root = &sysctl_table_root,
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..1876b526c778 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -177,7 +177,7 @@ void clockevents_register_device(struct clock_event_device *dev)
177/* 177/*
178 * Noop handler when we shut down an event device 178 * Noop handler when we shut down an event device
179 */ 179 */
180static void clockevents_handle_noop(struct clock_event_device *dev) 180void clockevents_handle_noop(struct clock_event_device *dev)
181{ 181{
182} 182}
183 183
@@ -199,7 +199,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
199 * released list and do a notify add later. 199 * released list and do a notify add later.
200 */ 200 */
201 if (old) { 201 if (old) {
202 old->event_handler = clockevents_handle_noop;
203 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 202 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
204 list_del(&old->list); 203 list_del(&old->list);
205 list_add(&old->list, &clockevents_released); 204 list_add(&old->list, &clockevents_released);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..1ad46f3df6e7 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
245 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 245 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
246 fail = update_persistent_clock(now); 246 fail = update_persistent_clock(now);
247 247
248 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; 248 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
249 if (next.tv_nsec <= 0) 249 if (next.tv_nsec <= 0)
250 next.tv_nsec += NSEC_PER_SEC; 250 next.tv_nsec += NSEC_PER_SEC;
251 251
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b94..2f5a38294bf9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
175 */ 175 */
176static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 176static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
177{ 177{
178 ktime_t next;
179
178 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
179 181
180 /* 182 /*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
185 187
186 /* 188 /*
187 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
188 * periodic mode: 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really
193 * programmed to the device.
189 */ 194 */
190 for (;;) { 195 for (next = dev->next_event; ;) {
191 ktime_t next = ktime_add(dev->next_event, tick_period); 196 next = ktime_add(next, tick_period);
192 197
193 if (!clockevents_program_event(dev, next, ktime_get())) 198 if (!clockevents_program_event(dev, next, ktime_get()))
194 return; 199 return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
205 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
206 struct tick_device *td; 211 struct tick_device *td;
207 unsigned long flags, *reason = why; 212 unsigned long flags, *reason = why;
208 int cpu; 213 int cpu, bc_stopped;
209 214
210 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
211 216
@@ -223,6 +228,8 @@ static void tick_do_broadcast_on_off(void *why)
223 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
224 goto out; 229 goto out;
225 230
231 bc_stopped = cpus_empty(tick_broadcast_mask);
232
226 switch (*reason) { 233 switch (*reason) {
227 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
228 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
@@ -245,9 +252,10 @@ static void tick_do_broadcast_on_off(void *why)
245 break; 252 break;
246 } 253 }
247 254
248 if (cpus_empty(tick_broadcast_mask)) 255 if (cpus_empty(tick_broadcast_mask)) {
249 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 256 if (!bc_stopped)
250 else { 257 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
258 } else if (bc_stopped) {
251 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 259 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
252 tick_broadcast_start_periodic(bc); 260 tick_broadcast_start_periodic(bc);
253 else 261 else
@@ -364,16 +372,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
364static int tick_broadcast_set_event(ktime_t expires, int force) 372static int tick_broadcast_set_event(ktime_t expires, int force)
365{ 373{
366 struct clock_event_device *bc = tick_broadcast_device.evtdev; 374 struct clock_event_device *bc = tick_broadcast_device.evtdev;
367 ktime_t now = ktime_get(); 375
368 int res; 376 return tick_dev_program_event(bc, expires, force);
369
370 for(;;) {
371 res = clockevents_program_event(bc, expires, now);
372 if (!res || !force)
373 return res;
374 now = ktime_get();
375 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
376 }
377} 377}
378 378
379int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 379int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -491,14 +491,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
491 cpu_clear(cpu, tick_broadcast_oneshot_mask); 491 cpu_clear(cpu, tick_broadcast_oneshot_mask);
492} 492}
493 493
494static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
495{
496 struct tick_device *td;
497 int cpu;
498
499 for_each_cpu_mask_nr(cpu, *mask) {
500 td = &per_cpu(tick_cpu_device, cpu);
501 if (td->evtdev)
502 td->evtdev->next_event = expires;
503 }
504}
505
494/** 506/**
495 * tick_broadcast_setup_oneshot - setup the broadcast device 507 * tick_broadcast_setup_oneshot - setup the broadcast device
496 */ 508 */
497void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 509void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
498{ 510{
499 bc->event_handler = tick_handle_oneshot_broadcast; 511 /* Set it up only once ! */
500 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 512 if (bc->event_handler != tick_handle_oneshot_broadcast) {
501 bc->next_event.tv64 = KTIME_MAX; 513 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
514 int cpu = smp_processor_id();
515 cpumask_t mask;
516
517 bc->event_handler = tick_handle_oneshot_broadcast;
518 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
519
520 /* Take the do_timer update */
521 tick_do_timer_cpu = cpu;
522
523 /*
524 * We must be careful here. There might be other CPUs
525 * waiting for periodic broadcast. We need to set the
526 * oneshot_mask bits for those and program the
527 * broadcast device to fire.
528 */
529 mask = tick_broadcast_mask;
530 cpu_clear(cpu, mask);
531 cpus_or(tick_broadcast_oneshot_mask,
532 tick_broadcast_oneshot_mask, mask);
533
534 if (was_periodic && !cpus_empty(mask)) {
535 tick_broadcast_init_next_event(&mask, tick_next_period);
536 tick_broadcast_set_event(tick_next_period, 1);
537 } else
538 bc->next_event.tv64 = KTIME_MAX;
539 }
502} 540}
503 541
504/* 542/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c4336f4188..c4777193d567 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -161,6 +161,7 @@ static void tick_setup_device(struct tick_device *td,
161 } else { 161 } else {
162 handler = td->evtdev->event_handler; 162 handler = td->evtdev->event_handler;
163 next_event = td->evtdev->next_event; 163 next_event = td->evtdev->next_event;
164 td->evtdev->event_handler = clockevents_handle_noop;
164 } 165 }
165 166
166 td->evtdev = newdev; 167 td->evtdev = newdev;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..0ffc2918ea6f 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -17,6 +17,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
17extern void tick_setup_oneshot(struct clock_event_device *newdev, 17extern void tick_setup_oneshot(struct clock_event_device *newdev,
18 void (*handler)(struct clock_event_device *), 18 void (*handler)(struct clock_event_device *),
19 ktime_t nextevt); 19 ktime_t nextevt);
20extern int tick_dev_program_event(struct clock_event_device *dev,
21 ktime_t expires, int force);
20extern int tick_program_event(ktime_t expires, int force); 22extern int tick_program_event(ktime_t expires, int force);
21extern void tick_oneshot_notify(void); 23extern void tick_oneshot_notify(void);
22extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 24extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e35501e61dd 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,58 @@
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/** 25/**
26 * tick_program_event 26 * tick_program_event internal worker function
27 */ 27 */
28int tick_program_event(ktime_t expires, int force) 28int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
29 int force)
29{ 30{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get(); 31 ktime_t now = ktime_get();
32 int i;
32 33
33 while (1) { 34 for (i = 0;;) {
34 int ret = clockevents_program_event(dev, expires, now); 35 int ret = clockevents_program_event(dev, expires, now);
35 36
36 if (!ret || !force) 37 if (!ret || !force)
37 return ret; 38 return ret;
39
40 /*
41 * We tried 2 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it
43 * and emit a warning.
44 */
45 if (++i > 2) {
46 printk(KERN_WARNING "CE: __tick_program_event of %s is "
47 "stuck %llx %llx\n", dev->name ? dev->name : "?",
48 now.tv64, expires.tv64);
49 printk(KERN_WARNING
50 "CE: increasing min_delta_ns %ld to %ld nsec\n",
51 dev->min_delta_ns, dev->min_delta_ns << 1);
52 WARN_ON(1);
53
54 /* Double the min. delta and try again */
55 if (!dev->min_delta_ns)
56 dev->min_delta_ns = 5000;
57 else
58 dev->min_delta_ns <<= 1;
59 i = 0;
60 }
61
38 now = ktime_get(); 62 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); 63 expires = ktime_add_ns(now, dev->min_delta_ns);
40 } 64 }
41} 65}
42 66
43/** 67/**
68 * tick_program_event
69 */
70int tick_program_event(ktime_t expires, int force)
71{
72 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
73
74 return tick_dev_program_event(dev, expires, force);
75}
76
77/**
44 * tick_resume_onshot - resume oneshot mode 78 * tick_resume_onshot - resume oneshot mode
45 */ 79 */
46void tick_resume_oneshot(void) 80void tick_resume_oneshot(void)
@@ -61,7 +95,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
61{ 95{
62 newdev->event_handler = handler; 96 newdev->event_handler = handler;
63 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 97 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
64 clockevents_program_event(newdev, next_event, ktime_get()); 98 tick_dev_program_event(newdev, next_event, 1);
65} 99}
66 100
67/** 101/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f5da526424a9..a87b0468568b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -162,6 +162,8 @@ void tick_nohz_stop_idle(int cpu)
162 ts->idle_lastupdate = now; 162 ts->idle_lastupdate = now;
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 ts->idle_active = 0; 164 ts->idle_active = 0;
165
166 sched_clock_idle_wakeup_event(0);
165 } 167 }
166} 168}
167 169
@@ -177,6 +179,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
177 } 179 }
178 ts->idle_entrytime = now; 180 ts->idle_entrytime = now;
179 ts->idle_active = 1; 181 ts->idle_active = 1;
182 sched_clock_idle_sleep_event();
180 return now; 183 return now;
181} 184}
182 185
@@ -643,17 +646,21 @@ void tick_setup_sched_timer(void)
643 ts->nohz_mode = NOHZ_MODE_HIGHRES; 646 ts->nohz_mode = NOHZ_MODE_HIGHRES;
644#endif 647#endif
645} 648}
649#endif /* HIGH_RES_TIMERS */
646 650
651#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
647void tick_cancel_sched_timer(int cpu) 652void tick_cancel_sched_timer(int cpu)
648{ 653{
649 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 654 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
650 655
656# ifdef CONFIG_HIGH_RES_TIMERS
651 if (ts->sched_timer.base) 657 if (ts->sched_timer.base)
652 hrtimer_cancel(&ts->sched_timer); 658 hrtimer_cancel(&ts->sched_timer);
659# endif
653 660
654 ts->nohz_mode = NOHZ_MODE_INACTIVE; 661 ts->nohz_mode = NOHZ_MODE_INACTIVE;
655} 662}
656#endif /* HIGH_RES_TIMERS */ 663#endif
657 664
658/** 665/**
659 * Async notification about clocksource changes 666 * Async notification about clocksource changes
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de44..532858fa5b88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h>
10#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f12444..815237a55af8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/err.h> 15#include <linux/err.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18 17
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256d..4ab9659d269e 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/sysctl.h> 15#include <linux/sysctl.h>
17 16
18static void *get_uts(ctl_table *table, int write) 17static void *get_uts(ctl_table *table, int write)