diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 312 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/sched.c | 78 | ||||
-rw-r--r-- | kernel/sysctl.c | 1 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 3 | ||||
-rw-r--r-- | kernel/time/ntp.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 78 | ||||
-rw-r--r-- | kernel/time/tick-common.c | 1 | ||||
-rw-r--r-- | kernel/time/tick-internal.h | 2 | ||||
-rw-r--r-- | kernel/time/tick-oneshot.c | 44 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 3 |
11 files changed, 362 insertions, 168 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79cf516d..f227bc172690 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -14,6 +14,8 @@ | |||
14 | * 2003-10-22 Updates by Stephen Hemminger. | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson. | 15 | * 2004 May-July Rework by Paul Jackson. |
16 | * 2006 Rework by Paul Menage to use generic cgroups | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
17 | * 2008 Rework of the scheduler domains and CPU hotplug handling | ||
18 | * by Max Krasnyansky | ||
17 | * | 19 | * |
18 | * This file is subject to the terms and conditions of the GNU General Public | 20 | * This file is subject to the terms and conditions of the GNU General Public |
19 | * License. See the file COPYING in the main directory of the Linux | 21 | * License. See the file COPYING in the main directory of the Linux |
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { | |||
236 | 238 | ||
237 | static DEFINE_MUTEX(callback_mutex); | 239 | static DEFINE_MUTEX(callback_mutex); |
238 | 240 | ||
239 | /* This is ugly, but preserves the userspace API for existing cpuset | 241 | /* |
242 | * This is ugly, but preserves the userspace API for existing cpuset | ||
240 | * users. If someone tries to mount the "cpuset" filesystem, we | 243 | * users. If someone tries to mount the "cpuset" filesystem, we |
241 | * silently switch it to mount "cgroup" instead */ | 244 | * silently switch it to mount "cgroup" instead |
245 | */ | ||
242 | static int cpuset_get_sb(struct file_system_type *fs_type, | 246 | static int cpuset_get_sb(struct file_system_type *fs_type, |
243 | int flags, const char *unused_dev_name, | 247 | int flags, const char *unused_dev_name, |
244 | void *data, struct vfsmount *mnt) | 248 | void *data, struct vfsmount *mnt) |
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
473 | } | 477 | } |
474 | 478 | ||
475 | /* | 479 | /* |
476 | * Helper routine for rebuild_sched_domains(). | 480 | * Helper routine for generate_sched_domains(). |
477 | * Do cpusets a, b have overlapping cpus_allowed masks? | 481 | * Do cpusets a, b have overlapping cpus_allowed masks? |
478 | */ | 482 | */ |
479 | |||
480 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 483 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
481 | { | 484 | { |
482 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 485 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
518 | } | 521 | } |
519 | 522 | ||
520 | /* | 523 | /* |
521 | * rebuild_sched_domains() | 524 | * generate_sched_domains() |
522 | * | 525 | * |
523 | * This routine will be called to rebuild the scheduler's dynamic | 526 | * This function builds a partial partition of the systems CPUs |
524 | * sched domains: | 527 | * A 'partial partition' is a set of non-overlapping subsets whose |
525 | * - if the flag 'sched_load_balance' of any cpuset with non-empty | 528 | * union is a subset of that set. |
526 | * 'cpus' changes, | 529 | * The output of this function needs to be passed to kernel/sched.c |
527 | * - or if the 'cpus' allowed changes in any cpuset which has that | 530 | * partition_sched_domains() routine, which will rebuild the scheduler's |
528 | * flag enabled, | 531 | * load balancing domains (sched domains) as specified by that partial |
529 | * - or if the 'sched_relax_domain_level' of any cpuset which has | 532 | * partition. |
530 | * that flag enabled and with non-empty 'cpus' changes, | ||
531 | * - or if any cpuset with non-empty 'cpus' is removed, | ||
532 | * - or if a cpu gets offlined. | ||
533 | * | ||
534 | * This routine builds a partial partition of the systems CPUs | ||
535 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
536 | * below), and passes that partial partition to the kernel/sched.c | ||
537 | * partition_sched_domains() routine, which will rebuild the | ||
538 | * schedulers load balancing domains (sched domains) as specified | ||
539 | * by that partial partition. A 'partial partition' is a set of | ||
540 | * non-overlapping subsets whose union is a subset of that set. | ||
541 | * | 533 | * |
542 | * See "What is sched_load_balance" in Documentation/cpusets.txt | 534 | * See "What is sched_load_balance" in Documentation/cpusets.txt |
543 | * for a background explanation of this. | 535 | * for a background explanation of this. |
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
547 | * domains when operating in the severe memory shortage situations | 539 | * domains when operating in the severe memory shortage situations |
548 | * that could cause allocation failures below. | 540 | * that could cause allocation failures below. |
549 | * | 541 | * |
550 | * Call with cgroup_mutex held. May take callback_mutex during | 542 | * Must be called with cgroup_lock held. |
551 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
552 | * a call to the get_online_cpus()/put_online_cpus() pair. | ||
553 | * Must not be called holding callback_mutex, because we must not | ||
554 | * call get_online_cpus() while holding callback_mutex. Elsewhere | ||
555 | * the kernel nests callback_mutex inside get_online_cpus() calls. | ||
556 | * So the reverse nesting would risk an ABBA deadlock. | ||
557 | * | 543 | * |
558 | * The three key local variables below are: | 544 | * The three key local variables below are: |
559 | * q - a linked-list queue of cpuset pointers, used to implement a | 545 | * q - a linked-list queue of cpuset pointers, used to implement a |
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
588 | * element of the partition (one sched domain) to be passed to | 574 | * element of the partition (one sched domain) to be passed to |
589 | * partition_sched_domains(). | 575 | * partition_sched_domains(). |
590 | */ | 576 | */ |
591 | 577 | static int generate_sched_domains(cpumask_t **domains, | |
592 | void rebuild_sched_domains(void) | 578 | struct sched_domain_attr **attributes) |
593 | { | 579 | { |
594 | LIST_HEAD(q); /* queue of cpusets to be scanned*/ | 580 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
595 | struct cpuset *cp; /* scans q */ | 581 | struct cpuset *cp; /* scans q */ |
596 | struct cpuset **csa; /* array of all cpuset ptrs */ | 582 | struct cpuset **csa; /* array of all cpuset ptrs */ |
597 | int csn; /* how many cpuset ptrs in csa so far */ | 583 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void) | |||
601 | int ndoms; /* number of sched domains in result */ | 587 | int ndoms; /* number of sched domains in result */ |
602 | int nslot; /* next empty doms[] cpumask_t slot */ | 588 | int nslot; /* next empty doms[] cpumask_t slot */ |
603 | 589 | ||
604 | csa = NULL; | 590 | ndoms = 0; |
605 | doms = NULL; | 591 | doms = NULL; |
606 | dattr = NULL; | 592 | dattr = NULL; |
593 | csa = NULL; | ||
607 | 594 | ||
608 | /* Special case for the 99% of systems with one, full, sched domain */ | 595 | /* Special case for the 99% of systems with one, full, sched domain */ |
609 | if (is_sched_load_balance(&top_cpuset)) { | 596 | if (is_sched_load_balance(&top_cpuset)) { |
610 | ndoms = 1; | ||
611 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
612 | if (!doms) | 598 | if (!doms) |
613 | goto rebuild; | 599 | goto done; |
600 | |||
614 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 601 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
615 | if (dattr) { | 602 | if (dattr) { |
616 | *dattr = SD_ATTR_INIT; | 603 | *dattr = SD_ATTR_INIT; |
617 | update_domain_attr_tree(dattr, &top_cpuset); | 604 | update_domain_attr_tree(dattr, &top_cpuset); |
618 | } | 605 | } |
619 | *doms = top_cpuset.cpus_allowed; | 606 | *doms = top_cpuset.cpus_allowed; |
620 | goto rebuild; | 607 | |
608 | ndoms = 1; | ||
609 | goto done; | ||
621 | } | 610 | } |
622 | 611 | ||
623 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 612 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
@@ -680,61 +669,141 @@ restart: | |||
680 | } | 669 | } |
681 | } | 670 | } |
682 | 671 | ||
683 | /* Convert <csn, csa> to <ndoms, doms> */ | 672 | /* |
673 | * Now we know how many domains to create. | ||
674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | ||
675 | */ | ||
684 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
685 | if (!doms) | 677 | if (!doms) { |
686 | goto rebuild; | 678 | ndoms = 0; |
679 | goto done; | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * The rest of the code, including the scheduler, can deal with | ||
684 | * dattr==NULL case. No need to abort if alloc fails. | ||
685 | */ | ||
687 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | 686 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); |
688 | 687 | ||
689 | for (nslot = 0, i = 0; i < csn; i++) { | 688 | for (nslot = 0, i = 0; i < csn; i++) { |
690 | struct cpuset *a = csa[i]; | 689 | struct cpuset *a = csa[i]; |
690 | cpumask_t *dp; | ||
691 | int apn = a->pn; | 691 | int apn = a->pn; |
692 | 692 | ||
693 | if (apn >= 0) { | 693 | if (apn < 0) { |
694 | cpumask_t *dp = doms + nslot; | 694 | /* Skip completed partitions */ |
695 | 695 | continue; | |
696 | if (nslot == ndoms) { | 696 | } |
697 | static int warnings = 10; | 697 | |
698 | if (warnings) { | 698 | dp = doms + nslot; |
699 | printk(KERN_WARNING | 699 | |
700 | "rebuild_sched_domains confused:" | 700 | if (nslot == ndoms) { |
701 | " nslot %d, ndoms %d, csn %d, i %d," | 701 | static int warnings = 10; |
702 | " apn %d\n", | 702 | if (warnings) { |
703 | nslot, ndoms, csn, i, apn); | 703 | printk(KERN_WARNING |
704 | warnings--; | 704 | "rebuild_sched_domains confused:" |
705 | } | 705 | " nslot %d, ndoms %d, csn %d, i %d," |
706 | continue; | 706 | " apn %d\n", |
707 | nslot, ndoms, csn, i, apn); | ||
708 | warnings--; | ||
707 | } | 709 | } |
710 | continue; | ||
711 | } | ||
708 | 712 | ||
709 | cpus_clear(*dp); | 713 | cpus_clear(*dp); |
710 | if (dattr) | 714 | if (dattr) |
711 | *(dattr + nslot) = SD_ATTR_INIT; | 715 | *(dattr + nslot) = SD_ATTR_INIT; |
712 | for (j = i; j < csn; j++) { | 716 | for (j = i; j < csn; j++) { |
713 | struct cpuset *b = csa[j]; | 717 | struct cpuset *b = csa[j]; |
714 | 718 | ||
715 | if (apn == b->pn) { | 719 | if (apn == b->pn) { |
716 | cpus_or(*dp, *dp, b->cpus_allowed); | 720 | cpus_or(*dp, *dp, b->cpus_allowed); |
717 | b->pn = -1; | 721 | if (dattr) |
718 | if (dattr) | 722 | update_domain_attr_tree(dattr + nslot, b); |
719 | update_domain_attr_tree(dattr | 723 | |
720 | + nslot, b); | 724 | /* Done with this partition */ |
721 | } | 725 | b->pn = -1; |
722 | } | 726 | } |
723 | nslot++; | ||
724 | } | 727 | } |
728 | nslot++; | ||
725 | } | 729 | } |
726 | BUG_ON(nslot != ndoms); | 730 | BUG_ON(nslot != ndoms); |
727 | 731 | ||
728 | rebuild: | 732 | done: |
729 | /* Have scheduler rebuild sched domains */ | 733 | kfree(csa); |
734 | |||
735 | *domains = doms; | ||
736 | *attributes = dattr; | ||
737 | return ndoms; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Rebuild scheduler domains. | ||
742 | * | ||
743 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | ||
744 | * Takes both cgroup_mutex and get_online_cpus(). | ||
745 | * | ||
746 | * Cannot be directly called from cpuset code handling changes | ||
747 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
748 | * from code that already holds cgroup_mutex. | ||
749 | */ | ||
750 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
751 | { | ||
752 | struct sched_domain_attr *attr; | ||
753 | cpumask_t *doms; | ||
754 | int ndoms; | ||
755 | |||
730 | get_online_cpus(); | 756 | get_online_cpus(); |
731 | partition_sched_domains(ndoms, doms, dattr); | 757 | |
758 | /* Generate domain masks and attrs */ | ||
759 | cgroup_lock(); | ||
760 | ndoms = generate_sched_domains(&doms, &attr); | ||
761 | cgroup_unlock(); | ||
762 | |||
763 | /* Have scheduler rebuild the domains */ | ||
764 | partition_sched_domains(ndoms, doms, attr); | ||
765 | |||
732 | put_online_cpus(); | 766 | put_online_cpus(); |
767 | } | ||
733 | 768 | ||
734 | done: | 769 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); |
735 | kfree(csa); | 770 | |
736 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 771 | /* |
737 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 772 | * Rebuild scheduler domains, asynchronously via workqueue. |
773 | * | ||
774 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
775 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
776 | * which has that flag enabled, or if any cpuset with a non-empty | ||
777 | * 'cpus' is removed, then call this routine to rebuild the | ||
778 | * scheduler's dynamic sched domains. | ||
779 | * | ||
780 | * The rebuild_sched_domains() and partition_sched_domains() | ||
781 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
782 | * but such cpuset changes as these must nest that locking the | ||
783 | * other way, holding cgroup_lock() for much of the code. | ||
784 | * | ||
785 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
786 | * these user changes delegates the actual sched domain rebuilding | ||
787 | * to a separate workqueue thread, which ends up processing the | ||
788 | * above do_rebuild_sched_domains() function. | ||
789 | */ | ||
790 | static void async_rebuild_sched_domains(void) | ||
791 | { | ||
792 | schedule_work(&rebuild_sched_domains_work); | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * Accomplishes the same scheduler domain rebuild as the above | ||
797 | * async_rebuild_sched_domains(), however it directly calls the | ||
798 | * rebuild routine synchronously rather than calling it via an | ||
799 | * asynchronous work thread. | ||
800 | * | ||
801 | * This can only be called from code that is not holding | ||
802 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
803 | */ | ||
804 | void rebuild_sched_domains(void) | ||
805 | { | ||
806 | do_rebuild_sched_domains(NULL); | ||
738 | } | 807 | } |
739 | 808 | ||
740 | /** | 809 | /** |
@@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
863 | return retval; | 932 | return retval; |
864 | 933 | ||
865 | if (is_load_balanced) | 934 | if (is_load_balanced) |
866 | rebuild_sched_domains(); | 935 | async_rebuild_sched_domains(); |
867 | return 0; | 936 | return 0; |
868 | } | 937 | } |
869 | 938 | ||
@@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1090 | if (val != cs->relax_domain_level) { | 1159 | if (val != cs->relax_domain_level) { |
1091 | cs->relax_domain_level = val; | 1160 | cs->relax_domain_level = val; |
1092 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) | 1161 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
1093 | rebuild_sched_domains(); | 1162 | async_rebuild_sched_domains(); |
1094 | } | 1163 | } |
1095 | 1164 | ||
1096 | return 0; | 1165 | return 0; |
@@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1131 | mutex_unlock(&callback_mutex); | 1200 | mutex_unlock(&callback_mutex); |
1132 | 1201 | ||
1133 | if (cpus_nonempty && balance_flag_changed) | 1202 | if (cpus_nonempty && balance_flag_changed) |
1134 | rebuild_sched_domains(); | 1203 | async_rebuild_sched_domains(); |
1135 | 1204 | ||
1136 | return 0; | 1205 | return 0; |
1137 | } | 1206 | } |
@@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
1492 | default: | 1561 | default: |
1493 | BUG(); | 1562 | BUG(); |
1494 | } | 1563 | } |
1564 | |||
1565 | /* Unreachable but makes gcc happy */ | ||
1566 | return 0; | ||
1495 | } | 1567 | } |
1496 | 1568 | ||
1497 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1569 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) |
@@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | |||
1504 | default: | 1576 | default: |
1505 | BUG(); | 1577 | BUG(); |
1506 | } | 1578 | } |
1579 | |||
1580 | /* Unrechable but makes gcc happy */ | ||
1581 | return 0; | ||
1507 | } | 1582 | } |
1508 | 1583 | ||
1509 | 1584 | ||
@@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1692 | } | 1767 | } |
1693 | 1768 | ||
1694 | /* | 1769 | /* |
1695 | * Locking note on the strange update_flag() call below: | ||
1696 | * | ||
1697 | * If the cpuset being removed has its flag 'sched_load_balance' | 1770 | * If the cpuset being removed has its flag 'sched_load_balance' |
1698 | * enabled, then simulate turning sched_load_balance off, which | 1771 | * enabled, then simulate turning sched_load_balance off, which |
1699 | * will call rebuild_sched_domains(). The get_online_cpus() | 1772 | * will call async_rebuild_sched_domains(). |
1700 | * call in rebuild_sched_domains() must not be made while holding | ||
1701 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
1702 | * get_online_cpus() calls. So the reverse nesting would risk an | ||
1703 | * ABBA deadlock. | ||
1704 | */ | 1773 | */ |
1705 | 1774 | ||
1706 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1775 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
@@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1719 | struct cgroup_subsys cpuset_subsys = { | 1788 | struct cgroup_subsys cpuset_subsys = { |
1720 | .name = "cpuset", | 1789 | .name = "cpuset", |
1721 | .create = cpuset_create, | 1790 | .create = cpuset_create, |
1722 | .destroy = cpuset_destroy, | 1791 | .destroy = cpuset_destroy, |
1723 | .can_attach = cpuset_can_attach, | 1792 | .can_attach = cpuset_can_attach, |
1724 | .attach = cpuset_attach, | 1793 | .attach = cpuset_attach, |
1725 | .populate = cpuset_populate, | 1794 | .populate = cpuset_populate, |
@@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
1811 | } | 1880 | } |
1812 | 1881 | ||
1813 | /* | 1882 | /* |
1814 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1883 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
1815 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1884 | * or memory nodes, we need to walk over the cpuset hierarchy, |
1816 | * removing that CPU or node from all cpusets. If this removes the | 1885 | * removing that CPU or node from all cpusets. If this removes the |
1817 | * last CPU or node from a cpuset, then move the tasks in the empty | 1886 | * last CPU or node from a cpuset, then move the tasks in the empty |
@@ -1903,35 +1972,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
1903 | } | 1972 | } |
1904 | 1973 | ||
1905 | /* | 1974 | /* |
1906 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
1907 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | ||
1908 | * track what's online after any CPU or memory node hotplug or unplug event. | ||
1909 | * | ||
1910 | * Since there are two callers of this routine, one for CPU hotplug | ||
1911 | * events and one for memory node hotplug events, we could have coded | ||
1912 | * two separate routines here. We code it as a single common routine | ||
1913 | * in order to minimize text size. | ||
1914 | */ | ||
1915 | |||
1916 | static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | ||
1917 | { | ||
1918 | cgroup_lock(); | ||
1919 | |||
1920 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1921 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
1922 | scan_for_empty_cpusets(&top_cpuset); | ||
1923 | |||
1924 | /* | ||
1925 | * Scheduler destroys domains on hotplug events. | ||
1926 | * Rebuild them based on the current settings. | ||
1927 | */ | ||
1928 | if (rebuild_sd) | ||
1929 | rebuild_sched_domains(); | ||
1930 | |||
1931 | cgroup_unlock(); | ||
1932 | } | ||
1933 | |||
1934 | /* | ||
1935 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 1975 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
1936 | * period. This is necessary in order to make cpusets transparent | 1976 | * period. This is necessary in order to make cpusets transparent |
1937 | * (of no affect) on systems that are actively using CPU hotplug | 1977 | * (of no affect) on systems that are actively using CPU hotplug |
@@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | |||
1939 | * | 1979 | * |
1940 | * This routine ensures that top_cpuset.cpus_allowed tracks | 1980 | * This routine ensures that top_cpuset.cpus_allowed tracks |
1941 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1981 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
1982 | * | ||
1983 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
1984 | * before calling generate_sched_domains(). | ||
1942 | */ | 1985 | */ |
1943 | 1986 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |
1944 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | ||
1945 | unsigned long phase, void *unused_cpu) | 1987 | unsigned long phase, void *unused_cpu) |
1946 | { | 1988 | { |
1989 | struct sched_domain_attr *attr; | ||
1990 | cpumask_t *doms; | ||
1991 | int ndoms; | ||
1992 | |||
1947 | switch (phase) { | 1993 | switch (phase) { |
1948 | case CPU_UP_CANCELED: | ||
1949 | case CPU_UP_CANCELED_FROZEN: | ||
1950 | case CPU_DOWN_FAILED: | ||
1951 | case CPU_DOWN_FAILED_FROZEN: | ||
1952 | case CPU_ONLINE: | 1994 | case CPU_ONLINE: |
1953 | case CPU_ONLINE_FROZEN: | 1995 | case CPU_ONLINE_FROZEN: |
1954 | case CPU_DEAD: | 1996 | case CPU_DEAD: |
1955 | case CPU_DEAD_FROZEN: | 1997 | case CPU_DEAD_FROZEN: |
1956 | common_cpu_mem_hotplug_unplug(1); | ||
1957 | break; | 1998 | break; |
1999 | |||
1958 | default: | 2000 | default: |
1959 | return NOTIFY_DONE; | 2001 | return NOTIFY_DONE; |
1960 | } | 2002 | } |
1961 | 2003 | ||
2004 | cgroup_lock(); | ||
2005 | top_cpuset.cpus_allowed = cpu_online_map; | ||
2006 | scan_for_empty_cpusets(&top_cpuset); | ||
2007 | ndoms = generate_sched_domains(&doms, &attr); | ||
2008 | cgroup_unlock(); | ||
2009 | |||
2010 | /* Have scheduler rebuild the domains */ | ||
2011 | partition_sched_domains(ndoms, doms, attr); | ||
2012 | |||
1962 | return NOTIFY_OK; | 2013 | return NOTIFY_OK; |
1963 | } | 2014 | } |
1964 | 2015 | ||
1965 | #ifdef CONFIG_MEMORY_HOTPLUG | 2016 | #ifdef CONFIG_MEMORY_HOTPLUG |
1966 | /* | 2017 | /* |
1967 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2018 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
1968 | * Call this routine anytime after you change | 2019 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
1969 | * node_states[N_HIGH_MEMORY]. | 2020 | * See also the previous routine cpuset_track_online_cpus(). |
1970 | * See also the previous routine cpuset_handle_cpuhp(). | ||
1971 | */ | 2021 | */ |
1972 | |||
1973 | void cpuset_track_online_nodes(void) | 2022 | void cpuset_track_online_nodes(void) |
1974 | { | 2023 | { |
1975 | common_cpu_mem_hotplug_unplug(0); | 2024 | cgroup_lock(); |
2025 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
2026 | scan_for_empty_cpusets(&top_cpuset); | ||
2027 | cgroup_unlock(); | ||
1976 | } | 2028 | } |
1977 | #endif | 2029 | #endif |
1978 | 2030 | ||
@@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void) | |||
1987 | top_cpuset.cpus_allowed = cpu_online_map; | 2039 | top_cpuset.cpus_allowed = cpu_online_map; |
1988 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2040 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
1989 | 2041 | ||
1990 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | 2042 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
1991 | } | 2043 | } |
1992 | 2044 | ||
1993 | /** | 2045 | /** |
diff --git a/kernel/exit.c b/kernel/exit.c index 25ed2ad986df..16395644a98f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
112 | * We won't ever get here for the group leader, since it | 112 | * We won't ever get here for the group leader, since it |
113 | * will have been the last reference on the signal_struct. | 113 | * will have been the last reference on the signal_struct. |
114 | */ | 114 | */ |
115 | sig->utime = cputime_add(sig->utime, tsk->utime); | 115 | sig->utime = cputime_add(sig->utime, task_utime(tsk)); |
116 | sig->stime = cputime_add(sig->stime, tsk->stime); | 116 | sig->stime = cputime_add(sig->stime, task_stime(tsk)); |
117 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | 117 | sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); |
118 | sig->min_flt += tsk->min_flt; | 118 | sig->min_flt += tsk->min_flt; |
119 | sig->maj_flt += tsk->maj_flt; | 119 | sig->maj_flt += tsk->maj_flt; |
120 | sig->nvcsw += tsk->nvcsw; | 120 | sig->nvcsw += tsk->nvcsw; |
diff --git a/kernel/sched.c b/kernel/sched.c index 9a1ddb84e26d..cc1f81b50b82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4179,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
4179 | } | 4179 | } |
4180 | 4180 | ||
4181 | /* | 4181 | /* |
4182 | * Use precise platform statistics if available: | ||
4183 | */ | ||
4184 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
4185 | cputime_t task_utime(struct task_struct *p) | ||
4186 | { | ||
4187 | return p->utime; | ||
4188 | } | ||
4189 | |||
4190 | cputime_t task_stime(struct task_struct *p) | ||
4191 | { | ||
4192 | return p->stime; | ||
4193 | } | ||
4194 | #else | ||
4195 | cputime_t task_utime(struct task_struct *p) | ||
4196 | { | ||
4197 | clock_t utime = cputime_to_clock_t(p->utime), | ||
4198 | total = utime + cputime_to_clock_t(p->stime); | ||
4199 | u64 temp; | ||
4200 | |||
4201 | /* | ||
4202 | * Use CFS's precise accounting: | ||
4203 | */ | ||
4204 | temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); | ||
4205 | |||
4206 | if (total) { | ||
4207 | temp *= utime; | ||
4208 | do_div(temp, total); | ||
4209 | } | ||
4210 | utime = (clock_t)temp; | ||
4211 | |||
4212 | p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); | ||
4213 | return p->prev_utime; | ||
4214 | } | ||
4215 | |||
4216 | cputime_t task_stime(struct task_struct *p) | ||
4217 | { | ||
4218 | clock_t stime; | ||
4219 | |||
4220 | /* | ||
4221 | * Use CFS's precise accounting. (we subtract utime from | ||
4222 | * the total, to make sure the total observed by userspace | ||
4223 | * grows monotonically - apps rely on that): | ||
4224 | */ | ||
4225 | stime = nsec_to_clock_t(p->se.sum_exec_runtime) - | ||
4226 | cputime_to_clock_t(task_utime(p)); | ||
4227 | |||
4228 | if (stime >= 0) | ||
4229 | p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); | ||
4230 | |||
4231 | return p->prev_stime; | ||
4232 | } | ||
4233 | #endif | ||
4234 | |||
4235 | inline cputime_t task_gtime(struct task_struct *p) | ||
4236 | { | ||
4237 | return p->gtime; | ||
4238 | } | ||
4239 | |||
4240 | /* | ||
4182 | * This function gets called by the timer code, with HZ frequency. | 4241 | * This function gets called by the timer code, with HZ frequency. |
4183 | * We call it with interrupts disabled. | 4242 | * We call it with interrupts disabled. |
4184 | * | 4243 | * |
@@ -7637,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
7637 | * and partition_sched_domains() will fallback to the single partition | 7696 | * and partition_sched_domains() will fallback to the single partition |
7638 | * 'fallback_doms', it also forces the domains to be rebuilt. | 7697 | * 'fallback_doms', it also forces the domains to be rebuilt. |
7639 | * | 7698 | * |
7699 | * If doms_new==NULL it will be replaced with cpu_online_map. | ||
7700 | * ndoms_new==0 is a special case for destroying existing domains. | ||
7701 | * It will not create the default domain. | ||
7702 | * | ||
7640 | * Call with hotplug lock held | 7703 | * Call with hotplug lock held |
7641 | */ | 7704 | */ |
7642 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, | 7705 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
7643 | struct sched_domain_attr *dattr_new) | 7706 | struct sched_domain_attr *dattr_new) |
7644 | { | 7707 | { |
7645 | int i, j; | 7708 | int i, j, n; |
7646 | 7709 | ||
7647 | mutex_lock(&sched_domains_mutex); | 7710 | mutex_lock(&sched_domains_mutex); |
7648 | 7711 | ||
7649 | /* always unregister in case we don't destroy any domains */ | 7712 | /* always unregister in case we don't destroy any domains */ |
7650 | unregister_sched_domain_sysctl(); | 7713 | unregister_sched_domain_sysctl(); |
7651 | 7714 | ||
7652 | if (doms_new == NULL) | 7715 | n = doms_new ? ndoms_new : 0; |
7653 | ndoms_new = 0; | ||
7654 | 7716 | ||
7655 | /* Destroy deleted domains */ | 7717 | /* Destroy deleted domains */ |
7656 | for (i = 0; i < ndoms_cur; i++) { | 7718 | for (i = 0; i < ndoms_cur; i++) { |
7657 | for (j = 0; j < ndoms_new; j++) { | 7719 | for (j = 0; j < n; j++) { |
7658 | if (cpus_equal(doms_cur[i], doms_new[j]) | 7720 | if (cpus_equal(doms_cur[i], doms_new[j]) |
7659 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7721 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
7660 | goto match1; | 7722 | goto match1; |
@@ -7667,7 +7729,6 @@ match1: | |||
7667 | 7729 | ||
7668 | if (doms_new == NULL) { | 7730 | if (doms_new == NULL) { |
7669 | ndoms_cur = 0; | 7731 | ndoms_cur = 0; |
7670 | ndoms_new = 1; | ||
7671 | doms_new = &fallback_doms; | 7732 | doms_new = &fallback_doms; |
7672 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7733 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
7673 | dattr_new = NULL; | 7734 | dattr_new = NULL; |
@@ -7704,8 +7765,13 @@ match2: | |||
7704 | int arch_reinit_sched_domains(void) | 7765 | int arch_reinit_sched_domains(void) |
7705 | { | 7766 | { |
7706 | get_online_cpus(); | 7767 | get_online_cpus(); |
7768 | |||
7769 | /* Destroy domains first to force the rebuild */ | ||
7770 | partition_sched_domains(0, NULL, NULL); | ||
7771 | |||
7707 | rebuild_sched_domains(); | 7772 | rebuild_sched_domains(); |
7708 | put_online_cpus(); | 7773 | put_online_cpus(); |
7774 | |||
7709 | return 0; | 7775 | return 0; |
7710 | } | 7776 | } |
7711 | 7777 | ||
@@ -7789,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
7789 | case CPU_ONLINE_FROZEN: | 7855 | case CPU_ONLINE_FROZEN: |
7790 | case CPU_DEAD: | 7856 | case CPU_DEAD: |
7791 | case CPU_DEAD_FROZEN: | 7857 | case CPU_DEAD_FROZEN: |
7792 | partition_sched_domains(0, NULL, NULL); | 7858 | partition_sched_domains(1, NULL, NULL); |
7793 | return NOTIFY_OK; | 7859 | return NOTIFY_OK; |
7794 | 7860 | ||
7795 | default: | 7861 | default: |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fe4713347275..50ec0886fa3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * | |||
159 | static struct ctl_table root_table[]; | 159 | static struct ctl_table root_table[]; |
160 | static struct ctl_table_root sysctl_table_root; | 160 | static struct ctl_table_root sysctl_table_root; |
161 | static struct ctl_table_header root_table_header = { | 161 | static struct ctl_table_header root_table_header = { |
162 | .count = 1, | ||
162 | .ctl_table = root_table, | 163 | .ctl_table = root_table, |
163 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), | 164 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), |
164 | .root = &sysctl_table_root, | 165 | .root = &sysctl_table_root, |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3d1e3e1a1971..1876b526c778 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -177,7 +177,7 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
177 | /* | 177 | /* |
178 | * Noop handler when we shut down an event device | 178 | * Noop handler when we shut down an event device |
179 | */ | 179 | */ |
180 | static void clockevents_handle_noop(struct clock_event_device *dev) | 180 | void clockevents_handle_noop(struct clock_event_device *dev) |
181 | { | 181 | { |
182 | } | 182 | } |
183 | 183 | ||
@@ -199,7 +199,6 @@ void clockevents_exchange_device(struct clock_event_device *old, | |||
199 | * released list and do a notify add later. | 199 | * released list and do a notify add later. |
200 | */ | 200 | */ |
201 | if (old) { | 201 | if (old) { |
202 | old->event_handler = clockevents_handle_noop; | ||
203 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); | 202 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); |
204 | list_del(&old->list); | 203 | list_del(&old->list); |
205 | list_add(&old->list, &clockevents_released); | 204 | list_add(&old->list, &clockevents_released); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd8196b..1ad46f3df6e7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy) | |||
245 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) | 245 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) |
246 | fail = update_persistent_clock(now); | 246 | fail = update_persistent_clock(now); |
247 | 247 | ||
248 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; | 248 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); |
249 | if (next.tv_nsec <= 0) | 249 | if (next.tv_nsec <= 0) |
250 | next.tv_nsec += NSEC_PER_SEC; | 250 | next.tv_nsec += NSEC_PER_SEC; |
251 | 251 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 31463d370b94..2f5a38294bf9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void) | |||
175 | */ | 175 | */ |
176 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | 176 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) |
177 | { | 177 | { |
178 | ktime_t next; | ||
179 | |||
178 | tick_do_periodic_broadcast(); | 180 | tick_do_periodic_broadcast(); |
179 | 181 | ||
180 | /* | 182 | /* |
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | |||
185 | 187 | ||
186 | /* | 188 | /* |
187 | * Setup the next period for devices, which do not have | 189 | * Setup the next period for devices, which do not have |
188 | * periodic mode: | 190 | * periodic mode. We read dev->next_event first and add to it |
191 | * when the event alrady expired. clockevents_program_event() | ||
192 | * sets dev->next_event only when the event is really | ||
193 | * programmed to the device. | ||
189 | */ | 194 | */ |
190 | for (;;) { | 195 | for (next = dev->next_event; ;) { |
191 | ktime_t next = ktime_add(dev->next_event, tick_period); | 196 | next = ktime_add(next, tick_period); |
192 | 197 | ||
193 | if (!clockevents_program_event(dev, next, ktime_get())) | 198 | if (!clockevents_program_event(dev, next, ktime_get())) |
194 | return; | 199 | return; |
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why) | |||
205 | struct clock_event_device *bc, *dev; | 210 | struct clock_event_device *bc, *dev; |
206 | struct tick_device *td; | 211 | struct tick_device *td; |
207 | unsigned long flags, *reason = why; | 212 | unsigned long flags, *reason = why; |
208 | int cpu; | 213 | int cpu, bc_stopped; |
209 | 214 | ||
210 | spin_lock_irqsave(&tick_broadcast_lock, flags); | 215 | spin_lock_irqsave(&tick_broadcast_lock, flags); |
211 | 216 | ||
@@ -223,6 +228,8 @@ static void tick_do_broadcast_on_off(void *why) | |||
223 | if (!tick_device_is_functional(dev)) | 228 | if (!tick_device_is_functional(dev)) |
224 | goto out; | 229 | goto out; |
225 | 230 | ||
231 | bc_stopped = cpus_empty(tick_broadcast_mask); | ||
232 | |||
226 | switch (*reason) { | 233 | switch (*reason) { |
227 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 234 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: |
228 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 235 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: |
@@ -245,9 +252,10 @@ static void tick_do_broadcast_on_off(void *why) | |||
245 | break; | 252 | break; |
246 | } | 253 | } |
247 | 254 | ||
248 | if (cpus_empty(tick_broadcast_mask)) | 255 | if (cpus_empty(tick_broadcast_mask)) { |
249 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | 256 | if (!bc_stopped) |
250 | else { | 257 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); |
258 | } else if (bc_stopped) { | ||
251 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | 259 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) |
252 | tick_broadcast_start_periodic(bc); | 260 | tick_broadcast_start_periodic(bc); |
253 | else | 261 | else |
@@ -364,16 +372,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void) | |||
364 | static int tick_broadcast_set_event(ktime_t expires, int force) | 372 | static int tick_broadcast_set_event(ktime_t expires, int force) |
365 | { | 373 | { |
366 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 374 | struct clock_event_device *bc = tick_broadcast_device.evtdev; |
367 | ktime_t now = ktime_get(); | 375 | |
368 | int res; | 376 | return tick_dev_program_event(bc, expires, force); |
369 | |||
370 | for(;;) { | ||
371 | res = clockevents_program_event(bc, expires, now); | ||
372 | if (!res || !force) | ||
373 | return res; | ||
374 | now = ktime_get(); | ||
375 | expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); | ||
376 | } | ||
377 | } | 377 | } |
378 | 378 | ||
379 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 379 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
@@ -491,14 +491,52 @@ static void tick_broadcast_clear_oneshot(int cpu) | |||
491 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | 491 | cpu_clear(cpu, tick_broadcast_oneshot_mask); |
492 | } | 492 | } |
493 | 493 | ||
494 | static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) | ||
495 | { | ||
496 | struct tick_device *td; | ||
497 | int cpu; | ||
498 | |||
499 | for_each_cpu_mask_nr(cpu, *mask) { | ||
500 | td = &per_cpu(tick_cpu_device, cpu); | ||
501 | if (td->evtdev) | ||
502 | td->evtdev->next_event = expires; | ||
503 | } | ||
504 | } | ||
505 | |||
494 | /** | 506 | /** |
495 | * tick_broadcast_setup_oneshot - setup the broadcast device | 507 | * tick_broadcast_setup_oneshot - setup the broadcast device |
496 | */ | 508 | */ |
497 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 509 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
498 | { | 510 | { |
499 | bc->event_handler = tick_handle_oneshot_broadcast; | 511 | /* Set it up only once ! */ |
500 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 512 | if (bc->event_handler != tick_handle_oneshot_broadcast) { |
501 | bc->next_event.tv64 = KTIME_MAX; | 513 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; |
514 | int cpu = smp_processor_id(); | ||
515 | cpumask_t mask; | ||
516 | |||
517 | bc->event_handler = tick_handle_oneshot_broadcast; | ||
518 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
519 | |||
520 | /* Take the do_timer update */ | ||
521 | tick_do_timer_cpu = cpu; | ||
522 | |||
523 | /* | ||
524 | * We must be careful here. There might be other CPUs | ||
525 | * waiting for periodic broadcast. We need to set the | ||
526 | * oneshot_mask bits for those and program the | ||
527 | * broadcast device to fire. | ||
528 | */ | ||
529 | mask = tick_broadcast_mask; | ||
530 | cpu_clear(cpu, mask); | ||
531 | cpus_or(tick_broadcast_oneshot_mask, | ||
532 | tick_broadcast_oneshot_mask, mask); | ||
533 | |||
534 | if (was_periodic && !cpus_empty(mask)) { | ||
535 | tick_broadcast_init_next_event(&mask, tick_next_period); | ||
536 | tick_broadcast_set_event(tick_next_period, 1); | ||
537 | } else | ||
538 | bc->next_event.tv64 = KTIME_MAX; | ||
539 | } | ||
502 | } | 540 | } |
503 | 541 | ||
504 | /* | 542 | /* |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 80c4336f4188..c4777193d567 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -161,6 +161,7 @@ static void tick_setup_device(struct tick_device *td, | |||
161 | } else { | 161 | } else { |
162 | handler = td->evtdev->event_handler; | 162 | handler = td->evtdev->event_handler; |
163 | next_event = td->evtdev->next_event; | 163 | next_event = td->evtdev->next_event; |
164 | td->evtdev->event_handler = clockevents_handle_noop; | ||
164 | } | 165 | } |
165 | 166 | ||
166 | td->evtdev = newdev; | 167 | td->evtdev = newdev; |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f13f2b7f4fd4..0ffc2918ea6f 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -17,6 +17,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev); | |||
17 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | 17 | extern void tick_setup_oneshot(struct clock_event_device *newdev, |
18 | void (*handler)(struct clock_event_device *), | 18 | void (*handler)(struct clock_event_device *), |
19 | ktime_t nextevt); | 19 | ktime_t nextevt); |
20 | extern int tick_dev_program_event(struct clock_event_device *dev, | ||
21 | ktime_t expires, int force); | ||
20 | extern int tick_program_event(ktime_t expires, int force); | 22 | extern int tick_program_event(ktime_t expires, int force); |
21 | extern void tick_oneshot_notify(void); | 23 | extern void tick_oneshot_notify(void); |
22 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | 24 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 450c04935b66..2e8de678e767 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -23,24 +23,56 @@ | |||
23 | #include "tick-internal.h" | 23 | #include "tick-internal.h" |
24 | 24 | ||
25 | /** | 25 | /** |
26 | * tick_program_event | 26 | * tick_program_event internal worker function |
27 | */ | 27 | */ |
28 | int tick_program_event(ktime_t expires, int force) | 28 | int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, |
29 | int force) | ||
29 | { | 30 | { |
30 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | ||
31 | ktime_t now = ktime_get(); | 31 | ktime_t now = ktime_get(); |
32 | int i; | ||
32 | 33 | ||
33 | while (1) { | 34 | for (i = 0;;) { |
34 | int ret = clockevents_program_event(dev, expires, now); | 35 | int ret = clockevents_program_event(dev, expires, now); |
35 | 36 | ||
36 | if (!ret || !force) | 37 | if (!ret || !force) |
37 | return ret; | 38 | return ret; |
39 | |||
40 | /* | ||
41 | * We tried 2 times to program the device with the given | ||
42 | * min_delta_ns. If that's not working then we double it | ||
43 | * and emit a warning. | ||
44 | */ | ||
45 | if (++i > 2) { | ||
46 | /* Increase the min. delta and try again */ | ||
47 | if (!dev->min_delta_ns) | ||
48 | dev->min_delta_ns = 5000; | ||
49 | else | ||
50 | dev->min_delta_ns += dev->min_delta_ns >> 1; | ||
51 | |||
52 | printk(KERN_WARNING | ||
53 | "CE: %s increasing min_delta_ns to %lu nsec\n", | ||
54 | dev->name ? dev->name : "?", | ||
55 | dev->min_delta_ns << 1); | ||
56 | |||
57 | i = 0; | ||
58 | } | ||
59 | |||
38 | now = ktime_get(); | 60 | now = ktime_get(); |
39 | expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); | 61 | expires = ktime_add_ns(now, dev->min_delta_ns); |
40 | } | 62 | } |
41 | } | 63 | } |
42 | 64 | ||
43 | /** | 65 | /** |
66 | * tick_program_event | ||
67 | */ | ||
68 | int tick_program_event(ktime_t expires, int force) | ||
69 | { | ||
70 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | ||
71 | |||
72 | return tick_dev_program_event(dev, expires, force); | ||
73 | } | ||
74 | |||
75 | /** | ||
44 | * tick_resume_onshot - resume oneshot mode | 76 | * tick_resume_onshot - resume oneshot mode |
45 | */ | 77 | */ |
46 | void tick_resume_oneshot(void) | 78 | void tick_resume_oneshot(void) |
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
61 | { | 93 | { |
62 | newdev->event_handler = handler; | 94 | newdev->event_handler = handler; |
63 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | 95 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); |
64 | clockevents_program_event(newdev, next_event, ktime_get()); | 96 | tick_dev_program_event(newdev, next_event, 1); |
65 | } | 97 | } |
66 | 98 | ||
67 | /** | 99 | /** |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7a46bde78c66..a87b0468568b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -162,6 +162,8 @@ void tick_nohz_stop_idle(int cpu) | |||
162 | ts->idle_lastupdate = now; | 162 | ts->idle_lastupdate = now; |
163 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | 163 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); |
164 | ts->idle_active = 0; | 164 | ts->idle_active = 0; |
165 | |||
166 | sched_clock_idle_wakeup_event(0); | ||
165 | } | 167 | } |
166 | } | 168 | } |
167 | 169 | ||
@@ -177,6 +179,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) | |||
177 | } | 179 | } |
178 | ts->idle_entrytime = now; | 180 | ts->idle_entrytime = now; |
179 | ts->idle_active = 1; | 181 | ts->idle_active = 1; |
182 | sched_clock_idle_sleep_event(); | ||
180 | return now; | 183 | return now; |
181 | } | 184 | } |
182 | 185 | ||