diff options
Diffstat (limited to 'kernel')
34 files changed, 2497 insertions, 775 deletions
diff --git a/kernel/compat.c b/kernel/compat.c index 9c48abfcd4a5..e1ef04870c2a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -445,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, | |||
| 445 | if (retval) | 445 | if (retval) |
| 446 | return retval; | 446 | return retval; |
| 447 | 447 | ||
| 448 | return sched_setaffinity(pid, new_mask); | 448 | return sched_setaffinity(pid, &new_mask); |
| 449 | } | 449 | } |
| 450 | 450 | ||
| 451 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, | 451 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2eff3f63abed..2011ad8d2697 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 232 | 232 | ||
| 233 | /* Ensure that we are not runnable on dying cpu */ | 233 | /* Ensure that we are not runnable on dying cpu */ |
| 234 | old_allowed = current->cpus_allowed; | 234 | old_allowed = current->cpus_allowed; |
| 235 | tmp = CPU_MASK_ALL; | 235 | cpus_setall(tmp); |
| 236 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
| 237 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed_ptr(current, &tmp); |
| 238 | 238 | ||
| 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
| 240 | 240 | ||
| @@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 268 | out_thread: | 268 | out_thread: |
| 269 | err = kthread_stop(p); | 269 | err = kthread_stop(p); |
| 270 | out_allowed: | 270 | out_allowed: |
| 271 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed_ptr(current, &old_allowed); |
| 272 | out_release: | 272 | out_release: |
| 273 | cpu_hotplug_done(); | 273 | cpu_hotplug_done(); |
| 274 | return err; | 274 | return err; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f414228..8b35fbd8292f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -98,6 +98,9 @@ struct cpuset { | |||
| 98 | /* partition number for rebuild_sched_domains() */ | 98 | /* partition number for rebuild_sched_domains() */ |
| 99 | int pn; | 99 | int pn; |
| 100 | 100 | ||
| 101 | /* for custom sched domain */ | ||
| 102 | int relax_domain_level; | ||
| 103 | |||
| 101 | /* used for walking a cpuset heirarchy */ | 104 | /* used for walking a cpuset heirarchy */ |
| 102 | struct list_head stack_list; | 105 | struct list_head stack_list; |
| 103 | }; | 106 | }; |
| @@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
| 478 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 481 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
| 479 | } | 482 | } |
| 480 | 483 | ||
| 484 | static void | ||
| 485 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | ||
| 486 | { | ||
| 487 | if (!dattr) | ||
| 488 | return; | ||
| 489 | if (dattr->relax_domain_level < c->relax_domain_level) | ||
| 490 | dattr->relax_domain_level = c->relax_domain_level; | ||
| 491 | return; | ||
| 492 | } | ||
| 493 | |||
| 481 | /* | 494 | /* |
| 482 | * rebuild_sched_domains() | 495 | * rebuild_sched_domains() |
| 483 | * | 496 | * |
| @@ -553,12 +566,14 @@ static void rebuild_sched_domains(void) | |||
| 553 | int csn; /* how many cpuset ptrs in csa so far */ | 566 | int csn; /* how many cpuset ptrs in csa so far */ |
| 554 | int i, j, k; /* indices for partition finding loops */ | 567 | int i, j, k; /* indices for partition finding loops */ |
| 555 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ | 568 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ |
| 569 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | ||
| 556 | int ndoms; /* number of sched domains in result */ | 570 | int ndoms; /* number of sched domains in result */ |
| 557 | int nslot; /* next empty doms[] cpumask_t slot */ | 571 | int nslot; /* next empty doms[] cpumask_t slot */ |
| 558 | 572 | ||
| 559 | q = NULL; | 573 | q = NULL; |
| 560 | csa = NULL; | 574 | csa = NULL; |
| 561 | doms = NULL; | 575 | doms = NULL; |
| 576 | dattr = NULL; | ||
| 562 | 577 | ||
| 563 | /* Special case for the 99% of systems with one, full, sched domain */ | 578 | /* Special case for the 99% of systems with one, full, sched domain */ |
| 564 | if (is_sched_load_balance(&top_cpuset)) { | 579 | if (is_sched_load_balance(&top_cpuset)) { |
| @@ -566,6 +581,11 @@ static void rebuild_sched_domains(void) | |||
| 566 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 581 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
| 567 | if (!doms) | 582 | if (!doms) |
| 568 | goto rebuild; | 583 | goto rebuild; |
| 584 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
| 585 | if (dattr) { | ||
| 586 | *dattr = SD_ATTR_INIT; | ||
| 587 | update_domain_attr(dattr, &top_cpuset); | ||
| 588 | } | ||
| 569 | *doms = top_cpuset.cpus_allowed; | 589 | *doms = top_cpuset.cpus_allowed; |
| 570 | goto rebuild; | 590 | goto rebuild; |
| 571 | } | 591 | } |
| @@ -622,6 +642,7 @@ restart: | |||
| 622 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 642 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
| 623 | if (!doms) | 643 | if (!doms) |
| 624 | goto rebuild; | 644 | goto rebuild; |
| 645 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
| 625 | 646 | ||
| 626 | for (nslot = 0, i = 0; i < csn; i++) { | 647 | for (nslot = 0, i = 0; i < csn; i++) { |
| 627 | struct cpuset *a = csa[i]; | 648 | struct cpuset *a = csa[i]; |
| @@ -644,12 +665,15 @@ restart: | |||
| 644 | } | 665 | } |
| 645 | 666 | ||
| 646 | cpus_clear(*dp); | 667 | cpus_clear(*dp); |
| 668 | if (dattr) | ||
| 669 | *(dattr + nslot) = SD_ATTR_INIT; | ||
| 647 | for (j = i; j < csn; j++) { | 670 | for (j = i; j < csn; j++) { |
| 648 | struct cpuset *b = csa[j]; | 671 | struct cpuset *b = csa[j]; |
| 649 | 672 | ||
| 650 | if (apn == b->pn) { | 673 | if (apn == b->pn) { |
| 651 | cpus_or(*dp, *dp, b->cpus_allowed); | 674 | cpus_or(*dp, *dp, b->cpus_allowed); |
| 652 | b->pn = -1; | 675 | b->pn = -1; |
| 676 | update_domain_attr(dattr, b); | ||
| 653 | } | 677 | } |
| 654 | } | 678 | } |
| 655 | nslot++; | 679 | nslot++; |
| @@ -660,7 +684,7 @@ restart: | |||
| 660 | rebuild: | 684 | rebuild: |
| 661 | /* Have scheduler rebuild sched domains */ | 685 | /* Have scheduler rebuild sched domains */ |
| 662 | get_online_cpus(); | 686 | get_online_cpus(); |
| 663 | partition_sched_domains(ndoms, doms); | 687 | partition_sched_domains(ndoms, doms, dattr); |
| 664 | put_online_cpus(); | 688 | put_online_cpus(); |
| 665 | 689 | ||
| 666 | done: | 690 | done: |
| @@ -668,6 +692,7 @@ done: | |||
| 668 | kfifo_free(q); | 692 | kfifo_free(q); |
| 669 | kfree(csa); | 693 | kfree(csa); |
| 670 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 694 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
| 695 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | ||
| 671 | } | 696 | } |
| 672 | 697 | ||
| 673 | static inline int started_after_time(struct task_struct *t1, | 698 | static inline int started_after_time(struct task_struct *t1, |
| @@ -729,7 +754,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | |||
| 729 | */ | 754 | */ |
| 730 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | 755 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) |
| 731 | { | 756 | { |
| 732 | set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); | 757 | set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); |
| 733 | } | 758 | } |
| 734 | 759 | ||
| 735 | /** | 760 | /** |
| @@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
| 1011 | return 0; | 1036 | return 0; |
| 1012 | } | 1037 | } |
| 1013 | 1038 | ||
| 1039 | static int update_relax_domain_level(struct cpuset *cs, char *buf) | ||
| 1040 | { | ||
| 1041 | int val = simple_strtol(buf, NULL, 10); | ||
| 1042 | |||
| 1043 | if (val < 0) | ||
| 1044 | val = -1; | ||
| 1045 | |||
| 1046 | if (val != cs->relax_domain_level) { | ||
| 1047 | cs->relax_domain_level = val; | ||
| 1048 | rebuild_sched_domains(); | ||
| 1049 | } | ||
| 1050 | |||
| 1051 | return 0; | ||
| 1052 | } | ||
| 1053 | |||
| 1014 | /* | 1054 | /* |
| 1015 | * update_flag - read a 0 or a 1 in a file and update associated flag | 1055 | * update_flag - read a 0 or a 1 in a file and update associated flag |
| 1016 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 1056 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
| @@ -1178,7 +1218,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
| 1178 | 1218 | ||
| 1179 | mutex_lock(&callback_mutex); | 1219 | mutex_lock(&callback_mutex); |
| 1180 | guarantee_online_cpus(cs, &cpus); | 1220 | guarantee_online_cpus(cs, &cpus); |
| 1181 | set_cpus_allowed(tsk, cpus); | 1221 | set_cpus_allowed_ptr(tsk, &cpus); |
| 1182 | mutex_unlock(&callback_mutex); | 1222 | mutex_unlock(&callback_mutex); |
| 1183 | 1223 | ||
| 1184 | from = oldcs->mems_allowed; | 1224 | from = oldcs->mems_allowed; |
| @@ -1202,6 +1242,7 @@ typedef enum { | |||
| 1202 | FILE_CPU_EXCLUSIVE, | 1242 | FILE_CPU_EXCLUSIVE, |
| 1203 | FILE_MEM_EXCLUSIVE, | 1243 | FILE_MEM_EXCLUSIVE, |
| 1204 | FILE_SCHED_LOAD_BALANCE, | 1244 | FILE_SCHED_LOAD_BALANCE, |
| 1245 | FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
| 1205 | FILE_MEMORY_PRESSURE_ENABLED, | 1246 | FILE_MEMORY_PRESSURE_ENABLED, |
| 1206 | FILE_MEMORY_PRESSURE, | 1247 | FILE_MEMORY_PRESSURE, |
| 1207 | FILE_SPREAD_PAGE, | 1248 | FILE_SPREAD_PAGE, |
| @@ -1256,6 +1297,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
| 1256 | case FILE_SCHED_LOAD_BALANCE: | 1297 | case FILE_SCHED_LOAD_BALANCE: |
| 1257 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); | 1298 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); |
| 1258 | break; | 1299 | break; |
| 1300 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | ||
| 1301 | retval = update_relax_domain_level(cs, buffer); | ||
| 1302 | break; | ||
| 1259 | case FILE_MEMORY_MIGRATE: | 1303 | case FILE_MEMORY_MIGRATE: |
| 1260 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | 1304 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); |
| 1261 | break; | 1305 | break; |
| @@ -1354,6 +1398,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, | |||
| 1354 | case FILE_SCHED_LOAD_BALANCE: | 1398 | case FILE_SCHED_LOAD_BALANCE: |
| 1355 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; | 1399 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; |
| 1356 | break; | 1400 | break; |
| 1401 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | ||
| 1402 | s += sprintf(s, "%d", cs->relax_domain_level); | ||
| 1403 | break; | ||
| 1357 | case FILE_MEMORY_MIGRATE: | 1404 | case FILE_MEMORY_MIGRATE: |
| 1358 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | 1405 | *s++ = is_memory_migrate(cs) ? '1' : '0'; |
| 1359 | break; | 1406 | break; |
| @@ -1424,6 +1471,13 @@ static struct cftype cft_sched_load_balance = { | |||
| 1424 | .private = FILE_SCHED_LOAD_BALANCE, | 1471 | .private = FILE_SCHED_LOAD_BALANCE, |
| 1425 | }; | 1472 | }; |
| 1426 | 1473 | ||
| 1474 | static struct cftype cft_sched_relax_domain_level = { | ||
| 1475 | .name = "sched_relax_domain_level", | ||
| 1476 | .read = cpuset_common_file_read, | ||
| 1477 | .write = cpuset_common_file_write, | ||
| 1478 | .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
| 1479 | }; | ||
| 1480 | |||
| 1427 | static struct cftype cft_memory_migrate = { | 1481 | static struct cftype cft_memory_migrate = { |
| 1428 | .name = "memory_migrate", | 1482 | .name = "memory_migrate", |
| 1429 | .read = cpuset_common_file_read, | 1483 | .read = cpuset_common_file_read, |
| @@ -1475,6 +1529,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1475 | return err; | 1529 | return err; |
| 1476 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) | 1530 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) |
| 1477 | return err; | 1531 | return err; |
| 1532 | if ((err = cgroup_add_file(cont, ss, | ||
| 1533 | &cft_sched_relax_domain_level)) < 0) | ||
| 1534 | return err; | ||
| 1478 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) | 1535 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) |
| 1479 | return err; | 1536 | return err; |
| 1480 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) | 1537 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) |
| @@ -1555,10 +1612,11 @@ static struct cgroup_subsys_state *cpuset_create( | |||
| 1555 | if (is_spread_slab(parent)) | 1612 | if (is_spread_slab(parent)) |
| 1556 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1613 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
| 1557 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1614 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
| 1558 | cs->cpus_allowed = CPU_MASK_NONE; | 1615 | cpus_clear(cs->cpus_allowed); |
| 1559 | cs->mems_allowed = NODE_MASK_NONE; | 1616 | nodes_clear(cs->mems_allowed); |
| 1560 | cs->mems_generation = cpuset_mems_generation++; | 1617 | cs->mems_generation = cpuset_mems_generation++; |
| 1561 | fmeter_init(&cs->fmeter); | 1618 | fmeter_init(&cs->fmeter); |
| 1619 | cs->relax_domain_level = -1; | ||
| 1562 | 1620 | ||
| 1563 | cs->parent = parent; | 1621 | cs->parent = parent; |
| 1564 | number_of_cpusets++; | 1622 | number_of_cpusets++; |
| @@ -1625,12 +1683,13 @@ int __init cpuset_init(void) | |||
| 1625 | { | 1683 | { |
| 1626 | int err = 0; | 1684 | int err = 0; |
| 1627 | 1685 | ||
| 1628 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1686 | cpus_setall(top_cpuset.cpus_allowed); |
| 1629 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1687 | nodes_setall(top_cpuset.mems_allowed); |
| 1630 | 1688 | ||
| 1631 | fmeter_init(&top_cpuset.fmeter); | 1689 | fmeter_init(&top_cpuset.fmeter); |
| 1632 | top_cpuset.mems_generation = cpuset_mems_generation++; | 1690 | top_cpuset.mems_generation = cpuset_mems_generation++; |
| 1633 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 1691 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
| 1692 | top_cpuset.relax_domain_level = -1; | ||
| 1634 | 1693 | ||
| 1635 | err = register_filesystem(&cpuset_fs_type); | 1694 | err = register_filesystem(&cpuset_fs_type); |
| 1636 | if (err < 0) | 1695 | if (err < 0) |
| @@ -1844,6 +1903,7 @@ void __init cpuset_init_smp(void) | |||
| 1844 | 1903 | ||
| 1845 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 1904 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
| 1846 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 1905 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
| 1906 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | ||
| 1847 | * | 1907 | * |
| 1848 | * Description: Returns the cpumask_t cpus_allowed of the cpuset | 1908 | * Description: Returns the cpumask_t cpus_allowed of the cpuset |
| 1849 | * attached to the specified @tsk. Guaranteed to return some non-empty | 1909 | * attached to the specified @tsk. Guaranteed to return some non-empty |
| @@ -1851,35 +1911,27 @@ void __init cpuset_init_smp(void) | |||
| 1851 | * tasks cpuset. | 1911 | * tasks cpuset. |
| 1852 | **/ | 1912 | **/ |
| 1853 | 1913 | ||
| 1854 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | 1914 | void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) |
| 1855 | { | 1915 | { |
| 1856 | cpumask_t mask; | ||
| 1857 | |||
| 1858 | mutex_lock(&callback_mutex); | 1916 | mutex_lock(&callback_mutex); |
| 1859 | mask = cpuset_cpus_allowed_locked(tsk); | 1917 | cpuset_cpus_allowed_locked(tsk, pmask); |
| 1860 | mutex_unlock(&callback_mutex); | 1918 | mutex_unlock(&callback_mutex); |
| 1861 | |||
| 1862 | return mask; | ||
| 1863 | } | 1919 | } |
| 1864 | 1920 | ||
| 1865 | /** | 1921 | /** |
| 1866 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | 1922 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. |
| 1867 | * Must be called with callback_mutex held. | 1923 | * Must be called with callback_mutex held. |
| 1868 | **/ | 1924 | **/ |
| 1869 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) | 1925 | void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) |
| 1870 | { | 1926 | { |
| 1871 | cpumask_t mask; | ||
| 1872 | |||
| 1873 | task_lock(tsk); | 1927 | task_lock(tsk); |
| 1874 | guarantee_online_cpus(task_cs(tsk), &mask); | 1928 | guarantee_online_cpus(task_cs(tsk), pmask); |
| 1875 | task_unlock(tsk); | 1929 | task_unlock(tsk); |
| 1876 | |||
| 1877 | return mask; | ||
| 1878 | } | 1930 | } |
| 1879 | 1931 | ||
| 1880 | void cpuset_init_current_mems_allowed(void) | 1932 | void cpuset_init_current_mems_allowed(void) |
| 1881 | { | 1933 | { |
| 1882 | current->mems_allowed = NODE_MASK_ALL; | 1934 | nodes_setall(current->mems_allowed); |
| 1883 | } | 1935 | } |
| 1884 | 1936 | ||
| 1885 | /** | 1937 | /** |
| @@ -2261,8 +2313,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | |||
| 2261 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, | 2313 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, |
| 2262 | task->cpus_allowed); | 2314 | task->cpus_allowed); |
| 2263 | seq_printf(m, "\n"); | 2315 | seq_printf(m, "\n"); |
| 2316 | seq_printf(m, "Cpus_allowed_list:\t"); | ||
| 2317 | m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, | ||
| 2318 | task->cpus_allowed); | ||
| 2319 | seq_printf(m, "\n"); | ||
| 2264 | seq_printf(m, "Mems_allowed:\t"); | 2320 | seq_printf(m, "Mems_allowed:\t"); |
| 2265 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, | 2321 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, |
| 2266 | task->mems_allowed); | 2322 | task->mems_allowed); |
| 2267 | seq_printf(m, "\n"); | 2323 | seq_printf(m, "\n"); |
| 2324 | seq_printf(m, "Mems_allowed_list:\t"); | ||
| 2325 | m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, | ||
| 2326 | task->mems_allowed); | ||
| 2327 | seq_printf(m, "\n"); | ||
| 2268 | } | 2328 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 073005b1cfb2..cece89f80ab4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -521,7 +521,7 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files) | |||
| 521 | } | 521 | } |
| 522 | EXPORT_SYMBOL(reset_files_struct); | 522 | EXPORT_SYMBOL(reset_files_struct); |
| 523 | 523 | ||
| 524 | static void __exit_files(struct task_struct *tsk) | 524 | void exit_files(struct task_struct *tsk) |
| 525 | { | 525 | { |
| 526 | struct files_struct * files = tsk->files; | 526 | struct files_struct * files = tsk->files; |
| 527 | 527 | ||
| @@ -533,12 +533,7 @@ static void __exit_files(struct task_struct *tsk) | |||
| 533 | } | 533 | } |
| 534 | } | 534 | } |
| 535 | 535 | ||
| 536 | void exit_files(struct task_struct *tsk) | 536 | void put_fs_struct(struct fs_struct *fs) |
| 537 | { | ||
| 538 | __exit_files(tsk); | ||
| 539 | } | ||
| 540 | |||
| 541 | static void __put_fs_struct(struct fs_struct *fs) | ||
| 542 | { | 537 | { |
| 543 | /* No need to hold fs->lock if we are killing it */ | 538 | /* No need to hold fs->lock if we are killing it */ |
| 544 | if (atomic_dec_and_test(&fs->count)) { | 539 | if (atomic_dec_and_test(&fs->count)) { |
| @@ -550,12 +545,7 @@ static void __put_fs_struct(struct fs_struct *fs) | |||
| 550 | } | 545 | } |
| 551 | } | 546 | } |
| 552 | 547 | ||
| 553 | void put_fs_struct(struct fs_struct *fs) | 548 | void exit_fs(struct task_struct *tsk) |
| 554 | { | ||
| 555 | __put_fs_struct(fs); | ||
| 556 | } | ||
| 557 | |||
| 558 | static void __exit_fs(struct task_struct *tsk) | ||
| 559 | { | 549 | { |
| 560 | struct fs_struct * fs = tsk->fs; | 550 | struct fs_struct * fs = tsk->fs; |
| 561 | 551 | ||
| @@ -563,15 +553,10 @@ static void __exit_fs(struct task_struct *tsk) | |||
| 563 | task_lock(tsk); | 553 | task_lock(tsk); |
| 564 | tsk->fs = NULL; | 554 | tsk->fs = NULL; |
| 565 | task_unlock(tsk); | 555 | task_unlock(tsk); |
| 566 | __put_fs_struct(fs); | 556 | put_fs_struct(fs); |
| 567 | } | 557 | } |
| 568 | } | 558 | } |
| 569 | 559 | ||
| 570 | void exit_fs(struct task_struct *tsk) | ||
| 571 | { | ||
| 572 | __exit_fs(tsk); | ||
| 573 | } | ||
| 574 | |||
| 575 | EXPORT_SYMBOL_GPL(exit_fs); | 560 | EXPORT_SYMBOL_GPL(exit_fs); |
| 576 | 561 | ||
| 577 | /* | 562 | /* |
| @@ -967,8 +952,8 @@ NORET_TYPE void do_exit(long code) | |||
| 967 | if (group_dead) | 952 | if (group_dead) |
| 968 | acct_process(); | 953 | acct_process(); |
| 969 | exit_sem(tsk); | 954 | exit_sem(tsk); |
| 970 | __exit_files(tsk); | 955 | exit_files(tsk); |
| 971 | __exit_fs(tsk); | 956 | exit_fs(tsk); |
| 972 | check_stack_usage(); | 957 | check_stack_usage(); |
| 973 | exit_thread(); | 958 | exit_thread(); |
| 974 | cgroup_exit(tsk, 1); | 959 | cgroup_exit(tsk, 1); |
diff --git a/kernel/fork.c b/kernel/fork.c index 9c042f901570..89fe414645e9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -132,6 +132,14 @@ void __put_task_struct(struct task_struct *tsk) | |||
| 132 | free_task(tsk); | 132 | free_task(tsk); |
| 133 | } | 133 | } |
| 134 | 134 | ||
| 135 | /* | ||
| 136 | * macro override instead of weak attribute alias, to workaround | ||
| 137 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
| 138 | */ | ||
| 139 | #ifndef arch_task_cache_init | ||
| 140 | #define arch_task_cache_init() | ||
| 141 | #endif | ||
| 142 | |||
| 135 | void __init fork_init(unsigned long mempages) | 143 | void __init fork_init(unsigned long mempages) |
| 136 | { | 144 | { |
| 137 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 145 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
| @@ -144,6 +152,9 @@ void __init fork_init(unsigned long mempages) | |||
| 144 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); | 152 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); |
| 145 | #endif | 153 | #endif |
| 146 | 154 | ||
| 155 | /* do the arch specific task caches init */ | ||
| 156 | arch_task_cache_init(); | ||
| 157 | |||
| 147 | /* | 158 | /* |
| 148 | * The default maximum number of threads is set to a safe | 159 | * The default maximum number of threads is set to a safe |
| 149 | * value: the thread structures can take up at most half | 160 | * value: the thread structures can take up at most half |
| @@ -163,6 +174,13 @@ void __init fork_init(unsigned long mempages) | |||
| 163 | init_task.signal->rlim[RLIMIT_NPROC]; | 174 | init_task.signal->rlim[RLIMIT_NPROC]; |
| 164 | } | 175 | } |
| 165 | 176 | ||
| 177 | int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, | ||
| 178 | struct task_struct *src) | ||
| 179 | { | ||
| 180 | *dst = *src; | ||
| 181 | return 0; | ||
| 182 | } | ||
| 183 | |||
| 166 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 184 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
| 167 | { | 185 | { |
| 168 | struct task_struct *tsk; | 186 | struct task_struct *tsk; |
| @@ -181,15 +199,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 181 | return NULL; | 199 | return NULL; |
| 182 | } | 200 | } |
| 183 | 201 | ||
| 184 | *tsk = *orig; | 202 | err = arch_dup_task_struct(tsk, orig); |
| 203 | if (err) | ||
| 204 | goto out; | ||
| 205 | |||
| 185 | tsk->stack = ti; | 206 | tsk->stack = ti; |
| 186 | 207 | ||
| 187 | err = prop_local_init_single(&tsk->dirties); | 208 | err = prop_local_init_single(&tsk->dirties); |
| 188 | if (err) { | 209 | if (err) |
| 189 | free_thread_info(ti); | 210 | goto out; |
| 190 | free_task_struct(tsk); | ||
| 191 | return NULL; | ||
| 192 | } | ||
| 193 | 211 | ||
| 194 | setup_thread_stack(tsk, orig); | 212 | setup_thread_stack(tsk, orig); |
| 195 | 213 | ||
| @@ -205,6 +223,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 205 | #endif | 223 | #endif |
| 206 | tsk->splice_pipe = NULL; | 224 | tsk->splice_pipe = NULL; |
| 207 | return tsk; | 225 | return tsk; |
| 226 | |||
| 227 | out: | ||
| 228 | free_thread_info(ti); | ||
| 229 | free_task_struct(tsk); | ||
| 230 | return NULL; | ||
| 208 | } | 231 | } |
| 209 | 232 | ||
| 210 | #ifdef CONFIG_MMU | 233 | #ifdef CONFIG_MMU |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c642ef75069f..f78777abe769 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -1238,51 +1238,50 @@ void hrtimer_run_pending(void) | |||
| 1238 | /* | 1238 | /* |
| 1239 | * Called from hardirq context every jiffy | 1239 | * Called from hardirq context every jiffy |
| 1240 | */ | 1240 | */ |
| 1241 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1241 | void hrtimer_run_queues(void) |
| 1242 | int index) | ||
| 1243 | { | 1242 | { |
| 1244 | struct rb_node *node; | 1243 | struct rb_node *node; |
| 1245 | struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; | 1244 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
| 1245 | struct hrtimer_clock_base *base; | ||
| 1246 | int index, gettime = 1; | ||
| 1246 | 1247 | ||
| 1247 | if (!base->first) | 1248 | if (hrtimer_hres_active()) |
| 1248 | return; | 1249 | return; |
| 1249 | 1250 | ||
| 1250 | if (base->get_softirq_time) | 1251 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { |
| 1251 | base->softirq_time = base->get_softirq_time(); | 1252 | base = &cpu_base->clock_base[index]; |
| 1252 | |||
| 1253 | spin_lock(&cpu_base->lock); | ||
| 1254 | 1253 | ||
| 1255 | while ((node = base->first)) { | 1254 | if (!base->first) |
| 1256 | struct hrtimer *timer; | ||
| 1257 | |||
| 1258 | timer = rb_entry(node, struct hrtimer, node); | ||
| 1259 | if (base->softirq_time.tv64 <= timer->expires.tv64) | ||
| 1260 | break; | ||
| 1261 | |||
| 1262 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { | ||
| 1263 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); | ||
| 1264 | list_add_tail(&timer->cb_entry, | ||
| 1265 | &base->cpu_base->cb_pending); | ||
| 1266 | continue; | 1255 | continue; |
| 1256 | |||
| 1257 | if (base->get_softirq_time) | ||
| 1258 | base->softirq_time = base->get_softirq_time(); | ||
| 1259 | else if (gettime) { | ||
| 1260 | hrtimer_get_softirq_time(cpu_base); | ||
| 1261 | gettime = 0; | ||
| 1267 | } | 1262 | } |
| 1268 | 1263 | ||
| 1269 | __run_hrtimer(timer); | 1264 | spin_lock(&cpu_base->lock); |
| 1270 | } | ||
| 1271 | spin_unlock(&cpu_base->lock); | ||
| 1272 | } | ||
| 1273 | 1265 | ||
| 1274 | void hrtimer_run_queues(void) | 1266 | while ((node = base->first)) { |
| 1275 | { | 1267 | struct hrtimer *timer; |
| 1276 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
| 1277 | int i; | ||
| 1278 | 1268 | ||
| 1279 | if (hrtimer_hres_active()) | 1269 | timer = rb_entry(node, struct hrtimer, node); |
| 1280 | return; | 1270 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
| 1271 | break; | ||
| 1281 | 1272 | ||
| 1282 | hrtimer_get_softirq_time(cpu_base); | 1273 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
| 1274 | __remove_hrtimer(timer, base, | ||
| 1275 | HRTIMER_STATE_PENDING, 0); | ||
| 1276 | list_add_tail(&timer->cb_entry, | ||
| 1277 | &base->cpu_base->cb_pending); | ||
| 1278 | continue; | ||
| 1279 | } | ||
| 1283 | 1280 | ||
| 1284 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1281 | __run_hrtimer(timer); |
| 1285 | run_hrtimer_queue(cpu_base, i); | 1282 | } |
| 1283 | spin_unlock(&cpu_base->lock); | ||
| 1284 | } | ||
| 1286 | } | 1285 | } |
| 1287 | 1286 | ||
| 1288 | /* | 1287 | /* |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fdb3fbe2b0c4..964964baefa2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq) | |||
| 47 | desc->irq_count = 0; | 47 | desc->irq_count = 0; |
| 48 | desc->irqs_unhandled = 0; | 48 | desc->irqs_unhandled = 0; |
| 49 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
| 50 | desc->affinity = CPU_MASK_ALL; | 50 | cpus_setall(desc->affinity); |
| 51 | #endif | 51 | #endif |
| 52 | spin_unlock_irqrestore(&desc->lock, flags); | 52 | spin_unlock_irqrestore(&desc->lock, flags); |
| 53 | } | 53 | } |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 06a0e2775651..6782dce93d01 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -29,7 +29,6 @@ | |||
| 29 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
| 30 | #include <asm/io.h> | 30 | #include <asm/io.h> |
| 31 | #include <asm/system.h> | 31 | #include <asm/system.h> |
| 32 | #include <asm/semaphore.h> | ||
| 33 | #include <asm/sections.h> | 32 | #include <asm/sections.h> |
| 34 | 33 | ||
| 35 | /* Per cpu memory for storing cpu states in case of system crash. */ | 34 | /* Per cpu memory for storing cpu states in case of system crash. */ |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 22be3ff3f363..e2764047ec03 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data) | |||
| 165 | } | 165 | } |
| 166 | 166 | ||
| 167 | /* We can run anywhere, unlike our parent keventd(). */ | 167 | /* We can run anywhere, unlike our parent keventd(). */ |
| 168 | set_cpus_allowed(current, CPU_MASK_ALL); | 168 | set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); |
| 169 | 169 | ||
| 170 | /* | 170 | /* |
| 171 | * Our parent is keventd, which runs with elevated scheduling priority. | 171 | * Our parent is keventd, which runs with elevated scheduling priority. |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 0ac887882f90..92cf6930ab51 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
| 16 | #include <asm/semaphore.h> | ||
| 17 | 16 | ||
| 18 | #define KTHREAD_NICE_LEVEL (-5) | 17 | #define KTHREAD_NICE_LEVEL (-5) |
| 19 | 18 | ||
| @@ -180,6 +179,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
| 180 | wait_task_inactive(k); | 179 | wait_task_inactive(k); |
| 181 | set_task_cpu(k, cpu); | 180 | set_task_cpu(k, cpu); |
| 182 | k->cpus_allowed = cpumask_of_cpu(cpu); | 181 | k->cpus_allowed = cpumask_of_cpu(cpu); |
| 182 | k->rt.nr_cpus_allowed = 1; | ||
| 183 | } | 183 | } |
| 184 | EXPORT_SYMBOL(kthread_bind); | 184 | EXPORT_SYMBOL(kthread_bind); |
| 185 | 185 | ||
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85abe74..7c74dab0d21b 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
| @@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
| 64 | return; | 64 | return; |
| 65 | 65 | ||
| 66 | for (i = 0; i < MAXLR; i++) { | 66 | for (i = 0; i < MAXLR; i++) { |
| 67 | int q; | 67 | int q, same = 1; |
| 68 | int same = 1; | 68 | |
| 69 | /* Nothing stored: */ | 69 | /* Nothing stored: */ |
| 70 | if (!latency_record[i].backtrace[0]) { | 70 | if (!latency_record[i].backtrace[0]) { |
| 71 | if (firstnonnull > i) | 71 | if (firstnonnull > i) |
| @@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
| 73 | continue; | 73 | continue; |
| 74 | } | 74 | } |
| 75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
| 76 | if (latency_record[i].backtrace[q] != | 76 | unsigned long record = lat->backtrace[q]; |
| 77 | lat->backtrace[q]) | 77 | |
| 78 | if (latency_record[i].backtrace[q] != record) { | ||
| 78 | same = 0; | 79 | same = 0; |
| 79 | if (same && lat->backtrace[q] == 0) | ||
| 80 | break; | 80 | break; |
| 81 | if (same && lat->backtrace[q] == ULONG_MAX) | 81 | } |
| 82 | |||
| 83 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
| 84 | if (record == 0 || record == ULONG_MAX) | ||
| 82 | break; | 85 | break; |
| 83 | } | 86 | } |
| 84 | if (same) { | 87 | if (same) { |
| @@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
| 143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | 146 | for (i = 0; i < LT_SAVECOUNT ; i++) { |
| 144 | struct latency_record *mylat; | 147 | struct latency_record *mylat; |
| 145 | int same = 1; | 148 | int same = 1; |
| 149 | |||
| 146 | mylat = &tsk->latency_record[i]; | 150 | mylat = &tsk->latency_record[i]; |
| 147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 151 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
| 148 | if (mylat->backtrace[q] != | 152 | unsigned long record = lat.backtrace[q]; |
| 149 | lat.backtrace[q]) | 153 | |
| 154 | if (mylat->backtrace[q] != record) { | ||
| 150 | same = 0; | 155 | same = 0; |
| 151 | if (same && lat.backtrace[q] == 0) | ||
| 152 | break; | 156 | break; |
| 153 | if (same && lat.backtrace[q] == ULONG_MAX) | 157 | } |
| 158 | |||
| 159 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
| 160 | if (record == 0 || record == ULONG_MAX) | ||
| 154 | break; | 161 | break; |
| 155 | } | 162 | } |
| 156 | if (same) { | 163 | if (same) { |
diff --git a/kernel/module.c b/kernel/module.c index 5d437bffd8dc..8d6cccc6c3cf 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -43,7 +43,6 @@ | |||
| 43 | #include <linux/mutex.h> | 43 | #include <linux/mutex.h> |
| 44 | #include <linux/unwind.h> | 44 | #include <linux/unwind.h> |
| 45 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
| 46 | #include <asm/semaphore.h> | ||
| 47 | #include <asm/cacheflush.h> | 46 | #include <asm/cacheflush.h> |
| 48 | #include <linux/license.h> | 47 | #include <linux/license.h> |
| 49 | #include <asm/sections.h> | 48 | #include <asm/sections.h> |
| @@ -664,7 +663,7 @@ static void free_module(struct module *mod); | |||
| 664 | 663 | ||
| 665 | static void wait_for_zero_refcount(struct module *mod) | 664 | static void wait_for_zero_refcount(struct module *mod) |
| 666 | { | 665 | { |
| 667 | /* Since we might sleep for some time, drop the semaphore first */ | 666 | /* Since we might sleep for some time, release the mutex first */ |
| 668 | mutex_unlock(&module_mutex); | 667 | mutex_unlock(&module_mutex); |
| 669 | for (;;) { | 668 | for (;;) { |
| 670 | DEBUGP("Looking at refcount...\n"); | 669 | DEBUGP("Looking at refcount...\n"); |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a9b04203a66d..8476956ffd92 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -37,7 +37,6 @@ | |||
| 37 | #include <linux/mutex.h> | 37 | #include <linux/mutex.h> |
| 38 | 38 | ||
| 39 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
| 40 | #include <asm/semaphore.h> | ||
| 41 | #include <linux/list.h> | 40 | #include <linux/list.h> |
| 42 | #include <linux/init.h> | 41 | #include <linux/init.h> |
| 43 | #include <linux/compiler.h> | 42 | #include <linux/compiler.h> |
diff --git a/kernel/profile.c b/kernel/profile.c index 3b7a1b055122..606d7387265c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -23,7 +23,6 @@ | |||
| 23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
| 24 | #include <linux/mutex.h> | 24 | #include <linux/mutex.h> |
| 25 | #include <asm/sections.h> | 25 | #include <asm/sections.h> |
| 26 | #include <asm/semaphore.h> | ||
| 27 | #include <asm/irq_regs.h> | 26 | #include <asm/irq_regs.h> |
| 28 | #include <asm/ptrace.h> | 27 | #include <asm/ptrace.h> |
| 29 | 28 | ||
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index fdb34e86f923..67e392ed5496 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -323,9 +323,8 @@ static int ptrace_setoptions(struct task_struct *child, long data) | |||
| 323 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; | 323 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; |
| 324 | } | 324 | } |
| 325 | 325 | ||
| 326 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) | 326 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) |
| 327 | { | 327 | { |
| 328 | siginfo_t lastinfo; | ||
| 329 | int error = -ESRCH; | 328 | int error = -ESRCH; |
| 330 | 329 | ||
| 331 | read_lock(&tasklist_lock); | 330 | read_lock(&tasklist_lock); |
| @@ -333,31 +332,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) | |||
| 333 | error = -EINVAL; | 332 | error = -EINVAL; |
| 334 | spin_lock_irq(&child->sighand->siglock); | 333 | spin_lock_irq(&child->sighand->siglock); |
| 335 | if (likely(child->last_siginfo != NULL)) { | 334 | if (likely(child->last_siginfo != NULL)) { |
| 336 | lastinfo = *child->last_siginfo; | 335 | *info = *child->last_siginfo; |
| 337 | error = 0; | 336 | error = 0; |
| 338 | } | 337 | } |
| 339 | spin_unlock_irq(&child->sighand->siglock); | 338 | spin_unlock_irq(&child->sighand->siglock); |
| 340 | } | 339 | } |
| 341 | read_unlock(&tasklist_lock); | 340 | read_unlock(&tasklist_lock); |
| 342 | if (!error) | ||
| 343 | return copy_siginfo_to_user(data, &lastinfo); | ||
| 344 | return error; | 341 | return error; |
| 345 | } | 342 | } |
| 346 | 343 | ||
| 347 | static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) | 344 | static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) |
| 348 | { | 345 | { |
| 349 | siginfo_t newinfo; | ||
| 350 | int error = -ESRCH; | 346 | int error = -ESRCH; |
| 351 | 347 | ||
| 352 | if (copy_from_user(&newinfo, data, sizeof (siginfo_t))) | ||
| 353 | return -EFAULT; | ||
| 354 | |||
| 355 | read_lock(&tasklist_lock); | 348 | read_lock(&tasklist_lock); |
| 356 | if (likely(child->sighand != NULL)) { | 349 | if (likely(child->sighand != NULL)) { |
| 357 | error = -EINVAL; | 350 | error = -EINVAL; |
| 358 | spin_lock_irq(&child->sighand->siglock); | 351 | spin_lock_irq(&child->sighand->siglock); |
| 359 | if (likely(child->last_siginfo != NULL)) { | 352 | if (likely(child->last_siginfo != NULL)) { |
| 360 | *child->last_siginfo = newinfo; | 353 | *child->last_siginfo = *info; |
| 361 | error = 0; | 354 | error = 0; |
| 362 | } | 355 | } |
| 363 | spin_unlock_irq(&child->sighand->siglock); | 356 | spin_unlock_irq(&child->sighand->siglock); |
| @@ -424,6 +417,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 424 | long addr, long data) | 417 | long addr, long data) |
| 425 | { | 418 | { |
| 426 | int ret = -EIO; | 419 | int ret = -EIO; |
| 420 | siginfo_t siginfo; | ||
| 427 | 421 | ||
| 428 | switch (request) { | 422 | switch (request) { |
| 429 | case PTRACE_PEEKTEXT: | 423 | case PTRACE_PEEKTEXT: |
| @@ -442,12 +436,22 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 442 | case PTRACE_GETEVENTMSG: | 436 | case PTRACE_GETEVENTMSG: |
| 443 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); | 437 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); |
| 444 | break; | 438 | break; |
| 439 | |||
| 445 | case PTRACE_GETSIGINFO: | 440 | case PTRACE_GETSIGINFO: |
| 446 | ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); | 441 | ret = ptrace_getsiginfo(child, &siginfo); |
| 442 | if (!ret) | ||
| 443 | ret = copy_siginfo_to_user((siginfo_t __user *) data, | ||
| 444 | &siginfo); | ||
| 447 | break; | 445 | break; |
| 446 | |||
| 448 | case PTRACE_SETSIGINFO: | 447 | case PTRACE_SETSIGINFO: |
| 449 | ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); | 448 | if (copy_from_user(&siginfo, (siginfo_t __user *) data, |
| 449 | sizeof siginfo)) | ||
| 450 | ret = -EFAULT; | ||
| 451 | else | ||
| 452 | ret = ptrace_setsiginfo(child, &siginfo); | ||
| 450 | break; | 453 | break; |
| 454 | |||
| 451 | case PTRACE_DETACH: /* detach a process that was attached. */ | 455 | case PTRACE_DETACH: /* detach a process that was attached. */ |
| 452 | ret = ptrace_detach(child, data); | 456 | ret = ptrace_detach(child, data); |
| 453 | break; | 457 | break; |
| @@ -616,6 +620,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 616 | { | 620 | { |
| 617 | compat_ulong_t __user *datap = compat_ptr(data); | 621 | compat_ulong_t __user *datap = compat_ptr(data); |
| 618 | compat_ulong_t word; | 622 | compat_ulong_t word; |
| 623 | siginfo_t siginfo; | ||
| 619 | int ret; | 624 | int ret; |
| 620 | 625 | ||
| 621 | switch (request) { | 626 | switch (request) { |
| @@ -638,6 +643,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 638 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); | 643 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); |
| 639 | break; | 644 | break; |
| 640 | 645 | ||
| 646 | case PTRACE_GETSIGINFO: | ||
| 647 | ret = ptrace_getsiginfo(child, &siginfo); | ||
| 648 | if (!ret) | ||
| 649 | ret = copy_siginfo_to_user32( | ||
| 650 | (struct compat_siginfo __user *) datap, | ||
| 651 | &siginfo); | ||
| 652 | break; | ||
| 653 | |||
| 654 | case PTRACE_SETSIGINFO: | ||
| 655 | memset(&siginfo, 0, sizeof siginfo); | ||
| 656 | if (copy_siginfo_from_user32( | ||
| 657 | &siginfo, (struct compat_siginfo __user *) datap)) | ||
| 658 | ret = -EFAULT; | ||
| 659 | else | ||
| 660 | ret = ptrace_setsiginfo(child, &siginfo); | ||
| 661 | break; | ||
| 662 | |||
| 641 | default: | 663 | default: |
| 642 | ret = ptrace_request(child, request, addr, data); | 664 | ret = ptrace_request(child, request, addr, data); |
| 643 | } | 665 | } |
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e9517014b57c..e1cdf196a515 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
| @@ -1007,10 +1007,10 @@ void __synchronize_sched(void) | |||
| 1007 | if (sched_getaffinity(0, &oldmask) < 0) | 1007 | if (sched_getaffinity(0, &oldmask) < 0) |
| 1008 | oldmask = cpu_possible_map; | 1008 | oldmask = cpu_possible_map; |
| 1009 | for_each_online_cpu(cpu) { | 1009 | for_each_online_cpu(cpu) { |
| 1010 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | 1010 | sched_setaffinity(0, &cpumask_of_cpu(cpu)); |
| 1011 | schedule(); | 1011 | schedule(); |
| 1012 | } | 1012 | } |
| 1013 | sched_setaffinity(0, oldmask); | 1013 | sched_setaffinity(0, &oldmask); |
| 1014 | } | 1014 | } |
| 1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); | 1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); |
| 1016 | 1016 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index fd599829e72a..47894f919d4e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -723,9 +723,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | |||
| 723 | */ | 723 | */ |
| 724 | static void rcu_torture_shuffle_tasks(void) | 724 | static void rcu_torture_shuffle_tasks(void) |
| 725 | { | 725 | { |
| 726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask; |
| 727 | int i; | 727 | int i; |
| 728 | 728 | ||
| 729 | cpus_setall(tmp_mask); | ||
| 729 | get_online_cpus(); | 730 | get_online_cpus(); |
| 730 | 731 | ||
| 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 732 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
| @@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void) | |||
| 737 | if (rcu_idle_cpu != -1) | 738 | if (rcu_idle_cpu != -1) |
| 738 | cpu_clear(rcu_idle_cpu, tmp_mask); | 739 | cpu_clear(rcu_idle_cpu, tmp_mask); |
| 739 | 740 | ||
| 740 | set_cpus_allowed(current, tmp_mask); | 741 | set_cpus_allowed_ptr(current, &tmp_mask); |
| 741 | 742 | ||
| 742 | if (reader_tasks) { | 743 | if (reader_tasks) { |
| 743 | for (i = 0; i < nrealreaders; i++) | 744 | for (i = 0; i < nrealreaders; i++) |
| 744 | if (reader_tasks[i]) | 745 | if (reader_tasks[i]) |
| 745 | set_cpus_allowed(reader_tasks[i], tmp_mask); | 746 | set_cpus_allowed_ptr(reader_tasks[i], |
| 747 | &tmp_mask); | ||
| 746 | } | 748 | } |
| 747 | 749 | ||
| 748 | if (fakewriter_tasks) { | 750 | if (fakewriter_tasks) { |
| 749 | for (i = 0; i < nfakewriters; i++) | 751 | for (i = 0; i < nfakewriters; i++) |
| 750 | if (fakewriter_tasks[i]) | 752 | if (fakewriter_tasks[i]) |
| 751 | set_cpus_allowed(fakewriter_tasks[i], tmp_mask); | 753 | set_cpus_allowed_ptr(fakewriter_tasks[i], |
| 754 | &tmp_mask); | ||
| 752 | } | 755 | } |
| 753 | 756 | ||
| 754 | if (writer_task) | 757 | if (writer_task) |
| 755 | set_cpus_allowed(writer_task, tmp_mask); | 758 | set_cpus_allowed_ptr(writer_task, &tmp_mask); |
| 756 | 759 | ||
| 757 | if (stats_task) | 760 | if (stats_task) |
| 758 | set_cpus_allowed(stats_task, tmp_mask); | 761 | set_cpus_allowed_ptr(stats_task, &tmp_mask); |
| 759 | 762 | ||
| 760 | if (rcu_idle_cpu == -1) | 763 | if (rcu_idle_cpu == -1) |
| 761 | rcu_idle_cpu = num_online_cpus() - 1; | 764 | rcu_idle_cpu = num_online_cpus() - 1; |
diff --git a/kernel/resource.c b/kernel/resource.c index 82aea814d409..cee12cc47cab 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -486,6 +486,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t | |||
| 486 | 486 | ||
| 487 | EXPORT_SYMBOL(adjust_resource); | 487 | EXPORT_SYMBOL(adjust_resource); |
| 488 | 488 | ||
| 489 | /** | ||
| 490 | * resource_alignment - calculate resource's alignment | ||
| 491 | * @res: resource pointer | ||
| 492 | * | ||
| 493 | * Returns alignment on success, 0 (invalid alignment) on failure. | ||
| 494 | */ | ||
| 495 | resource_size_t resource_alignment(struct resource *res) | ||
| 496 | { | ||
| 497 | switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { | ||
| 498 | case IORESOURCE_SIZEALIGN: | ||
| 499 | return res->end - res->start + 1; | ||
| 500 | case IORESOURCE_STARTALIGN: | ||
| 501 | return res->start; | ||
| 502 | default: | ||
| 503 | return 0; | ||
| 504 | } | ||
| 505 | } | ||
| 506 | |||
| 489 | /* | 507 | /* |
| 490 | * This is compatibility stuff for IO resources. | 508 | * This is compatibility stuff for IO resources. |
| 491 | * | 509 | * |
diff --git a/kernel/sched.c b/kernel/sched.c index 8dcdec6fe0fe..0014b03adaca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -66,6 +66,10 @@ | |||
| 66 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
| 67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
| 68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
| 69 | #include <linux/tick.h> | ||
| 70 | #include <linux/bootmem.h> | ||
| 71 | #include <linux/debugfs.h> | ||
| 72 | #include <linux/ctype.h> | ||
| 69 | 73 | ||
| 70 | #include <asm/tlb.h> | 74 | #include <asm/tlb.h> |
| 71 | #include <asm/irq_regs.h> | 75 | #include <asm/irq_regs.h> |
| @@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 114 | */ | 118 | */ |
| 115 | #define DEF_TIMESLICE (100 * HZ / 1000) | 119 | #define DEF_TIMESLICE (100 * HZ / 1000) |
| 116 | 120 | ||
| 121 | /* | ||
| 122 | * single value that denotes runtime == period, ie unlimited time. | ||
| 123 | */ | ||
| 124 | #define RUNTIME_INF ((u64)~0ULL) | ||
| 125 | |||
| 117 | #ifdef CONFIG_SMP | 126 | #ifdef CONFIG_SMP |
| 118 | /* | 127 | /* |
| 119 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | 128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) |
| @@ -155,6 +164,84 @@ struct rt_prio_array { | |||
| 155 | struct list_head queue[MAX_RT_PRIO]; | 164 | struct list_head queue[MAX_RT_PRIO]; |
| 156 | }; | 165 | }; |
| 157 | 166 | ||
| 167 | struct rt_bandwidth { | ||
| 168 | /* nests inside the rq lock: */ | ||
| 169 | spinlock_t rt_runtime_lock; | ||
| 170 | ktime_t rt_period; | ||
| 171 | u64 rt_runtime; | ||
| 172 | struct hrtimer rt_period_timer; | ||
| 173 | }; | ||
| 174 | |||
| 175 | static struct rt_bandwidth def_rt_bandwidth; | ||
| 176 | |||
| 177 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
| 178 | |||
| 179 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
| 180 | { | ||
| 181 | struct rt_bandwidth *rt_b = | ||
| 182 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
| 183 | ktime_t now; | ||
| 184 | int overrun; | ||
| 185 | int idle = 0; | ||
| 186 | |||
| 187 | for (;;) { | ||
| 188 | now = hrtimer_cb_get_time(timer); | ||
| 189 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
| 190 | |||
| 191 | if (!overrun) | ||
| 192 | break; | ||
| 193 | |||
| 194 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
| 195 | } | ||
| 196 | |||
| 197 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
| 198 | } | ||
| 199 | |||
| 200 | static | ||
| 201 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
| 202 | { | ||
| 203 | rt_b->rt_period = ns_to_ktime(period); | ||
| 204 | rt_b->rt_runtime = runtime; | ||
| 205 | |||
| 206 | spin_lock_init(&rt_b->rt_runtime_lock); | ||
| 207 | |||
| 208 | hrtimer_init(&rt_b->rt_period_timer, | ||
| 209 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 210 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
| 211 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
| 212 | } | ||
| 213 | |||
| 214 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 215 | { | ||
| 216 | ktime_t now; | ||
| 217 | |||
| 218 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
| 219 | return; | ||
| 220 | |||
| 221 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
| 222 | return; | ||
| 223 | |||
| 224 | spin_lock(&rt_b->rt_runtime_lock); | ||
| 225 | for (;;) { | ||
| 226 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
| 227 | break; | ||
| 228 | |||
| 229 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
| 230 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
| 231 | hrtimer_start(&rt_b->rt_period_timer, | ||
| 232 | rt_b->rt_period_timer.expires, | ||
| 233 | HRTIMER_MODE_ABS); | ||
| 234 | } | ||
| 235 | spin_unlock(&rt_b->rt_runtime_lock); | ||
| 236 | } | ||
| 237 | |||
| 238 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 239 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 240 | { | ||
| 241 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
| 242 | } | ||
| 243 | #endif | ||
| 244 | |||
| 158 | #ifdef CONFIG_GROUP_SCHED | 245 | #ifdef CONFIG_GROUP_SCHED |
| 159 | 246 | ||
| 160 | #include <linux/cgroup.h> | 247 | #include <linux/cgroup.h> |
| @@ -181,29 +268,39 @@ struct task_group { | |||
| 181 | struct sched_rt_entity **rt_se; | 268 | struct sched_rt_entity **rt_se; |
| 182 | struct rt_rq **rt_rq; | 269 | struct rt_rq **rt_rq; |
| 183 | 270 | ||
| 184 | u64 rt_runtime; | 271 | struct rt_bandwidth rt_bandwidth; |
| 185 | #endif | 272 | #endif |
| 186 | 273 | ||
| 187 | struct rcu_head rcu; | 274 | struct rcu_head rcu; |
| 188 | struct list_head list; | 275 | struct list_head list; |
| 276 | |||
| 277 | struct task_group *parent; | ||
| 278 | struct list_head siblings; | ||
| 279 | struct list_head children; | ||
| 189 | }; | 280 | }; |
| 190 | 281 | ||
| 282 | #ifdef CONFIG_USER_SCHED | ||
| 283 | |||
| 284 | /* | ||
| 285 | * Root task group. | ||
| 286 | * Every UID task group (including init_task_group aka UID-0) will | ||
| 287 | * be a child to this group. | ||
| 288 | */ | ||
| 289 | struct task_group root_task_group; | ||
| 290 | |||
| 191 | #ifdef CONFIG_FAIR_GROUP_SCHED | 291 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 192 | /* Default task group's sched entity on each cpu */ | 292 | /* Default task group's sched entity on each cpu */ |
| 193 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 293 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
| 194 | /* Default task group's cfs_rq on each cpu */ | 294 | /* Default task group's cfs_rq on each cpu */ |
| 195 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 295 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
| 196 | |||
| 197 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
| 198 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
| 199 | #endif | 296 | #endif |
| 200 | 297 | ||
| 201 | #ifdef CONFIG_RT_GROUP_SCHED | 298 | #ifdef CONFIG_RT_GROUP_SCHED |
| 202 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 299 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
| 203 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 300 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
| 204 | 301 | #endif | |
| 205 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | 302 | #else |
| 206 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | 303 | #define root_task_group init_task_group |
| 207 | #endif | 304 | #endif |
| 208 | 305 | ||
| 209 | /* task_group_lock serializes add/remove of task groups and also changes to | 306 | /* task_group_lock serializes add/remove of task groups and also changes to |
| @@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex); | |||
| 221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 318 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
| 222 | #endif | 319 | #endif |
| 223 | 320 | ||
| 321 | #define MIN_SHARES 2 | ||
| 322 | |||
| 224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 323 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
| 225 | #endif | 324 | #endif |
| 226 | 325 | ||
| 227 | /* Default task group. | 326 | /* Default task group. |
| 228 | * Every task in system belong to this group at bootup. | 327 | * Every task in system belong to this group at bootup. |
| 229 | */ | 328 | */ |
| 230 | struct task_group init_task_group = { | 329 | struct task_group init_task_group; |
| 231 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 232 | .se = init_sched_entity_p, | ||
| 233 | .cfs_rq = init_cfs_rq_p, | ||
| 234 | #endif | ||
| 235 | |||
| 236 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 237 | .rt_se = init_sched_rt_entity_p, | ||
| 238 | .rt_rq = init_rt_rq_p, | ||
| 239 | #endif | ||
| 240 | }; | ||
| 241 | 330 | ||
| 242 | /* return group to which a task belongs */ | 331 | /* return group to which a task belongs */ |
| 243 | static inline struct task_group *task_group(struct task_struct *p) | 332 | static inline struct task_group *task_group(struct task_struct *p) |
| @@ -297,8 +386,12 @@ struct cfs_rq { | |||
| 297 | 386 | ||
| 298 | struct rb_root tasks_timeline; | 387 | struct rb_root tasks_timeline; |
| 299 | struct rb_node *rb_leftmost; | 388 | struct rb_node *rb_leftmost; |
| 300 | struct rb_node *rb_load_balance_curr; | 389 | |
| 301 | /* 'curr' points to currently running entity on this cfs_rq. | 390 | struct list_head tasks; |
| 391 | struct list_head *balance_iterator; | ||
| 392 | |||
| 393 | /* | ||
| 394 | * 'curr' points to currently running entity on this cfs_rq. | ||
| 302 | * It is set to NULL otherwise (i.e when none are currently running). | 395 | * It is set to NULL otherwise (i.e when none are currently running). |
| 303 | */ | 396 | */ |
| 304 | struct sched_entity *curr, *next; | 397 | struct sched_entity *curr, *next; |
| @@ -318,6 +411,43 @@ struct cfs_rq { | |||
| 318 | */ | 411 | */ |
| 319 | struct list_head leaf_cfs_rq_list; | 412 | struct list_head leaf_cfs_rq_list; |
| 320 | struct task_group *tg; /* group that "owns" this runqueue */ | 413 | struct task_group *tg; /* group that "owns" this runqueue */ |
| 414 | |||
| 415 | #ifdef CONFIG_SMP | ||
| 416 | unsigned long task_weight; | ||
| 417 | unsigned long shares; | ||
| 418 | /* | ||
| 419 | * We need space to build a sched_domain wide view of the full task | ||
| 420 | * group tree, in order to avoid depending on dynamic memory allocation | ||
| 421 | * during the load balancing we place this in the per cpu task group | ||
| 422 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
| 423 | * but more should not be needed anyway. | ||
| 424 | */ | ||
| 425 | struct aggregate_struct { | ||
| 426 | /* | ||
| 427 | * load = weight(cpus) * f(tg) | ||
| 428 | * | ||
| 429 | * Where f(tg) is the recursive weight fraction assigned to | ||
| 430 | * this group. | ||
| 431 | */ | ||
| 432 | unsigned long load; | ||
| 433 | |||
| 434 | /* | ||
| 435 | * part of the group weight distributed to this span. | ||
| 436 | */ | ||
| 437 | unsigned long shares; | ||
| 438 | |||
| 439 | /* | ||
| 440 | * The sum of all runqueue weights within this span. | ||
| 441 | */ | ||
| 442 | unsigned long rq_weight; | ||
| 443 | |||
| 444 | /* | ||
| 445 | * Weight contributed by tasks; this is the part we can | ||
| 446 | * influence by moving tasks around. | ||
| 447 | */ | ||
| 448 | unsigned long task_weight; | ||
| 449 | } aggregate; | ||
| 450 | #endif | ||
| 321 | #endif | 451 | #endif |
| 322 | }; | 452 | }; |
| 323 | 453 | ||
| @@ -334,6 +464,9 @@ struct rt_rq { | |||
| 334 | #endif | 464 | #endif |
| 335 | int rt_throttled; | 465 | int rt_throttled; |
| 336 | u64 rt_time; | 466 | u64 rt_time; |
| 467 | u64 rt_runtime; | ||
| 468 | /* Nests inside the rq lock: */ | ||
| 469 | spinlock_t rt_runtime_lock; | ||
| 337 | 470 | ||
| 338 | #ifdef CONFIG_RT_GROUP_SCHED | 471 | #ifdef CONFIG_RT_GROUP_SCHED |
| 339 | unsigned long rt_nr_boosted; | 472 | unsigned long rt_nr_boosted; |
| @@ -396,6 +529,7 @@ struct rq { | |||
| 396 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 529 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 397 | unsigned char idle_at_tick; | 530 | unsigned char idle_at_tick; |
| 398 | #ifdef CONFIG_NO_HZ | 531 | #ifdef CONFIG_NO_HZ |
| 532 | unsigned long last_tick_seen; | ||
| 399 | unsigned char in_nohz_recently; | 533 | unsigned char in_nohz_recently; |
| 400 | #endif | 534 | #endif |
| 401 | /* capture load from *all* tasks on this cpu: */ | 535 | /* capture load from *all* tasks on this cpu: */ |
| @@ -405,8 +539,6 @@ struct rq { | |||
| 405 | 539 | ||
| 406 | struct cfs_rq cfs; | 540 | struct cfs_rq cfs; |
| 407 | struct rt_rq rt; | 541 | struct rt_rq rt; |
| 408 | u64 rt_period_expire; | ||
| 409 | int rt_throttled; | ||
| 410 | 542 | ||
| 411 | #ifdef CONFIG_FAIR_GROUP_SCHED | 543 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 412 | /* list of leaf cfs_rq on this cpu: */ | 544 | /* list of leaf cfs_rq on this cpu: */ |
| @@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq) | |||
| 499 | #endif | 631 | #endif |
| 500 | } | 632 | } |
| 501 | 633 | ||
| 634 | #ifdef CONFIG_NO_HZ | ||
| 635 | static inline bool nohz_on(int cpu) | ||
| 636 | { | ||
| 637 | return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; | ||
| 638 | } | ||
| 639 | |||
| 640 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
| 641 | { | ||
| 642 | return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; | ||
| 643 | } | ||
| 644 | |||
| 645 | static inline void update_last_tick_seen(struct rq *rq) | ||
| 646 | { | ||
| 647 | rq->last_tick_seen = jiffies; | ||
| 648 | } | ||
| 649 | #else | ||
| 650 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
| 651 | { | ||
| 652 | return 1; | ||
| 653 | } | ||
| 654 | |||
| 655 | static inline void update_last_tick_seen(struct rq *rq) | ||
| 656 | { | ||
| 657 | } | ||
| 658 | #endif | ||
| 659 | |||
| 502 | /* | 660 | /* |
| 503 | * Update the per-runqueue clock, as finegrained as the platform can give | 661 | * Update the per-runqueue clock, as finegrained as the platform can give |
| 504 | * us, but without assuming monotonicity, etc.: | 662 | * us, but without assuming monotonicity, etc.: |
| @@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq) | |||
| 523 | /* | 681 | /* |
| 524 | * Catch too large forward jumps too: | 682 | * Catch too large forward jumps too: |
| 525 | */ | 683 | */ |
| 526 | if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { | 684 | u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; |
| 527 | if (clock < rq->tick_timestamp + TICK_NSEC) | 685 | u64 max_time = rq->tick_timestamp + max_jump; |
| 528 | clock = rq->tick_timestamp + TICK_NSEC; | 686 | |
| 687 | if (unlikely(clock + delta > max_time)) { | ||
| 688 | if (clock < max_time) | ||
| 689 | clock = max_time; | ||
| 529 | else | 690 | else |
| 530 | clock++; | 691 | clock++; |
| 531 | rq->clock_overflows++; | 692 | rq->clock_overflows++; |
| @@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq) | |||
| 561 | #define task_rq(p) cpu_rq(task_cpu(p)) | 722 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 562 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 723 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 563 | 724 | ||
| 564 | unsigned long rt_needs_cpu(int cpu) | ||
| 565 | { | ||
| 566 | struct rq *rq = cpu_rq(cpu); | ||
| 567 | u64 delta; | ||
| 568 | |||
| 569 | if (!rq->rt_throttled) | ||
| 570 | return 0; | ||
| 571 | |||
| 572 | if (rq->clock > rq->rt_period_expire) | ||
| 573 | return 1; | ||
| 574 | |||
| 575 | delta = rq->rt_period_expire - rq->clock; | ||
| 576 | do_div(delta, NSEC_PER_SEC / HZ); | ||
| 577 | |||
| 578 | return (unsigned long)delta; | ||
| 579 | } | ||
| 580 | |||
| 581 | /* | 725 | /* |
| 582 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 726 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
| 583 | */ | 727 | */ |
| @@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu) | |||
| 590 | /* | 734 | /* |
| 591 | * Debugging: various feature bits | 735 | * Debugging: various feature bits |
| 592 | */ | 736 | */ |
| 737 | |||
| 738 | #define SCHED_FEAT(name, enabled) \ | ||
| 739 | __SCHED_FEAT_##name , | ||
| 740 | |||
| 593 | enum { | 741 | enum { |
| 594 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 742 | #include "sched_features.h" |
| 595 | SCHED_FEAT_WAKEUP_PREEMPT = 2, | ||
| 596 | SCHED_FEAT_START_DEBIT = 4, | ||
| 597 | SCHED_FEAT_HRTICK = 8, | ||
| 598 | SCHED_FEAT_DOUBLE_TICK = 16, | ||
| 599 | }; | 743 | }; |
| 600 | 744 | ||
| 745 | #undef SCHED_FEAT | ||
| 746 | |||
| 747 | #define SCHED_FEAT(name, enabled) \ | ||
| 748 | (1UL << __SCHED_FEAT_##name) * enabled | | ||
| 749 | |||
| 601 | const_debug unsigned int sysctl_sched_features = | 750 | const_debug unsigned int sysctl_sched_features = |
| 602 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | | 751 | #include "sched_features.h" |
| 603 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 752 | 0; |
| 604 | SCHED_FEAT_START_DEBIT * 1 | | 753 | |
| 605 | SCHED_FEAT_HRTICK * 1 | | 754 | #undef SCHED_FEAT |
| 606 | SCHED_FEAT_DOUBLE_TICK * 0; | 755 | |
| 756 | #ifdef CONFIG_SCHED_DEBUG | ||
| 757 | #define SCHED_FEAT(name, enabled) \ | ||
| 758 | #name , | ||
| 759 | |||
| 760 | __read_mostly char *sched_feat_names[] = { | ||
| 761 | #include "sched_features.h" | ||
| 762 | NULL | ||
| 763 | }; | ||
| 764 | |||
| 765 | #undef SCHED_FEAT | ||
| 766 | |||
| 767 | int sched_feat_open(struct inode *inode, struct file *filp) | ||
| 768 | { | ||
| 769 | filp->private_data = inode->i_private; | ||
| 770 | return 0; | ||
| 771 | } | ||
| 772 | |||
| 773 | static ssize_t | ||
| 774 | sched_feat_read(struct file *filp, char __user *ubuf, | ||
| 775 | size_t cnt, loff_t *ppos) | ||
| 776 | { | ||
| 777 | char *buf; | ||
| 778 | int r = 0; | ||
| 779 | int len = 0; | ||
| 780 | int i; | ||
| 607 | 781 | ||
| 608 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 782 | for (i = 0; sched_feat_names[i]; i++) { |
| 783 | len += strlen(sched_feat_names[i]); | ||
| 784 | len += 4; | ||
| 785 | } | ||
| 786 | |||
| 787 | buf = kmalloc(len + 2, GFP_KERNEL); | ||
| 788 | if (!buf) | ||
| 789 | return -ENOMEM; | ||
| 790 | |||
| 791 | for (i = 0; sched_feat_names[i]; i++) { | ||
| 792 | if (sysctl_sched_features & (1UL << i)) | ||
| 793 | r += sprintf(buf + r, "%s ", sched_feat_names[i]); | ||
| 794 | else | ||
| 795 | r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); | ||
| 796 | } | ||
| 797 | |||
| 798 | r += sprintf(buf + r, "\n"); | ||
| 799 | WARN_ON(r >= len + 2); | ||
| 800 | |||
| 801 | r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
| 802 | |||
| 803 | kfree(buf); | ||
| 804 | |||
| 805 | return r; | ||
| 806 | } | ||
| 807 | |||
| 808 | static ssize_t | ||
| 809 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
| 810 | size_t cnt, loff_t *ppos) | ||
| 811 | { | ||
| 812 | char buf[64]; | ||
| 813 | char *cmp = buf; | ||
| 814 | int neg = 0; | ||
| 815 | int i; | ||
| 816 | |||
| 817 | if (cnt > 63) | ||
| 818 | cnt = 63; | ||
| 819 | |||
| 820 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 821 | return -EFAULT; | ||
| 822 | |||
| 823 | buf[cnt] = 0; | ||
| 824 | |||
| 825 | if (strncmp(buf, "NO_", 3) == 0) { | ||
| 826 | neg = 1; | ||
| 827 | cmp += 3; | ||
| 828 | } | ||
| 829 | |||
| 830 | for (i = 0; sched_feat_names[i]; i++) { | ||
| 831 | int len = strlen(sched_feat_names[i]); | ||
| 832 | |||
| 833 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
| 834 | if (neg) | ||
| 835 | sysctl_sched_features &= ~(1UL << i); | ||
| 836 | else | ||
| 837 | sysctl_sched_features |= (1UL << i); | ||
| 838 | break; | ||
| 839 | } | ||
| 840 | } | ||
| 841 | |||
| 842 | if (!sched_feat_names[i]) | ||
| 843 | return -EINVAL; | ||
| 844 | |||
| 845 | filp->f_pos += cnt; | ||
| 846 | |||
| 847 | return cnt; | ||
| 848 | } | ||
| 849 | |||
| 850 | static struct file_operations sched_feat_fops = { | ||
| 851 | .open = sched_feat_open, | ||
| 852 | .read = sched_feat_read, | ||
| 853 | .write = sched_feat_write, | ||
| 854 | }; | ||
| 855 | |||
| 856 | static __init int sched_init_debug(void) | ||
| 857 | { | ||
| 858 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
| 859 | &sched_feat_fops); | ||
| 860 | |||
| 861 | return 0; | ||
| 862 | } | ||
| 863 | late_initcall(sched_init_debug); | ||
| 864 | |||
| 865 | #endif | ||
| 866 | |||
| 867 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
| 609 | 868 | ||
| 610 | /* | 869 | /* |
| 611 | * Number of tasks to iterate in a single balance run. | 870 | * Number of tasks to iterate in a single balance run. |
| @@ -627,16 +886,52 @@ static __read_mostly int scheduler_running; | |||
| 627 | */ | 886 | */ |
| 628 | int sysctl_sched_rt_runtime = 950000; | 887 | int sysctl_sched_rt_runtime = 950000; |
| 629 | 888 | ||
| 630 | /* | 889 | static inline u64 global_rt_period(void) |
| 631 | * single value that denotes runtime == period, ie unlimited time. | 890 | { |
| 632 | */ | 891 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
| 633 | #define RUNTIME_INF ((u64)~0ULL) | 892 | } |
| 893 | |||
| 894 | static inline u64 global_rt_runtime(void) | ||
| 895 | { | ||
| 896 | if (sysctl_sched_rt_period < 0) | ||
| 897 | return RUNTIME_INF; | ||
| 898 | |||
| 899 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
| 900 | } | ||
| 901 | |||
| 902 | static const unsigned long long time_sync_thresh = 100000; | ||
| 903 | |||
| 904 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
| 905 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
| 634 | 906 | ||
| 635 | /* | 907 | /* |
| 636 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 908 | * Global lock which we take every now and then to synchronize |
| 637 | * clock constructed from sched_clock(): | 909 | * the CPUs time. This method is not warp-safe, but it's good |
| 910 | * enough to synchronize slowly diverging time sources and thus | ||
| 911 | * it's good enough for tracing: | ||
| 638 | */ | 912 | */ |
| 639 | unsigned long long cpu_clock(int cpu) | 913 | static DEFINE_SPINLOCK(time_sync_lock); |
| 914 | static unsigned long long prev_global_time; | ||
| 915 | |||
| 916 | static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) | ||
| 917 | { | ||
| 918 | unsigned long flags; | ||
| 919 | |||
| 920 | spin_lock_irqsave(&time_sync_lock, flags); | ||
| 921 | |||
| 922 | if (time < prev_global_time) { | ||
| 923 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
| 924 | time = prev_global_time; | ||
| 925 | } else { | ||
| 926 | prev_global_time = time; | ||
| 927 | } | ||
| 928 | |||
| 929 | spin_unlock_irqrestore(&time_sync_lock, flags); | ||
| 930 | |||
| 931 | return time; | ||
| 932 | } | ||
| 933 | |||
| 934 | static unsigned long long __cpu_clock(int cpu) | ||
| 640 | { | 935 | { |
| 641 | unsigned long long now; | 936 | unsigned long long now; |
| 642 | unsigned long flags; | 937 | unsigned long flags; |
| @@ -657,6 +952,24 @@ unsigned long long cpu_clock(int cpu) | |||
| 657 | 952 | ||
| 658 | return now; | 953 | return now; |
| 659 | } | 954 | } |
| 955 | |||
| 956 | /* | ||
| 957 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
| 958 | * clock constructed from sched_clock(): | ||
| 959 | */ | ||
| 960 | unsigned long long cpu_clock(int cpu) | ||
| 961 | { | ||
| 962 | unsigned long long prev_cpu_time, time, delta_time; | ||
| 963 | |||
| 964 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
| 965 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
| 966 | delta_time = time-prev_cpu_time; | ||
| 967 | |||
| 968 | if (unlikely(delta_time > time_sync_thresh)) | ||
| 969 | time = __sync_cpu_clock(time, cpu); | ||
| 970 | |||
| 971 | return time; | ||
| 972 | } | ||
| 660 | EXPORT_SYMBOL_GPL(cpu_clock); | 973 | EXPORT_SYMBOL_GPL(cpu_clock); |
| 661 | 974 | ||
| 662 | #ifndef prepare_arch_switch | 975 | #ifndef prepare_arch_switch |
| @@ -1116,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
| 1116 | */ | 1429 | */ |
| 1117 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1430 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
| 1118 | 1431 | ||
| 1432 | /* | ||
| 1433 | * delta *= weight / lw | ||
| 1434 | */ | ||
| 1119 | static unsigned long | 1435 | static unsigned long |
| 1120 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1436 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
| 1121 | struct load_weight *lw) | 1437 | struct load_weight *lw) |
| @@ -1138,12 +1454,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
| 1138 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1454 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
| 1139 | } | 1455 | } |
| 1140 | 1456 | ||
| 1141 | static inline unsigned long | ||
| 1142 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
| 1143 | { | ||
| 1144 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1457 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
| 1148 | { | 1458 | { |
| 1149 | lw->weight += inc; | 1459 | lw->weight += inc; |
| @@ -1241,11 +1551,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
| 1241 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1551 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
| 1242 | #endif | 1552 | #endif |
| 1243 | 1553 | ||
| 1554 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
| 1555 | { | ||
| 1556 | update_load_add(&rq->load, load); | ||
| 1557 | } | ||
| 1558 | |||
| 1559 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
| 1560 | { | ||
| 1561 | update_load_sub(&rq->load, load); | ||
| 1562 | } | ||
| 1563 | |||
| 1244 | #ifdef CONFIG_SMP | 1564 | #ifdef CONFIG_SMP |
| 1245 | static unsigned long source_load(int cpu, int type); | 1565 | static unsigned long source_load(int cpu, int type); |
| 1246 | static unsigned long target_load(int cpu, int type); | 1566 | static unsigned long target_load(int cpu, int type); |
| 1247 | static unsigned long cpu_avg_load_per_task(int cpu); | 1567 | static unsigned long cpu_avg_load_per_task(int cpu); |
| 1248 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1568 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
| 1569 | |||
| 1570 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1571 | |||
| 1572 | /* | ||
| 1573 | * Group load balancing. | ||
| 1574 | * | ||
| 1575 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
| 1576 | * Given the pictures below, and assuming each item has equal weight: | ||
| 1577 | * | ||
| 1578 | * root 1 - thread | ||
| 1579 | * / | \ A - group | ||
| 1580 | * A 1 B | ||
| 1581 | * /|\ / \ | ||
| 1582 | * C 2 D 3 4 | ||
| 1583 | * | | | ||
| 1584 | * 5 6 | ||
| 1585 | * | ||
| 1586 | * load: | ||
| 1587 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
| 1588 | * which equals 1/9-th of the total load. | ||
| 1589 | * | ||
| 1590 | * shares: | ||
| 1591 | * The weight of this group on the selected cpus. | ||
| 1592 | * | ||
| 1593 | * rq_weight: | ||
| 1594 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
| 1595 | * B would get 2. | ||
| 1596 | * | ||
| 1597 | * task_weight: | ||
| 1598 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
| 1599 | * get 1, B gets 2. | ||
| 1600 | */ | ||
| 1601 | |||
| 1602 | static inline struct aggregate_struct * | ||
| 1603 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
| 1604 | { | ||
| 1605 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
| 1609 | |||
| 1610 | /* | ||
| 1611 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
| 1612 | * leaving it for the final time. | ||
| 1613 | */ | ||
| 1614 | static | ||
| 1615 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
| 1616 | struct sched_domain *sd) | ||
| 1617 | { | ||
| 1618 | struct task_group *parent, *child; | ||
| 1619 | |||
| 1620 | rcu_read_lock(); | ||
| 1621 | parent = &root_task_group; | ||
| 1622 | down: | ||
| 1623 | (*down)(parent, sd); | ||
| 1624 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
| 1625 | parent = child; | ||
| 1626 | goto down; | ||
| 1627 | |||
| 1628 | up: | ||
| 1629 | continue; | ||
| 1630 | } | ||
| 1631 | (*up)(parent, sd); | ||
| 1632 | |||
| 1633 | child = parent; | ||
| 1634 | parent = parent->parent; | ||
| 1635 | if (parent) | ||
| 1636 | goto up; | ||
| 1637 | rcu_read_unlock(); | ||
| 1638 | } | ||
| 1639 | |||
| 1640 | /* | ||
| 1641 | * Calculate the aggregate runqueue weight. | ||
| 1642 | */ | ||
| 1643 | static | ||
| 1644 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
| 1645 | { | ||
| 1646 | unsigned long rq_weight = 0; | ||
| 1647 | unsigned long task_weight = 0; | ||
| 1648 | int i; | ||
| 1649 | |||
| 1650 | for_each_cpu_mask(i, sd->span) { | ||
| 1651 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
| 1652 | task_weight += tg->cfs_rq[i]->task_weight; | ||
| 1653 | } | ||
| 1654 | |||
| 1655 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
| 1656 | aggregate(tg, sd)->task_weight = task_weight; | ||
| 1657 | } | ||
| 1658 | |||
| 1659 | /* | ||
| 1660 | * Redistribute tg->shares amongst all tg->cfs_rq[]s. | ||
| 1661 | */ | ||
| 1662 | static void __aggregate_redistribute_shares(struct task_group *tg) | ||
| 1663 | { | ||
| 1664 | int i, max_cpu = smp_processor_id(); | ||
| 1665 | unsigned long rq_weight = 0; | ||
| 1666 | unsigned long shares, max_shares = 0, shares_rem = tg->shares; | ||
| 1667 | |||
| 1668 | for_each_possible_cpu(i) | ||
| 1669 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
| 1670 | |||
| 1671 | for_each_possible_cpu(i) { | ||
| 1672 | /* | ||
| 1673 | * divide shares proportional to the rq_weights. | ||
| 1674 | */ | ||
| 1675 | shares = tg->shares * tg->cfs_rq[i]->load.weight; | ||
| 1676 | shares /= rq_weight + 1; | ||
| 1677 | |||
| 1678 | tg->cfs_rq[i]->shares = shares; | ||
| 1679 | |||
| 1680 | if (shares > max_shares) { | ||
| 1681 | max_shares = shares; | ||
| 1682 | max_cpu = i; | ||
| 1683 | } | ||
| 1684 | shares_rem -= shares; | ||
| 1685 | } | ||
| 1686 | |||
| 1687 | /* | ||
| 1688 | * Ensure it all adds up to tg->shares; we can loose a few | ||
| 1689 | * due to rounding down when computing the per-cpu shares. | ||
| 1690 | */ | ||
| 1691 | if (shares_rem) | ||
| 1692 | tg->cfs_rq[max_cpu]->shares += shares_rem; | ||
| 1693 | } | ||
| 1694 | |||
| 1695 | /* | ||
| 1696 | * Compute the weight of this group on the given cpus. | ||
| 1697 | */ | ||
| 1698 | static | ||
| 1699 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
| 1700 | { | ||
| 1701 | unsigned long shares = 0; | ||
| 1702 | int i; | ||
| 1703 | |||
| 1704 | again: | ||
| 1705 | for_each_cpu_mask(i, sd->span) | ||
| 1706 | shares += tg->cfs_rq[i]->shares; | ||
| 1707 | |||
| 1708 | /* | ||
| 1709 | * When the span doesn't have any shares assigned, but does have | ||
| 1710 | * tasks to run do a machine wide rebalance (should be rare). | ||
| 1711 | */ | ||
| 1712 | if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) { | ||
| 1713 | __aggregate_redistribute_shares(tg); | ||
| 1714 | goto again; | ||
| 1715 | } | ||
| 1716 | |||
| 1717 | aggregate(tg, sd)->shares = shares; | ||
| 1718 | } | ||
| 1719 | |||
| 1720 | /* | ||
| 1721 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
| 1722 | * weight and this group's parent's load, i.e. top-down. | ||
| 1723 | */ | ||
| 1724 | static | ||
| 1725 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
| 1726 | { | ||
| 1727 | unsigned long load; | ||
| 1728 | |||
| 1729 | if (!tg->parent) { | ||
| 1730 | int i; | ||
| 1731 | |||
| 1732 | load = 0; | ||
| 1733 | for_each_cpu_mask(i, sd->span) | ||
| 1734 | load += cpu_rq(i)->load.weight; | ||
| 1735 | |||
| 1736 | } else { | ||
| 1737 | load = aggregate(tg->parent, sd)->load; | ||
| 1738 | |||
| 1739 | /* | ||
| 1740 | * shares is our weight in the parent's rq so | ||
| 1741 | * shares/parent->rq_weight gives our fraction of the load | ||
| 1742 | */ | ||
| 1743 | load *= aggregate(tg, sd)->shares; | ||
| 1744 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
| 1745 | } | ||
| 1746 | |||
| 1747 | aggregate(tg, sd)->load = load; | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
| 1751 | |||
| 1752 | /* | ||
| 1753 | * Calculate and set the cpu's group shares. | ||
| 1754 | */ | ||
| 1755 | static void | ||
| 1756 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
| 1757 | int tcpu) | ||
| 1758 | { | ||
| 1759 | int boost = 0; | ||
| 1760 | unsigned long shares; | ||
| 1761 | unsigned long rq_weight; | ||
| 1762 | |||
| 1763 | if (!tg->se[tcpu]) | ||
| 1764 | return; | ||
| 1765 | |||
| 1766 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
| 1767 | |||
| 1768 | /* | ||
| 1769 | * If there are currently no tasks on the cpu pretend there is one of | ||
| 1770 | * average load so that when a new task gets to run here it will not | ||
| 1771 | * get delayed by group starvation. | ||
| 1772 | */ | ||
| 1773 | if (!rq_weight) { | ||
| 1774 | boost = 1; | ||
| 1775 | rq_weight = NICE_0_LOAD; | ||
| 1776 | } | ||
| 1777 | |||
| 1778 | /* | ||
| 1779 | * \Sum shares * rq_weight | ||
| 1780 | * shares = ----------------------- | ||
| 1781 | * \Sum rq_weight | ||
| 1782 | * | ||
| 1783 | */ | ||
| 1784 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
| 1785 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
| 1786 | |||
| 1787 | /* | ||
| 1788 | * record the actual number of shares, not the boosted amount. | ||
| 1789 | */ | ||
| 1790 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
| 1791 | |||
| 1792 | if (shares < MIN_SHARES) | ||
| 1793 | shares = MIN_SHARES; | ||
| 1794 | |||
| 1795 | __set_se_shares(tg->se[tcpu], shares); | ||
| 1796 | } | ||
| 1797 | |||
| 1798 | /* | ||
| 1799 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
| 1800 | * task went to. | ||
| 1801 | */ | ||
| 1802 | static void | ||
| 1803 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
| 1804 | int scpu, int dcpu) | ||
| 1805 | { | ||
| 1806 | unsigned long shares; | ||
| 1807 | |||
| 1808 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
| 1809 | |||
| 1810 | __update_group_shares_cpu(tg, sd, scpu); | ||
| 1811 | __update_group_shares_cpu(tg, sd, dcpu); | ||
| 1812 | |||
| 1813 | /* | ||
| 1814 | * ensure we never loose shares due to rounding errors in the | ||
| 1815 | * above redistribution. | ||
| 1816 | */ | ||
| 1817 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
| 1818 | if (shares) | ||
| 1819 | tg->cfs_rq[dcpu]->shares += shares; | ||
| 1820 | } | ||
| 1821 | |||
| 1822 | /* | ||
| 1823 | * Because changing a group's shares changes the weight of the super-group | ||
| 1824 | * we need to walk up the tree and change all shares until we hit the root. | ||
| 1825 | */ | ||
| 1826 | static void | ||
| 1827 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
| 1828 | int scpu, int dcpu) | ||
| 1829 | { | ||
| 1830 | while (tg) { | ||
| 1831 | __move_group_shares(tg, sd, scpu, dcpu); | ||
| 1832 | tg = tg->parent; | ||
| 1833 | } | ||
| 1834 | } | ||
| 1835 | |||
| 1836 | static | ||
| 1837 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
| 1838 | { | ||
| 1839 | unsigned long shares = aggregate(tg, sd)->shares; | ||
| 1840 | int i; | ||
| 1841 | |||
| 1842 | for_each_cpu_mask(i, sd->span) { | ||
| 1843 | struct rq *rq = cpu_rq(i); | ||
| 1844 | unsigned long flags; | ||
| 1845 | |||
| 1846 | spin_lock_irqsave(&rq->lock, flags); | ||
| 1847 | __update_group_shares_cpu(tg, sd, i); | ||
| 1848 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1849 | } | ||
| 1850 | |||
| 1851 | aggregate_group_shares(tg, sd); | ||
| 1852 | |||
| 1853 | /* | ||
| 1854 | * ensure we never loose shares due to rounding errors in the | ||
| 1855 | * above redistribution. | ||
| 1856 | */ | ||
| 1857 | shares -= aggregate(tg, sd)->shares; | ||
| 1858 | if (shares) { | ||
| 1859 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
| 1860 | aggregate(tg, sd)->shares += shares; | ||
| 1861 | } | ||
| 1862 | } | ||
| 1863 | |||
| 1864 | /* | ||
| 1865 | * Calculate the accumulative weight and recursive load of each task group | ||
| 1866 | * while walking down the tree. | ||
| 1867 | */ | ||
| 1868 | static | ||
| 1869 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
| 1870 | { | ||
| 1871 | aggregate_group_weight(tg, sd); | ||
| 1872 | aggregate_group_shares(tg, sd); | ||
| 1873 | aggregate_group_load(tg, sd); | ||
| 1874 | } | ||
| 1875 | |||
| 1876 | /* | ||
| 1877 | * Rebalance the cpu shares while walking back up the tree. | ||
| 1878 | */ | ||
| 1879 | static | ||
| 1880 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
| 1881 | { | ||
| 1882 | aggregate_group_set_shares(tg, sd); | ||
| 1883 | } | ||
| 1884 | |||
| 1885 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
| 1886 | |||
| 1887 | static void __init init_aggregate(void) | ||
| 1888 | { | ||
| 1889 | int i; | ||
| 1890 | |||
| 1891 | for_each_possible_cpu(i) | ||
| 1892 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
| 1893 | } | ||
| 1894 | |||
| 1895 | static int get_aggregate(struct sched_domain *sd) | ||
| 1896 | { | ||
| 1897 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
| 1898 | return 0; | ||
| 1899 | |||
| 1900 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
| 1901 | return 1; | ||
| 1902 | } | ||
| 1903 | |||
| 1904 | static void put_aggregate(struct sched_domain *sd) | ||
| 1905 | { | ||
| 1906 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
| 1907 | } | ||
| 1908 | |||
| 1909 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
| 1910 | { | ||
| 1911 | cfs_rq->shares = shares; | ||
| 1912 | } | ||
| 1913 | |||
| 1914 | #else | ||
| 1915 | |||
| 1916 | static inline void init_aggregate(void) | ||
| 1917 | { | ||
| 1918 | } | ||
| 1919 | |||
| 1920 | static inline int get_aggregate(struct sched_domain *sd) | ||
| 1921 | { | ||
| 1922 | return 0; | ||
| 1923 | } | ||
| 1924 | |||
| 1925 | static inline void put_aggregate(struct sched_domain *sd) | ||
| 1926 | { | ||
| 1927 | } | ||
| 1928 | #endif | ||
| 1929 | |||
| 1930 | #else /* CONFIG_SMP */ | ||
| 1931 | |||
| 1932 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1933 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
| 1934 | { | ||
| 1935 | } | ||
| 1936 | #endif | ||
| 1937 | |||
| 1249 | #endif /* CONFIG_SMP */ | 1938 | #endif /* CONFIG_SMP */ |
| 1250 | 1939 | ||
| 1251 | #include "sched_stats.h" | 1940 | #include "sched_stats.h" |
| @@ -1258,26 +1947,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
| 1258 | 1947 | ||
| 1259 | #define sched_class_highest (&rt_sched_class) | 1948 | #define sched_class_highest (&rt_sched_class) |
| 1260 | 1949 | ||
| 1261 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1950 | static void inc_nr_running(struct rq *rq) |
| 1262 | { | ||
| 1263 | update_load_add(&rq->load, p->se.load.weight); | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
| 1267 | { | ||
| 1268 | update_load_sub(&rq->load, p->se.load.weight); | ||
| 1269 | } | ||
| 1270 | |||
| 1271 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
| 1272 | { | 1951 | { |
| 1273 | rq->nr_running++; | 1952 | rq->nr_running++; |
| 1274 | inc_load(rq, p); | ||
| 1275 | } | 1953 | } |
| 1276 | 1954 | ||
| 1277 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1955 | static void dec_nr_running(struct rq *rq) |
| 1278 | { | 1956 | { |
| 1279 | rq->nr_running--; | 1957 | rq->nr_running--; |
| 1280 | dec_load(rq, p); | ||
| 1281 | } | 1958 | } |
| 1282 | 1959 | ||
| 1283 | static void set_load_weight(struct task_struct *p) | 1960 | static void set_load_weight(struct task_struct *p) |
| @@ -1369,7 +2046,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 1369 | rq->nr_uninterruptible--; | 2046 | rq->nr_uninterruptible--; |
| 1370 | 2047 | ||
| 1371 | enqueue_task(rq, p, wakeup); | 2048 | enqueue_task(rq, p, wakeup); |
| 1372 | inc_nr_running(p, rq); | 2049 | inc_nr_running(rq); |
| 1373 | } | 2050 | } |
| 1374 | 2051 | ||
| 1375 | /* | 2052 | /* |
| @@ -1381,7 +2058,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
| 1381 | rq->nr_uninterruptible++; | 2058 | rq->nr_uninterruptible++; |
| 1382 | 2059 | ||
| 1383 | dequeue_task(rq, p, sleep); | 2060 | dequeue_task(rq, p, sleep); |
| 1384 | dec_nr_running(p, rq); | 2061 | dec_nr_running(rq); |
| 1385 | } | 2062 | } |
| 1386 | 2063 | ||
| 1387 | /** | 2064 | /** |
| @@ -1438,7 +2115,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 1438 | /* | 2115 | /* |
| 1439 | * Buddy candidates are cache hot: | 2116 | * Buddy candidates are cache hot: |
| 1440 | */ | 2117 | */ |
| 1441 | if (&p->se == cfs_rq_of(&p->se)->next) | 2118 | if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) |
| 1442 | return 1; | 2119 | return 1; |
| 1443 | 2120 | ||
| 1444 | if (p->sched_class != &fair_sched_class) | 2121 | if (p->sched_class != &fair_sched_class) |
| @@ -1728,17 +2405,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 1728 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 2405 | * find_idlest_cpu - find the idlest cpu among the cpus in group. |
| 1729 | */ | 2406 | */ |
| 1730 | static int | 2407 | static int |
| 1731 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 2408 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, |
| 2409 | cpumask_t *tmp) | ||
| 1732 | { | 2410 | { |
| 1733 | cpumask_t tmp; | ||
| 1734 | unsigned long load, min_load = ULONG_MAX; | 2411 | unsigned long load, min_load = ULONG_MAX; |
| 1735 | int idlest = -1; | 2412 | int idlest = -1; |
| 1736 | int i; | 2413 | int i; |
| 1737 | 2414 | ||
| 1738 | /* Traverse only the allowed CPUs */ | 2415 | /* Traverse only the allowed CPUs */ |
| 1739 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 2416 | cpus_and(*tmp, group->cpumask, p->cpus_allowed); |
| 1740 | 2417 | ||
| 1741 | for_each_cpu_mask(i, tmp) { | 2418 | for_each_cpu_mask(i, *tmp) { |
| 1742 | load = weighted_cpuload(i); | 2419 | load = weighted_cpuload(i); |
| 1743 | 2420 | ||
| 1744 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2421 | if (load < min_load || (load == min_load && i == this_cpu)) { |
| @@ -1777,7 +2454,7 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1777 | } | 2454 | } |
| 1778 | 2455 | ||
| 1779 | while (sd) { | 2456 | while (sd) { |
| 1780 | cpumask_t span; | 2457 | cpumask_t span, tmpmask; |
| 1781 | struct sched_group *group; | 2458 | struct sched_group *group; |
| 1782 | int new_cpu, weight; | 2459 | int new_cpu, weight; |
| 1783 | 2460 | ||
| @@ -1793,7 +2470,7 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1793 | continue; | 2470 | continue; |
| 1794 | } | 2471 | } |
| 1795 | 2472 | ||
| 1796 | new_cpu = find_idlest_cpu(group, t, cpu); | 2473 | new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); |
| 1797 | if (new_cpu == -1 || new_cpu == cpu) { | 2474 | if (new_cpu == -1 || new_cpu == cpu) { |
| 1798 | /* Now try balancing at a lower domain level of cpu */ | 2475 | /* Now try balancing at a lower domain level of cpu */ |
| 1799 | sd = sd->child; | 2476 | sd = sd->child; |
| @@ -1839,6 +2516,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1839 | long old_state; | 2516 | long old_state; |
| 1840 | struct rq *rq; | 2517 | struct rq *rq; |
| 1841 | 2518 | ||
| 2519 | if (!sched_feat(SYNC_WAKEUPS)) | ||
| 2520 | sync = 0; | ||
| 2521 | |||
| 1842 | smp_wmb(); | 2522 | smp_wmb(); |
| 1843 | rq = task_rq_lock(p, &flags); | 2523 | rq = task_rq_lock(p, &flags); |
| 1844 | old_state = p->state; | 2524 | old_state = p->state; |
| @@ -1955,6 +2635,7 @@ static void __sched_fork(struct task_struct *p) | |||
| 1955 | 2635 | ||
| 1956 | INIT_LIST_HEAD(&p->rt.run_list); | 2636 | INIT_LIST_HEAD(&p->rt.run_list); |
| 1957 | p->se.on_rq = 0; | 2637 | p->se.on_rq = 0; |
| 2638 | INIT_LIST_HEAD(&p->se.group_node); | ||
| 1958 | 2639 | ||
| 1959 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2640 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| 1960 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2641 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
| @@ -2030,7 +2711,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2030 | * management (if any): | 2711 | * management (if any): |
| 2031 | */ | 2712 | */ |
| 2032 | p->sched_class->task_new(rq, p); | 2713 | p->sched_class->task_new(rq, p); |
| 2033 | inc_nr_running(p, rq); | 2714 | inc_nr_running(rq); |
| 2034 | } | 2715 | } |
| 2035 | check_preempt_curr(rq, p); | 2716 | check_preempt_curr(rq, p); |
| 2036 | #ifdef CONFIG_SMP | 2717 | #ifdef CONFIG_SMP |
| @@ -2674,7 +3355,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2674 | static struct sched_group * | 3355 | static struct sched_group * |
| 2675 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3356 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 2676 | unsigned long *imbalance, enum cpu_idle_type idle, | 3357 | unsigned long *imbalance, enum cpu_idle_type idle, |
| 2677 | int *sd_idle, cpumask_t *cpus, int *balance) | 3358 | int *sd_idle, const cpumask_t *cpus, int *balance) |
| 2678 | { | 3359 | { |
| 2679 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 3360 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 2680 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 3361 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| @@ -2975,7 +3656,7 @@ ret: | |||
| 2975 | */ | 3656 | */ |
| 2976 | static struct rq * | 3657 | static struct rq * |
| 2977 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 3658 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, |
| 2978 | unsigned long imbalance, cpumask_t *cpus) | 3659 | unsigned long imbalance, const cpumask_t *cpus) |
| 2979 | { | 3660 | { |
| 2980 | struct rq *busiest = NULL, *rq; | 3661 | struct rq *busiest = NULL, *rq; |
| 2981 | unsigned long max_load = 0; | 3662 | unsigned long max_load = 0; |
| @@ -3014,14 +3695,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 3014 | */ | 3695 | */ |
| 3015 | static int load_balance(int this_cpu, struct rq *this_rq, | 3696 | static int load_balance(int this_cpu, struct rq *this_rq, |
| 3016 | struct sched_domain *sd, enum cpu_idle_type idle, | 3697 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 3017 | int *balance) | 3698 | int *balance, cpumask_t *cpus) |
| 3018 | { | 3699 | { |
| 3019 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3700 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
| 3020 | struct sched_group *group; | 3701 | struct sched_group *group; |
| 3021 | unsigned long imbalance; | 3702 | unsigned long imbalance; |
| 3022 | struct rq *busiest; | 3703 | struct rq *busiest; |
| 3023 | cpumask_t cpus = CPU_MASK_ALL; | ||
| 3024 | unsigned long flags; | 3704 | unsigned long flags; |
| 3705 | int unlock_aggregate; | ||
| 3706 | |||
| 3707 | cpus_setall(*cpus); | ||
| 3708 | |||
| 3709 | unlock_aggregate = get_aggregate(sd); | ||
| 3025 | 3710 | ||
| 3026 | /* | 3711 | /* |
| 3027 | * When power savings policy is enabled for the parent domain, idle | 3712 | * When power savings policy is enabled for the parent domain, idle |
| @@ -3037,7 +3722,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 3037 | 3722 | ||
| 3038 | redo: | 3723 | redo: |
| 3039 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3724 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
| 3040 | &cpus, balance); | 3725 | cpus, balance); |
| 3041 | 3726 | ||
| 3042 | if (*balance == 0) | 3727 | if (*balance == 0) |
| 3043 | goto out_balanced; | 3728 | goto out_balanced; |
| @@ -3047,7 +3732,7 @@ redo: | |||
| 3047 | goto out_balanced; | 3732 | goto out_balanced; |
| 3048 | } | 3733 | } |
| 3049 | 3734 | ||
| 3050 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); | 3735 | busiest = find_busiest_queue(group, idle, imbalance, cpus); |
| 3051 | if (!busiest) { | 3736 | if (!busiest) { |
| 3052 | schedstat_inc(sd, lb_nobusyq[idle]); | 3737 | schedstat_inc(sd, lb_nobusyq[idle]); |
| 3053 | goto out_balanced; | 3738 | goto out_balanced; |
| @@ -3080,8 +3765,8 @@ redo: | |||
| 3080 | 3765 | ||
| 3081 | /* All tasks on this runqueue were pinned by CPU affinity */ | 3766 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 3082 | if (unlikely(all_pinned)) { | 3767 | if (unlikely(all_pinned)) { |
| 3083 | cpu_clear(cpu_of(busiest), cpus); | 3768 | cpu_clear(cpu_of(busiest), *cpus); |
| 3084 | if (!cpus_empty(cpus)) | 3769 | if (!cpus_empty(*cpus)) |
| 3085 | goto redo; | 3770 | goto redo; |
| 3086 | goto out_balanced; | 3771 | goto out_balanced; |
| 3087 | } | 3772 | } |
| @@ -3138,8 +3823,9 @@ redo: | |||
| 3138 | 3823 | ||
| 3139 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3824 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 3140 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3825 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 3141 | return -1; | 3826 | ld_moved = -1; |
| 3142 | return ld_moved; | 3827 | |
| 3828 | goto out; | ||
| 3143 | 3829 | ||
| 3144 | out_balanced: | 3830 | out_balanced: |
| 3145 | schedstat_inc(sd, lb_balanced[idle]); | 3831 | schedstat_inc(sd, lb_balanced[idle]); |
| @@ -3154,8 +3840,13 @@ out_one_pinned: | |||
| 3154 | 3840 | ||
| 3155 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3841 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 3156 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3842 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 3157 | return -1; | 3843 | ld_moved = -1; |
| 3158 | return 0; | 3844 | else |
| 3845 | ld_moved = 0; | ||
| 3846 | out: | ||
| 3847 | if (unlock_aggregate) | ||
| 3848 | put_aggregate(sd); | ||
| 3849 | return ld_moved; | ||
| 3159 | } | 3850 | } |
| 3160 | 3851 | ||
| 3161 | /* | 3852 | /* |
| @@ -3166,7 +3857,8 @@ out_one_pinned: | |||
| 3166 | * this_rq is locked. | 3857 | * this_rq is locked. |
| 3167 | */ | 3858 | */ |
| 3168 | static int | 3859 | static int |
| 3169 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | 3860 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, |
| 3861 | cpumask_t *cpus) | ||
| 3170 | { | 3862 | { |
| 3171 | struct sched_group *group; | 3863 | struct sched_group *group; |
| 3172 | struct rq *busiest = NULL; | 3864 | struct rq *busiest = NULL; |
| @@ -3174,7 +3866,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 3174 | int ld_moved = 0; | 3866 | int ld_moved = 0; |
| 3175 | int sd_idle = 0; | 3867 | int sd_idle = 0; |
| 3176 | int all_pinned = 0; | 3868 | int all_pinned = 0; |
| 3177 | cpumask_t cpus = CPU_MASK_ALL; | 3869 | |
| 3870 | cpus_setall(*cpus); | ||
| 3178 | 3871 | ||
| 3179 | /* | 3872 | /* |
| 3180 | * When power savings policy is enabled for the parent domain, idle | 3873 | * When power savings policy is enabled for the parent domain, idle |
| @@ -3189,14 +3882,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 3189 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3882 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
| 3190 | redo: | 3883 | redo: |
| 3191 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3884 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
| 3192 | &sd_idle, &cpus, NULL); | 3885 | &sd_idle, cpus, NULL); |
| 3193 | if (!group) { | 3886 | if (!group) { |
| 3194 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | 3887 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); |
| 3195 | goto out_balanced; | 3888 | goto out_balanced; |
| 3196 | } | 3889 | } |
| 3197 | 3890 | ||
| 3198 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, | 3891 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); |
| 3199 | &cpus); | ||
| 3200 | if (!busiest) { | 3892 | if (!busiest) { |
| 3201 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | 3893 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); |
| 3202 | goto out_balanced; | 3894 | goto out_balanced; |
| @@ -3218,8 +3910,8 @@ redo: | |||
| 3218 | spin_unlock(&busiest->lock); | 3910 | spin_unlock(&busiest->lock); |
| 3219 | 3911 | ||
| 3220 | if (unlikely(all_pinned)) { | 3912 | if (unlikely(all_pinned)) { |
| 3221 | cpu_clear(cpu_of(busiest), cpus); | 3913 | cpu_clear(cpu_of(busiest), *cpus); |
| 3222 | if (!cpus_empty(cpus)) | 3914 | if (!cpus_empty(*cpus)) |
| 3223 | goto redo; | 3915 | goto redo; |
| 3224 | } | 3916 | } |
| 3225 | } | 3917 | } |
| @@ -3253,6 +3945,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3253 | struct sched_domain *sd; | 3945 | struct sched_domain *sd; |
| 3254 | int pulled_task = -1; | 3946 | int pulled_task = -1; |
| 3255 | unsigned long next_balance = jiffies + HZ; | 3947 | unsigned long next_balance = jiffies + HZ; |
| 3948 | cpumask_t tmpmask; | ||
| 3256 | 3949 | ||
| 3257 | for_each_domain(this_cpu, sd) { | 3950 | for_each_domain(this_cpu, sd) { |
| 3258 | unsigned long interval; | 3951 | unsigned long interval; |
| @@ -3262,8 +3955,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3262 | 3955 | ||
| 3263 | if (sd->flags & SD_BALANCE_NEWIDLE) | 3956 | if (sd->flags & SD_BALANCE_NEWIDLE) |
| 3264 | /* If we've pulled tasks over stop searching: */ | 3957 | /* If we've pulled tasks over stop searching: */ |
| 3265 | pulled_task = load_balance_newidle(this_cpu, | 3958 | pulled_task = load_balance_newidle(this_cpu, this_rq, |
| 3266 | this_rq, sd); | 3959 | sd, &tmpmask); |
| 3267 | 3960 | ||
| 3268 | interval = msecs_to_jiffies(sd->balance_interval); | 3961 | interval = msecs_to_jiffies(sd->balance_interval); |
| 3269 | if (time_after(next_balance, sd->last_balance + interval)) | 3962 | if (time_after(next_balance, sd->last_balance + interval)) |
| @@ -3422,6 +4115,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3422 | /* Earliest time when we have to do rebalance again */ | 4115 | /* Earliest time when we have to do rebalance again */ |
| 3423 | unsigned long next_balance = jiffies + 60*HZ; | 4116 | unsigned long next_balance = jiffies + 60*HZ; |
| 3424 | int update_next_balance = 0; | 4117 | int update_next_balance = 0; |
| 4118 | cpumask_t tmp; | ||
| 3425 | 4119 | ||
| 3426 | for_each_domain(cpu, sd) { | 4120 | for_each_domain(cpu, sd) { |
| 3427 | if (!(sd->flags & SD_LOAD_BALANCE)) | 4121 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| @@ -3445,7 +4139,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3445 | } | 4139 | } |
| 3446 | 4140 | ||
| 3447 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 4141 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
| 3448 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 4142 | if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { |
| 3449 | /* | 4143 | /* |
| 3450 | * We've pulled tasks over so either we're no | 4144 | * We've pulled tasks over so either we're no |
| 3451 | * longer idle, or one of our SMT siblings is | 4145 | * longer idle, or one of our SMT siblings is |
| @@ -3561,7 +4255,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
| 3561 | */ | 4255 | */ |
| 3562 | int ilb = first_cpu(nohz.cpu_mask); | 4256 | int ilb = first_cpu(nohz.cpu_mask); |
| 3563 | 4257 | ||
| 3564 | if (ilb != NR_CPUS) | 4258 | if (ilb < nr_cpu_ids) |
| 3565 | resched_cpu(ilb); | 4259 | resched_cpu(ilb); |
| 3566 | } | 4260 | } |
| 3567 | } | 4261 | } |
| @@ -3765,9 +4459,9 @@ void scheduler_tick(void) | |||
| 3765 | rq->clock_underflows++; | 4459 | rq->clock_underflows++; |
| 3766 | } | 4460 | } |
| 3767 | rq->tick_timestamp = rq->clock; | 4461 | rq->tick_timestamp = rq->clock; |
| 4462 | update_last_tick_seen(rq); | ||
| 3768 | update_cpu_load(rq); | 4463 | update_cpu_load(rq); |
| 3769 | curr->sched_class->task_tick(rq, curr, 0); | 4464 | curr->sched_class->task_tick(rq, curr, 0); |
| 3770 | update_sched_rt_period(rq); | ||
| 3771 | spin_unlock(&rq->lock); | 4465 | spin_unlock(&rq->lock); |
| 3772 | 4466 | ||
| 3773 | #ifdef CONFIG_SMP | 4467 | #ifdef CONFIG_SMP |
| @@ -4367,10 +5061,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4367 | goto out_unlock; | 5061 | goto out_unlock; |
| 4368 | } | 5062 | } |
| 4369 | on_rq = p->se.on_rq; | 5063 | on_rq = p->se.on_rq; |
| 4370 | if (on_rq) { | 5064 | if (on_rq) |
| 4371 | dequeue_task(rq, p, 0); | 5065 | dequeue_task(rq, p, 0); |
| 4372 | dec_load(rq, p); | ||
| 4373 | } | ||
| 4374 | 5066 | ||
| 4375 | p->static_prio = NICE_TO_PRIO(nice); | 5067 | p->static_prio = NICE_TO_PRIO(nice); |
| 4376 | set_load_weight(p); | 5068 | set_load_weight(p); |
| @@ -4380,7 +5072,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4380 | 5072 | ||
| 4381 | if (on_rq) { | 5073 | if (on_rq) { |
| 4382 | enqueue_task(rq, p, 0); | 5074 | enqueue_task(rq, p, 0); |
| 4383 | inc_load(rq, p); | ||
| 4384 | /* | 5075 | /* |
| 4385 | * If the task increased its priority or is running and | 5076 | * If the task increased its priority or is running and |
| 4386 | * lowered its priority, then reschedule its CPU: | 5077 | * lowered its priority, then reschedule its CPU: |
| @@ -4602,7 +5293,7 @@ recheck: | |||
| 4602 | * Do not allow realtime tasks into groups that have no runtime | 5293 | * Do not allow realtime tasks into groups that have no runtime |
| 4603 | * assigned. | 5294 | * assigned. |
| 4604 | */ | 5295 | */ |
| 4605 | if (rt_policy(policy) && task_group(p)->rt_runtime == 0) | 5296 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) |
| 4606 | return -EPERM; | 5297 | return -EPERM; |
| 4607 | #endif | 5298 | #endif |
| 4608 | 5299 | ||
| @@ -4764,9 +5455,10 @@ out_unlock: | |||
| 4764 | return retval; | 5455 | return retval; |
| 4765 | } | 5456 | } |
| 4766 | 5457 | ||
| 4767 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 5458 | long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) |
| 4768 | { | 5459 | { |
| 4769 | cpumask_t cpus_allowed; | 5460 | cpumask_t cpus_allowed; |
| 5461 | cpumask_t new_mask = *in_mask; | ||
| 4770 | struct task_struct *p; | 5462 | struct task_struct *p; |
| 4771 | int retval; | 5463 | int retval; |
| 4772 | 5464 | ||
| @@ -4797,13 +5489,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
| 4797 | if (retval) | 5489 | if (retval) |
| 4798 | goto out_unlock; | 5490 | goto out_unlock; |
| 4799 | 5491 | ||
| 4800 | cpus_allowed = cpuset_cpus_allowed(p); | 5492 | cpuset_cpus_allowed(p, &cpus_allowed); |
| 4801 | cpus_and(new_mask, new_mask, cpus_allowed); | 5493 | cpus_and(new_mask, new_mask, cpus_allowed); |
| 4802 | again: | 5494 | again: |
| 4803 | retval = set_cpus_allowed(p, new_mask); | 5495 | retval = set_cpus_allowed_ptr(p, &new_mask); |
| 4804 | 5496 | ||
| 4805 | if (!retval) { | 5497 | if (!retval) { |
| 4806 | cpus_allowed = cpuset_cpus_allowed(p); | 5498 | cpuset_cpus_allowed(p, &cpus_allowed); |
| 4807 | if (!cpus_subset(new_mask, cpus_allowed)) { | 5499 | if (!cpus_subset(new_mask, cpus_allowed)) { |
| 4808 | /* | 5500 | /* |
| 4809 | * We must have raced with a concurrent cpuset | 5501 | * We must have raced with a concurrent cpuset |
| @@ -4847,7 +5539,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
| 4847 | if (retval) | 5539 | if (retval) |
| 4848 | return retval; | 5540 | return retval; |
| 4849 | 5541 | ||
| 4850 | return sched_setaffinity(pid, new_mask); | 5542 | return sched_setaffinity(pid, &new_mask); |
| 4851 | } | 5543 | } |
| 4852 | 5544 | ||
| 4853 | /* | 5545 | /* |
| @@ -5309,7 +6001,6 @@ static inline void sched_init_granularity(void) | |||
| 5309 | sysctl_sched_latency = limit; | 6001 | sysctl_sched_latency = limit; |
| 5310 | 6002 | ||
| 5311 | sysctl_sched_wakeup_granularity *= factor; | 6003 | sysctl_sched_wakeup_granularity *= factor; |
| 5312 | sysctl_sched_batch_wakeup_granularity *= factor; | ||
| 5313 | } | 6004 | } |
| 5314 | 6005 | ||
| 5315 | #ifdef CONFIG_SMP | 6006 | #ifdef CONFIG_SMP |
| @@ -5338,7 +6029,7 @@ static inline void sched_init_granularity(void) | |||
| 5338 | * task must not exit() & deallocate itself prematurely. The | 6029 | * task must not exit() & deallocate itself prematurely. The |
| 5339 | * call is not atomic; no spinlocks may be held. | 6030 | * call is not atomic; no spinlocks may be held. |
| 5340 | */ | 6031 | */ |
| 5341 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 6032 | int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) |
| 5342 | { | 6033 | { |
| 5343 | struct migration_req req; | 6034 | struct migration_req req; |
| 5344 | unsigned long flags; | 6035 | unsigned long flags; |
| @@ -5346,23 +6037,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
| 5346 | int ret = 0; | 6037 | int ret = 0; |
| 5347 | 6038 | ||
| 5348 | rq = task_rq_lock(p, &flags); | 6039 | rq = task_rq_lock(p, &flags); |
| 5349 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 6040 | if (!cpus_intersects(*new_mask, cpu_online_map)) { |
| 5350 | ret = -EINVAL; | 6041 | ret = -EINVAL; |
| 5351 | goto out; | 6042 | goto out; |
| 5352 | } | 6043 | } |
| 5353 | 6044 | ||
| 5354 | if (p->sched_class->set_cpus_allowed) | 6045 | if (p->sched_class->set_cpus_allowed) |
| 5355 | p->sched_class->set_cpus_allowed(p, &new_mask); | 6046 | p->sched_class->set_cpus_allowed(p, new_mask); |
| 5356 | else { | 6047 | else { |
| 5357 | p->cpus_allowed = new_mask; | 6048 | p->cpus_allowed = *new_mask; |
| 5358 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | 6049 | p->rt.nr_cpus_allowed = cpus_weight(*new_mask); |
| 5359 | } | 6050 | } |
| 5360 | 6051 | ||
| 5361 | /* Can the task run on the task's current CPU? If so, we're done */ | 6052 | /* Can the task run on the task's current CPU? If so, we're done */ |
| 5362 | if (cpu_isset(task_cpu(p), new_mask)) | 6053 | if (cpu_isset(task_cpu(p), *new_mask)) |
| 5363 | goto out; | 6054 | goto out; |
| 5364 | 6055 | ||
| 5365 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 6056 | if (migrate_task(p, any_online_cpu(*new_mask), &req)) { |
| 5366 | /* Need help from migration thread: drop lock and wait. */ | 6057 | /* Need help from migration thread: drop lock and wait. */ |
| 5367 | task_rq_unlock(rq, &flags); | 6058 | task_rq_unlock(rq, &flags); |
| 5368 | wake_up_process(rq->migration_thread); | 6059 | wake_up_process(rq->migration_thread); |
| @@ -5375,7 +6066,7 @@ out: | |||
| 5375 | 6066 | ||
| 5376 | return ret; | 6067 | return ret; |
| 5377 | } | 6068 | } |
| 5378 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 6069 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
| 5379 | 6070 | ||
| 5380 | /* | 6071 | /* |
| 5381 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 6072 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
| @@ -5513,12 +6204,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 5513 | dest_cpu = any_online_cpu(mask); | 6204 | dest_cpu = any_online_cpu(mask); |
| 5514 | 6205 | ||
| 5515 | /* On any allowed CPU? */ | 6206 | /* On any allowed CPU? */ |
| 5516 | if (dest_cpu == NR_CPUS) | 6207 | if (dest_cpu >= nr_cpu_ids) |
| 5517 | dest_cpu = any_online_cpu(p->cpus_allowed); | 6208 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 5518 | 6209 | ||
| 5519 | /* No more Mr. Nice Guy. */ | 6210 | /* No more Mr. Nice Guy. */ |
| 5520 | if (dest_cpu == NR_CPUS) { | 6211 | if (dest_cpu >= nr_cpu_ids) { |
| 5521 | cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); | 6212 | cpumask_t cpus_allowed; |
| 6213 | |||
| 6214 | cpuset_cpus_allowed_locked(p, &cpus_allowed); | ||
| 5522 | /* | 6215 | /* |
| 5523 | * Try to stay on the same cpuset, where the | 6216 | * Try to stay on the same cpuset, where the |
| 5524 | * current cpuset may be a subset of all cpus. | 6217 | * current cpuset may be a subset of all cpus. |
| @@ -5554,7 +6247,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 5554 | */ | 6247 | */ |
| 5555 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6248 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
| 5556 | { | 6249 | { |
| 5557 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 6250 | struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); |
| 5558 | unsigned long flags; | 6251 | unsigned long flags; |
| 5559 | 6252 | ||
| 5560 | local_irq_save(flags); | 6253 | local_irq_save(flags); |
| @@ -5966,20 +6659,16 @@ void __init migration_init(void) | |||
| 5966 | 6659 | ||
| 5967 | #ifdef CONFIG_SMP | 6660 | #ifdef CONFIG_SMP |
| 5968 | 6661 | ||
| 5969 | /* Number of possible processor ids */ | ||
| 5970 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
| 5971 | EXPORT_SYMBOL(nr_cpu_ids); | ||
| 5972 | |||
| 5973 | #ifdef CONFIG_SCHED_DEBUG | 6662 | #ifdef CONFIG_SCHED_DEBUG |
| 5974 | 6663 | ||
| 5975 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | 6664 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
| 6665 | cpumask_t *groupmask) | ||
| 5976 | { | 6666 | { |
| 5977 | struct sched_group *group = sd->groups; | 6667 | struct sched_group *group = sd->groups; |
| 5978 | cpumask_t groupmask; | 6668 | char str[256]; |
| 5979 | char str[NR_CPUS]; | ||
| 5980 | 6669 | ||
| 5981 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 6670 | cpulist_scnprintf(str, sizeof(str), sd->span); |
| 5982 | cpus_clear(groupmask); | 6671 | cpus_clear(*groupmask); |
| 5983 | 6672 | ||
| 5984 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | 6673 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); |
| 5985 | 6674 | ||
| @@ -6023,25 +6712,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
| 6023 | break; | 6712 | break; |
| 6024 | } | 6713 | } |
| 6025 | 6714 | ||
| 6026 | if (cpus_intersects(groupmask, group->cpumask)) { | 6715 | if (cpus_intersects(*groupmask, group->cpumask)) { |
| 6027 | printk(KERN_CONT "\n"); | 6716 | printk(KERN_CONT "\n"); |
| 6028 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 6717 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
| 6029 | break; | 6718 | break; |
| 6030 | } | 6719 | } |
| 6031 | 6720 | ||
| 6032 | cpus_or(groupmask, groupmask, group->cpumask); | 6721 | cpus_or(*groupmask, *groupmask, group->cpumask); |
| 6033 | 6722 | ||
| 6034 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 6723 | cpulist_scnprintf(str, sizeof(str), group->cpumask); |
| 6035 | printk(KERN_CONT " %s", str); | 6724 | printk(KERN_CONT " %s", str); |
| 6036 | 6725 | ||
| 6037 | group = group->next; | 6726 | group = group->next; |
| 6038 | } while (group != sd->groups); | 6727 | } while (group != sd->groups); |
| 6039 | printk(KERN_CONT "\n"); | 6728 | printk(KERN_CONT "\n"); |
| 6040 | 6729 | ||
| 6041 | if (!cpus_equal(sd->span, groupmask)) | 6730 | if (!cpus_equal(sd->span, *groupmask)) |
| 6042 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 6731 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
| 6043 | 6732 | ||
| 6044 | if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) | 6733 | if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) |
| 6045 | printk(KERN_ERR "ERROR: parent span is not a superset " | 6734 | printk(KERN_ERR "ERROR: parent span is not a superset " |
| 6046 | "of domain->span\n"); | 6735 | "of domain->span\n"); |
| 6047 | return 0; | 6736 | return 0; |
| @@ -6049,6 +6738,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
| 6049 | 6738 | ||
| 6050 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6739 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
| 6051 | { | 6740 | { |
| 6741 | cpumask_t *groupmask; | ||
| 6052 | int level = 0; | 6742 | int level = 0; |
| 6053 | 6743 | ||
| 6054 | if (!sd) { | 6744 | if (!sd) { |
| @@ -6058,14 +6748,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 6058 | 6748 | ||
| 6059 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6749 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
| 6060 | 6750 | ||
| 6751 | groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | ||
| 6752 | if (!groupmask) { | ||
| 6753 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
| 6754 | return; | ||
| 6755 | } | ||
| 6756 | |||
| 6061 | for (;;) { | 6757 | for (;;) { |
| 6062 | if (sched_domain_debug_one(sd, cpu, level)) | 6758 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) |
| 6063 | break; | 6759 | break; |
| 6064 | level++; | 6760 | level++; |
| 6065 | sd = sd->parent; | 6761 | sd = sd->parent; |
| 6066 | if (!sd) | 6762 | if (!sd) |
| 6067 | break; | 6763 | break; |
| 6068 | } | 6764 | } |
| 6765 | kfree(groupmask); | ||
| 6069 | } | 6766 | } |
| 6070 | #else | 6767 | #else |
| 6071 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6768 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| @@ -6253,30 +6950,33 @@ __setup("isolcpus=", isolated_cpu_setup); | |||
| 6253 | * and ->cpu_power to 0. | 6950 | * and ->cpu_power to 0. |
| 6254 | */ | 6951 | */ |
| 6255 | static void | 6952 | static void |
| 6256 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | 6953 | init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, |
| 6257 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, | 6954 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
| 6258 | struct sched_group **sg)) | 6955 | struct sched_group **sg, |
| 6956 | cpumask_t *tmpmask), | ||
| 6957 | cpumask_t *covered, cpumask_t *tmpmask) | ||
| 6259 | { | 6958 | { |
| 6260 | struct sched_group *first = NULL, *last = NULL; | 6959 | struct sched_group *first = NULL, *last = NULL; |
| 6261 | cpumask_t covered = CPU_MASK_NONE; | ||
| 6262 | int i; | 6960 | int i; |
| 6263 | 6961 | ||
| 6264 | for_each_cpu_mask(i, span) { | 6962 | cpus_clear(*covered); |
| 6963 | |||
| 6964 | for_each_cpu_mask(i, *span) { | ||
| 6265 | struct sched_group *sg; | 6965 | struct sched_group *sg; |
| 6266 | int group = group_fn(i, cpu_map, &sg); | 6966 | int group = group_fn(i, cpu_map, &sg, tmpmask); |
| 6267 | int j; | 6967 | int j; |
| 6268 | 6968 | ||
| 6269 | if (cpu_isset(i, covered)) | 6969 | if (cpu_isset(i, *covered)) |
| 6270 | continue; | 6970 | continue; |
| 6271 | 6971 | ||
| 6272 | sg->cpumask = CPU_MASK_NONE; | 6972 | cpus_clear(sg->cpumask); |
| 6273 | sg->__cpu_power = 0; | 6973 | sg->__cpu_power = 0; |
| 6274 | 6974 | ||
| 6275 | for_each_cpu_mask(j, span) { | 6975 | for_each_cpu_mask(j, *span) { |
| 6276 | if (group_fn(j, cpu_map, NULL) != group) | 6976 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
| 6277 | continue; | 6977 | continue; |
| 6278 | 6978 | ||
| 6279 | cpu_set(j, covered); | 6979 | cpu_set(j, *covered); |
| 6280 | cpu_set(j, sg->cpumask); | 6980 | cpu_set(j, sg->cpumask); |
| 6281 | } | 6981 | } |
| 6282 | if (!first) | 6982 | if (!first) |
| @@ -6302,7 +7002,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
| 6302 | * | 7002 | * |
| 6303 | * Should use nodemask_t. | 7003 | * Should use nodemask_t. |
| 6304 | */ | 7004 | */ |
| 6305 | static int find_next_best_node(int node, unsigned long *used_nodes) | 7005 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
| 6306 | { | 7006 | { |
| 6307 | int i, n, val, min_val, best_node = 0; | 7007 | int i, n, val, min_val, best_node = 0; |
| 6308 | 7008 | ||
| @@ -6316,7 +7016,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
| 6316 | continue; | 7016 | continue; |
| 6317 | 7017 | ||
| 6318 | /* Skip already used nodes */ | 7018 | /* Skip already used nodes */ |
| 6319 | if (test_bit(n, used_nodes)) | 7019 | if (node_isset(n, *used_nodes)) |
| 6320 | continue; | 7020 | continue; |
| 6321 | 7021 | ||
| 6322 | /* Simple min distance search */ | 7022 | /* Simple min distance search */ |
| @@ -6328,40 +7028,37 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
| 6328 | } | 7028 | } |
| 6329 | } | 7029 | } |
| 6330 | 7030 | ||
| 6331 | set_bit(best_node, used_nodes); | 7031 | node_set(best_node, *used_nodes); |
| 6332 | return best_node; | 7032 | return best_node; |
| 6333 | } | 7033 | } |
| 6334 | 7034 | ||
| 6335 | /** | 7035 | /** |
| 6336 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 7036 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
| 6337 | * @node: node whose cpumask we're constructing | 7037 | * @node: node whose cpumask we're constructing |
| 6338 | * @size: number of nodes to include in this span | 7038 | * @span: resulting cpumask |
| 6339 | * | 7039 | * |
| 6340 | * Given a node, construct a good cpumask for its sched_domain to span. It | 7040 | * Given a node, construct a good cpumask for its sched_domain to span. It |
| 6341 | * should be one that prevents unnecessary balancing, but also spreads tasks | 7041 | * should be one that prevents unnecessary balancing, but also spreads tasks |
| 6342 | * out optimally. | 7042 | * out optimally. |
| 6343 | */ | 7043 | */ |
| 6344 | static cpumask_t sched_domain_node_span(int node) | 7044 | static void sched_domain_node_span(int node, cpumask_t *span) |
| 6345 | { | 7045 | { |
| 6346 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 7046 | nodemask_t used_nodes; |
| 6347 | cpumask_t span, nodemask; | 7047 | node_to_cpumask_ptr(nodemask, node); |
| 6348 | int i; | 7048 | int i; |
| 6349 | 7049 | ||
| 6350 | cpus_clear(span); | 7050 | cpus_clear(*span); |
| 6351 | bitmap_zero(used_nodes, MAX_NUMNODES); | 7051 | nodes_clear(used_nodes); |
| 6352 | 7052 | ||
| 6353 | nodemask = node_to_cpumask(node); | 7053 | cpus_or(*span, *span, *nodemask); |
| 6354 | cpus_or(span, span, nodemask); | 7054 | node_set(node, used_nodes); |
| 6355 | set_bit(node, used_nodes); | ||
| 6356 | 7055 | ||
| 6357 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7056 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
| 6358 | int next_node = find_next_best_node(node, used_nodes); | 7057 | int next_node = find_next_best_node(node, &used_nodes); |
| 6359 | 7058 | ||
| 6360 | nodemask = node_to_cpumask(next_node); | 7059 | node_to_cpumask_ptr_next(nodemask, next_node); |
| 6361 | cpus_or(span, span, nodemask); | 7060 | cpus_or(*span, *span, *nodemask); |
| 6362 | } | 7061 | } |
| 6363 | |||
| 6364 | return span; | ||
| 6365 | } | 7062 | } |
| 6366 | #endif | 7063 | #endif |
| 6367 | 7064 | ||
| @@ -6375,7 +7072,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | |||
| 6375 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 7072 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
| 6376 | 7073 | ||
| 6377 | static int | 7074 | static int |
| 6378 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7075 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7076 | cpumask_t *unused) | ||
| 6379 | { | 7077 | { |
| 6380 | if (sg) | 7078 | if (sg) |
| 6381 | *sg = &per_cpu(sched_group_cpus, cpu); | 7079 | *sg = &per_cpu(sched_group_cpus, cpu); |
| @@ -6393,19 +7091,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); | |||
| 6393 | 7091 | ||
| 6394 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7092 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
| 6395 | static int | 7093 | static int |
| 6396 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7094 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7095 | cpumask_t *mask) | ||
| 6397 | { | 7096 | { |
| 6398 | int group; | 7097 | int group; |
| 6399 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7098 | |
| 6400 | cpus_and(mask, mask, *cpu_map); | 7099 | *mask = per_cpu(cpu_sibling_map, cpu); |
| 6401 | group = first_cpu(mask); | 7100 | cpus_and(*mask, *mask, *cpu_map); |
| 7101 | group = first_cpu(*mask); | ||
| 6402 | if (sg) | 7102 | if (sg) |
| 6403 | *sg = &per_cpu(sched_group_core, group); | 7103 | *sg = &per_cpu(sched_group_core, group); |
| 6404 | return group; | 7104 | return group; |
| 6405 | } | 7105 | } |
| 6406 | #elif defined(CONFIG_SCHED_MC) | 7106 | #elif defined(CONFIG_SCHED_MC) |
| 6407 | static int | 7107 | static int |
| 6408 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7108 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7109 | cpumask_t *unused) | ||
| 6409 | { | 7110 | { |
| 6410 | if (sg) | 7111 | if (sg) |
| 6411 | *sg = &per_cpu(sched_group_core, cpu); | 7112 | *sg = &per_cpu(sched_group_core, cpu); |
| @@ -6417,17 +7118,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); | |||
| 6417 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 7118 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
| 6418 | 7119 | ||
| 6419 | static int | 7120 | static int |
| 6420 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7121 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7122 | cpumask_t *mask) | ||
| 6421 | { | 7123 | { |
| 6422 | int group; | 7124 | int group; |
| 6423 | #ifdef CONFIG_SCHED_MC | 7125 | #ifdef CONFIG_SCHED_MC |
| 6424 | cpumask_t mask = cpu_coregroup_map(cpu); | 7126 | *mask = cpu_coregroup_map(cpu); |
| 6425 | cpus_and(mask, mask, *cpu_map); | 7127 | cpus_and(*mask, *mask, *cpu_map); |
| 6426 | group = first_cpu(mask); | 7128 | group = first_cpu(*mask); |
| 6427 | #elif defined(CONFIG_SCHED_SMT) | 7129 | #elif defined(CONFIG_SCHED_SMT) |
| 6428 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7130 | *mask = per_cpu(cpu_sibling_map, cpu); |
| 6429 | cpus_and(mask, mask, *cpu_map); | 7131 | cpus_and(*mask, *mask, *cpu_map); |
| 6430 | group = first_cpu(mask); | 7132 | group = first_cpu(*mask); |
| 6431 | #else | 7133 | #else |
| 6432 | group = cpu; | 7134 | group = cpu; |
| 6433 | #endif | 7135 | #endif |
| @@ -6443,19 +7145,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | |||
| 6443 | * gets dynamically allocated. | 7145 | * gets dynamically allocated. |
| 6444 | */ | 7146 | */ |
| 6445 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 7147 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
| 6446 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 7148 | static struct sched_group ***sched_group_nodes_bycpu; |
| 6447 | 7149 | ||
| 6448 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 7150 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
| 6449 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); | 7151 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
| 6450 | 7152 | ||
| 6451 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, | 7153 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
| 6452 | struct sched_group **sg) | 7154 | struct sched_group **sg, cpumask_t *nodemask) |
| 6453 | { | 7155 | { |
| 6454 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); | ||
| 6455 | int group; | 7156 | int group; |
| 6456 | 7157 | ||
| 6457 | cpus_and(nodemask, nodemask, *cpu_map); | 7158 | *nodemask = node_to_cpumask(cpu_to_node(cpu)); |
| 6458 | group = first_cpu(nodemask); | 7159 | cpus_and(*nodemask, *nodemask, *cpu_map); |
| 7160 | group = first_cpu(*nodemask); | ||
| 6459 | 7161 | ||
| 6460 | if (sg) | 7162 | if (sg) |
| 6461 | *sg = &per_cpu(sched_group_allnodes, group); | 7163 | *sg = &per_cpu(sched_group_allnodes, group); |
| @@ -6491,7 +7193,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 6491 | 7193 | ||
| 6492 | #ifdef CONFIG_NUMA | 7194 | #ifdef CONFIG_NUMA |
| 6493 | /* Free memory allocated for various sched_group structures */ | 7195 | /* Free memory allocated for various sched_group structures */ |
| 6494 | static void free_sched_groups(const cpumask_t *cpu_map) | 7196 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
| 6495 | { | 7197 | { |
| 6496 | int cpu, i; | 7198 | int cpu, i; |
| 6497 | 7199 | ||
| @@ -6503,11 +7205,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
| 6503 | continue; | 7205 | continue; |
| 6504 | 7206 | ||
| 6505 | for (i = 0; i < MAX_NUMNODES; i++) { | 7207 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6506 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 6507 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7208 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
| 6508 | 7209 | ||
| 6509 | cpus_and(nodemask, nodemask, *cpu_map); | 7210 | *nodemask = node_to_cpumask(i); |
| 6510 | if (cpus_empty(nodemask)) | 7211 | cpus_and(*nodemask, *nodemask, *cpu_map); |
| 7212 | if (cpus_empty(*nodemask)) | ||
| 6511 | continue; | 7213 | continue; |
| 6512 | 7214 | ||
| 6513 | if (sg == NULL) | 7215 | if (sg == NULL) |
| @@ -6525,7 +7227,7 @@ next_sg: | |||
| 6525 | } | 7227 | } |
| 6526 | } | 7228 | } |
| 6527 | #else | 7229 | #else |
| 6528 | static void free_sched_groups(const cpumask_t *cpu_map) | 7230 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
| 6529 | { | 7231 | { |
| 6530 | } | 7232 | } |
| 6531 | #endif | 7233 | #endif |
| @@ -6583,13 +7285,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 6583 | } | 7285 | } |
| 6584 | 7286 | ||
| 6585 | /* | 7287 | /* |
| 7288 | * Initializers for schedule domains | ||
| 7289 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
| 7290 | */ | ||
| 7291 | |||
| 7292 | #define SD_INIT(sd, type) sd_init_##type(sd) | ||
| 7293 | #define SD_INIT_FUNC(type) \ | ||
| 7294 | static noinline void sd_init_##type(struct sched_domain *sd) \ | ||
| 7295 | { \ | ||
| 7296 | memset(sd, 0, sizeof(*sd)); \ | ||
| 7297 | *sd = SD_##type##_INIT; \ | ||
| 7298 | sd->level = SD_LV_##type; \ | ||
| 7299 | } | ||
| 7300 | |||
| 7301 | SD_INIT_FUNC(CPU) | ||
| 7302 | #ifdef CONFIG_NUMA | ||
| 7303 | SD_INIT_FUNC(ALLNODES) | ||
| 7304 | SD_INIT_FUNC(NODE) | ||
| 7305 | #endif | ||
| 7306 | #ifdef CONFIG_SCHED_SMT | ||
| 7307 | SD_INIT_FUNC(SIBLING) | ||
| 7308 | #endif | ||
| 7309 | #ifdef CONFIG_SCHED_MC | ||
| 7310 | SD_INIT_FUNC(MC) | ||
| 7311 | #endif | ||
| 7312 | |||
| 7313 | /* | ||
| 7314 | * To minimize stack usage kmalloc room for cpumasks and share the | ||
| 7315 | * space as the usage in build_sched_domains() dictates. Used only | ||
| 7316 | * if the amount of space is significant. | ||
| 7317 | */ | ||
| 7318 | struct allmasks { | ||
| 7319 | cpumask_t tmpmask; /* make this one first */ | ||
| 7320 | union { | ||
| 7321 | cpumask_t nodemask; | ||
| 7322 | cpumask_t this_sibling_map; | ||
| 7323 | cpumask_t this_core_map; | ||
| 7324 | }; | ||
| 7325 | cpumask_t send_covered; | ||
| 7326 | |||
| 7327 | #ifdef CONFIG_NUMA | ||
| 7328 | cpumask_t domainspan; | ||
| 7329 | cpumask_t covered; | ||
| 7330 | cpumask_t notcovered; | ||
| 7331 | #endif | ||
| 7332 | }; | ||
| 7333 | |||
| 7334 | #if NR_CPUS > 128 | ||
| 7335 | #define SCHED_CPUMASK_ALLOC 1 | ||
| 7336 | #define SCHED_CPUMASK_FREE(v) kfree(v) | ||
| 7337 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v | ||
| 7338 | #else | ||
| 7339 | #define SCHED_CPUMASK_ALLOC 0 | ||
| 7340 | #define SCHED_CPUMASK_FREE(v) | ||
| 7341 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v | ||
| 7342 | #endif | ||
| 7343 | |||
| 7344 | #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ | ||
| 7345 | ((unsigned long)(a) + offsetof(struct allmasks, v)) | ||
| 7346 | |||
| 7347 | static int default_relax_domain_level = -1; | ||
| 7348 | |||
| 7349 | static int __init setup_relax_domain_level(char *str) | ||
| 7350 | { | ||
| 7351 | default_relax_domain_level = simple_strtoul(str, NULL, 0); | ||
| 7352 | return 1; | ||
| 7353 | } | ||
| 7354 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
| 7355 | |||
| 7356 | static void set_domain_attribute(struct sched_domain *sd, | ||
| 7357 | struct sched_domain_attr *attr) | ||
| 7358 | { | ||
| 7359 | int request; | ||
| 7360 | |||
| 7361 | if (!attr || attr->relax_domain_level < 0) { | ||
| 7362 | if (default_relax_domain_level < 0) | ||
| 7363 | return; | ||
| 7364 | else | ||
| 7365 | request = default_relax_domain_level; | ||
| 7366 | } else | ||
| 7367 | request = attr->relax_domain_level; | ||
| 7368 | if (request < sd->level) { | ||
| 7369 | /* turn off idle balance on this domain */ | ||
| 7370 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | ||
| 7371 | } else { | ||
| 7372 | /* turn on idle balance on this domain */ | ||
| 7373 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | ||
| 7374 | } | ||
| 7375 | } | ||
| 7376 | |||
| 7377 | /* | ||
| 6586 | * Build sched domains for a given set of cpus and attach the sched domains | 7378 | * Build sched domains for a given set of cpus and attach the sched domains |
| 6587 | * to the individual cpus | 7379 | * to the individual cpus |
| 6588 | */ | 7380 | */ |
| 6589 | static int build_sched_domains(const cpumask_t *cpu_map) | 7381 | static int __build_sched_domains(const cpumask_t *cpu_map, |
| 7382 | struct sched_domain_attr *attr) | ||
| 6590 | { | 7383 | { |
| 6591 | int i; | 7384 | int i; |
| 6592 | struct root_domain *rd; | 7385 | struct root_domain *rd; |
| 7386 | SCHED_CPUMASK_DECLARE(allmasks); | ||
| 7387 | cpumask_t *tmpmask; | ||
| 6593 | #ifdef CONFIG_NUMA | 7388 | #ifdef CONFIG_NUMA |
| 6594 | struct sched_group **sched_group_nodes = NULL; | 7389 | struct sched_group **sched_group_nodes = NULL; |
| 6595 | int sd_allnodes = 0; | 7390 | int sd_allnodes = 0; |
| @@ -6603,39 +7398,65 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6603 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7398 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| 6604 | return -ENOMEM; | 7399 | return -ENOMEM; |
| 6605 | } | 7400 | } |
| 6606 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
| 6607 | #endif | 7401 | #endif |
| 6608 | 7402 | ||
| 6609 | rd = alloc_rootdomain(); | 7403 | rd = alloc_rootdomain(); |
| 6610 | if (!rd) { | 7404 | if (!rd) { |
| 6611 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7405 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
| 7406 | #ifdef CONFIG_NUMA | ||
| 7407 | kfree(sched_group_nodes); | ||
| 7408 | #endif | ||
| 6612 | return -ENOMEM; | 7409 | return -ENOMEM; |
| 6613 | } | 7410 | } |
| 6614 | 7411 | ||
| 7412 | #if SCHED_CPUMASK_ALLOC | ||
| 7413 | /* get space for all scratch cpumask variables */ | ||
| 7414 | allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); | ||
| 7415 | if (!allmasks) { | ||
| 7416 | printk(KERN_WARNING "Cannot alloc cpumask array\n"); | ||
| 7417 | kfree(rd); | ||
| 7418 | #ifdef CONFIG_NUMA | ||
| 7419 | kfree(sched_group_nodes); | ||
| 7420 | #endif | ||
| 7421 | return -ENOMEM; | ||
| 7422 | } | ||
| 7423 | #endif | ||
| 7424 | tmpmask = (cpumask_t *)allmasks; | ||
| 7425 | |||
| 7426 | |||
| 7427 | #ifdef CONFIG_NUMA | ||
| 7428 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
| 7429 | #endif | ||
| 7430 | |||
| 6615 | /* | 7431 | /* |
| 6616 | * Set up domains for cpus specified by the cpu_map. | 7432 | * Set up domains for cpus specified by the cpu_map. |
| 6617 | */ | 7433 | */ |
| 6618 | for_each_cpu_mask(i, *cpu_map) { | 7434 | for_each_cpu_mask(i, *cpu_map) { |
| 6619 | struct sched_domain *sd = NULL, *p; | 7435 | struct sched_domain *sd = NULL, *p; |
| 6620 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 7436 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
| 6621 | 7437 | ||
| 6622 | cpus_and(nodemask, nodemask, *cpu_map); | 7438 | *nodemask = node_to_cpumask(cpu_to_node(i)); |
| 7439 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
| 6623 | 7440 | ||
| 6624 | #ifdef CONFIG_NUMA | 7441 | #ifdef CONFIG_NUMA |
| 6625 | if (cpus_weight(*cpu_map) > | 7442 | if (cpus_weight(*cpu_map) > |
| 6626 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 7443 | SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { |
| 6627 | sd = &per_cpu(allnodes_domains, i); | 7444 | sd = &per_cpu(allnodes_domains, i); |
| 6628 | *sd = SD_ALLNODES_INIT; | 7445 | SD_INIT(sd, ALLNODES); |
| 7446 | set_domain_attribute(sd, attr); | ||
| 6629 | sd->span = *cpu_map; | 7447 | sd->span = *cpu_map; |
| 6630 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); | 7448 | sd->first_cpu = first_cpu(sd->span); |
| 7449 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 6631 | p = sd; | 7450 | p = sd; |
| 6632 | sd_allnodes = 1; | 7451 | sd_allnodes = 1; |
| 6633 | } else | 7452 | } else |
| 6634 | p = NULL; | 7453 | p = NULL; |
| 6635 | 7454 | ||
| 6636 | sd = &per_cpu(node_domains, i); | 7455 | sd = &per_cpu(node_domains, i); |
| 6637 | *sd = SD_NODE_INIT; | 7456 | SD_INIT(sd, NODE); |
| 6638 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 7457 | set_domain_attribute(sd, attr); |
| 7458 | sched_domain_node_span(cpu_to_node(i), &sd->span); | ||
| 7459 | sd->first_cpu = first_cpu(sd->span); | ||
| 6639 | sd->parent = p; | 7460 | sd->parent = p; |
| 6640 | if (p) | 7461 | if (p) |
| 6641 | p->child = sd; | 7462 | p->child = sd; |
| @@ -6644,94 +7465,120 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6644 | 7465 | ||
| 6645 | p = sd; | 7466 | p = sd; |
| 6646 | sd = &per_cpu(phys_domains, i); | 7467 | sd = &per_cpu(phys_domains, i); |
| 6647 | *sd = SD_CPU_INIT; | 7468 | SD_INIT(sd, CPU); |
| 6648 | sd->span = nodemask; | 7469 | set_domain_attribute(sd, attr); |
| 7470 | sd->span = *nodemask; | ||
| 7471 | sd->first_cpu = first_cpu(sd->span); | ||
| 6649 | sd->parent = p; | 7472 | sd->parent = p; |
| 6650 | if (p) | 7473 | if (p) |
| 6651 | p->child = sd; | 7474 | p->child = sd; |
| 6652 | cpu_to_phys_group(i, cpu_map, &sd->groups); | 7475 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); |
| 6653 | 7476 | ||
| 6654 | #ifdef CONFIG_SCHED_MC | 7477 | #ifdef CONFIG_SCHED_MC |
| 6655 | p = sd; | 7478 | p = sd; |
| 6656 | sd = &per_cpu(core_domains, i); | 7479 | sd = &per_cpu(core_domains, i); |
| 6657 | *sd = SD_MC_INIT; | 7480 | SD_INIT(sd, MC); |
| 7481 | set_domain_attribute(sd, attr); | ||
| 6658 | sd->span = cpu_coregroup_map(i); | 7482 | sd->span = cpu_coregroup_map(i); |
| 7483 | sd->first_cpu = first_cpu(sd->span); | ||
| 6659 | cpus_and(sd->span, sd->span, *cpu_map); | 7484 | cpus_and(sd->span, sd->span, *cpu_map); |
| 6660 | sd->parent = p; | 7485 | sd->parent = p; |
| 6661 | p->child = sd; | 7486 | p->child = sd; |
| 6662 | cpu_to_core_group(i, cpu_map, &sd->groups); | 7487 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); |
| 6663 | #endif | 7488 | #endif |
| 6664 | 7489 | ||
| 6665 | #ifdef CONFIG_SCHED_SMT | 7490 | #ifdef CONFIG_SCHED_SMT |
| 6666 | p = sd; | 7491 | p = sd; |
| 6667 | sd = &per_cpu(cpu_domains, i); | 7492 | sd = &per_cpu(cpu_domains, i); |
| 6668 | *sd = SD_SIBLING_INIT; | 7493 | SD_INIT(sd, SIBLING); |
| 7494 | set_domain_attribute(sd, attr); | ||
| 6669 | sd->span = per_cpu(cpu_sibling_map, i); | 7495 | sd->span = per_cpu(cpu_sibling_map, i); |
| 7496 | sd->first_cpu = first_cpu(sd->span); | ||
| 6670 | cpus_and(sd->span, sd->span, *cpu_map); | 7497 | cpus_and(sd->span, sd->span, *cpu_map); |
| 6671 | sd->parent = p; | 7498 | sd->parent = p; |
| 6672 | p->child = sd; | 7499 | p->child = sd; |
| 6673 | cpu_to_cpu_group(i, cpu_map, &sd->groups); | 7500 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); |
| 6674 | #endif | 7501 | #endif |
| 6675 | } | 7502 | } |
| 6676 | 7503 | ||
| 6677 | #ifdef CONFIG_SCHED_SMT | 7504 | #ifdef CONFIG_SCHED_SMT |
| 6678 | /* Set up CPU (sibling) groups */ | 7505 | /* Set up CPU (sibling) groups */ |
| 6679 | for_each_cpu_mask(i, *cpu_map) { | 7506 | for_each_cpu_mask(i, *cpu_map) { |
| 6680 | cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); | 7507 | SCHED_CPUMASK_VAR(this_sibling_map, allmasks); |
| 6681 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 7508 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
| 6682 | if (i != first_cpu(this_sibling_map)) | 7509 | |
| 7510 | *this_sibling_map = per_cpu(cpu_sibling_map, i); | ||
| 7511 | cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); | ||
| 7512 | if (i != first_cpu(*this_sibling_map)) | ||
| 6683 | continue; | 7513 | continue; |
| 6684 | 7514 | ||
| 6685 | init_sched_build_groups(this_sibling_map, cpu_map, | 7515 | init_sched_build_groups(this_sibling_map, cpu_map, |
| 6686 | &cpu_to_cpu_group); | 7516 | &cpu_to_cpu_group, |
| 7517 | send_covered, tmpmask); | ||
| 6687 | } | 7518 | } |
| 6688 | #endif | 7519 | #endif |
| 6689 | 7520 | ||
| 6690 | #ifdef CONFIG_SCHED_MC | 7521 | #ifdef CONFIG_SCHED_MC |
| 6691 | /* Set up multi-core groups */ | 7522 | /* Set up multi-core groups */ |
| 6692 | for_each_cpu_mask(i, *cpu_map) { | 7523 | for_each_cpu_mask(i, *cpu_map) { |
| 6693 | cpumask_t this_core_map = cpu_coregroup_map(i); | 7524 | SCHED_CPUMASK_VAR(this_core_map, allmasks); |
| 6694 | cpus_and(this_core_map, this_core_map, *cpu_map); | 7525 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
| 6695 | if (i != first_cpu(this_core_map)) | 7526 | |
| 7527 | *this_core_map = cpu_coregroup_map(i); | ||
| 7528 | cpus_and(*this_core_map, *this_core_map, *cpu_map); | ||
| 7529 | if (i != first_cpu(*this_core_map)) | ||
| 6696 | continue; | 7530 | continue; |
| 7531 | |||
| 6697 | init_sched_build_groups(this_core_map, cpu_map, | 7532 | init_sched_build_groups(this_core_map, cpu_map, |
| 6698 | &cpu_to_core_group); | 7533 | &cpu_to_core_group, |
| 7534 | send_covered, tmpmask); | ||
| 6699 | } | 7535 | } |
| 6700 | #endif | 7536 | #endif |
| 6701 | 7537 | ||
| 6702 | /* Set up physical groups */ | 7538 | /* Set up physical groups */ |
| 6703 | for (i = 0; i < MAX_NUMNODES; i++) { | 7539 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6704 | cpumask_t nodemask = node_to_cpumask(i); | 7540 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
| 7541 | SCHED_CPUMASK_VAR(send_covered, allmasks); | ||
| 6705 | 7542 | ||
| 6706 | cpus_and(nodemask, nodemask, *cpu_map); | 7543 | *nodemask = node_to_cpumask(i); |
| 6707 | if (cpus_empty(nodemask)) | 7544 | cpus_and(*nodemask, *nodemask, *cpu_map); |
| 7545 | if (cpus_empty(*nodemask)) | ||
| 6708 | continue; | 7546 | continue; |
| 6709 | 7547 | ||
| 6710 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); | 7548 | init_sched_build_groups(nodemask, cpu_map, |
| 7549 | &cpu_to_phys_group, | ||
| 7550 | send_covered, tmpmask); | ||
| 6711 | } | 7551 | } |
| 6712 | 7552 | ||
| 6713 | #ifdef CONFIG_NUMA | 7553 | #ifdef CONFIG_NUMA |
| 6714 | /* Set up node groups */ | 7554 | /* Set up node groups */ |
| 6715 | if (sd_allnodes) | 7555 | if (sd_allnodes) { |
| 6716 | init_sched_build_groups(*cpu_map, cpu_map, | 7556 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
| 6717 | &cpu_to_allnodes_group); | 7557 | |
| 7558 | init_sched_build_groups(cpu_map, cpu_map, | ||
| 7559 | &cpu_to_allnodes_group, | ||
| 7560 | send_covered, tmpmask); | ||
| 7561 | } | ||
| 6718 | 7562 | ||
| 6719 | for (i = 0; i < MAX_NUMNODES; i++) { | 7563 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6720 | /* Set up node groups */ | 7564 | /* Set up node groups */ |
| 6721 | struct sched_group *sg, *prev; | 7565 | struct sched_group *sg, *prev; |
| 6722 | cpumask_t nodemask = node_to_cpumask(i); | 7566 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
| 6723 | cpumask_t domainspan; | 7567 | SCHED_CPUMASK_VAR(domainspan, allmasks); |
| 6724 | cpumask_t covered = CPU_MASK_NONE; | 7568 | SCHED_CPUMASK_VAR(covered, allmasks); |
| 6725 | int j; | 7569 | int j; |
| 6726 | 7570 | ||
| 6727 | cpus_and(nodemask, nodemask, *cpu_map); | 7571 | *nodemask = node_to_cpumask(i); |
| 6728 | if (cpus_empty(nodemask)) { | 7572 | cpus_clear(*covered); |
| 7573 | |||
| 7574 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
| 7575 | if (cpus_empty(*nodemask)) { | ||
| 6729 | sched_group_nodes[i] = NULL; | 7576 | sched_group_nodes[i] = NULL; |
| 6730 | continue; | 7577 | continue; |
| 6731 | } | 7578 | } |
| 6732 | 7579 | ||
| 6733 | domainspan = sched_domain_node_span(i); | 7580 | sched_domain_node_span(i, domainspan); |
| 6734 | cpus_and(domainspan, domainspan, *cpu_map); | 7581 | cpus_and(*domainspan, *domainspan, *cpu_map); |
| 6735 | 7582 | ||
| 6736 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); | 7583 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
| 6737 | if (!sg) { | 7584 | if (!sg) { |
| @@ -6740,31 +7587,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6740 | goto error; | 7587 | goto error; |
| 6741 | } | 7588 | } |
| 6742 | sched_group_nodes[i] = sg; | 7589 | sched_group_nodes[i] = sg; |
| 6743 | for_each_cpu_mask(j, nodemask) { | 7590 | for_each_cpu_mask(j, *nodemask) { |
| 6744 | struct sched_domain *sd; | 7591 | struct sched_domain *sd; |
| 6745 | 7592 | ||
| 6746 | sd = &per_cpu(node_domains, j); | 7593 | sd = &per_cpu(node_domains, j); |
| 6747 | sd->groups = sg; | 7594 | sd->groups = sg; |
| 6748 | } | 7595 | } |
| 6749 | sg->__cpu_power = 0; | 7596 | sg->__cpu_power = 0; |
| 6750 | sg->cpumask = nodemask; | 7597 | sg->cpumask = *nodemask; |
| 6751 | sg->next = sg; | 7598 | sg->next = sg; |
| 6752 | cpus_or(covered, covered, nodemask); | 7599 | cpus_or(*covered, *covered, *nodemask); |
| 6753 | prev = sg; | 7600 | prev = sg; |
| 6754 | 7601 | ||
| 6755 | for (j = 0; j < MAX_NUMNODES; j++) { | 7602 | for (j = 0; j < MAX_NUMNODES; j++) { |
| 6756 | cpumask_t tmp, notcovered; | 7603 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
| 6757 | int n = (i + j) % MAX_NUMNODES; | 7604 | int n = (i + j) % MAX_NUMNODES; |
| 7605 | node_to_cpumask_ptr(pnodemask, n); | ||
| 6758 | 7606 | ||
| 6759 | cpus_complement(notcovered, covered); | 7607 | cpus_complement(*notcovered, *covered); |
| 6760 | cpus_and(tmp, notcovered, *cpu_map); | 7608 | cpus_and(*tmpmask, *notcovered, *cpu_map); |
| 6761 | cpus_and(tmp, tmp, domainspan); | 7609 | cpus_and(*tmpmask, *tmpmask, *domainspan); |
| 6762 | if (cpus_empty(tmp)) | 7610 | if (cpus_empty(*tmpmask)) |
| 6763 | break; | 7611 | break; |
| 6764 | 7612 | ||
| 6765 | nodemask = node_to_cpumask(n); | 7613 | cpus_and(*tmpmask, *tmpmask, *pnodemask); |
| 6766 | cpus_and(tmp, tmp, nodemask); | 7614 | if (cpus_empty(*tmpmask)) |
| 6767 | if (cpus_empty(tmp)) | ||
| 6768 | continue; | 7615 | continue; |
| 6769 | 7616 | ||
| 6770 | sg = kmalloc_node(sizeof(struct sched_group), | 7617 | sg = kmalloc_node(sizeof(struct sched_group), |
| @@ -6775,9 +7622,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6775 | goto error; | 7622 | goto error; |
| 6776 | } | 7623 | } |
| 6777 | sg->__cpu_power = 0; | 7624 | sg->__cpu_power = 0; |
| 6778 | sg->cpumask = tmp; | 7625 | sg->cpumask = *tmpmask; |
| 6779 | sg->next = prev->next; | 7626 | sg->next = prev->next; |
| 6780 | cpus_or(covered, covered, tmp); | 7627 | cpus_or(*covered, *covered, *tmpmask); |
| 6781 | prev->next = sg; | 7628 | prev->next = sg; |
| 6782 | prev = sg; | 7629 | prev = sg; |
| 6783 | } | 7630 | } |
| @@ -6813,7 +7660,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6813 | if (sd_allnodes) { | 7660 | if (sd_allnodes) { |
| 6814 | struct sched_group *sg; | 7661 | struct sched_group *sg; |
| 6815 | 7662 | ||
| 6816 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | 7663 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, |
| 7664 | tmpmask); | ||
| 6817 | init_numa_sched_groups_power(sg); | 7665 | init_numa_sched_groups_power(sg); |
| 6818 | } | 7666 | } |
| 6819 | #endif | 7667 | #endif |
| @@ -6831,17 +7679,26 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6831 | cpu_attach_domain(sd, rd, i); | 7679 | cpu_attach_domain(sd, rd, i); |
| 6832 | } | 7680 | } |
| 6833 | 7681 | ||
| 7682 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
| 6834 | return 0; | 7683 | return 0; |
| 6835 | 7684 | ||
| 6836 | #ifdef CONFIG_NUMA | 7685 | #ifdef CONFIG_NUMA |
| 6837 | error: | 7686 | error: |
| 6838 | free_sched_groups(cpu_map); | 7687 | free_sched_groups(cpu_map, tmpmask); |
| 7688 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
| 6839 | return -ENOMEM; | 7689 | return -ENOMEM; |
| 6840 | #endif | 7690 | #endif |
| 6841 | } | 7691 | } |
| 6842 | 7692 | ||
| 7693 | static int build_sched_domains(const cpumask_t *cpu_map) | ||
| 7694 | { | ||
| 7695 | return __build_sched_domains(cpu_map, NULL); | ||
| 7696 | } | ||
| 7697 | |||
| 6843 | static cpumask_t *doms_cur; /* current sched domains */ | 7698 | static cpumask_t *doms_cur; /* current sched domains */ |
| 6844 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7699 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
| 7700 | static struct sched_domain_attr *dattr_cur; /* attribues of custom domains | ||
| 7701 | in 'doms_cur' */ | ||
| 6845 | 7702 | ||
| 6846 | /* | 7703 | /* |
| 6847 | * Special case: If a kmalloc of a doms_cur partition (array of | 7704 | * Special case: If a kmalloc of a doms_cur partition (array of |
| @@ -6869,15 +7726,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
| 6869 | if (!doms_cur) | 7726 | if (!doms_cur) |
| 6870 | doms_cur = &fallback_doms; | 7727 | doms_cur = &fallback_doms; |
| 6871 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); | 7728 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); |
| 7729 | dattr_cur = NULL; | ||
| 6872 | err = build_sched_domains(doms_cur); | 7730 | err = build_sched_domains(doms_cur); |
| 6873 | register_sched_domain_sysctl(); | 7731 | register_sched_domain_sysctl(); |
| 6874 | 7732 | ||
| 6875 | return err; | 7733 | return err; |
| 6876 | } | 7734 | } |
| 6877 | 7735 | ||
| 6878 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 7736 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map, |
| 7737 | cpumask_t *tmpmask) | ||
| 6879 | { | 7738 | { |
| 6880 | free_sched_groups(cpu_map); | 7739 | free_sched_groups(cpu_map, tmpmask); |
| 6881 | } | 7740 | } |
| 6882 | 7741 | ||
| 6883 | /* | 7742 | /* |
| @@ -6886,6 +7745,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | |||
| 6886 | */ | 7745 | */ |
| 6887 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 7746 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
| 6888 | { | 7747 | { |
| 7748 | cpumask_t tmpmask; | ||
| 6889 | int i; | 7749 | int i; |
| 6890 | 7750 | ||
| 6891 | unregister_sched_domain_sysctl(); | 7751 | unregister_sched_domain_sysctl(); |
| @@ -6893,7 +7753,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6893 | for_each_cpu_mask(i, *cpu_map) | 7753 | for_each_cpu_mask(i, *cpu_map) |
| 6894 | cpu_attach_domain(NULL, &def_root_domain, i); | 7754 | cpu_attach_domain(NULL, &def_root_domain, i); |
| 6895 | synchronize_sched(); | 7755 | synchronize_sched(); |
| 6896 | arch_destroy_sched_domains(cpu_map); | 7756 | arch_destroy_sched_domains(cpu_map, &tmpmask); |
| 7757 | } | ||
| 7758 | |||
| 7759 | /* handle null as "default" */ | ||
| 7760 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
| 7761 | struct sched_domain_attr *new, int idx_new) | ||
| 7762 | { | ||
| 7763 | struct sched_domain_attr tmp; | ||
| 7764 | |||
| 7765 | /* fast path */ | ||
| 7766 | if (!new && !cur) | ||
| 7767 | return 1; | ||
| 7768 | |||
| 7769 | tmp = SD_ATTR_INIT; | ||
| 7770 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
| 7771 | new ? (new + idx_new) : &tmp, | ||
| 7772 | sizeof(struct sched_domain_attr)); | ||
| 6897 | } | 7773 | } |
| 6898 | 7774 | ||
| 6899 | /* | 7775 | /* |
| @@ -6917,7 +7793,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6917 | * | 7793 | * |
| 6918 | * Call with hotplug lock held | 7794 | * Call with hotplug lock held |
| 6919 | */ | 7795 | */ |
| 6920 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | 7796 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
| 7797 | struct sched_domain_attr *dattr_new) | ||
| 6921 | { | 7798 | { |
| 6922 | int i, j; | 7799 | int i, j; |
| 6923 | 7800 | ||
| @@ -6930,12 +7807,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
| 6930 | ndoms_new = 1; | 7807 | ndoms_new = 1; |
| 6931 | doms_new = &fallback_doms; | 7808 | doms_new = &fallback_doms; |
| 6932 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7809 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
| 7810 | dattr_new = NULL; | ||
| 6933 | } | 7811 | } |
| 6934 | 7812 | ||
| 6935 | /* Destroy deleted domains */ | 7813 | /* Destroy deleted domains */ |
| 6936 | for (i = 0; i < ndoms_cur; i++) { | 7814 | for (i = 0; i < ndoms_cur; i++) { |
| 6937 | for (j = 0; j < ndoms_new; j++) { | 7815 | for (j = 0; j < ndoms_new; j++) { |
| 6938 | if (cpus_equal(doms_cur[i], doms_new[j])) | 7816 | if (cpus_equal(doms_cur[i], doms_new[j]) |
| 7817 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
| 6939 | goto match1; | 7818 | goto match1; |
| 6940 | } | 7819 | } |
| 6941 | /* no match - a current sched domain not in new doms_new[] */ | 7820 | /* no match - a current sched domain not in new doms_new[] */ |
| @@ -6947,11 +7826,13 @@ match1: | |||
| 6947 | /* Build new domains */ | 7826 | /* Build new domains */ |
| 6948 | for (i = 0; i < ndoms_new; i++) { | 7827 | for (i = 0; i < ndoms_new; i++) { |
| 6949 | for (j = 0; j < ndoms_cur; j++) { | 7828 | for (j = 0; j < ndoms_cur; j++) { |
| 6950 | if (cpus_equal(doms_new[i], doms_cur[j])) | 7829 | if (cpus_equal(doms_new[i], doms_cur[j]) |
| 7830 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
| 6951 | goto match2; | 7831 | goto match2; |
| 6952 | } | 7832 | } |
| 6953 | /* no match - add a new doms_new */ | 7833 | /* no match - add a new doms_new */ |
| 6954 | build_sched_domains(doms_new + i); | 7834 | __build_sched_domains(doms_new + i, |
| 7835 | dattr_new ? dattr_new + i : NULL); | ||
| 6955 | match2: | 7836 | match2: |
| 6956 | ; | 7837 | ; |
| 6957 | } | 7838 | } |
| @@ -6959,7 +7840,9 @@ match2: | |||
| 6959 | /* Remember the new sched domains */ | 7840 | /* Remember the new sched domains */ |
| 6960 | if (doms_cur != &fallback_doms) | 7841 | if (doms_cur != &fallback_doms) |
| 6961 | kfree(doms_cur); | 7842 | kfree(doms_cur); |
| 7843 | kfree(dattr_cur); /* kfree(NULL) is safe */ | ||
| 6962 | doms_cur = doms_new; | 7844 | doms_cur = doms_new; |
| 7845 | dattr_cur = dattr_new; | ||
| 6963 | ndoms_cur = ndoms_new; | 7846 | ndoms_cur = ndoms_new; |
| 6964 | 7847 | ||
| 6965 | register_sched_domain_sysctl(); | 7848 | register_sched_domain_sysctl(); |
| @@ -7086,6 +7969,11 @@ void __init sched_init_smp(void) | |||
| 7086 | { | 7969 | { |
| 7087 | cpumask_t non_isolated_cpus; | 7970 | cpumask_t non_isolated_cpus; |
| 7088 | 7971 | ||
| 7972 | #if defined(CONFIG_NUMA) | ||
| 7973 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
| 7974 | GFP_KERNEL); | ||
| 7975 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
| 7976 | #endif | ||
| 7089 | get_online_cpus(); | 7977 | get_online_cpus(); |
| 7090 | arch_init_sched_domains(&cpu_online_map); | 7978 | arch_init_sched_domains(&cpu_online_map); |
| 7091 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7979 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
| @@ -7096,13 +7984,18 @@ void __init sched_init_smp(void) | |||
| 7096 | hotcpu_notifier(update_sched_domains, 0); | 7984 | hotcpu_notifier(update_sched_domains, 0); |
| 7097 | 7985 | ||
| 7098 | /* Move init over to a non-isolated CPU */ | 7986 | /* Move init over to a non-isolated CPU */ |
| 7099 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7987 | if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) |
| 7100 | BUG(); | 7988 | BUG(); |
| 7101 | sched_init_granularity(); | 7989 | sched_init_granularity(); |
| 7102 | } | 7990 | } |
| 7103 | #else | 7991 | #else |
| 7104 | void __init sched_init_smp(void) | 7992 | void __init sched_init_smp(void) |
| 7105 | { | 7993 | { |
| 7994 | #if defined(CONFIG_NUMA) | ||
| 7995 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
| 7996 | GFP_KERNEL); | ||
| 7997 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
| 7998 | #endif | ||
| 7106 | sched_init_granularity(); | 7999 | sched_init_granularity(); |
| 7107 | } | 8000 | } |
| 7108 | #endif /* CONFIG_SMP */ | 8001 | #endif /* CONFIG_SMP */ |
| @@ -7117,6 +8010,7 @@ int in_sched_functions(unsigned long addr) | |||
| 7117 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 8010 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
| 7118 | { | 8011 | { |
| 7119 | cfs_rq->tasks_timeline = RB_ROOT; | 8012 | cfs_rq->tasks_timeline = RB_ROOT; |
| 8013 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
| 7120 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8014 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7121 | cfs_rq->rq = rq; | 8015 | cfs_rq->rq = rq; |
| 7122 | #endif | 8016 | #endif |
| @@ -7146,6 +8040,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 7146 | 8040 | ||
| 7147 | rt_rq->rt_time = 0; | 8041 | rt_rq->rt_time = 0; |
| 7148 | rt_rq->rt_throttled = 0; | 8042 | rt_rq->rt_throttled = 0; |
| 8043 | rt_rq->rt_runtime = 0; | ||
| 8044 | spin_lock_init(&rt_rq->rt_runtime_lock); | ||
| 7149 | 8045 | ||
| 7150 | #ifdef CONFIG_RT_GROUP_SCHED | 8046 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7151 | rt_rq->rt_nr_boosted = 0; | 8047 | rt_rq->rt_nr_boosted = 0; |
| @@ -7154,10 +8050,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 7154 | } | 8050 | } |
| 7155 | 8051 | ||
| 7156 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8052 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7157 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | 8053 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
| 7158 | struct cfs_rq *cfs_rq, struct sched_entity *se, | 8054 | struct sched_entity *se, int cpu, int add, |
| 7159 | int cpu, int add) | 8055 | struct sched_entity *parent) |
| 7160 | { | 8056 | { |
| 8057 | struct rq *rq = cpu_rq(cpu); | ||
| 7161 | tg->cfs_rq[cpu] = cfs_rq; | 8058 | tg->cfs_rq[cpu] = cfs_rq; |
| 7162 | init_cfs_rq(cfs_rq, rq); | 8059 | init_cfs_rq(cfs_rq, rq); |
| 7163 | cfs_rq->tg = tg; | 8060 | cfs_rq->tg = tg; |
| @@ -7165,45 +8062,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | |||
| 7165 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 8062 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
| 7166 | 8063 | ||
| 7167 | tg->se[cpu] = se; | 8064 | tg->se[cpu] = se; |
| 7168 | se->cfs_rq = &rq->cfs; | 8065 | /* se could be NULL for init_task_group */ |
| 8066 | if (!se) | ||
| 8067 | return; | ||
| 8068 | |||
| 8069 | if (!parent) | ||
| 8070 | se->cfs_rq = &rq->cfs; | ||
| 8071 | else | ||
| 8072 | se->cfs_rq = parent->my_q; | ||
| 8073 | |||
| 7169 | se->my_q = cfs_rq; | 8074 | se->my_q = cfs_rq; |
| 7170 | se->load.weight = tg->shares; | 8075 | se->load.weight = tg->shares; |
| 7171 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | 8076 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); |
| 7172 | se->parent = NULL; | 8077 | se->parent = parent; |
| 7173 | } | 8078 | } |
| 7174 | #endif | 8079 | #endif |
| 7175 | 8080 | ||
| 7176 | #ifdef CONFIG_RT_GROUP_SCHED | 8081 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7177 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | 8082 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
| 7178 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | 8083 | struct sched_rt_entity *rt_se, int cpu, int add, |
| 7179 | int cpu, int add) | 8084 | struct sched_rt_entity *parent) |
| 7180 | { | 8085 | { |
| 8086 | struct rq *rq = cpu_rq(cpu); | ||
| 8087 | |||
| 7181 | tg->rt_rq[cpu] = rt_rq; | 8088 | tg->rt_rq[cpu] = rt_rq; |
| 7182 | init_rt_rq(rt_rq, rq); | 8089 | init_rt_rq(rt_rq, rq); |
| 7183 | rt_rq->tg = tg; | 8090 | rt_rq->tg = tg; |
| 7184 | rt_rq->rt_se = rt_se; | 8091 | rt_rq->rt_se = rt_se; |
| 8092 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 7185 | if (add) | 8093 | if (add) |
| 7186 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 8094 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
| 7187 | 8095 | ||
| 7188 | tg->rt_se[cpu] = rt_se; | 8096 | tg->rt_se[cpu] = rt_se; |
| 8097 | if (!rt_se) | ||
| 8098 | return; | ||
| 8099 | |||
| 8100 | if (!parent) | ||
| 8101 | rt_se->rt_rq = &rq->rt; | ||
| 8102 | else | ||
| 8103 | rt_se->rt_rq = parent->my_q; | ||
| 8104 | |||
| 7189 | rt_se->rt_rq = &rq->rt; | 8105 | rt_se->rt_rq = &rq->rt; |
| 7190 | rt_se->my_q = rt_rq; | 8106 | rt_se->my_q = rt_rq; |
| 7191 | rt_se->parent = NULL; | 8107 | rt_se->parent = parent; |
| 7192 | INIT_LIST_HEAD(&rt_se->run_list); | 8108 | INIT_LIST_HEAD(&rt_se->run_list); |
| 7193 | } | 8109 | } |
| 7194 | #endif | 8110 | #endif |
| 7195 | 8111 | ||
| 7196 | void __init sched_init(void) | 8112 | void __init sched_init(void) |
| 7197 | { | 8113 | { |
| 7198 | int highest_cpu = 0; | ||
| 7199 | int i, j; | 8114 | int i, j; |
| 8115 | unsigned long alloc_size = 0, ptr; | ||
| 8116 | |||
| 8117 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 8118 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
| 8119 | #endif | ||
| 8120 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8121 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
| 8122 | #endif | ||
| 8123 | #ifdef CONFIG_USER_SCHED | ||
| 8124 | alloc_size *= 2; | ||
| 8125 | #endif | ||
| 8126 | /* | ||
| 8127 | * As sched_init() is called before page_alloc is setup, | ||
| 8128 | * we use alloc_bootmem(). | ||
| 8129 | */ | ||
| 8130 | if (alloc_size) { | ||
| 8131 | ptr = (unsigned long)alloc_bootmem_low(alloc_size); | ||
| 8132 | |||
| 8133 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 8134 | init_task_group.se = (struct sched_entity **)ptr; | ||
| 8135 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8136 | |||
| 8137 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
| 8138 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8139 | |||
| 8140 | #ifdef CONFIG_USER_SCHED | ||
| 8141 | root_task_group.se = (struct sched_entity **)ptr; | ||
| 8142 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8143 | |||
| 8144 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
| 8145 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8146 | #endif | ||
| 8147 | #endif | ||
| 8148 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8149 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
| 8150 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8151 | |||
| 8152 | init_task_group.rt_rq = (struct rt_rq **)ptr; | ||
| 8153 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8154 | |||
| 8155 | #ifdef CONFIG_USER_SCHED | ||
| 8156 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
| 8157 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8158 | |||
| 8159 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
| 8160 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8161 | #endif | ||
| 8162 | #endif | ||
| 8163 | } | ||
| 7200 | 8164 | ||
| 7201 | #ifdef CONFIG_SMP | 8165 | #ifdef CONFIG_SMP |
| 8166 | init_aggregate(); | ||
| 7202 | init_defrootdomain(); | 8167 | init_defrootdomain(); |
| 7203 | #endif | 8168 | #endif |
| 7204 | 8169 | ||
| 8170 | init_rt_bandwidth(&def_rt_bandwidth, | ||
| 8171 | global_rt_period(), global_rt_runtime()); | ||
| 8172 | |||
| 8173 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8174 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | ||
| 8175 | global_rt_period(), global_rt_runtime()); | ||
| 8176 | #ifdef CONFIG_USER_SCHED | ||
| 8177 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
| 8178 | global_rt_period(), RUNTIME_INF); | ||
| 8179 | #endif | ||
| 8180 | #endif | ||
| 8181 | |||
| 7205 | #ifdef CONFIG_GROUP_SCHED | 8182 | #ifdef CONFIG_GROUP_SCHED |
| 7206 | list_add(&init_task_group.list, &task_groups); | 8183 | list_add(&init_task_group.list, &task_groups); |
| 8184 | INIT_LIST_HEAD(&init_task_group.children); | ||
| 8185 | |||
| 8186 | #ifdef CONFIG_USER_SCHED | ||
| 8187 | INIT_LIST_HEAD(&root_task_group.children); | ||
| 8188 | init_task_group.parent = &root_task_group; | ||
| 8189 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
| 8190 | #endif | ||
| 7207 | #endif | 8191 | #endif |
| 7208 | 8192 | ||
| 7209 | for_each_possible_cpu(i) { | 8193 | for_each_possible_cpu(i) { |
| @@ -7214,26 +8198,68 @@ void __init sched_init(void) | |||
| 7214 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 8198 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
| 7215 | rq->nr_running = 0; | 8199 | rq->nr_running = 0; |
| 7216 | rq->clock = 1; | 8200 | rq->clock = 1; |
| 8201 | update_last_tick_seen(rq); | ||
| 7217 | init_cfs_rq(&rq->cfs, rq); | 8202 | init_cfs_rq(&rq->cfs, rq); |
| 7218 | init_rt_rq(&rq->rt, rq); | 8203 | init_rt_rq(&rq->rt, rq); |
| 7219 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8204 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7220 | init_task_group.shares = init_task_group_load; | 8205 | init_task_group.shares = init_task_group_load; |
| 7221 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8206 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 7222 | init_tg_cfs_entry(rq, &init_task_group, | 8207 | #ifdef CONFIG_CGROUP_SCHED |
| 8208 | /* | ||
| 8209 | * How much cpu bandwidth does init_task_group get? | ||
| 8210 | * | ||
| 8211 | * In case of task-groups formed thr' the cgroup filesystem, it | ||
| 8212 | * gets 100% of the cpu resources in the system. This overall | ||
| 8213 | * system cpu resource is divided among the tasks of | ||
| 8214 | * init_task_group and its child task-groups in a fair manner, | ||
| 8215 | * based on each entity's (task or task-group's) weight | ||
| 8216 | * (se->load.weight). | ||
| 8217 | * | ||
| 8218 | * In other words, if init_task_group has 10 tasks of weight | ||
| 8219 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | ||
| 8220 | * then A0's share of the cpu resource is: | ||
| 8221 | * | ||
| 8222 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | ||
| 8223 | * | ||
| 8224 | * We achieve this by letting init_task_group's tasks sit | ||
| 8225 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | ||
| 8226 | */ | ||
| 8227 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | ||
| 8228 | #elif defined CONFIG_USER_SCHED | ||
| 8229 | root_task_group.shares = NICE_0_LOAD; | ||
| 8230 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
| 8231 | /* | ||
| 8232 | * In case of task-groups formed thr' the user id of tasks, | ||
| 8233 | * init_task_group represents tasks belonging to root user. | ||
| 8234 | * Hence it forms a sibling of all subsequent groups formed. | ||
| 8235 | * In this case, init_task_group gets only a fraction of overall | ||
| 8236 | * system cpu resource, based on the weight assigned to root | ||
| 8237 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
| 8238 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
| 8239 | * (init_cfs_rq) and having one entity represent this group of | ||
| 8240 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
| 8241 | */ | ||
| 8242 | init_tg_cfs_entry(&init_task_group, | ||
| 7223 | &per_cpu(init_cfs_rq, i), | 8243 | &per_cpu(init_cfs_rq, i), |
| 7224 | &per_cpu(init_sched_entity, i), i, 1); | 8244 | &per_cpu(init_sched_entity, i), i, 1, |
| 8245 | root_task_group.se[i]); | ||
| 7225 | 8246 | ||
| 7226 | #endif | 8247 | #endif |
| 8248 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 8249 | |||
| 8250 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
| 7227 | #ifdef CONFIG_RT_GROUP_SCHED | 8251 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7228 | init_task_group.rt_runtime = | ||
| 7229 | sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
| 7230 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8252 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
| 7231 | init_tg_rt_entry(rq, &init_task_group, | 8253 | #ifdef CONFIG_CGROUP_SCHED |
| 8254 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
| 8255 | #elif defined CONFIG_USER_SCHED | ||
| 8256 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
| 8257 | init_tg_rt_entry(&init_task_group, | ||
| 7232 | &per_cpu(init_rt_rq, i), | 8258 | &per_cpu(init_rt_rq, i), |
| 7233 | &per_cpu(init_sched_rt_entity, i), i, 1); | 8259 | &per_cpu(init_sched_rt_entity, i), i, 1, |
| 8260 | root_task_group.rt_se[i]); | ||
| 8261 | #endif | ||
| 7234 | #endif | 8262 | #endif |
| 7235 | rq->rt_period_expire = 0; | ||
| 7236 | rq->rt_throttled = 0; | ||
| 7237 | 8263 | ||
| 7238 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8264 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| 7239 | rq->cpu_load[j] = 0; | 8265 | rq->cpu_load[j] = 0; |
| @@ -7250,7 +8276,6 @@ void __init sched_init(void) | |||
| 7250 | #endif | 8276 | #endif |
| 7251 | init_rq_hrtick(rq); | 8277 | init_rq_hrtick(rq); |
| 7252 | atomic_set(&rq->nr_iowait, 0); | 8278 | atomic_set(&rq->nr_iowait, 0); |
| 7253 | highest_cpu = i; | ||
| 7254 | } | 8279 | } |
| 7255 | 8280 | ||
| 7256 | set_load_weight(&init_task); | 8281 | set_load_weight(&init_task); |
| @@ -7260,7 +8285,6 @@ void __init sched_init(void) | |||
| 7260 | #endif | 8285 | #endif |
| 7261 | 8286 | ||
| 7262 | #ifdef CONFIG_SMP | 8287 | #ifdef CONFIG_SMP |
| 7263 | nr_cpu_ids = highest_cpu + 1; | ||
| 7264 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 8288 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
| 7265 | #endif | 8289 | #endif |
| 7266 | 8290 | ||
| @@ -7419,8 +8443,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
| 7419 | 8443 | ||
| 7420 | #endif | 8444 | #endif |
| 7421 | 8445 | ||
| 7422 | #ifdef CONFIG_GROUP_SCHED | ||
| 7423 | |||
| 7424 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8446 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7425 | static void free_fair_sched_group(struct task_group *tg) | 8447 | static void free_fair_sched_group(struct task_group *tg) |
| 7426 | { | 8448 | { |
| @@ -7437,17 +8459,18 @@ static void free_fair_sched_group(struct task_group *tg) | |||
| 7437 | kfree(tg->se); | 8459 | kfree(tg->se); |
| 7438 | } | 8460 | } |
| 7439 | 8461 | ||
| 7440 | static int alloc_fair_sched_group(struct task_group *tg) | 8462 | static |
| 8463 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7441 | { | 8464 | { |
| 7442 | struct cfs_rq *cfs_rq; | 8465 | struct cfs_rq *cfs_rq; |
| 7443 | struct sched_entity *se; | 8466 | struct sched_entity *se, *parent_se; |
| 7444 | struct rq *rq; | 8467 | struct rq *rq; |
| 7445 | int i; | 8468 | int i; |
| 7446 | 8469 | ||
| 7447 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | 8470 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
| 7448 | if (!tg->cfs_rq) | 8471 | if (!tg->cfs_rq) |
| 7449 | goto err; | 8472 | goto err; |
| 7450 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 8473 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); |
| 7451 | if (!tg->se) | 8474 | if (!tg->se) |
| 7452 | goto err; | 8475 | goto err; |
| 7453 | 8476 | ||
| @@ -7466,7 +8489,8 @@ static int alloc_fair_sched_group(struct task_group *tg) | |||
| 7466 | if (!se) | 8489 | if (!se) |
| 7467 | goto err; | 8490 | goto err; |
| 7468 | 8491 | ||
| 7469 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); | 8492 | parent_se = parent ? parent->se[i] : NULL; |
| 8493 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); | ||
| 7470 | } | 8494 | } |
| 7471 | 8495 | ||
| 7472 | return 1; | 8496 | return 1; |
| @@ -7490,7 +8514,8 @@ static inline void free_fair_sched_group(struct task_group *tg) | |||
| 7490 | { | 8514 | { |
| 7491 | } | 8515 | } |
| 7492 | 8516 | ||
| 7493 | static inline int alloc_fair_sched_group(struct task_group *tg) | 8517 | static inline |
| 8518 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7494 | { | 8519 | { |
| 7495 | return 1; | 8520 | return 1; |
| 7496 | } | 8521 | } |
| @@ -7509,6 +8534,8 @@ static void free_rt_sched_group(struct task_group *tg) | |||
| 7509 | { | 8534 | { |
| 7510 | int i; | 8535 | int i; |
| 7511 | 8536 | ||
| 8537 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
| 8538 | |||
| 7512 | for_each_possible_cpu(i) { | 8539 | for_each_possible_cpu(i) { |
| 7513 | if (tg->rt_rq) | 8540 | if (tg->rt_rq) |
| 7514 | kfree(tg->rt_rq[i]); | 8541 | kfree(tg->rt_rq[i]); |
| @@ -7520,21 +8547,23 @@ static void free_rt_sched_group(struct task_group *tg) | |||
| 7520 | kfree(tg->rt_se); | 8547 | kfree(tg->rt_se); |
| 7521 | } | 8548 | } |
| 7522 | 8549 | ||
| 7523 | static int alloc_rt_sched_group(struct task_group *tg) | 8550 | static |
| 8551 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7524 | { | 8552 | { |
| 7525 | struct rt_rq *rt_rq; | 8553 | struct rt_rq *rt_rq; |
| 7526 | struct sched_rt_entity *rt_se; | 8554 | struct sched_rt_entity *rt_se, *parent_se; |
| 7527 | struct rq *rq; | 8555 | struct rq *rq; |
| 7528 | int i; | 8556 | int i; |
| 7529 | 8557 | ||
| 7530 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | 8558 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
| 7531 | if (!tg->rt_rq) | 8559 | if (!tg->rt_rq) |
| 7532 | goto err; | 8560 | goto err; |
| 7533 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | 8561 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); |
| 7534 | if (!tg->rt_se) | 8562 | if (!tg->rt_se) |
| 7535 | goto err; | 8563 | goto err; |
| 7536 | 8564 | ||
| 7537 | tg->rt_runtime = 0; | 8565 | init_rt_bandwidth(&tg->rt_bandwidth, |
| 8566 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
| 7538 | 8567 | ||
| 7539 | for_each_possible_cpu(i) { | 8568 | for_each_possible_cpu(i) { |
| 7540 | rq = cpu_rq(i); | 8569 | rq = cpu_rq(i); |
| @@ -7549,7 +8578,8 @@ static int alloc_rt_sched_group(struct task_group *tg) | |||
| 7549 | if (!rt_se) | 8578 | if (!rt_se) |
| 7550 | goto err; | 8579 | goto err; |
| 7551 | 8580 | ||
| 7552 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); | 8581 | parent_se = parent ? parent->rt_se[i] : NULL; |
| 8582 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); | ||
| 7553 | } | 8583 | } |
| 7554 | 8584 | ||
| 7555 | return 1; | 8585 | return 1; |
| @@ -7573,7 +8603,8 @@ static inline void free_rt_sched_group(struct task_group *tg) | |||
| 7573 | { | 8603 | { |
| 7574 | } | 8604 | } |
| 7575 | 8605 | ||
| 7576 | static inline int alloc_rt_sched_group(struct task_group *tg) | 8606 | static inline |
| 8607 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7577 | { | 8608 | { |
| 7578 | return 1; | 8609 | return 1; |
| 7579 | } | 8610 | } |
| @@ -7587,6 +8618,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
| 7587 | } | 8618 | } |
| 7588 | #endif | 8619 | #endif |
| 7589 | 8620 | ||
| 8621 | #ifdef CONFIG_GROUP_SCHED | ||
| 7590 | static void free_sched_group(struct task_group *tg) | 8622 | static void free_sched_group(struct task_group *tg) |
| 7591 | { | 8623 | { |
| 7592 | free_fair_sched_group(tg); | 8624 | free_fair_sched_group(tg); |
| @@ -7595,7 +8627,7 @@ static void free_sched_group(struct task_group *tg) | |||
| 7595 | } | 8627 | } |
| 7596 | 8628 | ||
| 7597 | /* allocate runqueue etc for a new task group */ | 8629 | /* allocate runqueue etc for a new task group */ |
| 7598 | struct task_group *sched_create_group(void) | 8630 | struct task_group *sched_create_group(struct task_group *parent) |
| 7599 | { | 8631 | { |
| 7600 | struct task_group *tg; | 8632 | struct task_group *tg; |
| 7601 | unsigned long flags; | 8633 | unsigned long flags; |
| @@ -7605,10 +8637,10 @@ struct task_group *sched_create_group(void) | |||
| 7605 | if (!tg) | 8637 | if (!tg) |
| 7606 | return ERR_PTR(-ENOMEM); | 8638 | return ERR_PTR(-ENOMEM); |
| 7607 | 8639 | ||
| 7608 | if (!alloc_fair_sched_group(tg)) | 8640 | if (!alloc_fair_sched_group(tg, parent)) |
| 7609 | goto err; | 8641 | goto err; |
| 7610 | 8642 | ||
| 7611 | if (!alloc_rt_sched_group(tg)) | 8643 | if (!alloc_rt_sched_group(tg, parent)) |
| 7612 | goto err; | 8644 | goto err; |
| 7613 | 8645 | ||
| 7614 | spin_lock_irqsave(&task_group_lock, flags); | 8646 | spin_lock_irqsave(&task_group_lock, flags); |
| @@ -7617,6 +8649,12 @@ struct task_group *sched_create_group(void) | |||
| 7617 | register_rt_sched_group(tg, i); | 8649 | register_rt_sched_group(tg, i); |
| 7618 | } | 8650 | } |
| 7619 | list_add_rcu(&tg->list, &task_groups); | 8651 | list_add_rcu(&tg->list, &task_groups); |
| 8652 | |||
| 8653 | WARN_ON(!parent); /* root should already exist */ | ||
| 8654 | |||
| 8655 | tg->parent = parent; | ||
| 8656 | list_add_rcu(&tg->siblings, &parent->children); | ||
| 8657 | INIT_LIST_HEAD(&tg->children); | ||
| 7620 | spin_unlock_irqrestore(&task_group_lock, flags); | 8658 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7621 | 8659 | ||
| 7622 | return tg; | 8660 | return tg; |
| @@ -7645,6 +8683,7 @@ void sched_destroy_group(struct task_group *tg) | |||
| 7645 | unregister_rt_sched_group(tg, i); | 8683 | unregister_rt_sched_group(tg, i); |
| 7646 | } | 8684 | } |
| 7647 | list_del_rcu(&tg->list); | 8685 | list_del_rcu(&tg->list); |
| 8686 | list_del_rcu(&tg->siblings); | ||
| 7648 | spin_unlock_irqrestore(&task_group_lock, flags); | 8687 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7649 | 8688 | ||
| 7650 | /* wait for possible concurrent references to cfs_rqs complete */ | 8689 | /* wait for possible concurrent references to cfs_rqs complete */ |
| @@ -7688,16 +8727,14 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7688 | 8727 | ||
| 7689 | task_rq_unlock(rq, &flags); | 8728 | task_rq_unlock(rq, &flags); |
| 7690 | } | 8729 | } |
| 8730 | #endif | ||
| 7691 | 8731 | ||
| 7692 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8732 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7693 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8733 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
| 7694 | { | 8734 | { |
| 7695 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8735 | struct cfs_rq *cfs_rq = se->cfs_rq; |
| 7696 | struct rq *rq = cfs_rq->rq; | ||
| 7697 | int on_rq; | 8736 | int on_rq; |
| 7698 | 8737 | ||
| 7699 | spin_lock_irq(&rq->lock); | ||
| 7700 | |||
| 7701 | on_rq = se->on_rq; | 8738 | on_rq = se->on_rq; |
| 7702 | if (on_rq) | 8739 | if (on_rq) |
| 7703 | dequeue_entity(cfs_rq, se, 0); | 8740 | dequeue_entity(cfs_rq, se, 0); |
| @@ -7707,8 +8744,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
| 7707 | 8744 | ||
| 7708 | if (on_rq) | 8745 | if (on_rq) |
| 7709 | enqueue_entity(cfs_rq, se, 0); | 8746 | enqueue_entity(cfs_rq, se, 0); |
| 8747 | } | ||
| 7710 | 8748 | ||
| 7711 | spin_unlock_irq(&rq->lock); | 8749 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
| 8750 | { | ||
| 8751 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
| 8752 | struct rq *rq = cfs_rq->rq; | ||
| 8753 | unsigned long flags; | ||
| 8754 | |||
| 8755 | spin_lock_irqsave(&rq->lock, flags); | ||
| 8756 | __set_se_shares(se, shares); | ||
| 8757 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 7712 | } | 8758 | } |
| 7713 | 8759 | ||
| 7714 | static DEFINE_MUTEX(shares_mutex); | 8760 | static DEFINE_MUTEX(shares_mutex); |
| @@ -7719,12 +8765,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7719 | unsigned long flags; | 8765 | unsigned long flags; |
| 7720 | 8766 | ||
| 7721 | /* | 8767 | /* |
| 8768 | * We can't change the weight of the root cgroup. | ||
| 8769 | */ | ||
| 8770 | if (!tg->se[0]) | ||
| 8771 | return -EINVAL; | ||
| 8772 | |||
| 8773 | /* | ||
| 7722 | * A weight of 0 or 1 can cause arithmetics problems. | 8774 | * A weight of 0 or 1 can cause arithmetics problems. |
| 7723 | * (The default weight is 1024 - so there's no practical | 8775 | * (The default weight is 1024 - so there's no practical |
| 7724 | * limitation from this.) | 8776 | * limitation from this.) |
| 7725 | */ | 8777 | */ |
| 7726 | if (shares < 2) | 8778 | if (shares < MIN_SHARES) |
| 7727 | shares = 2; | 8779 | shares = MIN_SHARES; |
| 7728 | 8780 | ||
| 7729 | mutex_lock(&shares_mutex); | 8781 | mutex_lock(&shares_mutex); |
| 7730 | if (tg->shares == shares) | 8782 | if (tg->shares == shares) |
| @@ -7733,6 +8785,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7733 | spin_lock_irqsave(&task_group_lock, flags); | 8785 | spin_lock_irqsave(&task_group_lock, flags); |
| 7734 | for_each_possible_cpu(i) | 8786 | for_each_possible_cpu(i) |
| 7735 | unregister_fair_sched_group(tg, i); | 8787 | unregister_fair_sched_group(tg, i); |
| 8788 | list_del_rcu(&tg->siblings); | ||
| 7736 | spin_unlock_irqrestore(&task_group_lock, flags); | 8789 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7737 | 8790 | ||
| 7738 | /* wait for any ongoing reference to this group to finish */ | 8791 | /* wait for any ongoing reference to this group to finish */ |
| @@ -7743,8 +8796,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7743 | * w/o tripping rebalance_share or load_balance_fair. | 8796 | * w/o tripping rebalance_share or load_balance_fair. |
| 7744 | */ | 8797 | */ |
| 7745 | tg->shares = shares; | 8798 | tg->shares = shares; |
| 7746 | for_each_possible_cpu(i) | 8799 | for_each_possible_cpu(i) { |
| 7747 | set_se_shares(tg->se[i], shares); | 8800 | /* |
| 8801 | * force a rebalance | ||
| 8802 | */ | ||
| 8803 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
| 8804 | set_se_shares(tg->se[i], shares/nr_cpu_ids); | ||
| 8805 | } | ||
| 7748 | 8806 | ||
| 7749 | /* | 8807 | /* |
| 7750 | * Enable load balance activity on this group, by inserting it back on | 8808 | * Enable load balance activity on this group, by inserting it back on |
| @@ -7753,6 +8811,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7753 | spin_lock_irqsave(&task_group_lock, flags); | 8811 | spin_lock_irqsave(&task_group_lock, flags); |
| 7754 | for_each_possible_cpu(i) | 8812 | for_each_possible_cpu(i) |
| 7755 | register_fair_sched_group(tg, i); | 8813 | register_fair_sched_group(tg, i); |
| 8814 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
| 7756 | spin_unlock_irqrestore(&task_group_lock, flags); | 8815 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7757 | done: | 8816 | done: |
| 7758 | mutex_unlock(&shares_mutex); | 8817 | mutex_unlock(&shares_mutex); |
| @@ -7779,26 +8838,58 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
| 7779 | return div64_64(runtime << 16, period); | 8838 | return div64_64(runtime << 16, period); |
| 7780 | } | 8839 | } |
| 7781 | 8840 | ||
| 8841 | #ifdef CONFIG_CGROUP_SCHED | ||
| 8842 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
| 8843 | { | ||
| 8844 | struct task_group *tgi, *parent = tg->parent; | ||
| 8845 | unsigned long total = 0; | ||
| 8846 | |||
| 8847 | if (!parent) { | ||
| 8848 | if (global_rt_period() < period) | ||
| 8849 | return 0; | ||
| 8850 | |||
| 8851 | return to_ratio(period, runtime) < | ||
| 8852 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 8853 | } | ||
| 8854 | |||
| 8855 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | ||
| 8856 | return 0; | ||
| 8857 | |||
| 8858 | rcu_read_lock(); | ||
| 8859 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | ||
| 8860 | if (tgi == tg) | ||
| 8861 | continue; | ||
| 8862 | |||
| 8863 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | ||
| 8864 | tgi->rt_bandwidth.rt_runtime); | ||
| 8865 | } | ||
| 8866 | rcu_read_unlock(); | ||
| 8867 | |||
| 8868 | return total + to_ratio(period, runtime) < | ||
| 8869 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | ||
| 8870 | parent->rt_bandwidth.rt_runtime); | ||
| 8871 | } | ||
| 8872 | #elif defined CONFIG_USER_SCHED | ||
| 7782 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8873 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
| 7783 | { | 8874 | { |
| 7784 | struct task_group *tgi; | 8875 | struct task_group *tgi; |
| 7785 | unsigned long total = 0; | 8876 | unsigned long total = 0; |
| 7786 | unsigned long global_ratio = | 8877 | unsigned long global_ratio = |
| 7787 | to_ratio(sysctl_sched_rt_period, | 8878 | to_ratio(global_rt_period(), global_rt_runtime()); |
| 7788 | sysctl_sched_rt_runtime < 0 ? | ||
| 7789 | RUNTIME_INF : sysctl_sched_rt_runtime); | ||
| 7790 | 8879 | ||
| 7791 | rcu_read_lock(); | 8880 | rcu_read_lock(); |
| 7792 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8881 | list_for_each_entry_rcu(tgi, &task_groups, list) { |
| 7793 | if (tgi == tg) | 8882 | if (tgi == tg) |
| 7794 | continue; | 8883 | continue; |
| 7795 | 8884 | ||
| 7796 | total += to_ratio(period, tgi->rt_runtime); | 8885 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), |
| 8886 | tgi->rt_bandwidth.rt_runtime); | ||
| 7797 | } | 8887 | } |
| 7798 | rcu_read_unlock(); | 8888 | rcu_read_unlock(); |
| 7799 | 8889 | ||
| 7800 | return total + to_ratio(period, runtime) < global_ratio; | 8890 | return total + to_ratio(period, runtime) < global_ratio; |
| 7801 | } | 8891 | } |
| 8892 | #endif | ||
| 7802 | 8893 | ||
| 7803 | /* Must be called with tasklist_lock held */ | 8894 | /* Must be called with tasklist_lock held */ |
| 7804 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8895 | static inline int tg_has_rt_tasks(struct task_group *tg) |
| @@ -7811,19 +8902,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
| 7811 | return 0; | 8902 | return 0; |
| 7812 | } | 8903 | } |
| 7813 | 8904 | ||
| 7814 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 8905 | static int tg_set_bandwidth(struct task_group *tg, |
| 8906 | u64 rt_period, u64 rt_runtime) | ||
| 7815 | { | 8907 | { |
| 7816 | u64 rt_runtime, rt_period; | 8908 | int i, err = 0; |
| 7817 | int err = 0; | ||
| 7818 | |||
| 7819 | rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
| 7820 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
| 7821 | if (rt_runtime_us == -1) | ||
| 7822 | rt_runtime = RUNTIME_INF; | ||
| 7823 | 8909 | ||
| 7824 | mutex_lock(&rt_constraints_mutex); | 8910 | mutex_lock(&rt_constraints_mutex); |
| 7825 | read_lock(&tasklist_lock); | 8911 | read_lock(&tasklist_lock); |
| 7826 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | 8912 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { |
| 7827 | err = -EBUSY; | 8913 | err = -EBUSY; |
| 7828 | goto unlock; | 8914 | goto unlock; |
| 7829 | } | 8915 | } |
| @@ -7831,7 +8917,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
| 7831 | err = -EINVAL; | 8917 | err = -EINVAL; |
| 7832 | goto unlock; | 8918 | goto unlock; |
| 7833 | } | 8919 | } |
| 7834 | tg->rt_runtime = rt_runtime; | 8920 | |
| 8921 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 8922 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | ||
| 8923 | tg->rt_bandwidth.rt_runtime = rt_runtime; | ||
| 8924 | |||
| 8925 | for_each_possible_cpu(i) { | ||
| 8926 | struct rt_rq *rt_rq = tg->rt_rq[i]; | ||
| 8927 | |||
| 8928 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 8929 | rt_rq->rt_runtime = rt_runtime; | ||
| 8930 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 8931 | } | ||
| 8932 | spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 7835 | unlock: | 8933 | unlock: |
| 7836 | read_unlock(&tasklist_lock); | 8934 | read_unlock(&tasklist_lock); |
| 7837 | mutex_unlock(&rt_constraints_mutex); | 8935 | mutex_unlock(&rt_constraints_mutex); |
| @@ -7839,19 +8937,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
| 7839 | return err; | 8937 | return err; |
| 7840 | } | 8938 | } |
| 7841 | 8939 | ||
| 8940 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | ||
| 8941 | { | ||
| 8942 | u64 rt_runtime, rt_period; | ||
| 8943 | |||
| 8944 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 8945 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
| 8946 | if (rt_runtime_us < 0) | ||
| 8947 | rt_runtime = RUNTIME_INF; | ||
| 8948 | |||
| 8949 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
| 8950 | } | ||
| 8951 | |||
| 7842 | long sched_group_rt_runtime(struct task_group *tg) | 8952 | long sched_group_rt_runtime(struct task_group *tg) |
| 7843 | { | 8953 | { |
| 7844 | u64 rt_runtime_us; | 8954 | u64 rt_runtime_us; |
| 7845 | 8955 | ||
| 7846 | if (tg->rt_runtime == RUNTIME_INF) | 8956 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) |
| 7847 | return -1; | 8957 | return -1; |
| 7848 | 8958 | ||
| 7849 | rt_runtime_us = tg->rt_runtime; | 8959 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; |
| 7850 | do_div(rt_runtime_us, NSEC_PER_USEC); | 8960 | do_div(rt_runtime_us, NSEC_PER_USEC); |
| 7851 | return rt_runtime_us; | 8961 | return rt_runtime_us; |
| 7852 | } | 8962 | } |
| 8963 | |||
| 8964 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | ||
| 8965 | { | ||
| 8966 | u64 rt_runtime, rt_period; | ||
| 8967 | |||
| 8968 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | ||
| 8969 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 8970 | |||
| 8971 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
| 8972 | } | ||
| 8973 | |||
| 8974 | long sched_group_rt_period(struct task_group *tg) | ||
| 8975 | { | ||
| 8976 | u64 rt_period_us; | ||
| 8977 | |||
| 8978 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 8979 | do_div(rt_period_us, NSEC_PER_USEC); | ||
| 8980 | return rt_period_us; | ||
| 8981 | } | ||
| 8982 | |||
| 8983 | static int sched_rt_global_constraints(void) | ||
| 8984 | { | ||
| 8985 | int ret = 0; | ||
| 8986 | |||
| 8987 | mutex_lock(&rt_constraints_mutex); | ||
| 8988 | if (!__rt_schedulable(NULL, 1, 0)) | ||
| 8989 | ret = -EINVAL; | ||
| 8990 | mutex_unlock(&rt_constraints_mutex); | ||
| 8991 | |||
| 8992 | return ret; | ||
| 8993 | } | ||
| 8994 | #else | ||
| 8995 | static int sched_rt_global_constraints(void) | ||
| 8996 | { | ||
| 8997 | unsigned long flags; | ||
| 8998 | int i; | ||
| 8999 | |||
| 9000 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 9001 | for_each_possible_cpu(i) { | ||
| 9002 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | ||
| 9003 | |||
| 9004 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 9005 | rt_rq->rt_runtime = global_rt_runtime(); | ||
| 9006 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 9007 | } | ||
| 9008 | spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 9009 | |||
| 9010 | return 0; | ||
| 9011 | } | ||
| 7853 | #endif | 9012 | #endif |
| 7854 | #endif /* CONFIG_GROUP_SCHED */ | 9013 | |
| 9014 | int sched_rt_handler(struct ctl_table *table, int write, | ||
| 9015 | struct file *filp, void __user *buffer, size_t *lenp, | ||
| 9016 | loff_t *ppos) | ||
| 9017 | { | ||
| 9018 | int ret; | ||
| 9019 | int old_period, old_runtime; | ||
| 9020 | static DEFINE_MUTEX(mutex); | ||
| 9021 | |||
| 9022 | mutex_lock(&mutex); | ||
| 9023 | old_period = sysctl_sched_rt_period; | ||
| 9024 | old_runtime = sysctl_sched_rt_runtime; | ||
| 9025 | |||
| 9026 | ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
| 9027 | |||
| 9028 | if (!ret && write) { | ||
| 9029 | ret = sched_rt_global_constraints(); | ||
| 9030 | if (ret) { | ||
| 9031 | sysctl_sched_rt_period = old_period; | ||
| 9032 | sysctl_sched_rt_runtime = old_runtime; | ||
| 9033 | } else { | ||
| 9034 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
| 9035 | def_rt_bandwidth.rt_period = | ||
| 9036 | ns_to_ktime(global_rt_period()); | ||
| 9037 | } | ||
| 9038 | } | ||
| 9039 | mutex_unlock(&mutex); | ||
| 9040 | |||
| 9041 | return ret; | ||
| 9042 | } | ||
| 7855 | 9043 | ||
| 7856 | #ifdef CONFIG_CGROUP_SCHED | 9044 | #ifdef CONFIG_CGROUP_SCHED |
| 7857 | 9045 | ||
| @@ -7865,7 +9053,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
| 7865 | static struct cgroup_subsys_state * | 9053 | static struct cgroup_subsys_state * |
| 7866 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | 9054 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 7867 | { | 9055 | { |
| 7868 | struct task_group *tg; | 9056 | struct task_group *tg, *parent; |
| 7869 | 9057 | ||
| 7870 | if (!cgrp->parent) { | 9058 | if (!cgrp->parent) { |
| 7871 | /* This is early initialization for the top cgroup */ | 9059 | /* This is early initialization for the top cgroup */ |
| @@ -7873,11 +9061,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 7873 | return &init_task_group.css; | 9061 | return &init_task_group.css; |
| 7874 | } | 9062 | } |
| 7875 | 9063 | ||
| 7876 | /* we support only 1-level deep hierarchical scheduler atm */ | 9064 | parent = cgroup_tg(cgrp->parent); |
| 7877 | if (cgrp->parent->parent) | 9065 | tg = sched_create_group(parent); |
| 7878 | return ERR_PTR(-EINVAL); | ||
| 7879 | |||
| 7880 | tg = sched_create_group(); | ||
| 7881 | if (IS_ERR(tg)) | 9066 | if (IS_ERR(tg)) |
| 7882 | return ERR_PTR(-ENOMEM); | 9067 | return ERR_PTR(-ENOMEM); |
| 7883 | 9068 | ||
| @@ -7901,7 +9086,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 7901 | { | 9086 | { |
| 7902 | #ifdef CONFIG_RT_GROUP_SCHED | 9087 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7903 | /* Don't accept realtime tasks when there is no way for them to run */ | 9088 | /* Don't accept realtime tasks when there is no way for them to run */ |
| 7904 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) | 9089 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) |
| 7905 | return -EINVAL; | 9090 | return -EINVAL; |
| 7906 | #else | 9091 | #else |
| 7907 | /* We don't support RT-tasks being in separate groups */ | 9092 | /* We don't support RT-tasks being in separate groups */ |
| @@ -7935,7 +9120,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
| 7935 | #endif | 9120 | #endif |
| 7936 | 9121 | ||
| 7937 | #ifdef CONFIG_RT_GROUP_SCHED | 9122 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7938 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 9123 | static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
| 7939 | struct file *file, | 9124 | struct file *file, |
| 7940 | const char __user *userbuf, | 9125 | const char __user *userbuf, |
| 7941 | size_t nbytes, loff_t *unused_ppos) | 9126 | size_t nbytes, loff_t *unused_ppos) |
| @@ -7979,6 +9164,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, | |||
| 7979 | 9164 | ||
| 7980 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 9165 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
| 7981 | } | 9166 | } |
| 9167 | |||
| 9168 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
| 9169 | u64 rt_period_us) | ||
| 9170 | { | ||
| 9171 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | ||
| 9172 | } | ||
| 9173 | |||
| 9174 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
| 9175 | { | ||
| 9176 | return sched_group_rt_period(cgroup_tg(cgrp)); | ||
| 9177 | } | ||
| 7982 | #endif | 9178 | #endif |
| 7983 | 9179 | ||
| 7984 | static struct cftype cpu_files[] = { | 9180 | static struct cftype cpu_files[] = { |
| @@ -7995,6 +9191,11 @@ static struct cftype cpu_files[] = { | |||
| 7995 | .read = cpu_rt_runtime_read, | 9191 | .read = cpu_rt_runtime_read, |
| 7996 | .write = cpu_rt_runtime_write, | 9192 | .write = cpu_rt_runtime_write, |
| 7997 | }, | 9193 | }, |
| 9194 | { | ||
| 9195 | .name = "rt_period_us", | ||
| 9196 | .read_uint = cpu_rt_period_read_uint, | ||
| 9197 | .write_uint = cpu_rt_period_write_uint, | ||
| 9198 | }, | ||
| 7998 | #endif | 9199 | #endif |
| 7999 | }; | 9200 | }; |
| 8000 | 9201 | ||
| @@ -8035,9 +9236,9 @@ struct cpuacct { | |||
| 8035 | struct cgroup_subsys cpuacct_subsys; | 9236 | struct cgroup_subsys cpuacct_subsys; |
| 8036 | 9237 | ||
| 8037 | /* return cpu accounting group corresponding to this container */ | 9238 | /* return cpu accounting group corresponding to this container */ |
| 8038 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | 9239 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
| 8039 | { | 9240 | { |
| 8040 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | 9241 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), |
| 8041 | struct cpuacct, css); | 9242 | struct cpuacct, css); |
| 8042 | } | 9243 | } |
| 8043 | 9244 | ||
| @@ -8050,7 +9251,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) | |||
| 8050 | 9251 | ||
| 8051 | /* create a new cpu accounting group */ | 9252 | /* create a new cpu accounting group */ |
| 8052 | static struct cgroup_subsys_state *cpuacct_create( | 9253 | static struct cgroup_subsys_state *cpuacct_create( |
| 8053 | struct cgroup_subsys *ss, struct cgroup *cont) | 9254 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 8054 | { | 9255 | { |
| 8055 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 9256 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
| 8056 | 9257 | ||
| @@ -8068,18 +9269,18 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
| 8068 | 9269 | ||
| 8069 | /* destroy an existing cpu accounting group */ | 9270 | /* destroy an existing cpu accounting group */ |
| 8070 | static void | 9271 | static void |
| 8071 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 9272 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 8072 | { | 9273 | { |
| 8073 | struct cpuacct *ca = cgroup_ca(cont); | 9274 | struct cpuacct *ca = cgroup_ca(cgrp); |
| 8074 | 9275 | ||
| 8075 | free_percpu(ca->cpuusage); | 9276 | free_percpu(ca->cpuusage); |
| 8076 | kfree(ca); | 9277 | kfree(ca); |
| 8077 | } | 9278 | } |
| 8078 | 9279 | ||
| 8079 | /* return total cpu usage (in nanoseconds) of a group */ | 9280 | /* return total cpu usage (in nanoseconds) of a group */ |
| 8080 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | 9281 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) |
| 8081 | { | 9282 | { |
| 8082 | struct cpuacct *ca = cgroup_ca(cont); | 9283 | struct cpuacct *ca = cgroup_ca(cgrp); |
| 8083 | u64 totalcpuusage = 0; | 9284 | u64 totalcpuusage = 0; |
| 8084 | int i; | 9285 | int i; |
| 8085 | 9286 | ||
| @@ -8098,16 +9299,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | |||
| 8098 | return totalcpuusage; | 9299 | return totalcpuusage; |
| 8099 | } | 9300 | } |
| 8100 | 9301 | ||
| 9302 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
| 9303 | u64 reset) | ||
| 9304 | { | ||
| 9305 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
| 9306 | int err = 0; | ||
| 9307 | int i; | ||
| 9308 | |||
| 9309 | if (reset) { | ||
| 9310 | err = -EINVAL; | ||
| 9311 | goto out; | ||
| 9312 | } | ||
| 9313 | |||
| 9314 | for_each_possible_cpu(i) { | ||
| 9315 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | ||
| 9316 | |||
| 9317 | spin_lock_irq(&cpu_rq(i)->lock); | ||
| 9318 | *cpuusage = 0; | ||
| 9319 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
| 9320 | } | ||
| 9321 | out: | ||
| 9322 | return err; | ||
| 9323 | } | ||
| 9324 | |||
| 8101 | static struct cftype files[] = { | 9325 | static struct cftype files[] = { |
| 8102 | { | 9326 | { |
| 8103 | .name = "usage", | 9327 | .name = "usage", |
| 8104 | .read_uint = cpuusage_read, | 9328 | .read_uint = cpuusage_read, |
| 9329 | .write_uint = cpuusage_write, | ||
| 8105 | }, | 9330 | }, |
| 8106 | }; | 9331 | }; |
| 8107 | 9332 | ||
| 8108 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 9333 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 8109 | { | 9334 | { |
| 8110 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 9335 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); |
| 8111 | } | 9336 | } |
| 8112 | 9337 | ||
| 8113 | /* | 9338 | /* |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ef358ba07683..f3f4af4b8b0f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 67 | (long long)(p->nvcsw + p->nivcsw), | 67 | (long long)(p->nvcsw + p->nivcsw), |
| 68 | p->prio); | 68 | p->prio); |
| 69 | #ifdef CONFIG_SCHEDSTATS | 69 | #ifdef CONFIG_SCHEDSTATS |
| 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", | 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
| 71 | SPLIT_NS(p->se.vruntime), | 71 | SPLIT_NS(p->se.vruntime), |
| 72 | SPLIT_NS(p->se.sum_exec_runtime), | 72 | SPLIT_NS(p->se.sum_exec_runtime), |
| 73 | SPLIT_NS(p->se.sum_sleep_runtime)); | 73 | SPLIT_NS(p->se.sum_sleep_runtime)); |
| 74 | #else | 74 | #else |
| 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", | 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
| 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
| 77 | #endif | 77 | #endif |
| 78 | |||
| 79 | #ifdef CONFIG_CGROUP_SCHED | ||
| 80 | { | ||
| 81 | char path[64]; | ||
| 82 | |||
| 83 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
| 84 | SEQ_printf(m, " %s", path); | ||
| 85 | } | ||
| 86 | #endif | ||
| 87 | SEQ_printf(m, "\n"); | ||
| 78 | } | 88 | } |
| 79 | 89 | ||
| 80 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 90 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
| @@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 109 | struct sched_entity *last; | 119 | struct sched_entity *last; |
| 110 | unsigned long flags; | 120 | unsigned long flags; |
| 111 | 121 | ||
| 112 | SEQ_printf(m, "\ncfs_rq\n"); | 122 | #if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) |
| 123 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
| 124 | #else | ||
| 125 | char path[128] = ""; | ||
| 126 | struct cgroup *cgroup = NULL; | ||
| 127 | struct task_group *tg = cfs_rq->tg; | ||
| 128 | |||
| 129 | if (tg) | ||
| 130 | cgroup = tg->css.cgroup; | ||
| 131 | |||
| 132 | if (cgroup) | ||
| 133 | cgroup_path(cgroup, path, sizeof(path)); | ||
| 134 | |||
| 135 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
| 136 | #endif | ||
| 113 | 137 | ||
| 114 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
| 115 | SPLIT_NS(cfs_rq->exec_clock)); | 139 | SPLIT_NS(cfs_rq->exec_clock)); |
| @@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 143 | #endif | 167 | #endif |
| 144 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
| 145 | cfs_rq->nr_spread_over); | 169 | cfs_rq->nr_spread_over); |
| 170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 171 | #ifdef CONFIG_SMP | ||
| 172 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
| 173 | #endif | ||
| 174 | #endif | ||
| 146 | } | 175 | } |
| 147 | 176 | ||
| 148 | static void print_cpu(struct seq_file *m, int cpu) | 177 | static void print_cpu(struct seq_file *m, int cpu) |
| @@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
| 214 | PN(sysctl_sched_latency); | 243 | PN(sysctl_sched_latency); |
| 215 | PN(sysctl_sched_min_granularity); | 244 | PN(sysctl_sched_min_granularity); |
| 216 | PN(sysctl_sched_wakeup_granularity); | 245 | PN(sysctl_sched_wakeup_granularity); |
| 217 | PN(sysctl_sched_batch_wakeup_granularity); | ||
| 218 | PN(sysctl_sched_child_runs_first); | 246 | PN(sysctl_sched_child_runs_first); |
| 219 | P(sysctl_sched_features); | 247 | P(sysctl_sched_features); |
| 220 | #undef PN | 248 | #undef PN |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0080968d3e4a..89fa32b4edf2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; | |||
| 62 | unsigned int __read_mostly sysctl_sched_compat_yield; | 62 | unsigned int __read_mostly sysctl_sched_compat_yield; |
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * SCHED_BATCH wake-up granularity. | ||
| 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | ||
| 67 | * | ||
| 68 | * This option delays the preemption effects of decoupled workloads | ||
| 69 | * and reduces their over-scheduling. Synchronous workloads will still | ||
| 70 | * have immediate wakeup/sleep latencies. | ||
| 71 | */ | ||
| 72 | unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | ||
| 73 | |||
| 74 | /* | ||
| 75 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
| 76 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 77 | * | 67 | * |
| 78 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
| 79 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
| 80 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
| 81 | */ | 71 | */ |
| 82 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
| 83 | 73 | ||
| 84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 85 | 75 | ||
| @@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
| 87 | * CFS operations on generic schedulable entities: | 77 | * CFS operations on generic schedulable entities: |
| 88 | */ | 78 | */ |
| 89 | 79 | ||
| 80 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
| 81 | { | ||
| 82 | return container_of(se, struct task_struct, se); | ||
| 83 | } | ||
| 84 | |||
| 90 | #ifdef CONFIG_FAIR_GROUP_SCHED | 85 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 91 | 86 | ||
| 92 | /* cpu runqueue to which this cfs_rq is attached */ | 87 | /* cpu runqueue to which this cfs_rq is attached */ |
| @@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 98 | /* An entity is a task if it doesn't "own" a runqueue */ | 93 | /* An entity is a task if it doesn't "own" a runqueue */ |
| 99 | #define entity_is_task(se) (!se->my_q) | 94 | #define entity_is_task(se) (!se->my_q) |
| 100 | 95 | ||
| 96 | /* Walk up scheduling entities hierarchy */ | ||
| 97 | #define for_each_sched_entity(se) \ | ||
| 98 | for (; se; se = se->parent) | ||
| 99 | |||
| 100 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
| 101 | { | ||
| 102 | return p->se.cfs_rq; | ||
| 103 | } | ||
| 104 | |||
| 105 | /* runqueue on which this entity is (to be) queued */ | ||
| 106 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 107 | { | ||
| 108 | return se->cfs_rq; | ||
| 109 | } | ||
| 110 | |||
| 111 | /* runqueue "owned" by this group */ | ||
| 112 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 113 | { | ||
| 114 | return grp->my_q; | ||
| 115 | } | ||
| 116 | |||
| 117 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
| 118 | * another cpu ('this_cpu') | ||
| 119 | */ | ||
| 120 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 121 | { | ||
| 122 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
| 123 | } | ||
| 124 | |||
| 125 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
| 126 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 127 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
| 128 | |||
| 129 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
| 130 | static inline int | ||
| 131 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 132 | { | ||
| 133 | if (se->cfs_rq == pse->cfs_rq) | ||
| 134 | return 1; | ||
| 135 | |||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 140 | { | ||
| 141 | return se->parent; | ||
| 142 | } | ||
| 143 | |||
| 101 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 144 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| 102 | 145 | ||
| 103 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 146 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
| @@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 107 | 150 | ||
| 108 | #define entity_is_task(se) 1 | 151 | #define entity_is_task(se) 1 |
| 109 | 152 | ||
| 110 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 153 | #define for_each_sched_entity(se) \ |
| 154 | for (; se; se = NULL) | ||
| 111 | 155 | ||
| 112 | static inline struct task_struct *task_of(struct sched_entity *se) | 156 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) |
| 113 | { | 157 | { |
| 114 | return container_of(se, struct task_struct, se); | 158 | return &task_rq(p)->cfs; |
| 159 | } | ||
| 160 | |||
| 161 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 162 | { | ||
| 163 | struct task_struct *p = task_of(se); | ||
| 164 | struct rq *rq = task_rq(p); | ||
| 165 | |||
| 166 | return &rq->cfs; | ||
| 167 | } | ||
| 168 | |||
| 169 | /* runqueue "owned" by this group */ | ||
| 170 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 171 | { | ||
| 172 | return NULL; | ||
| 173 | } | ||
| 174 | |||
| 175 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 176 | { | ||
| 177 | return &cpu_rq(this_cpu)->cfs; | ||
| 178 | } | ||
| 179 | |||
| 180 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 181 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
| 182 | |||
| 183 | static inline int | ||
| 184 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 185 | { | ||
| 186 | return 1; | ||
| 187 | } | ||
| 188 | |||
| 189 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 190 | { | ||
| 191 | return NULL; | ||
| 115 | } | 192 | } |
| 116 | 193 | ||
| 194 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 195 | |||
| 117 | 196 | ||
| 118 | /************************************************************** | 197 | /************************************************************** |
| 119 | * Scheduling class tree data structure manipulation methods: | 198 | * Scheduling class tree data structure manipulation methods: |
| @@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
| 255 | #endif | 334 | #endif |
| 256 | 335 | ||
| 257 | /* | 336 | /* |
| 337 | * delta *= w / rw | ||
| 338 | */ | ||
| 339 | static inline unsigned long | ||
| 340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
| 341 | { | ||
| 342 | for_each_sched_entity(se) { | ||
| 343 | delta = calc_delta_mine(delta, | ||
| 344 | se->load.weight, &cfs_rq_of(se)->load); | ||
| 345 | } | ||
| 346 | |||
| 347 | return delta; | ||
| 348 | } | ||
| 349 | |||
| 350 | /* | ||
| 351 | * delta *= rw / w | ||
| 352 | */ | ||
| 353 | static inline unsigned long | ||
| 354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
| 355 | { | ||
| 356 | for_each_sched_entity(se) { | ||
| 357 | delta = calc_delta_mine(delta, | ||
| 358 | cfs_rq_of(se)->load.weight, &se->load); | ||
| 359 | } | ||
| 360 | |||
| 361 | return delta; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 258 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
| 259 | * | 366 | * |
| 260 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
| @@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running) | |||
| 283 | */ | 390 | */ |
| 284 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 285 | { | 392 | { |
| 286 | return calc_delta_mine(__sched_period(cfs_rq->nr_running), | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
| 287 | se->load.weight, &cfs_rq->load); | ||
| 288 | } | 394 | } |
| 289 | 395 | ||
| 290 | /* | 396 | /* |
| 291 | * We calculate the vruntime slice. | 397 | * We calculate the vruntime slice of a to be inserted task |
| 292 | * | 398 | * |
| 293 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
| 294 | */ | 400 | */ |
| 295 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 296 | { | 402 | { |
| 297 | u64 vslice = __sched_period(nr_running); | 403 | unsigned long nr_running = cfs_rq->nr_running; |
| 298 | 404 | ||
| 299 | vslice *= NICE_0_LOAD; | 405 | if (!se->on_rq) |
| 300 | do_div(vslice, rq_weight); | 406 | nr_running++; |
| 301 | 407 | ||
| 302 | return vslice; | 408 | return __sched_period(nr_running); |
| 303 | } | 409 | } |
| 304 | 410 | ||
| 305 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 411 | /* |
| 412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
| 413 | * that it favours >=0 over <0. | ||
| 414 | * | ||
| 415 | * -20 | | ||
| 416 | * | | ||
| 417 | * 0 --------+------- | ||
| 418 | * .' | ||
| 419 | * 19 .' | ||
| 420 | * | ||
| 421 | */ | ||
| 422 | static unsigned long | ||
| 423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
| 306 | { | 424 | { |
| 307 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, | 425 | struct load_weight lw = { |
| 308 | cfs_rq->nr_running + 1); | 426 | .weight = NICE_0_LOAD, |
| 427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
| 428 | }; | ||
| 429 | |||
| 430 | for_each_sched_entity(se) { | ||
| 431 | struct load_weight *se_lw = &se->load; | ||
| 432 | |||
| 433 | if (se->load.weight < NICE_0_LOAD) | ||
| 434 | se_lw = &lw; | ||
| 435 | |||
| 436 | delta = calc_delta_mine(delta, | ||
| 437 | cfs_rq_of(se)->load.weight, se_lw); | ||
| 438 | } | ||
| 439 | |||
| 440 | return delta; | ||
| 309 | } | 441 | } |
| 310 | 442 | ||
| 311 | /* | 443 | /* |
| @@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 322 | 454 | ||
| 323 | curr->sum_exec_runtime += delta_exec; | 455 | curr->sum_exec_runtime += delta_exec; |
| 324 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 456 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
| 325 | delta_exec_weighted = delta_exec; | 457 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
| 326 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
| 327 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
| 328 | &curr->load); | ||
| 329 | } | ||
| 330 | curr->vruntime += delta_exec_weighted; | 458 | curr->vruntime += delta_exec_weighted; |
| 331 | } | 459 | } |
| 332 | 460 | ||
| @@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 413 | * Scheduling class queueing methods: | 541 | * Scheduling class queueing methods: |
| 414 | */ | 542 | */ |
| 415 | 543 | ||
| 544 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
| 545 | static void | ||
| 546 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 547 | { | ||
| 548 | cfs_rq->task_weight += weight; | ||
| 549 | } | ||
| 550 | #else | ||
| 551 | static inline void | ||
| 552 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 553 | { | ||
| 554 | } | ||
| 555 | #endif | ||
| 556 | |||
| 416 | static void | 557 | static void |
| 417 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 558 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 418 | { | 559 | { |
| 419 | update_load_add(&cfs_rq->load, se->load.weight); | 560 | update_load_add(&cfs_rq->load, se->load.weight); |
| 561 | if (!parent_entity(se)) | ||
| 562 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 563 | if (entity_is_task(se)) | ||
| 564 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
| 420 | cfs_rq->nr_running++; | 565 | cfs_rq->nr_running++; |
| 421 | se->on_rq = 1; | 566 | se->on_rq = 1; |
| 567 | list_add(&se->group_node, &cfs_rq->tasks); | ||
| 422 | } | 568 | } |
| 423 | 569 | ||
| 424 | static void | 570 | static void |
| 425 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 571 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 426 | { | 572 | { |
| 427 | update_load_sub(&cfs_rq->load, se->load.weight); | 573 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 574 | if (!parent_entity(se)) | ||
| 575 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 576 | if (entity_is_task(se)) | ||
| 577 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
| 428 | cfs_rq->nr_running--; | 578 | cfs_rq->nr_running--; |
| 429 | se->on_rq = 0; | 579 | se->on_rq = 0; |
| 580 | list_del_init(&se->group_node); | ||
| 430 | } | 581 | } |
| 431 | 582 | ||
| 432 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 583 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| @@ -510,8 +661,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 510 | 661 | ||
| 511 | if (!initial) { | 662 | if (!initial) { |
| 512 | /* sleeps upto a single latency don't count. */ | 663 | /* sleeps upto a single latency don't count. */ |
| 513 | if (sched_feat(NEW_FAIR_SLEEPERS)) | 664 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
| 514 | vruntime -= sysctl_sched_latency; | 665 | if (sched_feat(NORMALIZED_SLEEPER)) |
| 666 | vruntime -= calc_delta_weight(sysctl_sched_latency, se); | ||
| 667 | else | ||
| 668 | vruntime -= sysctl_sched_latency; | ||
| 669 | } | ||
| 515 | 670 | ||
| 516 | /* ensure we never gain time by being placed backwards. */ | 671 | /* ensure we never gain time by being placed backwards. */ |
| 517 | vruntime = max_vruntime(se->vruntime, vruntime); | 672 | vruntime = max_vruntime(se->vruntime, vruntime); |
| @@ -627,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 627 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 782 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
| 628 | } | 783 | } |
| 629 | 784 | ||
| 785 | static int | ||
| 786 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
| 787 | |||
| 630 | static struct sched_entity * | 788 | static struct sched_entity * |
| 631 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 789 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 632 | { | 790 | { |
| 633 | s64 diff, gran; | ||
| 634 | |||
| 635 | if (!cfs_rq->next) | 791 | if (!cfs_rq->next) |
| 636 | return se; | 792 | return se; |
| 637 | 793 | ||
| 638 | diff = cfs_rq->next->vruntime - se->vruntime; | 794 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) |
| 639 | if (diff < 0) | ||
| 640 | return se; | ||
| 641 | |||
| 642 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); | ||
| 643 | if (diff > gran) | ||
| 644 | return se; | 795 | return se; |
| 645 | 796 | ||
| 646 | return cfs_rq->next; | 797 | return cfs_rq->next; |
| @@ -708,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
| 708 | * CFS operations on tasks: | 859 | * CFS operations on tasks: |
| 709 | */ | 860 | */ |
| 710 | 861 | ||
| 711 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 712 | |||
| 713 | /* Walk up scheduling entities hierarchy */ | ||
| 714 | #define for_each_sched_entity(se) \ | ||
| 715 | for (; se; se = se->parent) | ||
| 716 | |||
| 717 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
| 718 | { | ||
| 719 | return p->se.cfs_rq; | ||
| 720 | } | ||
| 721 | |||
| 722 | /* runqueue on which this entity is (to be) queued */ | ||
| 723 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 724 | { | ||
| 725 | return se->cfs_rq; | ||
| 726 | } | ||
| 727 | |||
| 728 | /* runqueue "owned" by this group */ | ||
| 729 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 730 | { | ||
| 731 | return grp->my_q; | ||
| 732 | } | ||
| 733 | |||
| 734 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
| 735 | * another cpu ('this_cpu') | ||
| 736 | */ | ||
| 737 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 738 | { | ||
| 739 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
| 740 | } | ||
| 741 | |||
| 742 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
| 743 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 744 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
| 745 | |||
| 746 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
| 747 | static inline int | ||
| 748 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 749 | { | ||
| 750 | if (se->cfs_rq == pse->cfs_rq) | ||
| 751 | return 1; | ||
| 752 | |||
| 753 | return 0; | ||
| 754 | } | ||
| 755 | |||
| 756 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 757 | { | ||
| 758 | return se->parent; | ||
| 759 | } | ||
| 760 | |||
| 761 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 762 | |||
| 763 | #define for_each_sched_entity(se) \ | ||
| 764 | for (; se; se = NULL) | ||
| 765 | |||
| 766 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
| 767 | { | ||
| 768 | return &task_rq(p)->cfs; | ||
| 769 | } | ||
| 770 | |||
| 771 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 772 | { | ||
| 773 | struct task_struct *p = task_of(se); | ||
| 774 | struct rq *rq = task_rq(p); | ||
| 775 | |||
| 776 | return &rq->cfs; | ||
| 777 | } | ||
| 778 | |||
| 779 | /* runqueue "owned" by this group */ | ||
| 780 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 781 | { | ||
| 782 | return NULL; | ||
| 783 | } | ||
| 784 | |||
| 785 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 786 | { | ||
| 787 | return &cpu_rq(this_cpu)->cfs; | ||
| 788 | } | ||
| 789 | |||
| 790 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 791 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
| 792 | |||
| 793 | static inline int | ||
| 794 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 795 | { | ||
| 796 | return 1; | ||
| 797 | } | ||
| 798 | |||
| 799 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 800 | { | ||
| 801 | return NULL; | ||
| 802 | } | ||
| 803 | |||
| 804 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 805 | |||
| 806 | #ifdef CONFIG_SCHED_HRTICK | 862 | #ifdef CONFIG_SCHED_HRTICK |
| 807 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | 863 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) |
| 808 | { | 864 | { |
| @@ -916,7 +972,7 @@ static void yield_task_fair(struct rq *rq) | |||
| 916 | /* | 972 | /* |
| 917 | * Already in the rightmost position? | 973 | * Already in the rightmost position? |
| 918 | */ | 974 | */ |
| 919 | if (unlikely(rightmost->vruntime < se->vruntime)) | 975 | if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) |
| 920 | return; | 976 | return; |
| 921 | 977 | ||
| 922 | /* | 978 | /* |
| @@ -955,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
| 955 | return cpu; | 1011 | return cpu; |
| 956 | 1012 | ||
| 957 | for_each_domain(cpu, sd) { | 1013 | for_each_domain(cpu, sd) { |
| 958 | if (sd->flags & SD_WAKE_IDLE) { | 1014 | if ((sd->flags & SD_WAKE_IDLE) |
| 1015 | || ((sd->flags & SD_WAKE_IDLE_FAR) | ||
| 1016 | && !task_hot(p, task_rq(p)->clock, sd))) { | ||
| 959 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1017 | cpus_and(tmp, sd->span, p->cpus_allowed); |
| 960 | for_each_cpu_mask(i, tmp) { | 1018 | for_each_cpu_mask(i, tmp) { |
| 961 | if (idle_cpu(i)) { | 1019 | if (idle_cpu(i)) { |
| @@ -1099,6 +1157,58 @@ out: | |||
| 1099 | } | 1157 | } |
| 1100 | #endif /* CONFIG_SMP */ | 1158 | #endif /* CONFIG_SMP */ |
| 1101 | 1159 | ||
| 1160 | static unsigned long wakeup_gran(struct sched_entity *se) | ||
| 1161 | { | ||
| 1162 | unsigned long gran = sysctl_sched_wakeup_granularity; | ||
| 1163 | |||
| 1164 | /* | ||
| 1165 | * More easily preempt - nice tasks, while not making it harder for | ||
| 1166 | * + nice tasks. | ||
| 1167 | */ | ||
| 1168 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | ||
| 1169 | |||
| 1170 | return gran; | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | /* | ||
| 1174 | * Should 'se' preempt 'curr'. | ||
| 1175 | * | ||
| 1176 | * |s1 | ||
| 1177 | * |s2 | ||
| 1178 | * |s3 | ||
| 1179 | * g | ||
| 1180 | * |<--->|c | ||
| 1181 | * | ||
| 1182 | * w(c, s1) = -1 | ||
| 1183 | * w(c, s2) = 0 | ||
| 1184 | * w(c, s3) = 1 | ||
| 1185 | * | ||
| 1186 | */ | ||
| 1187 | static int | ||
| 1188 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | ||
| 1189 | { | ||
| 1190 | s64 gran, vdiff = curr->vruntime - se->vruntime; | ||
| 1191 | |||
| 1192 | if (vdiff < 0) | ||
| 1193 | return -1; | ||
| 1194 | |||
| 1195 | gran = wakeup_gran(curr); | ||
| 1196 | if (vdiff > gran) | ||
| 1197 | return 1; | ||
| 1198 | |||
| 1199 | return 0; | ||
| 1200 | } | ||
| 1201 | |||
| 1202 | /* return depth at which a sched entity is present in the hierarchy */ | ||
| 1203 | static inline int depth_se(struct sched_entity *se) | ||
| 1204 | { | ||
| 1205 | int depth = 0; | ||
| 1206 | |||
| 1207 | for_each_sched_entity(se) | ||
| 1208 | depth++; | ||
| 1209 | |||
| 1210 | return depth; | ||
| 1211 | } | ||
| 1102 | 1212 | ||
| 1103 | /* | 1213 | /* |
| 1104 | * Preempt the current task with a newly woken task if needed: | 1214 | * Preempt the current task with a newly woken task if needed: |
| @@ -1108,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1108 | struct task_struct *curr = rq->curr; | 1218 | struct task_struct *curr = rq->curr; |
| 1109 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1219 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
| 1110 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1220 | struct sched_entity *se = &curr->se, *pse = &p->se; |
| 1111 | unsigned long gran; | 1221 | int se_depth, pse_depth; |
| 1112 | 1222 | ||
| 1113 | if (unlikely(rt_prio(p->prio))) { | 1223 | if (unlikely(rt_prio(p->prio))) { |
| 1114 | update_rq_clock(rq); | 1224 | update_rq_clock(rq); |
| @@ -1133,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1133 | if (!sched_feat(WAKEUP_PREEMPT)) | 1243 | if (!sched_feat(WAKEUP_PREEMPT)) |
| 1134 | return; | 1244 | return; |
| 1135 | 1245 | ||
| 1136 | while (!is_same_group(se, pse)) { | 1246 | /* |
| 1247 | * preemption test can be made between sibling entities who are in the | ||
| 1248 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | ||
| 1249 | * both tasks until we find their ancestors who are siblings of common | ||
| 1250 | * parent. | ||
| 1251 | */ | ||
| 1252 | |||
| 1253 | /* First walk up until both entities are at same depth */ | ||
| 1254 | se_depth = depth_se(se); | ||
| 1255 | pse_depth = depth_se(pse); | ||
| 1256 | |||
| 1257 | while (se_depth > pse_depth) { | ||
| 1258 | se_depth--; | ||
| 1137 | se = parent_entity(se); | 1259 | se = parent_entity(se); |
| 1260 | } | ||
| 1261 | |||
| 1262 | while (pse_depth > se_depth) { | ||
| 1263 | pse_depth--; | ||
| 1138 | pse = parent_entity(pse); | 1264 | pse = parent_entity(pse); |
| 1139 | } | 1265 | } |
| 1140 | 1266 | ||
| 1141 | gran = sysctl_sched_wakeup_granularity; | 1267 | while (!is_same_group(se, pse)) { |
| 1142 | /* | 1268 | se = parent_entity(se); |
| 1143 | * More easily preempt - nice tasks, while not making | 1269 | pse = parent_entity(pse); |
| 1144 | * it harder for + nice tasks. | 1270 | } |
| 1145 | */ | ||
| 1146 | if (unlikely(se->load.weight > NICE_0_LOAD)) | ||
| 1147 | gran = calc_delta_fair(gran, &se->load); | ||
| 1148 | 1271 | ||
| 1149 | if (pse->vruntime + gran < se->vruntime) | 1272 | if (wakeup_preempt_entity(se, pse) == 1) |
| 1150 | resched_task(curr); | 1273 | resched_task(curr); |
| 1151 | } | 1274 | } |
| 1152 | 1275 | ||
| @@ -1197,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
| 1197 | * the current task: | 1320 | * the current task: |
| 1198 | */ | 1321 | */ |
| 1199 | static struct task_struct * | 1322 | static struct task_struct * |
| 1200 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 1323 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) |
| 1201 | { | 1324 | { |
| 1202 | struct task_struct *p; | 1325 | struct task_struct *p = NULL; |
| 1326 | struct sched_entity *se; | ||
| 1327 | |||
| 1328 | if (next == &cfs_rq->tasks) | ||
| 1329 | return NULL; | ||
| 1330 | |||
| 1331 | /* Skip over entities that are not tasks */ | ||
| 1332 | do { | ||
| 1333 | se = list_entry(next, struct sched_entity, group_node); | ||
| 1334 | next = next->next; | ||
| 1335 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
| 1203 | 1336 | ||
| 1204 | if (!curr) | 1337 | if (next == &cfs_rq->tasks) |
| 1205 | return NULL; | 1338 | return NULL; |
| 1206 | 1339 | ||
| 1207 | p = rb_entry(curr, struct task_struct, se.run_node); | 1340 | cfs_rq->balance_iterator = next; |
| 1208 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 1341 | |
| 1342 | if (entity_is_task(se)) | ||
| 1343 | p = task_of(se); | ||
| 1209 | 1344 | ||
| 1210 | return p; | 1345 | return p; |
| 1211 | } | 1346 | } |
| @@ -1214,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg) | |||
| 1214 | { | 1349 | { |
| 1215 | struct cfs_rq *cfs_rq = arg; | 1350 | struct cfs_rq *cfs_rq = arg; |
| 1216 | 1351 | ||
| 1217 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); | 1352 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); |
| 1218 | } | 1353 | } |
| 1219 | 1354 | ||
| 1220 | static struct task_struct *load_balance_next_fair(void *arg) | 1355 | static struct task_struct *load_balance_next_fair(void *arg) |
| 1221 | { | 1356 | { |
| 1222 | struct cfs_rq *cfs_rq = arg; | 1357 | struct cfs_rq *cfs_rq = arg; |
| 1223 | 1358 | ||
| 1224 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1359 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
| 1225 | } | 1360 | } |
| 1226 | 1361 | ||
| 1227 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1362 | static unsigned long |
| 1228 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1363 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1364 | unsigned long max_load_move, struct sched_domain *sd, | ||
| 1365 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
| 1366 | struct cfs_rq *cfs_rq) | ||
| 1229 | { | 1367 | { |
| 1230 | struct sched_entity *curr; | 1368 | struct rq_iterator cfs_rq_iterator; |
| 1231 | struct task_struct *p; | ||
| 1232 | |||
| 1233 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
| 1234 | return MAX_PRIO; | ||
| 1235 | |||
| 1236 | curr = cfs_rq->curr; | ||
| 1237 | if (!curr) | ||
| 1238 | curr = __pick_next_entity(cfs_rq); | ||
| 1239 | 1369 | ||
| 1240 | p = task_of(curr); | 1370 | cfs_rq_iterator.start = load_balance_start_fair; |
| 1371 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1372 | cfs_rq_iterator.arg = cfs_rq; | ||
| 1241 | 1373 | ||
| 1242 | return p->prio; | 1374 | return balance_tasks(this_rq, this_cpu, busiest, |
| 1375 | max_load_move, sd, idle, all_pinned, | ||
| 1376 | this_best_prio, &cfs_rq_iterator); | ||
| 1243 | } | 1377 | } |
| 1244 | #endif | ||
| 1245 | 1378 | ||
| 1379 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1246 | static unsigned long | 1380 | static unsigned long |
| 1247 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1381 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1248 | unsigned long max_load_move, | 1382 | unsigned long max_load_move, |
| 1249 | struct sched_domain *sd, enum cpu_idle_type idle, | 1383 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 1250 | int *all_pinned, int *this_best_prio) | 1384 | int *all_pinned, int *this_best_prio) |
| 1251 | { | 1385 | { |
| 1252 | struct cfs_rq *busy_cfs_rq; | ||
| 1253 | long rem_load_move = max_load_move; | 1386 | long rem_load_move = max_load_move; |
| 1254 | struct rq_iterator cfs_rq_iterator; | 1387 | int busiest_cpu = cpu_of(busiest); |
| 1255 | 1388 | struct task_group *tg; | |
| 1256 | cfs_rq_iterator.start = load_balance_start_fair; | ||
| 1257 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1258 | 1389 | ||
| 1259 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1390 | rcu_read_lock(); |
| 1260 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1391 | list_for_each_entry(tg, &task_groups, list) { |
| 1261 | struct cfs_rq *this_cfs_rq; | ||
| 1262 | long imbalance; | 1392 | long imbalance; |
| 1263 | unsigned long maxload; | 1393 | unsigned long this_weight, busiest_weight; |
| 1394 | long rem_load, max_load, moved_load; | ||
| 1395 | |||
| 1396 | /* | ||
| 1397 | * empty group | ||
| 1398 | */ | ||
| 1399 | if (!aggregate(tg, sd)->task_weight) | ||
| 1400 | continue; | ||
| 1401 | |||
| 1402 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
| 1403 | rem_load /= aggregate(tg, sd)->load + 1; | ||
| 1404 | |||
| 1405 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
| 1406 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
| 1407 | |||
| 1408 | imbalance = (busiest_weight - this_weight) / 2; | ||
| 1264 | 1409 | ||
| 1265 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1410 | if (imbalance < 0) |
| 1411 | imbalance = busiest_weight; | ||
| 1266 | 1412 | ||
| 1267 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1413 | max_load = max(rem_load, imbalance); |
| 1268 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1414 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
| 1269 | if (imbalance <= 0) | 1415 | max_load, sd, idle, all_pinned, this_best_prio, |
| 1416 | tg->cfs_rq[busiest_cpu]); | ||
| 1417 | |||
| 1418 | if (!moved_load) | ||
| 1270 | continue; | 1419 | continue; |
| 1271 | 1420 | ||
| 1272 | /* Don't pull more than imbalance/2 */ | 1421 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
| 1273 | imbalance /= 2; | ||
| 1274 | maxload = min(rem_load_move, imbalance); | ||
| 1275 | 1422 | ||
| 1276 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1423 | moved_load *= aggregate(tg, sd)->load; |
| 1277 | #else | 1424 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
| 1278 | # define maxload rem_load_move | ||
| 1279 | #endif | ||
| 1280 | /* | ||
| 1281 | * pass busy_cfs_rq argument into | ||
| 1282 | * load_balance_[start|next]_fair iterators | ||
| 1283 | */ | ||
| 1284 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
| 1285 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
| 1286 | maxload, sd, idle, all_pinned, | ||
| 1287 | this_best_prio, | ||
| 1288 | &cfs_rq_iterator); | ||
| 1289 | 1425 | ||
| 1290 | if (rem_load_move <= 0) | 1426 | rem_load_move -= moved_load; |
| 1427 | if (rem_load_move < 0) | ||
| 1291 | break; | 1428 | break; |
| 1292 | } | 1429 | } |
| 1430 | rcu_read_unlock(); | ||
| 1293 | 1431 | ||
| 1294 | return max_load_move - rem_load_move; | 1432 | return max_load_move - rem_load_move; |
| 1295 | } | 1433 | } |
| 1434 | #else | ||
| 1435 | static unsigned long | ||
| 1436 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1437 | unsigned long max_load_move, | ||
| 1438 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1439 | int *all_pinned, int *this_best_prio) | ||
| 1440 | { | ||
| 1441 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
| 1442 | max_load_move, sd, idle, all_pinned, | ||
| 1443 | this_best_prio, &busiest->cfs); | ||
| 1444 | } | ||
| 1445 | #endif | ||
| 1296 | 1446 | ||
| 1297 | static int | 1447 | static int |
| 1298 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1448 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| @@ -1461,16 +1611,40 @@ static const struct sched_class fair_sched_class = { | |||
| 1461 | }; | 1611 | }; |
| 1462 | 1612 | ||
| 1463 | #ifdef CONFIG_SCHED_DEBUG | 1613 | #ifdef CONFIG_SCHED_DEBUG |
| 1614 | static void | ||
| 1615 | print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth) | ||
| 1616 | { | ||
| 1617 | struct sched_entity *se; | ||
| 1618 | |||
| 1619 | if (!cfs_rq) | ||
| 1620 | return; | ||
| 1621 | |||
| 1622 | list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) { | ||
| 1623 | int i; | ||
| 1624 | |||
| 1625 | for (i = depth; i; i--) | ||
| 1626 | seq_puts(m, " "); | ||
| 1627 | |||
| 1628 | seq_printf(m, "%lu %s %lu\n", | ||
| 1629 | se->load.weight, | ||
| 1630 | entity_is_task(se) ? "T" : "G", | ||
| 1631 | calc_delta_weight(SCHED_LOAD_SCALE, se) | ||
| 1632 | ); | ||
| 1633 | if (!entity_is_task(se)) | ||
| 1634 | print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1); | ||
| 1635 | } | ||
| 1636 | } | ||
| 1637 | |||
| 1464 | static void print_cfs_stats(struct seq_file *m, int cpu) | 1638 | static void print_cfs_stats(struct seq_file *m, int cpu) |
| 1465 | { | 1639 | { |
| 1466 | struct cfs_rq *cfs_rq; | 1640 | struct cfs_rq *cfs_rq; |
| 1467 | 1641 | ||
| 1468 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1469 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
| 1470 | #endif | ||
| 1471 | rcu_read_lock(); | 1642 | rcu_read_lock(); |
| 1472 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1643 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
| 1473 | print_cfs_rq(m, cpu, cfs_rq); | 1644 | print_cfs_rq(m, cpu, cfs_rq); |
| 1645 | |||
| 1646 | seq_printf(m, "\nWeight tree:\n"); | ||
| 1647 | print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1); | ||
| 1474 | rcu_read_unlock(); | 1648 | rcu_read_unlock(); |
| 1475 | } | 1649 | } |
| 1476 | #endif | 1650 | #endif |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h new file mode 100644 index 000000000000..1c7283cb9581 --- /dev/null +++ b/kernel/sched_features.h | |||
| @@ -0,0 +1,10 @@ | |||
| 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | ||
| 2 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
| 3 | SCHED_FEAT(START_DEBIT, 1) | ||
| 4 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | ||
| 5 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | ||
| 6 | SCHED_FEAT(SYNC_WAKEUPS, 1) | ||
| 7 | SCHED_FEAT(HRTICK, 1) | ||
| 8 | SCHED_FEAT(DOUBLE_TICK, 0) | ||
| 9 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | ||
| 10 | SCHED_FEAT(DEADLINE, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a6d2e516420..c2730a5a4f05 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | |||
| 62 | if (!rt_rq->tg) | 62 | if (!rt_rq->tg) |
| 63 | return RUNTIME_INF; | 63 | return RUNTIME_INF; |
| 64 | 64 | ||
| 65 | return rt_rq->tg->rt_runtime; | 65 | return rt_rq->rt_runtime; |
| 66 | } | ||
| 67 | |||
| 68 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) | ||
| 69 | { | ||
| 70 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | ||
| 66 | } | 71 | } |
| 67 | 72 | ||
| 68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 73 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
| @@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) | |||
| 127 | return p->prio != p->normal_prio; | 132 | return p->prio != p->normal_prio; |
| 128 | } | 133 | } |
| 129 | 134 | ||
| 135 | #ifdef CONFIG_SMP | ||
| 136 | static inline cpumask_t sched_rt_period_mask(void) | ||
| 137 | { | ||
| 138 | return cpu_rq(smp_processor_id())->rd->span; | ||
| 139 | } | ||
| 140 | #else | ||
| 141 | static inline cpumask_t sched_rt_period_mask(void) | ||
| 142 | { | ||
| 143 | return cpu_online_map; | ||
| 144 | } | ||
| 145 | #endif | ||
| 146 | |||
| 147 | static inline | ||
| 148 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
| 149 | { | ||
| 150 | return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; | ||
| 151 | } | ||
| 152 | |||
| 153 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
| 154 | { | ||
| 155 | return &rt_rq->tg->rt_bandwidth; | ||
| 156 | } | ||
| 157 | |||
| 130 | #else | 158 | #else |
| 131 | 159 | ||
| 132 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | 160 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) |
| 133 | { | 161 | { |
| 134 | if (sysctl_sched_rt_runtime == -1) | 162 | return rt_rq->rt_runtime; |
| 135 | return RUNTIME_INF; | 163 | } |
| 136 | 164 | ||
| 137 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 165 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) |
| 166 | { | ||
| 167 | return ktime_to_ns(def_rt_bandwidth.rt_period); | ||
| 138 | } | 168 | } |
| 139 | 169 | ||
| 140 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 170 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
| @@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) | |||
| 173 | { | 203 | { |
| 174 | return rt_rq->rt_throttled; | 204 | return rt_rq->rt_throttled; |
| 175 | } | 205 | } |
| 206 | |||
| 207 | static inline cpumask_t sched_rt_period_mask(void) | ||
| 208 | { | ||
| 209 | return cpu_online_map; | ||
| 210 | } | ||
| 211 | |||
| 212 | static inline | ||
| 213 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
| 214 | { | ||
| 215 | return &cpu_rq(cpu)->rt; | ||
| 216 | } | ||
| 217 | |||
| 218 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
| 219 | { | ||
| 220 | return &def_rt_bandwidth; | ||
| 221 | } | ||
| 222 | |||
| 223 | #endif | ||
| 224 | |||
| 225 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
| 226 | { | ||
| 227 | int i, idle = 1; | ||
| 228 | cpumask_t span; | ||
| 229 | |||
| 230 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
| 231 | return 1; | ||
| 232 | |||
| 233 | span = sched_rt_period_mask(); | ||
| 234 | for_each_cpu_mask(i, span) { | ||
| 235 | int enqueue = 0; | ||
| 236 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
| 237 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 238 | |||
| 239 | spin_lock(&rq->lock); | ||
| 240 | if (rt_rq->rt_time) { | ||
| 241 | u64 runtime; | ||
| 242 | |||
| 243 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 244 | runtime = rt_rq->rt_runtime; | ||
| 245 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
| 246 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
| 247 | rt_rq->rt_throttled = 0; | ||
| 248 | enqueue = 1; | ||
| 249 | } | ||
| 250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
| 251 | idle = 0; | ||
| 252 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 253 | } | ||
| 254 | |||
| 255 | if (enqueue) | ||
| 256 | sched_rt_rq_enqueue(rt_rq); | ||
| 257 | spin_unlock(&rq->lock); | ||
| 258 | } | ||
| 259 | |||
| 260 | return idle; | ||
| 261 | } | ||
| 262 | |||
| 263 | #ifdef CONFIG_SMP | ||
| 264 | static int balance_runtime(struct rt_rq *rt_rq) | ||
| 265 | { | ||
| 266 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
| 267 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | ||
| 268 | int i, weight, more = 0; | ||
| 269 | u64 rt_period; | ||
| 270 | |||
| 271 | weight = cpus_weight(rd->span); | ||
| 272 | |||
| 273 | spin_lock(&rt_b->rt_runtime_lock); | ||
| 274 | rt_period = ktime_to_ns(rt_b->rt_period); | ||
| 275 | for_each_cpu_mask(i, rd->span) { | ||
| 276 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | ||
| 277 | s64 diff; | ||
| 278 | |||
| 279 | if (iter == rt_rq) | ||
| 280 | continue; | ||
| 281 | |||
| 282 | spin_lock(&iter->rt_runtime_lock); | ||
| 283 | diff = iter->rt_runtime - iter->rt_time; | ||
| 284 | if (diff > 0) { | ||
| 285 | do_div(diff, weight); | ||
| 286 | if (rt_rq->rt_runtime + diff > rt_period) | ||
| 287 | diff = rt_period - rt_rq->rt_runtime; | ||
| 288 | iter->rt_runtime -= diff; | ||
| 289 | rt_rq->rt_runtime += diff; | ||
| 290 | more = 1; | ||
| 291 | if (rt_rq->rt_runtime == rt_period) { | ||
| 292 | spin_unlock(&iter->rt_runtime_lock); | ||
| 293 | break; | ||
| 294 | } | ||
| 295 | } | ||
| 296 | spin_unlock(&iter->rt_runtime_lock); | ||
| 297 | } | ||
| 298 | spin_unlock(&rt_b->rt_runtime_lock); | ||
| 299 | |||
| 300 | return more; | ||
| 301 | } | ||
| 176 | #endif | 302 | #endif |
| 177 | 303 | ||
| 178 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | 304 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) |
| @@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 197 | if (rt_rq->rt_throttled) | 323 | if (rt_rq->rt_throttled) |
| 198 | return rt_rq_throttled(rt_rq); | 324 | return rt_rq_throttled(rt_rq); |
| 199 | 325 | ||
| 326 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | ||
| 327 | return 0; | ||
| 328 | |||
| 329 | #ifdef CONFIG_SMP | ||
| 200 | if (rt_rq->rt_time > runtime) { | 330 | if (rt_rq->rt_time > runtime) { |
| 201 | struct rq *rq = rq_of_rt_rq(rt_rq); | 331 | int more; |
| 202 | 332 | ||
| 203 | rq->rt_throttled = 1; | 333 | spin_unlock(&rt_rq->rt_runtime_lock); |
| 204 | rt_rq->rt_throttled = 1; | 334 | more = balance_runtime(rt_rq); |
| 335 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 205 | 336 | ||
| 337 | if (more) | ||
| 338 | runtime = sched_rt_runtime(rt_rq); | ||
| 339 | } | ||
| 340 | #endif | ||
| 341 | |||
| 342 | if (rt_rq->rt_time > runtime) { | ||
| 343 | rt_rq->rt_throttled = 1; | ||
| 206 | if (rt_rq_throttled(rt_rq)) { | 344 | if (rt_rq_throttled(rt_rq)) { |
| 207 | sched_rt_rq_dequeue(rt_rq); | 345 | sched_rt_rq_dequeue(rt_rq); |
| 208 | return 1; | 346 | return 1; |
| @@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 212 | return 0; | 350 | return 0; |
| 213 | } | 351 | } |
| 214 | 352 | ||
| 215 | static void update_sched_rt_period(struct rq *rq) | ||
| 216 | { | ||
| 217 | struct rt_rq *rt_rq; | ||
| 218 | u64 period; | ||
| 219 | |||
| 220 | while (rq->clock > rq->rt_period_expire) { | ||
| 221 | period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
| 222 | rq->rt_period_expire += period; | ||
| 223 | |||
| 224 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
| 225 | u64 runtime = sched_rt_runtime(rt_rq); | ||
| 226 | |||
| 227 | rt_rq->rt_time -= min(rt_rq->rt_time, runtime); | ||
| 228 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
| 229 | rt_rq->rt_throttled = 0; | ||
| 230 | sched_rt_rq_enqueue(rt_rq); | ||
| 231 | } | ||
| 232 | } | ||
| 233 | |||
| 234 | rq->rt_throttled = 0; | ||
| 235 | } | ||
| 236 | } | ||
| 237 | |||
| 238 | /* | 353 | /* |
| 239 | * Update the current task's runtime statistics. Skip current tasks that | 354 | * Update the current task's runtime statistics. Skip current tasks that |
| 240 | * are not in our scheduling class. | 355 | * are not in our scheduling class. |
| @@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq) | |||
| 259 | curr->se.exec_start = rq->clock; | 374 | curr->se.exec_start = rq->clock; |
| 260 | cpuacct_charge(curr, delta_exec); | 375 | cpuacct_charge(curr, delta_exec); |
| 261 | 376 | ||
| 262 | rt_rq->rt_time += delta_exec; | 377 | for_each_sched_rt_entity(rt_se) { |
| 263 | if (sched_rt_runtime_exceeded(rt_rq)) | 378 | rt_rq = rt_rq_of_se(rt_se); |
| 264 | resched_task(curr); | 379 | |
| 380 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 381 | rt_rq->rt_time += delta_exec; | ||
| 382 | if (sched_rt_runtime_exceeded(rt_rq)) | ||
| 383 | resched_task(curr); | ||
| 384 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 385 | } | ||
| 265 | } | 386 | } |
| 266 | 387 | ||
| 267 | static inline | 388 | static inline |
| @@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 284 | #ifdef CONFIG_RT_GROUP_SCHED | 405 | #ifdef CONFIG_RT_GROUP_SCHED |
| 285 | if (rt_se_boosted(rt_se)) | 406 | if (rt_se_boosted(rt_se)) |
| 286 | rt_rq->rt_nr_boosted++; | 407 | rt_rq->rt_nr_boosted++; |
| 408 | |||
| 409 | if (rt_rq->tg) | ||
| 410 | start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); | ||
| 411 | #else | ||
| 412 | start_rt_bandwidth(&def_rt_bandwidth); | ||
| 287 | #endif | 413 | #endif |
| 288 | } | 414 | } |
| 289 | 415 | ||
| @@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 353 | /* | 479 | /* |
| 354 | * Because the prio of an upper entry depends on the lower | 480 | * Because the prio of an upper entry depends on the lower |
| 355 | * entries, we must remove entries top - down. | 481 | * entries, we must remove entries top - down. |
| 356 | * | ||
| 357 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
| 358 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
| 359 | */ | 482 | */ |
| 360 | static void dequeue_rt_stack(struct task_struct *p) | 483 | static void dequeue_rt_stack(struct task_struct *p) |
| 361 | { | 484 | { |
| 362 | struct sched_rt_entity *rt_se, *top_se; | 485 | struct sched_rt_entity *rt_se, *back = NULL; |
| 363 | 486 | ||
| 364 | /* | 487 | rt_se = &p->rt; |
| 365 | * dequeue all, top - down. | 488 | for_each_sched_rt_entity(rt_se) { |
| 366 | */ | 489 | rt_se->back = back; |
| 367 | do { | 490 | back = rt_se; |
| 368 | rt_se = &p->rt; | 491 | } |
| 369 | top_se = NULL; | 492 | |
| 370 | for_each_sched_rt_entity(rt_se) { | 493 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
| 371 | if (on_rt_rq(rt_se)) | 494 | if (on_rt_rq(rt_se)) |
| 372 | top_se = rt_se; | 495 | dequeue_rt_entity(rt_se); |
| 373 | } | 496 | } |
| 374 | if (top_se) | ||
| 375 | dequeue_rt_entity(top_se); | ||
| 376 | } while (top_se); | ||
| 377 | } | 497 | } |
| 378 | 498 | ||
| 379 | /* | 499 | /* |
| @@ -393,6 +513,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 393 | */ | 513 | */ |
| 394 | for_each_sched_rt_entity(rt_se) | 514 | for_each_sched_rt_entity(rt_se) |
| 395 | enqueue_rt_entity(rt_se); | 515 | enqueue_rt_entity(rt_se); |
| 516 | |||
| 517 | inc_cpu_load(rq, p->se.load.weight); | ||
| 396 | } | 518 | } |
| 397 | 519 | ||
| 398 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 520 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
| @@ -412,6 +534,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
| 412 | if (rt_rq && rt_rq->rt_nr_running) | 534 | if (rt_rq && rt_rq->rt_nr_running) |
| 413 | enqueue_rt_entity(rt_se); | 535 | enqueue_rt_entity(rt_se); |
| 414 | } | 536 | } |
| 537 | |||
| 538 | dec_cpu_load(rq, p->se.load.weight); | ||
| 415 | } | 539 | } |
| 416 | 540 | ||
| 417 | /* | 541 | /* |
| @@ -1001,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1001 | return 0; | 1125 | return 0; |
| 1002 | } | 1126 | } |
| 1003 | 1127 | ||
| 1004 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | 1128 | static void set_cpus_allowed_rt(struct task_struct *p, |
| 1129 | const cpumask_t *new_mask) | ||
| 1005 | { | 1130 | { |
| 1006 | int weight = cpus_weight(*new_mask); | 1131 | int weight = cpus_weight(*new_mask); |
| 1007 | 1132 | ||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 5b32433e7ee5..5bae2e0c3ff2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
| @@ -9,6 +9,11 @@ | |||
| 9 | static int show_schedstat(struct seq_file *seq, void *v) | 9 | static int show_schedstat(struct seq_file *seq, void *v) |
| 10 | { | 10 | { |
| 11 | int cpu; | 11 | int cpu; |
| 12 | int mask_len = NR_CPUS/32 * 9; | ||
| 13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
| 14 | |||
| 15 | if (mask_str == NULL) | ||
| 16 | return -ENOMEM; | ||
| 12 | 17 | ||
| 13 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
| 14 | seq_printf(seq, "timestamp %lu\n", jiffies); | 19 | seq_printf(seq, "timestamp %lu\n", jiffies); |
| @@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 36 | preempt_disable(); | 41 | preempt_disable(); |
| 37 | for_each_domain(cpu, sd) { | 42 | for_each_domain(cpu, sd) { |
| 38 | enum cpu_idle_type itype; | 43 | enum cpu_idle_type itype; |
| 39 | char mask_str[NR_CPUS]; | ||
| 40 | 44 | ||
| 41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 45 | cpumask_scnprintf(mask_str, mask_len, sd->span); |
| 42 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | 46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
| 43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | 47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
| 44 | itype++) { | 48 | itype++) { |
diff --git a/kernel/signal.c b/kernel/signal.c index cc8303cd093d..64ad0ed15992 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -220,7 +220,7 @@ void flush_signals(struct task_struct *t) | |||
| 220 | unsigned long flags; | 220 | unsigned long flags; |
| 221 | 221 | ||
| 222 | spin_lock_irqsave(&t->sighand->siglock, flags); | 222 | spin_lock_irqsave(&t->sighand->siglock, flags); |
| 223 | clear_tsk_thread_flag(t,TIF_SIGPENDING); | 223 | clear_tsk_thread_flag(t, TIF_SIGPENDING); |
| 224 | flush_sigqueue(&t->pending); | 224 | flush_sigqueue(&t->pending); |
| 225 | flush_sigqueue(&t->signal->shared_pending); | 225 | flush_sigqueue(&t->signal->shared_pending); |
| 226 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 226 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
| @@ -424,7 +424,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 424 | } | 424 | } |
| 425 | if (signr && | 425 | if (signr && |
| 426 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 426 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
| 427 | info->si_sys_private){ | 427 | info->si_sys_private) { |
| 428 | /* | 428 | /* |
| 429 | * Release the siglock to ensure proper locking order | 429 | * Release the siglock to ensure proper locking order |
| 430 | * of timer locks outside of siglocks. Note, we leave | 430 | * of timer locks outside of siglocks. Note, we leave |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 31e9f2a47928..3c44956ee7e2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) | |||
| 356 | /* Tasklets */ | 356 | /* Tasklets */ |
| 357 | struct tasklet_head | 357 | struct tasklet_head |
| 358 | { | 358 | { |
| 359 | struct tasklet_struct *list; | 359 | struct tasklet_struct *head; |
| 360 | struct tasklet_struct **tail; | ||
| 360 | }; | 361 | }; |
| 361 | 362 | ||
| 362 | /* Some compilers disobey section attribute on statics when not | 363 | /* Some compilers disobey section attribute on statics when not |
| @@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
| 369 | unsigned long flags; | 370 | unsigned long flags; |
| 370 | 371 | ||
| 371 | local_irq_save(flags); | 372 | local_irq_save(flags); |
| 372 | t->next = __get_cpu_var(tasklet_vec).list; | 373 | t->next = NULL; |
| 373 | __get_cpu_var(tasklet_vec).list = t; | 374 | *__get_cpu_var(tasklet_vec).tail = t; |
| 375 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
| 374 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 376 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 375 | local_irq_restore(flags); | 377 | local_irq_restore(flags); |
| 376 | } | 378 | } |
| @@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
| 382 | unsigned long flags; | 384 | unsigned long flags; |
| 383 | 385 | ||
| 384 | local_irq_save(flags); | 386 | local_irq_save(flags); |
| 385 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 387 | t->next = NULL; |
| 386 | __get_cpu_var(tasklet_hi_vec).list = t; | 388 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
| 389 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
| 387 | raise_softirq_irqoff(HI_SOFTIRQ); | 390 | raise_softirq_irqoff(HI_SOFTIRQ); |
| 388 | local_irq_restore(flags); | 391 | local_irq_restore(flags); |
| 389 | } | 392 | } |
| @@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a) | |||
| 395 | struct tasklet_struct *list; | 398 | struct tasklet_struct *list; |
| 396 | 399 | ||
| 397 | local_irq_disable(); | 400 | local_irq_disable(); |
| 398 | list = __get_cpu_var(tasklet_vec).list; | 401 | list = __get_cpu_var(tasklet_vec).head; |
| 399 | __get_cpu_var(tasklet_vec).list = NULL; | 402 | __get_cpu_var(tasklet_vec).head = NULL; |
| 403 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | ||
| 400 | local_irq_enable(); | 404 | local_irq_enable(); |
| 401 | 405 | ||
| 402 | while (list) { | 406 | while (list) { |
| @@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a) | |||
| 416 | } | 420 | } |
| 417 | 421 | ||
| 418 | local_irq_disable(); | 422 | local_irq_disable(); |
| 419 | t->next = __get_cpu_var(tasklet_vec).list; | 423 | t->next = NULL; |
| 420 | __get_cpu_var(tasklet_vec).list = t; | 424 | *__get_cpu_var(tasklet_vec).tail = t; |
| 425 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
| 421 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 426 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 422 | local_irq_enable(); | 427 | local_irq_enable(); |
| 423 | } | 428 | } |
| @@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 428 | struct tasklet_struct *list; | 433 | struct tasklet_struct *list; |
| 429 | 434 | ||
| 430 | local_irq_disable(); | 435 | local_irq_disable(); |
| 431 | list = __get_cpu_var(tasklet_hi_vec).list; | 436 | list = __get_cpu_var(tasklet_hi_vec).head; |
| 432 | __get_cpu_var(tasklet_hi_vec).list = NULL; | 437 | __get_cpu_var(tasklet_hi_vec).head = NULL; |
| 438 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | ||
| 433 | local_irq_enable(); | 439 | local_irq_enable(); |
| 434 | 440 | ||
| 435 | while (list) { | 441 | while (list) { |
| @@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 449 | } | 455 | } |
| 450 | 456 | ||
| 451 | local_irq_disable(); | 457 | local_irq_disable(); |
| 452 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 458 | t->next = NULL; |
| 453 | __get_cpu_var(tasklet_hi_vec).list = t; | 459 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
| 460 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
| 454 | __raise_softirq_irqoff(HI_SOFTIRQ); | 461 | __raise_softirq_irqoff(HI_SOFTIRQ); |
| 455 | local_irq_enable(); | 462 | local_irq_enable(); |
| 456 | } | 463 | } |
| @@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill); | |||
| 487 | 494 | ||
| 488 | void __init softirq_init(void) | 495 | void __init softirq_init(void) |
| 489 | { | 496 | { |
| 497 | int cpu; | ||
| 498 | |||
| 499 | for_each_possible_cpu(cpu) { | ||
| 500 | per_cpu(tasklet_vec, cpu).tail = | ||
| 501 | &per_cpu(tasklet_vec, cpu).head; | ||
| 502 | per_cpu(tasklet_hi_vec, cpu).tail = | ||
| 503 | &per_cpu(tasklet_hi_vec, cpu).head; | ||
| 504 | } | ||
| 505 | |||
| 490 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); | 506 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); |
| 491 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); | 507 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); |
| 492 | } | 508 | } |
| @@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
| 555 | return; | 571 | return; |
| 556 | 572 | ||
| 557 | /* CPU is dead, so no lock needed. */ | 573 | /* CPU is dead, so no lock needed. */ |
| 558 | for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { | 574 | for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { |
| 559 | if (*i == t) { | 575 | if (*i == t) { |
| 560 | *i = t->next; | 576 | *i = t->next; |
| 577 | /* If this was the tail element, move the tail ptr */ | ||
| 578 | if (*i == NULL) | ||
| 579 | per_cpu(tasklet_vec, cpu).tail = i; | ||
| 561 | return; | 580 | return; |
| 562 | } | 581 | } |
| 563 | } | 582 | } |
| @@ -566,20 +585,20 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
| 566 | 585 | ||
| 567 | static void takeover_tasklets(unsigned int cpu) | 586 | static void takeover_tasklets(unsigned int cpu) |
| 568 | { | 587 | { |
| 569 | struct tasklet_struct **i; | ||
| 570 | |||
| 571 | /* CPU is dead, so no lock needed. */ | 588 | /* CPU is dead, so no lock needed. */ |
| 572 | local_irq_disable(); | 589 | local_irq_disable(); |
| 573 | 590 | ||
| 574 | /* Find end, append list for that CPU. */ | 591 | /* Find end, append list for that CPU. */ |
| 575 | for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); | 592 | *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head; |
| 576 | *i = per_cpu(tasklet_vec, cpu).list; | 593 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; |
| 577 | per_cpu(tasklet_vec, cpu).list = NULL; | 594 | per_cpu(tasklet_vec, cpu).head = NULL; |
| 595 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | ||
| 578 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 596 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 579 | 597 | ||
| 580 | for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); | 598 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; |
| 581 | *i = per_cpu(tasklet_hi_vec, cpu).list; | 599 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; |
| 582 | per_cpu(tasklet_hi_vec, cpu).list = NULL; | 600 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
| 601 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | ||
| 583 | raise_softirq_irqoff(HI_SOFTIRQ); | 602 | raise_softirq_irqoff(HI_SOFTIRQ); |
| 584 | 603 | ||
| 585 | local_irq_enable(); | 604 | local_irq_enable(); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6f4e0e13f70c..0101aeef7ed7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -11,7 +11,6 @@ | |||
| 11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
| 12 | 12 | ||
| 13 | #include <asm/atomic.h> | 13 | #include <asm/atomic.h> |
| 14 | #include <asm/semaphore.h> | ||
| 15 | #include <asm/uaccess.h> | 14 | #include <asm/uaccess.h> |
| 16 | 15 | ||
| 17 | /* Since we effect priority and affinity (both of which are visible | 16 | /* Since we effect priority and affinity (both of which are visible |
| @@ -35,7 +34,7 @@ static int stopmachine(void *cpu) | |||
| 35 | int irqs_disabled = 0; | 34 | int irqs_disabled = 0; |
| 36 | int prepared = 0; | 35 | int prepared = 0; |
| 37 | 36 | ||
| 38 | set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); | 37 | set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); |
| 39 | 38 | ||
| 40 | /* Ack: we are alive */ | 39 | /* Ack: we are alive */ |
| 41 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ | 40 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ |
| @@ -135,8 +134,7 @@ static void restart_machine(void) | |||
| 135 | preempt_enable_no_resched(); | 134 | preempt_enable_no_resched(); |
| 136 | } | 135 | } |
| 137 | 136 | ||
| 138 | struct stop_machine_data | 137 | struct stop_machine_data { |
| 139 | { | ||
| 140 | int (*fn)(void *); | 138 | int (*fn)(void *); |
| 141 | void *data; | 139 | void *data; |
| 142 | struct completion done; | 140 | struct completion done; |
diff --git a/kernel/sys.c b/kernel/sys.c index a626116af5db..6a0cc71ee88d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -67,6 +67,12 @@ | |||
| 67 | #ifndef SET_ENDIAN | 67 | #ifndef SET_ENDIAN |
| 68 | # define SET_ENDIAN(a,b) (-EINVAL) | 68 | # define SET_ENDIAN(a,b) (-EINVAL) |
| 69 | #endif | 69 | #endif |
| 70 | #ifndef GET_TSC_CTL | ||
| 71 | # define GET_TSC_CTL(a) (-EINVAL) | ||
| 72 | #endif | ||
| 73 | #ifndef SET_TSC_CTL | ||
| 74 | # define SET_TSC_CTL(a) (-EINVAL) | ||
| 75 | #endif | ||
| 70 | 76 | ||
| 71 | /* | 77 | /* |
| 72 | * this is where the system-wide overflow UID and GID are defined, for | 78 | * this is where the system-wide overflow UID and GID are defined, for |
| @@ -1737,7 +1743,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 1737 | #else | 1743 | #else |
| 1738 | return -EINVAL; | 1744 | return -EINVAL; |
| 1739 | #endif | 1745 | #endif |
| 1740 | 1746 | case PR_GET_TSC: | |
| 1747 | error = GET_TSC_CTL(arg2); | ||
| 1748 | break; | ||
| 1749 | case PR_SET_TSC: | ||
| 1750 | error = SET_TSC_CTL(arg2); | ||
| 1751 | break; | ||
| 1741 | default: | 1752 | default: |
| 1742 | error = -EINVAL; | 1753 | error = -EINVAL; |
| 1743 | break; | 1754 | break; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2a2d6889bab..fd3364827ccf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = { | |||
| 270 | }, | 270 | }, |
| 271 | { | 271 | { |
| 272 | .ctl_name = CTL_UNNUMBERED, | 272 | .ctl_name = CTL_UNNUMBERED, |
| 273 | .procname = "sched_batch_wakeup_granularity_ns", | ||
| 274 | .data = &sysctl_sched_batch_wakeup_granularity, | ||
| 275 | .maxlen = sizeof(unsigned int), | ||
| 276 | .mode = 0644, | ||
| 277 | .proc_handler = &proc_dointvec_minmax, | ||
| 278 | .strategy = &sysctl_intvec, | ||
| 279 | .extra1 = &min_wakeup_granularity_ns, | ||
| 280 | .extra2 = &max_wakeup_granularity_ns, | ||
| 281 | }, | ||
| 282 | { | ||
| 283 | .ctl_name = CTL_UNNUMBERED, | ||
| 284 | .procname = "sched_child_runs_first", | 273 | .procname = "sched_child_runs_first", |
| 285 | .data = &sysctl_sched_child_runs_first, | 274 | .data = &sysctl_sched_child_runs_first, |
| 286 | .maxlen = sizeof(unsigned int), | 275 | .maxlen = sizeof(unsigned int), |
| @@ -318,7 +307,7 @@ static struct ctl_table kern_table[] = { | |||
| 318 | .data = &sysctl_sched_rt_period, | 307 | .data = &sysctl_sched_rt_period, |
| 319 | .maxlen = sizeof(unsigned int), | 308 | .maxlen = sizeof(unsigned int), |
| 320 | .mode = 0644, | 309 | .mode = 0644, |
| 321 | .proc_handler = &proc_dointvec, | 310 | .proc_handler = &sched_rt_handler, |
| 322 | }, | 311 | }, |
| 323 | { | 312 | { |
| 324 | .ctl_name = CTL_UNNUMBERED, | 313 | .ctl_name = CTL_UNNUMBERED, |
| @@ -326,7 +315,7 @@ static struct ctl_table kern_table[] = { | |||
| 326 | .data = &sysctl_sched_rt_runtime, | 315 | .data = &sysctl_sched_rt_runtime, |
| 327 | .maxlen = sizeof(int), | 316 | .maxlen = sizeof(int), |
| 328 | .mode = 0644, | 317 | .mode = 0644, |
| 329 | .proc_handler = &proc_dointvec, | 318 | .proc_handler = &sched_rt_handler, |
| 330 | }, | 319 | }, |
| 331 | { | 320 | { |
| 332 | .ctl_name = CTL_UNNUMBERED, | 321 | .ctl_name = CTL_UNNUMBERED, |
diff --git a/kernel/time.c b/kernel/time.c index a5ec013b6c80..35d373a98782 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -379,6 +379,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) | |||
| 379 | ts->tv_sec = sec; | 379 | ts->tv_sec = sec; |
| 380 | ts->tv_nsec = nsec; | 380 | ts->tv_nsec = nsec; |
| 381 | } | 381 | } |
| 382 | EXPORT_SYMBOL(set_normalized_timespec); | ||
| 382 | 383 | ||
| 383 | /** | 384 | /** |
| 384 | * ns_to_timespec - Convert nanoseconds to timespec | 385 | * ns_to_timespec - Convert nanoseconds to timespec |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index fdfa0c745bb6..57a1f02e5ec0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -262,7 +262,7 @@ out: | |||
| 262 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | 262 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) |
| 263 | { | 263 | { |
| 264 | if (!cpu_isset(*oncpu, cpu_online_map)) | 264 | if (!cpu_isset(*oncpu, cpu_online_map)) |
| 265 | printk(KERN_ERR "tick-braodcast: ignoring broadcast for " | 265 | printk(KERN_ERR "tick-broadcast: ignoring broadcast for " |
| 266 | "offline CPU #%d\n", *oncpu); | 266 | "offline CPU #%d\n", *oncpu); |
| 267 | else | 267 | else |
| 268 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, | 268 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69dba0c71727..d358d4e3a958 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -191,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | |||
| 191 | void tick_nohz_stop_sched_tick(void) | 191 | void tick_nohz_stop_sched_tick(void) |
| 192 | { | 192 | { |
| 193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
| 194 | unsigned long rt_jiffies; | ||
| 195 | struct tick_sched *ts; | 194 | struct tick_sched *ts; |
| 196 | ktime_t last_update, expires, now; | 195 | ktime_t last_update, expires, now; |
| 197 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 196 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
| @@ -243,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) | |||
| 243 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 242 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
| 244 | delta_jiffies = next_jiffies - last_jiffies; | 243 | delta_jiffies = next_jiffies - last_jiffies; |
| 245 | 244 | ||
| 246 | rt_jiffies = rt_needs_cpu(cpu); | ||
| 247 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
| 248 | delta_jiffies = rt_jiffies; | ||
| 249 | |||
| 250 | if (rcu_needs_cpu(cpu)) | 245 | if (rcu_needs_cpu(cpu)) |
| 251 | delta_jiffies = 1; | 246 | delta_jiffies = 1; |
| 252 | /* | 247 | /* |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a3fa587c350c..2d6087c7cf98 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -178,6 +178,7 @@ static void change_clocksource(void) | |||
| 178 | if (clock == new) | 178 | if (clock == new) |
| 179 | return; | 179 | return; |
| 180 | 180 | ||
| 181 | new->cycle_last = 0; | ||
| 181 | now = clocksource_read(new); | 182 | now = clocksource_read(new); |
| 182 | nsec = __get_nsec_offset(); | 183 | nsec = __get_nsec_offset(); |
| 183 | timespec_add_ns(&xtime, nsec); | 184 | timespec_add_ns(&xtime, nsec); |
| @@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev) | |||
| 295 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); | 296 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); |
| 296 | update_xtime_cache(0); | 297 | update_xtime_cache(0); |
| 297 | /* re-base the last cycle value */ | 298 | /* re-base the last cycle value */ |
| 299 | clock->cycle_last = 0; | ||
| 298 | clock->cycle_last = clocksource_read(clock); | 300 | clock->cycle_last = clocksource_read(clock); |
| 299 | clock->error = 0; | 301 | clock->error = 0; |
| 300 | timekeeping_suspended = 0; | 302 | timekeeping_suspended = 0; |
diff --git a/kernel/user.c b/kernel/user.c index 7132022a040c..debce602bfdd 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up) | |||
| 101 | { | 101 | { |
| 102 | int rc = 0; | 102 | int rc = 0; |
| 103 | 103 | ||
| 104 | up->tg = sched_create_group(); | 104 | up->tg = sched_create_group(&root_task_group); |
| 105 | if (IS_ERR(up->tg)) | 105 | if (IS_ERR(up->tg)) |
| 106 | rc = -ENOMEM; | 106 | rc = -ENOMEM; |
| 107 | 107 | ||
| @@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | |||
| 193 | 193 | ||
| 194 | static struct kobj_attribute cpu_rt_runtime_attr = | 194 | static struct kobj_attribute cpu_rt_runtime_attr = |
| 195 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); | 195 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); |
| 196 | |||
| 197 | static ssize_t cpu_rt_period_show(struct kobject *kobj, | ||
| 198 | struct kobj_attribute *attr, | ||
| 199 | char *buf) | ||
| 200 | { | ||
| 201 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 202 | |||
| 203 | return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); | ||
| 204 | } | ||
| 205 | |||
| 206 | static ssize_t cpu_rt_period_store(struct kobject *kobj, | ||
| 207 | struct kobj_attribute *attr, | ||
| 208 | const char *buf, size_t size) | ||
| 209 | { | ||
| 210 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 211 | unsigned long rt_period; | ||
| 212 | int rc; | ||
| 213 | |||
| 214 | sscanf(buf, "%lu", &rt_period); | ||
| 215 | |||
| 216 | rc = sched_group_set_rt_period(up->tg, rt_period); | ||
| 217 | |||
| 218 | return (rc ? rc : size); | ||
| 219 | } | ||
| 220 | |||
| 221 | static struct kobj_attribute cpu_rt_period_attr = | ||
| 222 | __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); | ||
| 196 | #endif | 223 | #endif |
| 197 | 224 | ||
| 198 | /* default attributes per uid directory */ | 225 | /* default attributes per uid directory */ |
| @@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = { | |||
| 202 | #endif | 229 | #endif |
| 203 | #ifdef CONFIG_RT_GROUP_SCHED | 230 | #ifdef CONFIG_RT_GROUP_SCHED |
| 204 | &cpu_rt_runtime_attr.attr, | 231 | &cpu_rt_runtime_attr.attr, |
| 232 | &cpu_rt_period_attr.attr, | ||
| 205 | #endif | 233 | #endif |
| 206 | NULL | 234 | NULL |
| 207 | }; | 235 | }; |
