aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/DocBook/debugobjects.tmpl50
-rw-r--r--Documentation/RCU/checklist.txt6
-rw-r--r--Documentation/RCU/rcu.txt10
-rw-r--r--Documentation/RCU/stallwarn.txt16
-rw-r--r--Documentation/RCU/torture.txt13
-rw-r--r--Documentation/RCU/trace.txt4
-rw-r--r--Documentation/RCU/whatisRCU.txt19
-rw-r--r--Documentation/atomic_ops.txt87
-rw-r--r--Documentation/kernel-parameters.txt12
-rw-r--r--Documentation/lockdep-design.txt63
-rw-r--r--Documentation/trace/events.txt2
-rw-r--r--Documentation/virtual/kvm/api.txt16
-rw-r--r--MAINTAINERS13
-rw-r--r--Makefile2
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/arm/Kconfig4
-rw-r--r--arch/arm/common/pl330.c116
-rw-r--r--arch/arm/configs/imx_v4_v5_defconfig12
-rw-r--r--arch/arm/kernel/process.c6
-rw-r--r--arch/arm/kernel/setup.c1
-rw-r--r--arch/arm/mach-exynos/cpu.c5
-rw-r--r--arch/arm/mach-imx/Kconfig8
-rw-r--r--arch/arm/mach-imx/Makefile4
-rw-r--r--arch/arm/mach-imx/clock-imx35.c20
-rw-r--r--arch/arm/mach-imx/mach-cpuimx35.c8
-rw-r--r--arch/arm/mach-omap2/omap_hwmod_3xxx_data.c4
-rw-r--r--arch/arm/mm/init.c4
-rw-r--r--arch/arm/mm/proc-v7.S6
-rw-r--r--arch/arm/oprofile/common.c2
-rw-r--r--arch/arm/plat-mxc/cpufreq.c2
-rw-r--r--arch/arm/plat-mxc/include/mach/uncompress.h1
-rw-r--r--arch/arm/plat-mxc/pwm.c9
-rw-r--r--arch/arm/plat-orion/gpio.c6
-rw-r--r--arch/arm/plat-samsung/include/plat/cpu-freq-core.h25
-rw-r--r--arch/avr32/kernel/process.c6
-rw-r--r--arch/blackfin/kernel/process.c6
-rw-r--r--arch/cris/arch-v32/kernel/time.c4
-rw-r--r--arch/ia64/Kconfig6
-rw-r--r--arch/ia64/include/asm/cputime.h71
-rw-r--r--arch/ia64/mm/contig.c3
-rw-r--r--arch/ia64/mm/init.c4
-rw-r--r--arch/m68k/platform/68328/timers.c4
-rw-r--r--arch/m68k/platform/coldfire/dma_timer.c5
-rw-r--r--arch/m68k/platform/coldfire/pit.c4
-rw-r--r--arch/m68k/platform/coldfire/sltimers.c4
-rw-r--r--arch/m68k/platform/coldfire/timers.c4
-rw-r--r--arch/microblaze/include/asm/memblock.h14
-rw-r--r--arch/microblaze/kernel/process.c6
-rw-r--r--arch/microblaze/kernel/prom.c3
-rw-r--r--arch/mips/Kconfig6
-rw-r--r--arch/mips/kernel/process.c6
-rw-r--r--arch/mips/kernel/setup.c3
-rw-r--r--arch/mips/sgi-ip27/ip27-memory.c5
-rw-r--r--arch/openrisc/include/asm/memblock.h24
-rw-r--r--arch/openrisc/kernel/idle.c6
-rw-r--r--arch/openrisc/kernel/prom.c3
-rw-r--r--arch/parisc/kernel/time.c6
-rw-r--r--arch/powerpc/Kconfig4
-rw-r--r--arch/powerpc/include/asm/cputime.h72
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h33
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h33
-rw-r--r--arch/powerpc/include/asm/memblock.h8
-rw-r--r--arch/powerpc/kernel/idle.c15
-rw-r--r--arch/powerpc/kernel/machine_kexec.c3
-rw-r--r--arch/powerpc/kernel/prom.c20
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_pr.c2
-rw-r--r--arch/powerpc/kvm/e500.c1
-rw-r--r--arch/powerpc/mm/init_32.c4
-rw-r--r--arch/powerpc/mm/mem.c2
-rw-r--r--arch/powerpc/mm/numa.c60
-rw-r--r--arch/powerpc/mm/tlb_nohash.c1
-rw-r--r--arch/powerpc/platforms/embedded6xx/wii.c23
-rw-r--r--arch/powerpc/platforms/iseries/setup.c12
-rw-r--r--arch/powerpc/platforms/ps3/mm.c1
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c4
-rw-r--r--arch/s390/Kconfig6
-rw-r--r--arch/s390/appldata/appldata_os.c16
-rw-r--r--arch/s390/include/asm/cputime.h142
-rw-r--r--arch/s390/kernel/process.c6
-rw-r--r--arch/s390/kernel/setup.c4
-rw-r--r--arch/s390/oprofile/hwsampler.c7
-rw-r--r--arch/s390/oprofile/init.c373
-rw-r--r--arch/s390/oprofile/op_counter.h23
-rw-r--r--arch/score/Kconfig6
-rw-r--r--arch/score/kernel/setup.c4
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/include/asm/memblock.h4
-rw-r--r--arch/sh/kernel/idle.c6
-rw-r--r--arch/sh/kernel/machine_kexec.c3
-rw-r--r--arch/sh/kernel/setup.c3
-rw-r--r--arch/sh/mm/Kconfig3
-rw-r--r--arch/sh/mm/init.c3
-rw-r--r--arch/sh/oprofile/common.c4
-rw-r--r--arch/sparc/Kconfig4
-rw-r--r--arch/sparc/include/asm/memblock.h8
-rw-r--r--arch/sparc/kernel/process_64.c6
-rw-r--r--arch/sparc/kernel/setup_32.c2
-rw-r--r--arch/sparc/mm/init_64.c32
-rw-r--r--arch/tile/kernel/process.c6
-rw-r--r--arch/tile/mm/fault.c4
-rw-r--r--arch/um/kernel/process.c6
-rw-r--r--arch/um/kernel/time.c6
-rw-r--r--arch/unicore32/kernel/process.c6
-rw-r--r--arch/unicore32/kernel/setup.c1
-rw-r--r--arch/unicore32/mm/init.c4
-rw-r--r--arch/unicore32/mm/mmu.c1
-rw-r--r--arch/x86/Kconfig18
-rw-r--r--arch/x86/ia32/ia32entry.S43
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/apic.h6
-rw-r--r--arch/x86/include/asm/apic_flat_64.h7
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/bitops.h76
-rw-r--r--arch/x86/include/asm/cmpxchg.h163
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h46
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h43
-rw-r--r--arch/x86/include/asm/div64.h22
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/insn.h7
-rw-r--r--arch/x86/include/asm/mach_timer.h2
-rw-r--r--arch/x86/include/asm/mc146818rtc.h4
-rw-r--r--arch/x86/include/asm/memblock.h23
-rw-r--r--arch/x86/include/asm/numachip/numachip_csr.h167
-rw-r--r--arch/x86/include/asm/percpu.h53
-rw-r--r--arch/x86/include/asm/perf_event.h44
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/spinlock.h15
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/x86_init.h3
-rw-r--r--arch/x86/kernel/acpi/boot.c10
-rw-r--r--arch/x86/kernel/amd_nb.c8
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c113
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c294
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/check.c34
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c14
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c262
-rw-r--r--arch/x86/kernel/cpu/perf_event.h51
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c90
-rw-r--r--arch/x86/kernel/cpu/powerflags.c3
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/e820.c58
-rw-r--r--arch/x86/kernel/entry_32.S4
-rw-r--r--arch/x86/kernel/entry_64.S31
-rw-r--r--arch/x86/kernel/head.c2
-rw-r--r--arch/x86/kernel/head32.c7
-rw-r--r--arch/x86/kernel/head64.c7
-rw-r--r--arch/x86/kernel/hpet.c7
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/mpparse.c12
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kernel/trampoline.c4
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/tsc_sync.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c77
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kvm/i8254.c10
-rw-r--r--arch/x86/kvm/x86.c19
-rw-r--r--arch/x86/lib/inat.c9
-rw-r--r--arch/x86/lib/insn.c4
-rw-r--r--arch/x86/lib/string_32.c8
-rw-r--r--arch/x86/lib/x86-opcode-map.txt606
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c22
-rw-r--r--arch/x86/mm/init.c8
-rw-r--r--arch/x86/mm/init_32.c36
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c33
-rw-r--r--arch/x86/mm/numa.c37
-rw-r--r--arch/x86/mm/numa_32.c10
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/mm/numa_emulation.c36
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/srat.c7
-rw-r--r--arch/x86/oprofile/Makefile3
-rw-r--r--arch/x86/oprofile/init.c30
-rw-r--r--arch/x86/oprofile/nmi_int.c27
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c50
-rw-r--r--arch/x86/platform/efi/efi.c12
-rw-r--r--arch/x86/tools/Makefile11
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk21
-rw-r--r--arch/x86/tools/insn_sanity.c275
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/mmu.c12
-rw-r--r--arch/x86/xen/setup.c7
-rw-r--r--arch/xtensa/kernel/time.c13
-rw-r--r--block/blk-map.c2
-rw-r--r--block/blk-tag.c13
-rw-r--r--block/cfq-iosched.c12
-rw-r--r--block/ioctl.c26
-rw-r--r--drivers/base/cpu.c7
-rw-r--r--drivers/clocksource/acpi_pm.c2
-rw-r--r--drivers/clocksource/i8253.c6
-rw-r--r--drivers/clocksource/tcb_clksrc.c4
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c50
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c54
-rw-r--r--drivers/cpufreq/cpufreq_stats.c5
-rw-r--r--drivers/dma/Kconfig4
-rw-r--r--drivers/edac/sb_edac.c2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c4
-rw-r--r--drivers/gpu/drm/i915/intel_display.c8
-rw-r--r--drivers/gpu/drm/radeon/evergreen.c12
-rw-r--r--drivers/gpu/drm/radeon/radeon_atombios.c6
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_kms.c6
-rw-r--r--drivers/hwmon/coretemp.c7
-rw-r--r--drivers/input/mouse/sentelic.c8
-rw-r--r--drivers/input/mouse/sentelic.h3
-rw-r--r--drivers/iommu/intel-iommu.c25
-rw-r--r--drivers/iommu/iommu.c2
-rw-r--r--drivers/lguest/x86/core.c2
-rw-r--r--drivers/macintosh/rack-meter.c14
-rw-r--r--drivers/media/video/gspca/gspca.c6
-rw-r--r--drivers/mmc/host/mmci.c14
-rw-r--r--drivers/net/ethernet/freescale/Kconfig4
-rw-r--r--drivers/net/ethernet/marvell/skge.c3
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_cq.c1
-rw-r--r--drivers/net/wireless/ath/ath9k/main.c3
-rw-r--r--drivers/net/wireless/b43/pio.c16
-rw-r--r--drivers/net/wireless/mwifiex/sta_ioctl.c7
-rw-r--r--drivers/of/platform.c2
-rw-r--r--drivers/oprofile/nmi_timer_int.c173
-rw-r--r--drivers/oprofile/oprof.c30
-rw-r--r--drivers/oprofile/oprof.h10
-rw-r--r--drivers/oprofile/timer_int.c30
-rw-r--r--drivers/pci/Kconfig4
-rw-r--r--drivers/pci/ioapic.c15
-rw-r--r--drivers/rtc/interface.c50
-rw-r--r--drivers/watchdog/coh901327_wdt.c6
-rw-r--r--drivers/watchdog/hpwdt.c5
-rw-r--r--drivers/watchdog/iTCO_wdt.c6
-rw-r--r--drivers/watchdog/sp805_wdt.c2
-rw-r--r--fs/ceph/dir.c29
-rw-r--r--fs/cifs/connect.c4
-rw-r--r--fs/compat_ioctl.c38
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/locks.c11
-rw-r--r--fs/minix/inode.c34
-rw-r--r--fs/proc/array.c8
-rw-r--r--fs/proc/stat.c67
-rw-r--r--fs/proc/uptime.c11
-rw-r--r--fs/xfs/xfs_super.c30
-rw-r--r--fs/xfs/xfs_sync.c36
-rw-r--r--fs/xfs/xfs_sync.h2
-rw-r--r--include/asm-generic/cputime.h64
-rw-r--r--include/linux/bitops.h10
-rw-r--r--include/linux/bootmem.h2
-rw-r--r--include/linux/cpu.h1
-rw-r--r--include/linux/debugobjects.h6
-rw-r--r--include/linux/hardirq.h21
-rw-r--r--include/linux/jump_label.h27
-rw-r--r--include/linux/kernel_stat.h36
-rw-r--r--include/linux/kvm.h1
-rw-r--r--include/linux/latencytop.h3
-rw-r--r--include/linux/lockdep.h4
-rw-r--r--include/linux/memblock.h170
-rw-r--r--include/linux/mm.h34
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/perf_event.h8
-rw-r--r--include/linux/poison.h6
-rw-r--r--include/linux/rcupdate.h115
-rw-r--r--include/linux/sched.h31
-rw-r--r--include/linux/security.h2
-rw-r--r--include/linux/srcu.h87
-rw-r--r--include/linux/tick.h11
-rw-r--r--include/linux/wait.h4
-rw-r--r--include/net/ip_vs.h2
-rw-r--r--include/trace/events/rcu.h122
-rw-r--r--include/trace/events/sched.h57
-rw-r--r--init/Kconfig10
-rw-r--r--init/main.c3
-rw-r--r--kernel/Makefile20
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/callchain.c191
-rw-r--r--kernel/events/core.c298
-rw-r--r--kernel/events/internal.h39
-rw-r--r--kernel/exit.c31
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/futex.c28
-rw-r--r--kernel/hung_task.c14
-rw-r--r--kernel/itimer.c15
-rw-r--r--kernel/jump_label.c49
-rw-r--r--kernel/lockdep.c83
-rw-r--r--kernel/panic.c17
-rw-r--r--kernel/posix-cpu-timers.c132
-rw-r--r--kernel/printk.c11
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcutiny.c149
-rw-r--r--kernel/rcutiny_plugin.h29
-rw-r--r--kernel/rcutorture.c225
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h26
-rw-r--r--kernel/rcutree_plugin.h289
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)0
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2187
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)4
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)0
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1000
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)30
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)218
-rw-r--r--kernel/sched/sched.h1166
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)109
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)4
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/tick-sched.c105
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/timer.c62
-rw-r--r--kernel/trace/trace.c106
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events_filter.c26
-rw-r--r--kernel/trace/trace_irqsoff.c13
-rw-r--r--kernel/trace/trace_output.c16
-rw-r--r--kernel/trace/trace_sched_wakeup.c13
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/wait.c4
-rw-r--r--lib/debugobjects.c54
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/memblock.c961
-rw-r--r--mm/mempolicy.c11
-rw-r--r--mm/nobootmem.c45
-rw-r--r--mm/page_alloc.c508
-rw-r--r--mm/slub.c4
-rw-r--r--net/bluetooth/hci_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c22
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/sched/sch_netem.c7
-rw-r--r--net/sched/sch_qfq.c4
-rw-r--r--net/socket.c16
-rw-r--r--security/security.c2
-rw-r--r--sound/soc/codecs/wm8776.c1
-rw-r--r--tools/perf/Documentation/perf-annotate.txt4
-rw-r--r--tools/perf/Documentation/perf-buildid-list.txt2
-rw-r--r--tools/perf/Documentation/perf-evlist.txt2
-rw-r--r--tools/perf/Documentation/perf-kmem.txt2
-rw-r--r--tools/perf/Documentation/perf-lock.txt2
-rw-r--r--tools/perf/Documentation/perf-record.txt2
-rw-r--r--tools/perf/Documentation/perf-report.txt11
-rw-r--r--tools/perf/Documentation/perf-sched.txt2
-rw-r--r--tools/perf/Documentation/perf-script.txt9
-rw-r--r--tools/perf/Documentation/perf-test.txt8
-rw-r--r--tools/perf/Documentation/perf-timechart.txt2
-rw-r--r--tools/perf/Makefile1
-rw-r--r--tools/perf/builtin-annotate.c132
-rw-r--r--tools/perf/builtin-buildid-list.c53
-rw-r--r--tools/perf/builtin-diff.c21
-rw-r--r--tools/perf/builtin-evlist.c2
-rw-r--r--tools/perf/builtin-inject.c118
-rw-r--r--tools/perf/builtin-kmem.c16
-rw-r--r--tools/perf/builtin-kvm.c2
-rw-r--r--tools/perf/builtin-lock.c12
-rw-r--r--tools/perf/builtin-probe.c1
-rw-r--r--tools/perf/builtin-record.c603
-rw-r--r--tools/perf/builtin-report.c236
-rw-r--r--tools/perf/builtin-sched.c200
-rw-r--r--tools/perf/builtin-script.c130
-rw-r--r--tools/perf/builtin-stat.c134
-rw-r--r--tools/perf/builtin-test.c545
-rw-r--r--tools/perf/builtin-timechart.c38
-rw-r--r--tools/perf/builtin-top.c558
-rw-r--r--tools/perf/perf.c33
-rw-r--r--tools/perf/perf.h24
-rw-r--r--tools/perf/util/annotate.c8
-rw-r--r--tools/perf/util/annotate.h5
-rw-r--r--tools/perf/util/build-id.c26
-rw-r--r--tools/perf/util/build-id.h2
-rw-r--r--tools/perf/util/callchain.h3
-rw-r--r--tools/perf/util/cgroup.c15
-rw-r--r--tools/perf/util/config.c5
-rw-r--r--tools/perf/util/debugfs.c35
-rw-r--r--tools/perf/util/debugfs.h31
-rw-r--r--tools/perf/util/event.c360
-rw-r--r--tools/perf/util/event.h68
-rw-r--r--tools/perf/util/evlist.c299
-rw-r--r--tools/perf/util/evlist.h43
-rw-r--r--tools/perf/util/evsel.c154
-rw-r--r--tools/perf/util/evsel.h8
-rw-r--r--tools/perf/util/header.c741
-rw-r--r--tools/perf/util/header.h51
-rw-r--r--tools/perf/util/hist.h3
-rw-r--r--tools/perf/util/include/linux/bitops.h118
-rw-r--r--tools/perf/util/map.c4
-rw-r--r--tools/perf/util/map.h19
-rw-r--r--tools/perf/util/parse-events.c30
-rw-r--r--tools/perf/util/parse-events.h1
-rw-r--r--tools/perf/util/probe-finder.h1
-rw-r--r--tools/perf/util/scripting-engines/trace-event-perl.c75
-rw-r--r--tools/perf/util/scripting-engines/trace-event-python.c4
-rw-r--r--tools/perf/util/session.c342
-rw-r--r--tools/perf/util/session.h72
-rw-r--r--tools/perf/util/setup.py3
-rw-r--r--tools/perf/util/symbol.c11
-rw-r--r--tools/perf/util/symbol.h1
-rw-r--r--tools/perf/util/thread.c6
-rw-r--r--tools/perf/util/thread.h14
-rw-r--r--tools/perf/util/tool.h50
-rw-r--r--tools/perf/util/top.h20
-rw-r--r--tools/perf/util/trace-event-info.c28
-rw-r--r--tools/perf/util/trace-event-scripting.c2
-rw-r--r--tools/perf/util/trace-event.h8
-rw-r--r--tools/perf/util/ui/browsers/annotate.c16
-rw-r--r--tools/perf/util/ui/browsers/hists.c2
-rw-r--r--tools/perf/util/ui/progress.c3
-rw-r--r--tools/perf/util/usage.c5
-rw-r--r--tools/perf/util/util.h11
-rw-r--r--tools/perf/util/values.c1
-rw-r--r--virt/kvm/assigned-dev.c93
455 files changed, 13008 insertions, 8885 deletions
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
index 08ff908aa7a2..24979f691e3e 100644
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -96,6 +96,7 @@
96 <listitem><para>debug_object_deactivate</para></listitem> 96 <listitem><para>debug_object_deactivate</para></listitem>
97 <listitem><para>debug_object_destroy</para></listitem> 97 <listitem><para>debug_object_destroy</para></listitem>
98 <listitem><para>debug_object_free</para></listitem> 98 <listitem><para>debug_object_free</para></listitem>
99 <listitem><para>debug_object_assert_init</para></listitem>
99 </itemizedlist> 100 </itemizedlist>
100 Each of these functions takes the address of the real object and 101 Each of these functions takes the address of the real object and
101 a pointer to the object type specific debug description 102 a pointer to the object type specific debug description
@@ -273,6 +274,26 @@
273 debug checks. 274 debug checks.
274 </para> 275 </para>
275 </sect1> 276 </sect1>
277
278 <sect1 id="debug_object_assert_init">
279 <title>debug_object_assert_init</title>
280 <para>
281 This function is called to assert that an object has been
282 initialized.
283 </para>
284 <para>
285 When the real object is not tracked by debugobjects, it calls
286 fixup_assert_init of the object type description structure
287 provided by the caller, with the hardcoded object state
288 ODEBUG_NOT_AVAILABLE. The fixup function can correct the problem
289 by calling debug_object_init and other specific initializing
290 functions.
291 </para>
292 <para>
293 When the real object is already tracked by debugobjects it is
294 ignored.
295 </para>
296 </sect1>
276 </chapter> 297 </chapter>
277 <chapter id="fixupfunctions"> 298 <chapter id="fixupfunctions">
278 <title>Fixup functions</title> 299 <title>Fixup functions</title>
@@ -381,6 +402,35 @@
381 statistics. 402 statistics.
382 </para> 403 </para>
383 </sect1> 404 </sect1>
405 <sect1 id="fixup_assert_init">
406 <title>fixup_assert_init</title>
407 <para>
408 This function is called from the debug code whenever a problem
409 in debug_object_assert_init is detected.
410 </para>
411 <para>
412 Called from debug_object_assert_init() with a hardcoded state
413 ODEBUG_STATE_NOTAVAILABLE when the object is not found in the
414 debug bucket.
415 </para>
416 <para>
417 The function returns 1 when the fixup was successful,
418 otherwise 0. The return value is used to update the
419 statistics.
420 </para>
421 <para>
422 Note, this function should make sure debug_object_init() is
423 called before returning.
424 </para>
425 <para>
426 The handling of statically initialized objects is a special
427 case. The fixup function should check if this is a legitimate
428 case of a statically initialized object or not. In this case only
429 debug_object_init() should be called to make the object known to
430 the tracker. Then the function should return 0 because this is not
431 a real fixup.
432 </para>
433 </sect1>
384 </chapter> 434 </chapter>
385 <chapter id="bugs"> 435 <chapter id="bugs">
386 <title>Known Bugs And Assumptions</title> 436 <title>Known Bugs And Assumptions</title>
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 0c134f8afc6f..bff2d8be1e18 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -328,6 +328,12 @@ over a rather long period of time, but improvements are always welcome!
328 RCU rather than SRCU, because RCU is almost always faster and 328 RCU rather than SRCU, because RCU is almost always faster and
329 easier to use than is SRCU. 329 easier to use than is SRCU.
330 330
331 If you need to enter your read-side critical section in a
332 hardirq or exception handler, and then exit that same read-side
333 critical section in the task that was interrupted, then you need
334 to srcu_read_lock_raw() and srcu_read_unlock_raw(), which avoid
335 the lockdep checking that would otherwise this practice illegal.
336
331 Also unlike other forms of RCU, explicit initialization 337 Also unlike other forms of RCU, explicit initialization
332 and cleanup is required via init_srcu_struct() and 338 and cleanup is required via init_srcu_struct() and
333 cleanup_srcu_struct(). These are passed a "struct srcu_struct" 339 cleanup_srcu_struct(). These are passed a "struct srcu_struct"
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 31852705b586..bf778332a28f 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -38,11 +38,11 @@ o How can the updater tell when a grace period has completed
38 38
39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the 39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
40 same effect, but require that the readers manipulate CPU-local 40 same effect, but require that the readers manipulate CPU-local
41 counters. These counters allow limited types of blocking 41 counters. These counters allow limited types of blocking within
42 within RCU read-side critical sections. SRCU also uses 42 RCU read-side critical sections. SRCU also uses CPU-local
43 CPU-local counters, and permits general blocking within 43 counters, and permits general blocking within RCU read-side
44 RCU read-side critical sections. These two variants of 44 critical sections. These variants of RCU detect grace periods
45 RCU detect grace periods by sampling these counters. 45 by sampling these counters.
46 46
47o If I am running on a uniprocessor kernel, which can only do one 47o If I am running on a uniprocessor kernel, which can only do one
48 thing at a time, why should I wait for a grace period? 48 thing at a time, why should I wait for a grace period?
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 4e959208f736..083d88cbc089 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -101,6 +101,11 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
101 CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning 101 CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
102 messages. 102 messages.
103 103
104o A hardware or software issue shuts off the scheduler-clock
105 interrupt on a CPU that is not in dyntick-idle mode. This
106 problem really has happened, and seems to be most likely to
107 result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels.
108
104o A bug in the RCU implementation. 109o A bug in the RCU implementation.
105 110
106o A hardware failure. This is quite unlikely, but has occurred 111o A hardware failure. This is quite unlikely, but has occurred
@@ -109,12 +114,11 @@ o A hardware failure. This is quite unlikely, but has occurred
109 This resulted in a series of RCU CPU stall warnings, eventually 114 This resulted in a series of RCU CPU stall warnings, eventually
110 leading the realization that the CPU had failed. 115 leading the realization that the CPU had failed.
111 116
112The RCU, RCU-sched, and RCU-bh implementations have CPU stall 117The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
113warning. SRCU does not have its own CPU stall warnings, but its 118SRCU does not have its own CPU stall warnings, but its calls to
114calls to synchronize_sched() will result in RCU-sched detecting 119synchronize_sched() will result in RCU-sched detecting RCU-sched-related
115RCU-sched-related CPU stalls. Please note that RCU only detects 120CPU stalls. Please note that RCU only detects CPU stalls when there is
116CPU stalls when there is a grace period in progress. No grace period, 121a grace period in progress. No grace period, no CPU stall warnings.
117no CPU stall warnings.
118 122
119To diagnose the cause of the stall, inspect the stack traces. 123To diagnose the cause of the stall, inspect the stack traces.
120The offending function will usually be near the top of the stack. 124The offending function will usually be near the top of the stack.
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 783d6c134d3f..d67068d0d2b9 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -61,11 +61,24 @@ nreaders This is the number of RCU reading threads supported.
61 To properly exercise RCU implementations with preemptible 61 To properly exercise RCU implementations with preemptible
62 read-side critical sections. 62 read-side critical sections.
63 63
64onoff_interval
65 The number of seconds between each attempt to execute a
66 randomly selected CPU-hotplug operation. Defaults to
67 zero, which disables CPU hotplugging. In HOTPLUG_CPU=n
68 kernels, rcutorture will silently refuse to do any
69 CPU-hotplug operations regardless of what value is
70 specified for onoff_interval.
71
64shuffle_interval 72shuffle_interval
65 The number of seconds to keep the test threads affinitied 73 The number of seconds to keep the test threads affinitied
66 to a particular subset of the CPUs, defaults to 3 seconds. 74 to a particular subset of the CPUs, defaults to 3 seconds.
67 Used in conjunction with test_no_idle_hz. 75 Used in conjunction with test_no_idle_hz.
68 76
77shutdown_secs The number of seconds to run the test before terminating
78 the test and powering off the system. The default is
79 zero, which disables test termination and system shutdown.
80 This capability is useful for automated testing.
81
69stat_interval The number of seconds between output of torture 82stat_interval The number of seconds between output of torture
70 statistics (via printk()). Regardless of the interval, 83 statistics (via printk()). Regardless of the interval,
71 statistics are printed when the module is unloaded. 84 statistics are printed when the module is unloaded.
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index aaf65f6c6cd7..49587abfc2f7 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -105,14 +105,10 @@ o "dt" is the current value of the dyntick counter that is incremented
105 or one greater than the interrupt-nesting depth otherwise. 105 or one greater than the interrupt-nesting depth otherwise.
106 The number after the second "/" is the NMI nesting depth. 106 The number after the second "/" is the NMI nesting depth.
107 107
108 This field is displayed only for CONFIG_NO_HZ kernels.
109
110o "df" is the number of times that some other CPU has forced a 108o "df" is the number of times that some other CPU has forced a
111 quiescent state on behalf of this CPU due to this CPU being in 109 quiescent state on behalf of this CPU due to this CPU being in
112 dynticks-idle state. 110 dynticks-idle state.
113 111
114 This field is displayed only for CONFIG_NO_HZ kernels.
115
116o "of" is the number of times that some other CPU has forced a 112o "of" is the number of times that some other CPU has forced a
117 quiescent state on behalf of this CPU due to this CPU being 113 quiescent state on behalf of this CPU due to this CPU being
118 offline. In a perfect world, this might never happen, but it 114 offline. In a perfect world, this might never happen, but it
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 6ef692667e2f..6bbe8dcdc3da 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -4,6 +4,7 @@ to start learning about RCU:
41. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ 41. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
52. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ 52. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
63. RCU part 3: the RCU API http://lwn.net/Articles/264090/ 63. RCU part 3: the RCU API http://lwn.net/Articles/264090/
74. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
7 8
8 9
9What is RCU? 10What is RCU?
@@ -834,6 +835,8 @@ SRCU: Critical sections Grace period Barrier
834 835
835 srcu_read_lock synchronize_srcu N/A 836 srcu_read_lock synchronize_srcu N/A
836 srcu_read_unlock synchronize_srcu_expedited 837 srcu_read_unlock synchronize_srcu_expedited
838 srcu_read_lock_raw
839 srcu_read_unlock_raw
837 srcu_dereference 840 srcu_dereference
838 841
839SRCU: Initialization/cleanup 842SRCU: Initialization/cleanup
@@ -855,27 +858,33 @@ list can be helpful:
855 858
856a. Will readers need to block? If so, you need SRCU. 859a. Will readers need to block? If so, you need SRCU.
857 860
858b. What about the -rt patchset? If readers would need to block 861b. Is it necessary to start a read-side critical section in a
862 hardirq handler or exception handler, and then to complete
863 this read-side critical section in the task that was
864 interrupted? If so, you need SRCU's srcu_read_lock_raw() and
865 srcu_read_unlock_raw() primitives.
866
867c. What about the -rt patchset? If readers would need to block
859 in an non-rt kernel, you need SRCU. If readers would block 868 in an non-rt kernel, you need SRCU. If readers would block
860 in a -rt kernel, but not in a non-rt kernel, SRCU is not 869 in a -rt kernel, but not in a non-rt kernel, SRCU is not
861 necessary. 870 necessary.
862 871
863c. Do you need to treat NMI handlers, hardirq handlers, 872d. Do you need to treat NMI handlers, hardirq handlers,
864 and code segments with preemption disabled (whether 873 and code segments with preemption disabled (whether
865 via preempt_disable(), local_irq_save(), local_bh_disable(), 874 via preempt_disable(), local_irq_save(), local_bh_disable(),
866 or some other mechanism) as if they were explicit RCU readers? 875 or some other mechanism) as if they were explicit RCU readers?
867 If so, you need RCU-sched. 876 If so, you need RCU-sched.
868 877
869d. Do you need RCU grace periods to complete even in the face 878e. Do you need RCU grace periods to complete even in the face
870 of softirq monopolization of one or more of the CPUs? For 879 of softirq monopolization of one or more of the CPUs? For
871 example, is your code subject to network-based denial-of-service 880 example, is your code subject to network-based denial-of-service
872 attacks? If so, you need RCU-bh. 881 attacks? If so, you need RCU-bh.
873 882
874e. Is your workload too update-intensive for normal use of 883f. Is your workload too update-intensive for normal use of
875 RCU, but inappropriate for other synchronization mechanisms? 884 RCU, but inappropriate for other synchronization mechanisms?
876 If so, consider SLAB_DESTROY_BY_RCU. But please be careful! 885 If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
877 886
878f. Otherwise, use RCU. 887g. Otherwise, use RCU.
879 888
880Of course, this all assumes that you have determined that RCU is in fact 889Of course, this all assumes that you have determined that RCU is in fact
881the right tool for your job. 890the right tool for your job.
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 3bd585b44927..27f2b21a9d5c 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -84,6 +84,93 @@ compiler optimizes the section accessing atomic_t variables.
84 84
85*** YOU HAVE BEEN WARNED! *** 85*** YOU HAVE BEEN WARNED! ***
86 86
87Properly aligned pointers, longs, ints, and chars (and unsigned
88equivalents) may be atomically loaded from and stored to in the same
89sense as described for atomic_read() and atomic_set(). The ACCESS_ONCE()
90macro should be used to prevent the compiler from using optimizations
91that might otherwise optimize accesses out of existence on the one hand,
92or that might create unsolicited accesses on the other.
93
94For example consider the following code:
95
96 while (a > 0)
97 do_something();
98
99If the compiler can prove that do_something() does not store to the
100variable a, then the compiler is within its rights transforming this to
101the following:
102
103 tmp = a;
104 if (a > 0)
105 for (;;)
106 do_something();
107
108If you don't want the compiler to do this (and you probably don't), then
109you should use something like the following:
110
111 while (ACCESS_ONCE(a) < 0)
112 do_something();
113
114Alternatively, you could place a barrier() call in the loop.
115
116For another example, consider the following code:
117
118 tmp_a = a;
119 do_something_with(tmp_a);
120 do_something_else_with(tmp_a);
121
122If the compiler can prove that do_something_with() does not store to the
123variable a, then the compiler is within its rights to manufacture an
124additional load as follows:
125
126 tmp_a = a;
127 do_something_with(tmp_a);
128 tmp_a = a;
129 do_something_else_with(tmp_a);
130
131This could fatally confuse your code if it expected the same value
132to be passed to do_something_with() and do_something_else_with().
133
134The compiler would be likely to manufacture this additional load if
135do_something_with() was an inline function that made very heavy use
136of registers: reloading from variable a could save a flush to the
137stack and later reload. To prevent the compiler from attacking your
138code in this manner, write the following:
139
140 tmp_a = ACCESS_ONCE(a);
141 do_something_with(tmp_a);
142 do_something_else_with(tmp_a);
143
144For a final example, consider the following code, assuming that the
145variable a is set at boot time before the second CPU is brought online
146and never changed later, so that memory barriers are not needed:
147
148 if (a)
149 b = 9;
150 else
151 b = 42;
152
153The compiler is within its rights to manufacture an additional store
154by transforming the above code into the following:
155
156 b = 42;
157 if (a)
158 b = 9;
159
160This could come as a fatal surprise to other code running concurrently
161that expected b to never have the value 42 if a was zero. To prevent
162the compiler from doing this, write something like:
163
164 if (a)
165 ACCESS_ONCE(b) = 9;
166 else
167 ACCESS_ONCE(b) = 42;
168
169Don't even -think- about doing this without proper use of memory barriers,
170locks, or atomic operations if variable a can change at runtime!
171
172*** WARNING: ACCESS_ONCE() DOES NOT IMPLY A BARRIER! ***
173
87Now, we move onto the atomic operation interfaces typically implemented with 174Now, we move onto the atomic operation interfaces typically implemented with
88the help of assembly code. 175the help of assembly code.
89 176
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 81c287fad79d..e229769606f2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1885,6 +1885,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1885 arch_perfmon: [X86] Force use of architectural 1885 arch_perfmon: [X86] Force use of architectural
1886 perfmon on Intel CPUs instead of the 1886 perfmon on Intel CPUs instead of the
1887 CPU specific event set. 1887 CPU specific event set.
1888 timer: [X86] Force use of architectural NMI
1889 timer mode (see also oprofile.timer
1890 for generic hr timer mode)
1891 [s390] Force legacy basic mode sampling
1892 (report cpu_type "timer")
1888 1893
1889 oops=panic Always panic on oopses. Default is to just kill the 1894 oops=panic Always panic on oopses. Default is to just kill the
1890 process, but there is a small probability of 1895 process, but there is a small probability of
@@ -2750,11 +2755,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2750 functions are at fixed addresses, they make nice 2755 functions are at fixed addresses, they make nice
2751 targets for exploits that can control RIP. 2756 targets for exploits that can control RIP.
2752 2757
2753 emulate Vsyscalls turn into traps and are emulated 2758 emulate [default] Vsyscalls turn into traps and are
2754 reasonably safely. 2759 emulated reasonably safely.
2755 2760
2756 native [default] Vsyscalls are native syscall 2761 native Vsyscalls are native syscall instructions.
2757 instructions.
2758 This is a little bit faster than trapping 2762 This is a little bit faster than trapping
2759 and makes a few dynamic recompilers work 2763 and makes a few dynamic recompilers work
2760 better than they would in emulation mode. 2764 better than they would in emulation mode.
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index abf768c681e2..5dbc99c04f6e 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -221,3 +221,66 @@ when the chain is validated for the first time, is then put into a hash
221table, which hash-table can be checked in a lockfree manner. If the 221table, which hash-table can be checked in a lockfree manner. If the
222locking chain occurs again later on, the hash table tells us that we 222locking chain occurs again later on, the hash table tells us that we
223dont have to validate the chain again. 223dont have to validate the chain again.
224
225Troubleshooting:
226----------------
227
228The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
229Exceeding this number will trigger the following lockdep warning:
230
231 (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
232
233By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
234desktop systems have less than 1,000 lock classes, so this warning
235normally results from lock-class leakage or failure to properly
236initialize locks. These two problems are illustrated below:
237
2381. Repeated module loading and unloading while running the validator
239 will result in lock-class leakage. The issue here is that each
240 load of the module will create a new set of lock classes for
241 that module's locks, but module unloading does not remove old
242 classes (see below discussion of reuse of lock classes for why).
243 Therefore, if that module is loaded and unloaded repeatedly,
244 the number of lock classes will eventually reach the maximum.
245
2462. Using structures such as arrays that have large numbers of
247 locks that are not explicitly initialized. For example,
248 a hash table with 8192 buckets where each bucket has its own
249 spinlock_t will consume 8192 lock classes -unless- each spinlock
250 is explicitly initialized at runtime, for example, using the
251 run-time spin_lock_init() as opposed to compile-time initializers
252 such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize
253 the per-bucket spinlocks would guarantee lock-class overflow.
254 In contrast, a loop that called spin_lock_init() on each lock
255 would place all 8192 locks into a single lock class.
256
257 The moral of this story is that you should always explicitly
258 initialize your locks.
259
260One might argue that the validator should be modified to allow
261lock classes to be reused. However, if you are tempted to make this
262argument, first review the code and think through the changes that would
263be required, keeping in mind that the lock classes to be removed are
264likely to be linked into the lock-dependency graph. This turns out to
265be harder to do than to say.
266
267Of course, if you do run out of lock classes, the next thing to do is
268to find the offending lock classes. First, the following command gives
269you the number of lock classes currently in use along with the maximum:
270
271 grep "lock-classes" /proc/lockdep_stats
272
273This command produces the following output on a modest system:
274
275 lock-classes: 748 [max: 8191]
276
277If the number allocated (748 above) increases continually over time,
278then there is likely a leak. The following command can be used to
279identify the leaking lock classes:
280
281 grep "BD" /proc/lockdep
282
283Run the command and save the output, then compare against the output from
284a later run of this command to identify the leakers. This same output
285can also help you find situations where runtime lock initialization has
286been omitted.
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index b510564aac7e..bb24c2a0e870 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -191,8 +191,6 @@ And for string fields they are:
191 191
192Currently, only exact string matches are supported. 192Currently, only exact string matches are supported.
193 193
194Currently, the maximum number of predicates in a filter is 16.
195
1965.2 Setting filters 1945.2 Setting filters
197------------------- 195-------------------
198 196
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 7945b0bd35e2..e2a4b5287361 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1100,6 +1100,15 @@ emulate them efficiently. The fields in each entry are defined as follows:
1100 eax, ebx, ecx, edx: the values returned by the cpuid instruction for 1100 eax, ebx, ecx, edx: the values returned by the cpuid instruction for
1101 this function/index combination 1101 this function/index combination
1102 1102
1103The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned
1104as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC
1105support. Instead it is reported via
1106
1107 ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER)
1108
1109if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the
1110feature in userspace, then you can enable the feature for KVM_SET_CPUID2.
1111
11034.47 KVM_PPC_GET_PVINFO 11124.47 KVM_PPC_GET_PVINFO
1104 1113
1105Capability: KVM_CAP_PPC_GET_PVINFO 1114Capability: KVM_CAP_PPC_GET_PVINFO
@@ -1151,6 +1160,13 @@ following flags are specified:
1151/* Depends on KVM_CAP_IOMMU */ 1160/* Depends on KVM_CAP_IOMMU */
1152#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 1161#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
1153 1162
1163The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
1164isolation of the device. Usages not specifying this flag are deprecated.
1165
1166Only PCI header type 0 devices with PCI BAR resources are supported by
1167device assignment. The user requesting this ioctl must have read/write
1168access to the PCI sysfs resource files associated with the device.
1169
11544.49 KVM_DEASSIGN_PCI_DEVICE 11704.49 KVM_DEASSIGN_PCI_DEVICE
1155 1171
1156Capability: KVM_CAP_DEVICE_DEASSIGNMENT 1172Capability: KVM_CAP_DEVICE_DEASSIGNMENT
diff --git a/MAINTAINERS b/MAINTAINERS
index 6afba60c3904..62f1cd357ddf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1698,11 +1698,9 @@ F: arch/x86/include/asm/tce.h
1698 1698
1699CAN NETWORK LAYER 1699CAN NETWORK LAYER
1700M: Oliver Hartkopp <socketcan@hartkopp.net> 1700M: Oliver Hartkopp <socketcan@hartkopp.net>
1701M: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
1702M: Urs Thuermann <urs.thuermann@volkswagen.de>
1703L: linux-can@vger.kernel.org 1701L: linux-can@vger.kernel.org
1704L: netdev@vger.kernel.org 1702W: http://gitorious.org/linux-can
1705W: http://developer.berlios.de/projects/socketcan/ 1703T: git git://gitorious.org/linux-can/linux-can-next.git
1706S: Maintained 1704S: Maintained
1707F: net/can/ 1705F: net/can/
1708F: include/linux/can.h 1706F: include/linux/can.h
@@ -1713,9 +1711,10 @@ F: include/linux/can/gw.h
1713 1711
1714CAN NETWORK DRIVERS 1712CAN NETWORK DRIVERS
1715M: Wolfgang Grandegger <wg@grandegger.com> 1713M: Wolfgang Grandegger <wg@grandegger.com>
1714M: Marc Kleine-Budde <mkl@pengutronix.de>
1716L: linux-can@vger.kernel.org 1715L: linux-can@vger.kernel.org
1717L: netdev@vger.kernel.org 1716W: http://gitorious.org/linux-can
1718W: http://developer.berlios.de/projects/socketcan/ 1717T: git git://gitorious.org/linux-can/linux-can-next.git
1719S: Maintained 1718S: Maintained
1720F: drivers/net/can/ 1719F: drivers/net/can/
1721F: include/linux/can/dev.h 1720F: include/linux/can/dev.h
@@ -2700,7 +2699,7 @@ FIREWIRE SUBSYSTEM
2700M: Stefan Richter <stefanr@s5r6.in-berlin.de> 2699M: Stefan Richter <stefanr@s5r6.in-berlin.de>
2701L: linux1394-devel@lists.sourceforge.net 2700L: linux1394-devel@lists.sourceforge.net
2702W: http://ieee1394.wiki.kernel.org/ 2701W: http://ieee1394.wiki.kernel.org/
2703T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394-2.6.git 2702T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git
2704S: Maintained 2703S: Maintained
2705F: drivers/firewire/ 2704F: drivers/firewire/
2706F: include/linux/firewire*.h 2705F: include/linux/firewire*.h
diff --git a/Makefile b/Makefile
index ea51081812f3..adddd11c3b3b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
1VERSION = 3 1VERSION = 3
2PATCHLEVEL = 2 2PATCHLEVEL = 2
3SUBLEVEL = 0 3SUBLEVEL = 0
4EXTRAVERSION = -rc7 4EXTRAVERSION =
5NAME = Saber-toothed Squirrel 5NAME = Saber-toothed Squirrel
6 6
7# *DOCUMENTATION* 7# *DOCUMENTATION*
diff --git a/arch/Kconfig b/arch/Kconfig
index 4b0669cbb3b0..2505740b81d2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -30,6 +30,10 @@ config OPROFILE_EVENT_MULTIPLEX
30config HAVE_OPROFILE 30config HAVE_OPROFILE
31 bool 31 bool
32 32
33config OPROFILE_NMI_TIMER
34 def_bool y
35 depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
36
33config KPROBES 37config KPROBES
34 bool "Kprobes" 38 bool "Kprobes"
35 depends on MODULES 39 depends on MODULES
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 776d76b8cb69..b259c7c644e3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1246,7 +1246,7 @@ config PL310_ERRATA_588369
1246 1246
1247config ARM_ERRATA_720789 1247config ARM_ERRATA_720789
1248 bool "ARM errata: TLBIASIDIS and TLBIMVAIS operations can broadcast a faulty ASID" 1248 bool "ARM errata: TLBIASIDIS and TLBIMVAIS operations can broadcast a faulty ASID"
1249 depends on CPU_V7 && SMP 1249 depends on CPU_V7
1250 help 1250 help
1251 This option enables the workaround for the 720789 Cortex-A9 (prior to 1251 This option enables the workaround for the 720789 Cortex-A9 (prior to
1252 r2p0) erratum. A faulty ASID can be sent to the other CPUs for the 1252 r2p0) erratum. A faulty ASID can be sent to the other CPUs for the
@@ -1282,7 +1282,7 @@ config ARM_ERRATA_743622
1282 1282
1283config ARM_ERRATA_751472 1283config ARM_ERRATA_751472
1284 bool "ARM errata: Interrupted ICIALLUIS may prevent completion of broadcasted operation" 1284 bool "ARM errata: Interrupted ICIALLUIS may prevent completion of broadcasted operation"
1285 depends on CPU_V7 && SMP 1285 depends on CPU_V7
1286 help 1286 help
1287 This option enables the workaround for the 751472 Cortex-A9 (prior 1287 This option enables the workaround for the 751472 Cortex-A9 (prior
1288 to r3p0) erratum. An interrupted ICIALLUIS operation may prevent the 1288 to r3p0) erratum. An interrupted ICIALLUIS operation may prevent the
diff --git a/arch/arm/common/pl330.c b/arch/arm/common/pl330.c
index f407a6b35d3d..8d8df744f7a5 100644
--- a/arch/arm/common/pl330.c
+++ b/arch/arm/common/pl330.c
@@ -221,17 +221,6 @@
221 */ 221 */
222#define MCODE_BUFF_PER_REQ 256 222#define MCODE_BUFF_PER_REQ 256
223 223
224/*
225 * Mark a _pl330_req as free.
226 * We do it by writing DMAEND as the first instruction
227 * because no valid request is going to have DMAEND as
228 * its first instruction to execute.
229 */
230#define MARK_FREE(req) do { \
231 _emit_END(0, (req)->mc_cpu); \
232 (req)->mc_len = 0; \
233 } while (0)
234
235/* If the _pl330_req is available to the client */ 224/* If the _pl330_req is available to the client */
236#define IS_FREE(req) (*((u8 *)((req)->mc_cpu)) == CMD_DMAEND) 225#define IS_FREE(req) (*((u8 *)((req)->mc_cpu)) == CMD_DMAEND)
237 226
@@ -301,8 +290,10 @@ struct pl330_thread {
301 struct pl330_dmac *dmac; 290 struct pl330_dmac *dmac;
302 /* Only two at a time */ 291 /* Only two at a time */
303 struct _pl330_req req[2]; 292 struct _pl330_req req[2];
304 /* Index of the last submitted request */ 293 /* Index of the last enqueued request */
305 unsigned lstenq; 294 unsigned lstenq;
295 /* Index of the last submitted request or -1 if the DMA is stopped */
296 int req_running;
306}; 297};
307 298
308enum pl330_dmac_state { 299enum pl330_dmac_state {
@@ -778,6 +769,22 @@ static inline void _execute_DBGINSN(struct pl330_thread *thrd,
778 writel(0, regs + DBGCMD); 769 writel(0, regs + DBGCMD);
779} 770}
780 771
772/*
773 * Mark a _pl330_req as free.
774 * We do it by writing DMAEND as the first instruction
775 * because no valid request is going to have DMAEND as
776 * its first instruction to execute.
777 */
778static void mark_free(struct pl330_thread *thrd, int idx)
779{
780 struct _pl330_req *req = &thrd->req[idx];
781
782 _emit_END(0, req->mc_cpu);
783 req->mc_len = 0;
784
785 thrd->req_running = -1;
786}
787
781static inline u32 _state(struct pl330_thread *thrd) 788static inline u32 _state(struct pl330_thread *thrd)
782{ 789{
783 void __iomem *regs = thrd->dmac->pinfo->base; 790 void __iomem *regs = thrd->dmac->pinfo->base;
@@ -836,31 +843,6 @@ static inline u32 _state(struct pl330_thread *thrd)
836 } 843 }
837} 844}
838 845
839/* If the request 'req' of thread 'thrd' is currently active */
840static inline bool _req_active(struct pl330_thread *thrd,
841 struct _pl330_req *req)
842{
843 void __iomem *regs = thrd->dmac->pinfo->base;
844 u32 buf = req->mc_bus, pc = readl(regs + CPC(thrd->id));
845
846 if (IS_FREE(req))
847 return false;
848
849 return (pc >= buf && pc <= buf + req->mc_len) ? true : false;
850}
851
852/* Returns 0 if the thread is inactive, ID of active req + 1 otherwise */
853static inline unsigned _thrd_active(struct pl330_thread *thrd)
854{
855 if (_req_active(thrd, &thrd->req[0]))
856 return 1; /* First req active */
857
858 if (_req_active(thrd, &thrd->req[1]))
859 return 2; /* Second req active */
860
861 return 0;
862}
863
864static void _stop(struct pl330_thread *thrd) 846static void _stop(struct pl330_thread *thrd)
865{ 847{
866 void __iomem *regs = thrd->dmac->pinfo->base; 848 void __iomem *regs = thrd->dmac->pinfo->base;
@@ -892,17 +874,22 @@ static bool _trigger(struct pl330_thread *thrd)
892 struct _arg_GO go; 874 struct _arg_GO go;
893 unsigned ns; 875 unsigned ns;
894 u8 insn[6] = {0, 0, 0, 0, 0, 0}; 876 u8 insn[6] = {0, 0, 0, 0, 0, 0};
877 int idx;
895 878
896 /* Return if already ACTIVE */ 879 /* Return if already ACTIVE */
897 if (_state(thrd) != PL330_STATE_STOPPED) 880 if (_state(thrd) != PL330_STATE_STOPPED)
898 return true; 881 return true;
899 882
900 if (!IS_FREE(&thrd->req[1 - thrd->lstenq])) 883 idx = 1 - thrd->lstenq;
901 req = &thrd->req[1 - thrd->lstenq]; 884 if (!IS_FREE(&thrd->req[idx]))
902 else if (!IS_FREE(&thrd->req[thrd->lstenq])) 885 req = &thrd->req[idx];
903 req = &thrd->req[thrd->lstenq]; 886 else {
904 else 887 idx = thrd->lstenq;
905 req = NULL; 888 if (!IS_FREE(&thrd->req[idx]))
889 req = &thrd->req[idx];
890 else
891 req = NULL;
892 }
906 893
907 /* Return if no request */ 894 /* Return if no request */
908 if (!req || !req->r) 895 if (!req || !req->r)
@@ -933,6 +920,8 @@ static bool _trigger(struct pl330_thread *thrd)
933 /* Only manager can execute GO */ 920 /* Only manager can execute GO */
934 _execute_DBGINSN(thrd, insn, true); 921 _execute_DBGINSN(thrd, insn, true);
935 922
923 thrd->req_running = idx;
924
936 return true; 925 return true;
937} 926}
938 927
@@ -1382,8 +1371,8 @@ static void pl330_dotask(unsigned long data)
1382 1371
1383 thrd->req[0].r = NULL; 1372 thrd->req[0].r = NULL;
1384 thrd->req[1].r = NULL; 1373 thrd->req[1].r = NULL;
1385 MARK_FREE(&thrd->req[0]); 1374 mark_free(thrd, 0);
1386 MARK_FREE(&thrd->req[1]); 1375 mark_free(thrd, 1);
1387 1376
1388 /* Clear the reset flag */ 1377 /* Clear the reset flag */
1389 pl330->dmac_tbd.reset_chan &= ~(1 << i); 1378 pl330->dmac_tbd.reset_chan &= ~(1 << i);
@@ -1461,14 +1450,12 @@ int pl330_update(const struct pl330_info *pi)
1461 1450
1462 thrd = &pl330->channels[id]; 1451 thrd = &pl330->channels[id];
1463 1452
1464 active = _thrd_active(thrd); 1453 active = thrd->req_running;
1465 if (!active) /* Aborted */ 1454 if (active == -1) /* Aborted */
1466 continue; 1455 continue;
1467 1456
1468 active -= 1;
1469
1470 rqdone = &thrd->req[active]; 1457 rqdone = &thrd->req[active];
1471 MARK_FREE(rqdone); 1458 mark_free(thrd, active);
1472 1459
1473 /* Get going again ASAP */ 1460 /* Get going again ASAP */
1474 _start(thrd); 1461 _start(thrd);
@@ -1509,7 +1496,7 @@ int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op)
1509 struct pl330_thread *thrd = ch_id; 1496 struct pl330_thread *thrd = ch_id;
1510 struct pl330_dmac *pl330; 1497 struct pl330_dmac *pl330;
1511 unsigned long flags; 1498 unsigned long flags;
1512 int ret = 0, active; 1499 int ret = 0, active = thrd->req_running;
1513 1500
1514 if (!thrd || thrd->free || thrd->dmac->state == DYING) 1501 if (!thrd || thrd->free || thrd->dmac->state == DYING)
1515 return -EINVAL; 1502 return -EINVAL;
@@ -1525,28 +1512,24 @@ int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op)
1525 1512
1526 thrd->req[0].r = NULL; 1513 thrd->req[0].r = NULL;
1527 thrd->req[1].r = NULL; 1514 thrd->req[1].r = NULL;
1528 MARK_FREE(&thrd->req[0]); 1515 mark_free(thrd, 0);
1529 MARK_FREE(&thrd->req[1]); 1516 mark_free(thrd, 1);
1530 break; 1517 break;
1531 1518
1532 case PL330_OP_ABORT: 1519 case PL330_OP_ABORT:
1533 active = _thrd_active(thrd);
1534
1535 /* Make sure the channel is stopped */ 1520 /* Make sure the channel is stopped */
1536 _stop(thrd); 1521 _stop(thrd);
1537 1522
1538 /* ABORT is only for the active req */ 1523 /* ABORT is only for the active req */
1539 if (!active) 1524 if (active == -1)
1540 break; 1525 break;
1541 1526
1542 active--;
1543
1544 thrd->req[active].r = NULL; 1527 thrd->req[active].r = NULL;
1545 MARK_FREE(&thrd->req[active]); 1528 mark_free(thrd, active);
1546 1529
1547 /* Start the next */ 1530 /* Start the next */
1548 case PL330_OP_START: 1531 case PL330_OP_START:
1549 if (!_thrd_active(thrd) && !_start(thrd)) 1532 if ((active == -1) && !_start(thrd))
1550 ret = -EIO; 1533 ret = -EIO;
1551 break; 1534 break;
1552 1535
@@ -1587,14 +1570,13 @@ int pl330_chan_status(void *ch_id, struct pl330_chanstatus *pstatus)
1587 else 1570 else
1588 pstatus->faulting = false; 1571 pstatus->faulting = false;
1589 1572
1590 active = _thrd_active(thrd); 1573 active = thrd->req_running;
1591 1574
1592 if (!active) { 1575 if (active == -1) {
1593 /* Indicate that the thread is not running */ 1576 /* Indicate that the thread is not running */
1594 pstatus->top_req = NULL; 1577 pstatus->top_req = NULL;
1595 pstatus->wait_req = NULL; 1578 pstatus->wait_req = NULL;
1596 } else { 1579 } else {
1597 active--;
1598 pstatus->top_req = thrd->req[active].r; 1580 pstatus->top_req = thrd->req[active].r;
1599 pstatus->wait_req = !IS_FREE(&thrd->req[1 - active]) 1581 pstatus->wait_req = !IS_FREE(&thrd->req[1 - active])
1600 ? thrd->req[1 - active].r : NULL; 1582 ? thrd->req[1 - active].r : NULL;
@@ -1659,9 +1641,9 @@ void *pl330_request_channel(const struct pl330_info *pi)
1659 thrd->free = false; 1641 thrd->free = false;
1660 thrd->lstenq = 1; 1642 thrd->lstenq = 1;
1661 thrd->req[0].r = NULL; 1643 thrd->req[0].r = NULL;
1662 MARK_FREE(&thrd->req[0]); 1644 mark_free(thrd, 0);
1663 thrd->req[1].r = NULL; 1645 thrd->req[1].r = NULL;
1664 MARK_FREE(&thrd->req[1]); 1646 mark_free(thrd, 1);
1665 break; 1647 break;
1666 } 1648 }
1667 } 1649 }
@@ -1767,14 +1749,14 @@ static inline void _reset_thread(struct pl330_thread *thrd)
1767 thrd->req[0].mc_bus = pl330->mcode_bus 1749 thrd->req[0].mc_bus = pl330->mcode_bus
1768 + (thrd->id * pi->mcbufsz); 1750 + (thrd->id * pi->mcbufsz);
1769 thrd->req[0].r = NULL; 1751 thrd->req[0].r = NULL;
1770 MARK_FREE(&thrd->req[0]); 1752 mark_free(thrd, 0);
1771 1753
1772 thrd->req[1].mc_cpu = thrd->req[0].mc_cpu 1754 thrd->req[1].mc_cpu = thrd->req[0].mc_cpu
1773 + pi->mcbufsz / 2; 1755 + pi->mcbufsz / 2;
1774 thrd->req[1].mc_bus = thrd->req[0].mc_bus 1756 thrd->req[1].mc_bus = thrd->req[0].mc_bus
1775 + pi->mcbufsz / 2; 1757 + pi->mcbufsz / 2;
1776 thrd->req[1].r = NULL; 1758 thrd->req[1].r = NULL;
1777 MARK_FREE(&thrd->req[1]); 1759 mark_free(thrd, 1);
1778} 1760}
1779 1761
1780static int dmac_alloc_threads(struct pl330_dmac *pl330) 1762static int dmac_alloc_threads(struct pl330_dmac *pl330)
diff --git a/arch/arm/configs/imx_v4_v5_defconfig b/arch/arm/configs/imx_v4_v5_defconfig
index 11a4192197c8..cf497ce41dfe 100644
--- a/arch/arm/configs/imx_v4_v5_defconfig
+++ b/arch/arm/configs/imx_v4_v5_defconfig
@@ -18,9 +18,10 @@ CONFIG_ARCH_MXC=y
18CONFIG_ARCH_IMX_V4_V5=y 18CONFIG_ARCH_IMX_V4_V5=y
19CONFIG_ARCH_MX1ADS=y 19CONFIG_ARCH_MX1ADS=y
20CONFIG_MACH_SCB9328=y 20CONFIG_MACH_SCB9328=y
21CONFIG_MACH_APF9328=y
21CONFIG_MACH_MX21ADS=y 22CONFIG_MACH_MX21ADS=y
22CONFIG_MACH_MX25_3DS=y 23CONFIG_MACH_MX25_3DS=y
23CONFIG_MACH_EUKREA_CPUIMX25=y 24CONFIG_MACH_EUKREA_CPUIMX25SD=y
24CONFIG_MACH_MX27ADS=y 25CONFIG_MACH_MX27ADS=y
25CONFIG_MACH_PCM038=y 26CONFIG_MACH_PCM038=y
26CONFIG_MACH_CPUIMX27=y 27CONFIG_MACH_CPUIMX27=y
@@ -72,17 +73,16 @@ CONFIG_MTD_CFI_GEOMETRY=y
72CONFIG_MTD_CFI_INTELEXT=y 73CONFIG_MTD_CFI_INTELEXT=y
73CONFIG_MTD_PHYSMAP=y 74CONFIG_MTD_PHYSMAP=y
74CONFIG_MTD_NAND=y 75CONFIG_MTD_NAND=y
76CONFIG_MTD_NAND_MXC=y
75CONFIG_MTD_UBI=y 77CONFIG_MTD_UBI=y
76CONFIG_MISC_DEVICES=y 78CONFIG_MISC_DEVICES=y
77CONFIG_EEPROM_AT24=y 79CONFIG_EEPROM_AT24=y
78CONFIG_EEPROM_AT25=y 80CONFIG_EEPROM_AT25=y
79CONFIG_NETDEVICES=y 81CONFIG_NETDEVICES=y
80CONFIG_NET_ETHERNET=y
81CONFIG_SMC91X=y
82CONFIG_DM9000=y 82CONFIG_DM9000=y
83CONFIG_SMC91X=y
83CONFIG_SMC911X=y 84CONFIG_SMC911X=y
84# CONFIG_NETDEV_1000 is not set 85CONFIG_SMSC_PHY=y
85# CONFIG_NETDEV_10000 is not set
86# CONFIG_INPUT_MOUSEDEV is not set 86# CONFIG_INPUT_MOUSEDEV is not set
87CONFIG_INPUT_EVDEV=y 87CONFIG_INPUT_EVDEV=y
88# CONFIG_INPUT_KEYBOARD is not set 88# CONFIG_INPUT_KEYBOARD is not set
@@ -100,6 +100,7 @@ CONFIG_I2C_CHARDEV=y
100CONFIG_I2C_IMX=y 100CONFIG_I2C_IMX=y
101CONFIG_SPI=y 101CONFIG_SPI=y
102CONFIG_SPI_IMX=y 102CONFIG_SPI_IMX=y
103CONFIG_SPI_SPIDEV=y
103CONFIG_W1=y 104CONFIG_W1=y
104CONFIG_W1_MASTER_MXC=y 105CONFIG_W1_MASTER_MXC=y
105CONFIG_W1_SLAVE_THERM=y 106CONFIG_W1_SLAVE_THERM=y
@@ -139,6 +140,7 @@ CONFIG_MMC=y
139CONFIG_MMC_MXC=y 140CONFIG_MMC_MXC=y
140CONFIG_NEW_LEDS=y 141CONFIG_NEW_LEDS=y
141CONFIG_LEDS_CLASS=y 142CONFIG_LEDS_CLASS=y
143CONFIG_LEDS_GPIO=y
142CONFIG_LEDS_MC13783=y 144CONFIG_LEDS_MC13783=y
143CONFIG_LEDS_TRIGGERS=y 145CONFIG_LEDS_TRIGGERS=y
144CONFIG_LEDS_TRIGGER_TIMER=y 146CONFIG_LEDS_TRIGGER_TIMER=y
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 3d0c6fb74ae4..e8e8fe505df1 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -183,7 +183,8 @@ void cpu_idle(void)
183 183
184 /* endless idle loop with no priority at all */ 184 /* endless idle loop with no priority at all */
185 while (1) { 185 while (1) {
186 tick_nohz_stop_sched_tick(1); 186 tick_nohz_idle_enter();
187 rcu_idle_enter();
187 leds_event(led_idle_start); 188 leds_event(led_idle_start);
188 while (!need_resched()) { 189 while (!need_resched()) {
189#ifdef CONFIG_HOTPLUG_CPU 190#ifdef CONFIG_HOTPLUG_CPU
@@ -213,7 +214,8 @@ void cpu_idle(void)
213 } 214 }
214 } 215 }
215 leds_event(led_idle_end); 216 leds_event(led_idle_end);
216 tick_nohz_restart_sched_tick(); 217 rcu_idle_exit();
218 tick_nohz_idle_exit();
217 preempt_enable_no_resched(); 219 preempt_enable_no_resched();
218 schedule(); 220 schedule();
219 preempt_disable(); 221 preempt_disable();
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 8fc2c8fcbdc6..c0b59bff6be6 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -52,6 +52,7 @@
52#include <asm/mach/time.h> 52#include <asm/mach/time.h>
53#include <asm/traps.h> 53#include <asm/traps.h>
54#include <asm/unwind.h> 54#include <asm/unwind.h>
55#include <asm/memblock.h>
55 56
56#if defined(CONFIG_DEPRECATED_PARAM_STRUCT) 57#if defined(CONFIG_DEPRECATED_PARAM_STRUCT)
57#include "compat.h" 58#include "compat.h"
diff --git a/arch/arm/mach-exynos/cpu.c b/arch/arm/mach-exynos/cpu.c
index 90ec247f3b37..cc8d4bd6d0f7 100644
--- a/arch/arm/mach-exynos/cpu.c
+++ b/arch/arm/mach-exynos/cpu.c
@@ -111,11 +111,6 @@ static struct map_desc exynos4_iodesc[] __initdata = {
111 .length = SZ_4K, 111 .length = SZ_4K,
112 .type = MT_DEVICE, 112 .type = MT_DEVICE,
113 }, { 113 }, {
114 .virtual = (unsigned long)S5P_VA_SROMC,
115 .pfn = __phys_to_pfn(EXYNOS4_PA_SROMC),
116 .length = SZ_4K,
117 .type = MT_DEVICE,
118 }, {
119 .virtual = (unsigned long)S3C_VA_USB_HSPHY, 114 .virtual = (unsigned long)S3C_VA_USB_HSPHY,
120 .pfn = __phys_to_pfn(EXYNOS4_PA_HSPHY), 115 .pfn = __phys_to_pfn(EXYNOS4_PA_HSPHY),
121 .length = SZ_4K, 116 .length = SZ_4K,
diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
index c44aa974e79c..0e6f1af260b6 100644
--- a/arch/arm/mach-imx/Kconfig
+++ b/arch/arm/mach-imx/Kconfig
@@ -132,7 +132,7 @@ config MACH_MX25_3DS
132 select IMX_HAVE_PLATFORM_MXC_NAND 132 select IMX_HAVE_PLATFORM_MXC_NAND
133 select IMX_HAVE_PLATFORM_SDHCI_ESDHC_IMX 133 select IMX_HAVE_PLATFORM_SDHCI_ESDHC_IMX
134 134
135config MACH_EUKREA_CPUIMX25 135config MACH_EUKREA_CPUIMX25SD
136 bool "Support Eukrea CPUIMX25 Platform" 136 bool "Support Eukrea CPUIMX25 Platform"
137 select SOC_IMX25 137 select SOC_IMX25
138 select IMX_HAVE_PLATFORM_FLEXCAN 138 select IMX_HAVE_PLATFORM_FLEXCAN
@@ -148,7 +148,7 @@ config MACH_EUKREA_CPUIMX25
148 148
149choice 149choice
150 prompt "Baseboard" 150 prompt "Baseboard"
151 depends on MACH_EUKREA_CPUIMX25 151 depends on MACH_EUKREA_CPUIMX25SD
152 default MACH_EUKREA_MBIMXSD25_BASEBOARD 152 default MACH_EUKREA_MBIMXSD25_BASEBOARD
153 153
154config MACH_EUKREA_MBIMXSD25_BASEBOARD 154config MACH_EUKREA_MBIMXSD25_BASEBOARD
@@ -542,7 +542,7 @@ config MACH_MX35_3DS
542 Include support for MX35PDK platform. This includes specific 542 Include support for MX35PDK platform. This includes specific
543 configurations for the board and its peripherals. 543 configurations for the board and its peripherals.
544 544
545config MACH_EUKREA_CPUIMX35 545config MACH_EUKREA_CPUIMX35SD
546 bool "Support Eukrea CPUIMX35 Platform" 546 bool "Support Eukrea CPUIMX35 Platform"
547 select SOC_IMX35 547 select SOC_IMX35
548 select IMX_HAVE_PLATFORM_FLEXCAN 548 select IMX_HAVE_PLATFORM_FLEXCAN
@@ -560,7 +560,7 @@ config MACH_EUKREA_CPUIMX35
560 560
561choice 561choice
562 prompt "Baseboard" 562 prompt "Baseboard"
563 depends on MACH_EUKREA_CPUIMX35 563 depends on MACH_EUKREA_CPUIMX35SD
564 default MACH_EUKREA_MBIMXSD35_BASEBOARD 564 default MACH_EUKREA_MBIMXSD35_BASEBOARD
565 565
566config MACH_EUKREA_MBIMXSD35_BASEBOARD 566config MACH_EUKREA_MBIMXSD35_BASEBOARD
diff --git a/arch/arm/mach-imx/Makefile b/arch/arm/mach-imx/Makefile
index aba73214c2a8..d97f409ce98b 100644
--- a/arch/arm/mach-imx/Makefile
+++ b/arch/arm/mach-imx/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_MACH_MX21ADS) += mach-mx21ads.o
24 24
25# i.MX25 based machines 25# i.MX25 based machines
26obj-$(CONFIG_MACH_MX25_3DS) += mach-mx25_3ds.o 26obj-$(CONFIG_MACH_MX25_3DS) += mach-mx25_3ds.o
27obj-$(CONFIG_MACH_EUKREA_CPUIMX25) += mach-eukrea_cpuimx25.o 27obj-$(CONFIG_MACH_EUKREA_CPUIMX25SD) += mach-eukrea_cpuimx25.o
28obj-$(CONFIG_MACH_EUKREA_MBIMXSD25_BASEBOARD) += eukrea_mbimxsd25-baseboard.o 28obj-$(CONFIG_MACH_EUKREA_MBIMXSD25_BASEBOARD) += eukrea_mbimxsd25-baseboard.o
29 29
30# i.MX27 based machines 30# i.MX27 based machines
@@ -57,7 +57,7 @@ obj-$(CONFIG_MACH_BUG) += mach-bug.o
57# i.MX35 based machines 57# i.MX35 based machines
58obj-$(CONFIG_MACH_PCM043) += mach-pcm043.o 58obj-$(CONFIG_MACH_PCM043) += mach-pcm043.o
59obj-$(CONFIG_MACH_MX35_3DS) += mach-mx35_3ds.o 59obj-$(CONFIG_MACH_MX35_3DS) += mach-mx35_3ds.o
60obj-$(CONFIG_MACH_EUKREA_CPUIMX35) += mach-cpuimx35.o 60obj-$(CONFIG_MACH_EUKREA_CPUIMX35SD) += mach-cpuimx35.o
61obj-$(CONFIG_MACH_EUKREA_MBIMXSD35_BASEBOARD) += eukrea_mbimxsd35-baseboard.o 61obj-$(CONFIG_MACH_EUKREA_MBIMXSD35_BASEBOARD) += eukrea_mbimxsd35-baseboard.o
62obj-$(CONFIG_MACH_VPR200) += mach-vpr200.o 62obj-$(CONFIG_MACH_VPR200) += mach-vpr200.o
63 63
diff --git a/arch/arm/mach-imx/clock-imx35.c b/arch/arm/mach-imx/clock-imx35.c
index 8116f119517d..ac8238caecb9 100644
--- a/arch/arm/mach-imx/clock-imx35.c
+++ b/arch/arm/mach-imx/clock-imx35.c
@@ -507,7 +507,7 @@ static struct clk_lookup lookups[] = {
507 507
508int __init mx35_clocks_init() 508int __init mx35_clocks_init()
509{ 509{
510 unsigned int cgr2 = 3 << 26, cgr3 = 0; 510 unsigned int cgr2 = 3 << 26;
511 511
512#if defined(CONFIG_DEBUG_LL) && !defined(CONFIG_DEBUG_ICEDCC) 512#if defined(CONFIG_DEBUG_LL) && !defined(CONFIG_DEBUG_ICEDCC)
513 cgr2 |= 3 << 16; 513 cgr2 |= 3 << 16;
@@ -521,6 +521,12 @@ int __init mx35_clocks_init()
521 __raw_writel((3 << 18), CCM_BASE + CCM_CGR0); 521 __raw_writel((3 << 18), CCM_BASE + CCM_CGR0);
522 __raw_writel((3 << 2) | (3 << 4) | (3 << 6) | (3 << 8) | (3 << 16), 522 __raw_writel((3 << 2) | (3 << 4) | (3 << 6) | (3 << 8) | (3 << 16),
523 CCM_BASE + CCM_CGR1); 523 CCM_BASE + CCM_CGR1);
524 __raw_writel(cgr2, CCM_BASE + CCM_CGR2);
525 __raw_writel(0, CCM_BASE + CCM_CGR3);
526
527 clk_enable(&iim_clk);
528 imx_print_silicon_rev("i.MX35", mx35_revision());
529 clk_disable(&iim_clk);
524 530
525 /* 531 /*
526 * Check if we came up in internal boot mode. If yes, we need some 532 * Check if we came up in internal boot mode. If yes, we need some
@@ -529,17 +535,11 @@ int __init mx35_clocks_init()
529 */ 535 */
530 if (!(__raw_readl(CCM_BASE + CCM_RCSR) & (3 << 10))) { 536 if (!(__raw_readl(CCM_BASE + CCM_RCSR) & (3 << 10))) {
531 /* Additionally turn on UART1, SCC, and IIM clocks */ 537 /* Additionally turn on UART1, SCC, and IIM clocks */
532 cgr2 |= 3 << 16 | 3 << 4; 538 clk_enable(&iim_clk);
533 cgr3 |= 3 << 2; 539 clk_enable(&uart1_clk);
540 clk_enable(&scc_clk);
534 } 541 }
535 542
536 __raw_writel(cgr2, CCM_BASE + CCM_CGR2);
537 __raw_writel(cgr3, CCM_BASE + CCM_CGR3);
538
539 clk_enable(&iim_clk);
540 imx_print_silicon_rev("i.MX35", mx35_revision());
541 clk_disable(&iim_clk);
542
543#ifdef CONFIG_MXC_USE_EPIT 543#ifdef CONFIG_MXC_USE_EPIT
544 epit_timer_init(&epit1_clk, 544 epit_timer_init(&epit1_clk,
545 MX35_IO_ADDRESS(MX35_EPIT1_BASE_ADDR), MX35_INT_EPIT1); 545 MX35_IO_ADDRESS(MX35_EPIT1_BASE_ADDR), MX35_INT_EPIT1);
diff --git a/arch/arm/mach-imx/mach-cpuimx35.c b/arch/arm/mach-imx/mach-cpuimx35.c
index 66af2e8f7e57..362aae780601 100644
--- a/arch/arm/mach-imx/mach-cpuimx35.c
+++ b/arch/arm/mach-imx/mach-cpuimx35.c
@@ -53,12 +53,18 @@ static const struct imxi2c_platform_data
53 .bitrate = 100000, 53 .bitrate = 100000,
54}; 54};
55 55
56#define TSC2007_IRQGPIO IMX_GPIO_NR(3, 2)
57static int tsc2007_get_pendown_state(void)
58{
59 return !gpio_get_value(TSC2007_IRQGPIO);
60}
61
56static struct tsc2007_platform_data tsc2007_info = { 62static struct tsc2007_platform_data tsc2007_info = {
57 .model = 2007, 63 .model = 2007,
58 .x_plate_ohms = 180, 64 .x_plate_ohms = 180,
65 .get_pendown_state = tsc2007_get_pendown_state,
59}; 66};
60 67
61#define TSC2007_IRQGPIO IMX_GPIO_NR(3, 2)
62static struct i2c_board_info eukrea_cpuimx35_i2c_devices[] = { 68static struct i2c_board_info eukrea_cpuimx35_i2c_devices[] = {
63 { 69 {
64 I2C_BOARD_INFO("pcf8563", 0x51), 70 I2C_BOARD_INFO("pcf8563", 0x51),
diff --git a/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c b/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
index 7f8915ad5099..eef43e2e163e 100644
--- a/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c
@@ -3247,18 +3247,14 @@ static __initdata struct omap_hwmod *omap3xxx_hwmods[] = {
3247 3247
3248/* 3430ES1-only hwmods */ 3248/* 3430ES1-only hwmods */
3249static __initdata struct omap_hwmod *omap3430es1_hwmods[] = { 3249static __initdata struct omap_hwmod *omap3430es1_hwmods[] = {
3250 &omap3xxx_iva_hwmod,
3251 &omap3430es1_dss_core_hwmod, 3250 &omap3430es1_dss_core_hwmod,
3252 &omap3xxx_mailbox_hwmod,
3253 NULL 3251 NULL
3254}; 3252};
3255 3253
3256/* 3430ES2+-only hwmods */ 3254/* 3430ES2+-only hwmods */
3257static __initdata struct omap_hwmod *omap3430es2plus_hwmods[] = { 3255static __initdata struct omap_hwmod *omap3430es2plus_hwmods[] = {
3258 &omap3xxx_iva_hwmod,
3259 &omap3xxx_dss_core_hwmod, 3256 &omap3xxx_dss_core_hwmod,
3260 &omap3xxx_usbhsotg_hwmod, 3257 &omap3xxx_usbhsotg_hwmod,
3261 &omap3xxx_mailbox_hwmod,
3262 NULL 3258 NULL
3263}; 3259};
3264 3260
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index fbdd12ea3a58..7c38474e533a 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -32,6 +32,7 @@
32 32
33#include <asm/mach/arch.h> 33#include <asm/mach/arch.h>
34#include <asm/mach/map.h> 34#include <asm/mach/map.h>
35#include <asm/memblock.h>
35 36
36#include "mm.h" 37#include "mm.h"
37 38
@@ -332,7 +333,6 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
332 333
333 sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); 334 sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
334 335
335 memblock_init();
336 for (i = 0; i < mi->nr_banks; i++) 336 for (i = 0; i < mi->nr_banks; i++)
337 memblock_add(mi->bank[i].start, mi->bank[i].size); 337 memblock_add(mi->bank[i].start, mi->bank[i].size);
338 338
@@ -371,7 +371,7 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
371 if (mdesc->reserve) 371 if (mdesc->reserve)
372 mdesc->reserve(); 372 mdesc->reserve();
373 373
374 memblock_analyze(); 374 memblock_allow_resize();
375 memblock_dump_all(); 375 memblock_dump_all();
376} 376}
377 377
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 2c559ac38142..e70a73731eaa 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -363,11 +363,13 @@ __v7_setup:
363 orreq r10, r10, #1 << 6 @ set bit #6 363 orreq r10, r10, #1 << 6 @ set bit #6
364 mcreq p15, 0, r10, c15, c0, 1 @ write diagnostic register 364 mcreq p15, 0, r10, c15, c0, 1 @ write diagnostic register
365#endif 365#endif
366#ifdef CONFIG_ARM_ERRATA_751472 366#if defined(CONFIG_ARM_ERRATA_751472) && defined(CONFIG_SMP)
367 cmp r6, #0x30 @ present prior to r3p0 367 ALT_SMP(cmp r6, #0x30) @ present prior to r3p0
368 ALT_UP_B(1f)
368 mrclt p15, 0, r10, c15, c0, 1 @ read diagnostic register 369 mrclt p15, 0, r10, c15, c0, 1 @ read diagnostic register
369 orrlt r10, r10, #1 << 11 @ set bit #11 370 orrlt r10, r10, #1 << 11 @ set bit #11
370 mcrlt p15, 0, r10, c15, c0, 1 @ write diagnostic register 371 mcrlt p15, 0, r10, c15, c0, 1 @ write diagnostic register
3721:
371#endif 373#endif
372 374
3733: mov r10, #0 3753: mov r10, #0
diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c
index c074e66ad224..4e0a371630b3 100644
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -116,7 +116,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
116 return oprofile_perf_init(ops); 116 return oprofile_perf_init(ops);
117} 117}
118 118
119void __exit oprofile_arch_exit(void) 119void oprofile_arch_exit(void)
120{ 120{
121 oprofile_perf_exit(); 121 oprofile_perf_exit();
122} 122}
diff --git a/arch/arm/plat-mxc/cpufreq.c b/arch/arm/plat-mxc/cpufreq.c
index adbff706ef6f..73db34bf588a 100644
--- a/arch/arm/plat-mxc/cpufreq.c
+++ b/arch/arm/plat-mxc/cpufreq.c
@@ -98,7 +98,7 @@ static int mxc_set_target(struct cpufreq_policy *policy,
98 return ret; 98 return ret;
99} 99}
100 100
101static int __init mxc_cpufreq_init(struct cpufreq_policy *policy) 101static int mxc_cpufreq_init(struct cpufreq_policy *policy)
102{ 102{
103 int ret; 103 int ret;
104 int i; 104 int i;
diff --git a/arch/arm/plat-mxc/include/mach/uncompress.h b/arch/arm/plat-mxc/include/mach/uncompress.h
index 88fd40452567..477971b00930 100644
--- a/arch/arm/plat-mxc/include/mach/uncompress.h
+++ b/arch/arm/plat-mxc/include/mach/uncompress.h
@@ -98,6 +98,7 @@ static __inline__ void __arch_decomp_setup(unsigned long arch_id)
98 case MACH_TYPE_PCM043: 98 case MACH_TYPE_PCM043:
99 case MACH_TYPE_LILLY1131: 99 case MACH_TYPE_LILLY1131:
100 case MACH_TYPE_VPR200: 100 case MACH_TYPE_VPR200:
101 case MACH_TYPE_EUKREA_CPUIMX35SD:
101 uart_base = MX3X_UART1_BASE_ADDR; 102 uart_base = MX3X_UART1_BASE_ADDR;
102 break; 103 break;
103 case MACH_TYPE_MAGX_ZN5: 104 case MACH_TYPE_MAGX_ZN5:
diff --git a/arch/arm/plat-mxc/pwm.c b/arch/arm/plat-mxc/pwm.c
index 845de59f07ed..e032717f7d02 100644
--- a/arch/arm/plat-mxc/pwm.c
+++ b/arch/arm/plat-mxc/pwm.c
@@ -77,6 +77,15 @@ int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
77 do_div(c, period_ns); 77 do_div(c, period_ns);
78 duty_cycles = c; 78 duty_cycles = c;
79 79
80 /*
81 * according to imx pwm RM, the real period value should be
82 * PERIOD value in PWMPR plus 2.
83 */
84 if (period_cycles > 2)
85 period_cycles -= 2;
86 else
87 period_cycles = 0;
88
80 writel(duty_cycles, pwm->mmio_base + MX3_PWMSAR); 89 writel(duty_cycles, pwm->mmio_base + MX3_PWMSAR);
81 writel(period_cycles, pwm->mmio_base + MX3_PWMPR); 90 writel(period_cycles, pwm->mmio_base + MX3_PWMPR);
82 91
diff --git a/arch/arm/plat-orion/gpio.c b/arch/arm/plat-orion/gpio.c
index 41ab97ebe4cf..10d160888133 100644
--- a/arch/arm/plat-orion/gpio.c
+++ b/arch/arm/plat-orion/gpio.c
@@ -384,12 +384,16 @@ void __init orion_gpio_init(int gpio_base, int ngpio,
384 struct orion_gpio_chip *ochip; 384 struct orion_gpio_chip *ochip;
385 struct irq_chip_generic *gc; 385 struct irq_chip_generic *gc;
386 struct irq_chip_type *ct; 386 struct irq_chip_type *ct;
387 char gc_label[16];
387 388
388 if (orion_gpio_chip_count == ARRAY_SIZE(orion_gpio_chips)) 389 if (orion_gpio_chip_count == ARRAY_SIZE(orion_gpio_chips))
389 return; 390 return;
390 391
392 snprintf(gc_label, sizeof(gc_label), "orion_gpio%d",
393 orion_gpio_chip_count);
394
391 ochip = orion_gpio_chips + orion_gpio_chip_count; 395 ochip = orion_gpio_chips + orion_gpio_chip_count;
392 ochip->chip.label = "orion_gpio"; 396 ochip->chip.label = kstrdup(gc_label, GFP_KERNEL);
393 ochip->chip.request = orion_gpio_request; 397 ochip->chip.request = orion_gpio_request;
394 ochip->chip.direction_input = orion_gpio_direction_input; 398 ochip->chip.direction_input = orion_gpio_direction_input;
395 ochip->chip.get = orion_gpio_get; 399 ochip->chip.get = orion_gpio_get;
diff --git a/arch/arm/plat-samsung/include/plat/cpu-freq-core.h b/arch/arm/plat-samsung/include/plat/cpu-freq-core.h
index dac4760c0f0a..95509d8eb140 100644
--- a/arch/arm/plat-samsung/include/plat/cpu-freq-core.h
+++ b/arch/arm/plat-samsung/include/plat/cpu-freq-core.h
@@ -202,14 +202,6 @@ extern int s3c_plltab_register(struct cpufreq_frequency_table *plls,
202extern struct s3c_cpufreq_config *s3c_cpufreq_getconfig(void); 202extern struct s3c_cpufreq_config *s3c_cpufreq_getconfig(void);
203extern struct s3c_iotimings *s3c_cpufreq_getiotimings(void); 203extern struct s3c_iotimings *s3c_cpufreq_getiotimings(void);
204 204
205extern void s3c2410_iotiming_debugfs(struct seq_file *seq,
206 struct s3c_cpufreq_config *cfg,
207 union s3c_iobank *iob);
208
209extern void s3c2412_iotiming_debugfs(struct seq_file *seq,
210 struct s3c_cpufreq_config *cfg,
211 union s3c_iobank *iob);
212
213#ifdef CONFIG_CPU_FREQ_S3C24XX_DEBUGFS 205#ifdef CONFIG_CPU_FREQ_S3C24XX_DEBUGFS
214#define s3c_cpufreq_debugfs_call(x) x 206#define s3c_cpufreq_debugfs_call(x) x
215#else 207#else
@@ -226,6 +218,10 @@ extern void s3c2410_cpufreq_setrefresh(struct s3c_cpufreq_config *cfg);
226extern void s3c2410_set_fvco(struct s3c_cpufreq_config *cfg); 218extern void s3c2410_set_fvco(struct s3c_cpufreq_config *cfg);
227 219
228#ifdef CONFIG_S3C2410_IOTIMING 220#ifdef CONFIG_S3C2410_IOTIMING
221extern void s3c2410_iotiming_debugfs(struct seq_file *seq,
222 struct s3c_cpufreq_config *cfg,
223 union s3c_iobank *iob);
224
229extern int s3c2410_iotiming_calc(struct s3c_cpufreq_config *cfg, 225extern int s3c2410_iotiming_calc(struct s3c_cpufreq_config *cfg,
230 struct s3c_iotimings *iot); 226 struct s3c_iotimings *iot);
231 227
@@ -235,6 +231,7 @@ extern int s3c2410_iotiming_get(struct s3c_cpufreq_config *cfg,
235extern void s3c2410_iotiming_set(struct s3c_cpufreq_config *cfg, 231extern void s3c2410_iotiming_set(struct s3c_cpufreq_config *cfg,
236 struct s3c_iotimings *iot); 232 struct s3c_iotimings *iot);
237#else 233#else
234#define s3c2410_iotiming_debugfs NULL
238#define s3c2410_iotiming_calc NULL 235#define s3c2410_iotiming_calc NULL
239#define s3c2410_iotiming_get NULL 236#define s3c2410_iotiming_get NULL
240#define s3c2410_iotiming_set NULL 237#define s3c2410_iotiming_set NULL
@@ -242,8 +239,10 @@ extern void s3c2410_iotiming_set(struct s3c_cpufreq_config *cfg,
242 239
243/* S3C2412 compatible routines */ 240/* S3C2412 compatible routines */
244 241
245extern int s3c2412_iotiming_get(struct s3c_cpufreq_config *cfg, 242#ifdef CONFIG_S3C2412_IOTIMING
246 struct s3c_iotimings *timings); 243extern void s3c2412_iotiming_debugfs(struct seq_file *seq,
244 struct s3c_cpufreq_config *cfg,
245 union s3c_iobank *iob);
247 246
248extern int s3c2412_iotiming_get(struct s3c_cpufreq_config *cfg, 247extern int s3c2412_iotiming_get(struct s3c_cpufreq_config *cfg,
249 struct s3c_iotimings *timings); 248 struct s3c_iotimings *timings);
@@ -253,6 +252,12 @@ extern int s3c2412_iotiming_calc(struct s3c_cpufreq_config *cfg,
253 252
254extern void s3c2412_iotiming_set(struct s3c_cpufreq_config *cfg, 253extern void s3c2412_iotiming_set(struct s3c_cpufreq_config *cfg,
255 struct s3c_iotimings *iot); 254 struct s3c_iotimings *iot);
255#else
256#define s3c2412_iotiming_debugfs NULL
257#define s3c2412_iotiming_calc NULL
258#define s3c2412_iotiming_get NULL
259#define s3c2412_iotiming_set NULL
260#endif /* CONFIG_S3C2412_IOTIMING */
256 261
257#ifdef CONFIG_CPU_FREQ_S3C24XX_DEBUG 262#ifdef CONFIG_CPU_FREQ_S3C24XX_DEBUG
258#define s3c_freq_dbg(x...) printk(KERN_INFO x) 263#define s3c_freq_dbg(x...) printk(KERN_INFO x)
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index ef5a2a08fcca..ea3395750324 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -34,10 +34,12 @@ void cpu_idle(void)
34{ 34{
35 /* endless idle loop with no priority at all */ 35 /* endless idle loop with no priority at all */
36 while (1) { 36 while (1) {
37 tick_nohz_stop_sched_tick(1); 37 tick_nohz_idle_enter();
38 rcu_idle_enter();
38 while (!need_resched()) 39 while (!need_resched())
39 cpu_idle_sleep(); 40 cpu_idle_sleep();
40 tick_nohz_restart_sched_tick(); 41 rcu_idle_exit();
42 tick_nohz_idle_exit();
41 preempt_enable_no_resched(); 43 preempt_enable_no_resched();
42 schedule(); 44 schedule();
43 preempt_disable(); 45 preempt_disable();
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 6a80a9e9fc4a..8dd0416673cb 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -88,10 +88,12 @@ void cpu_idle(void)
88#endif 88#endif
89 if (!idle) 89 if (!idle)
90 idle = default_idle; 90 idle = default_idle;
91 tick_nohz_stop_sched_tick(1); 91 tick_nohz_idle_enter();
92 rcu_idle_enter();
92 while (!need_resched()) 93 while (!need_resched())
93 idle(); 94 idle();
94 tick_nohz_restart_sched_tick(); 95 rcu_idle_exit();
96 tick_nohz_idle_exit();
95 preempt_enable_no_resched(); 97 preempt_enable_no_resched();
96 schedule(); 98 schedule();
97 preempt_disable(); 99 preempt_disable();
diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c
index bb978ede8985..6773fc83a670 100644
--- a/arch/cris/arch-v32/kernel/time.c
+++ b/arch/cris/arch-v32/kernel/time.c
@@ -47,14 +47,12 @@ static struct clocksource cont_rotime = {
47 .rating = 300, 47 .rating = 300,
48 .read = read_cont_rotime, 48 .read = read_cont_rotime,
49 .mask = CLOCKSOURCE_MASK(32), 49 .mask = CLOCKSOURCE_MASK(32),
50 .shift = 10,
51 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 50 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
52}; 51};
53 52
54static int __init etrax_init_cont_rotime(void) 53static int __init etrax_init_cont_rotime(void)
55{ 54{
56 cont_rotime.mult = clocksource_khz2mult(100000, cont_rotime.shift); 55 clocksource_register_khz(&cont_rotime, 100000);
57 clocksource_register(&cont_rotime);
58 return 0; 56 return 0;
59} 57}
60arch_initcall(etrax_init_cont_rotime); 58arch_initcall(etrax_init_cont_rotime);
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 27489b6dd533..3b7a7c483785 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -23,6 +23,9 @@ config IA64
23 select HAVE_ARCH_TRACEHOOK 23 select HAVE_ARCH_TRACEHOOK
24 select HAVE_DMA_API_DEBUG 24 select HAVE_DMA_API_DEBUG
25 select HAVE_GENERIC_HARDIRQS 25 select HAVE_GENERIC_HARDIRQS
26 select HAVE_MEMBLOCK
27 select HAVE_MEMBLOCK_NODE_MAP
28 select ARCH_DISCARD_MEMBLOCK
26 select GENERIC_IRQ_PROBE 29 select GENERIC_IRQ_PROBE
27 select GENERIC_PENDING_IRQ if SMP 30 select GENERIC_PENDING_IRQ if SMP
28 select IRQ_PER_CPU 31 select IRQ_PER_CPU
@@ -474,9 +477,6 @@ config NODES_SHIFT
474 MAX_NUMNODES will be 2^(This value). 477 MAX_NUMNODES will be 2^(This value).
475 If in doubt, use the default. 478 If in doubt, use the default.
476 479
477config ARCH_POPULATES_NODE_MAP
478 def_bool y
479
480# VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. 480# VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent.
481# VIRTUAL_MEM_MAP has been retained for historical reasons. 481# VIRTUAL_MEM_MAP has been retained for historical reasons.
482config VIRTUAL_MEM_MAP 482config VIRTUAL_MEM_MAP
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
index 6073b187528a..3deac956d325 100644
--- a/arch/ia64/include/asm/cputime.h
+++ b/arch/ia64/include/asm/cputime.h
@@ -26,59 +26,53 @@
26#include <linux/jiffies.h> 26#include <linux/jiffies.h>
27#include <asm/processor.h> 27#include <asm/processor.h>
28 28
29typedef u64 cputime_t; 29typedef u64 __nocast cputime_t;
30typedef u64 cputime64_t; 30typedef u64 __nocast cputime64_t;
31 31
32#define cputime_zero ((cputime_t)0)
33#define cputime_one_jiffy jiffies_to_cputime(1) 32#define cputime_one_jiffy jiffies_to_cputime(1)
34#define cputime_max ((~((cputime_t)0) >> 1) - 1)
35#define cputime_add(__a, __b) ((__a) + (__b))
36#define cputime_sub(__a, __b) ((__a) - (__b))
37#define cputime_div(__a, __n) ((__a) / (__n))
38#define cputime_halve(__a) ((__a) >> 1)
39#define cputime_eq(__a, __b) ((__a) == (__b))
40#define cputime_gt(__a, __b) ((__a) > (__b))
41#define cputime_ge(__a, __b) ((__a) >= (__b))
42#define cputime_lt(__a, __b) ((__a) < (__b))
43#define cputime_le(__a, __b) ((__a) <= (__b))
44
45#define cputime64_zero ((cputime64_t)0)
46#define cputime64_add(__a, __b) ((__a) + (__b))
47#define cputime64_sub(__a, __b) ((__a) - (__b))
48#define cputime_to_cputime64(__ct) (__ct)
49 33
50/* 34/*
51 * Convert cputime <-> jiffies (HZ) 35 * Convert cputime <-> jiffies (HZ)
52 */ 36 */
53#define cputime_to_jiffies(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) 37#define cputime_to_jiffies(__ct) \
54#define jiffies_to_cputime(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) 38 ((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
55#define cputime64_to_jiffies64(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) 39#define jiffies_to_cputime(__jif) \
56#define jiffies64_to_cputime64(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) 40 (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
41#define cputime64_to_jiffies64(__ct) \
42 ((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
43#define jiffies64_to_cputime64(__jif) \
44 (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ))
57 45
58/* 46/*
59 * Convert cputime <-> microseconds 47 * Convert cputime <-> microseconds
60 */ 48 */
61#define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) 49#define cputime_to_usecs(__ct) \
62#define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) 50 ((__force u64)(__ct) / NSEC_PER_USEC)
51#define usecs_to_cputime(__usecs) \
52 (__force cputime_t)((__usecs) * NSEC_PER_USEC)
53#define usecs_to_cputime64(__usecs) \
54 (__force cputime64_t)((__usecs) * NSEC_PER_USEC)
63 55
64/* 56/*
65 * Convert cputime <-> seconds 57 * Convert cputime <-> seconds
66 */ 58 */
67#define cputime_to_secs(__ct) ((__ct) / NSEC_PER_SEC) 59#define cputime_to_secs(__ct) \
68#define secs_to_cputime(__secs) ((__secs) * NSEC_PER_SEC) 60 ((__force u64)(__ct) / NSEC_PER_SEC)
61#define secs_to_cputime(__secs) \
62 (__force cputime_t)((__secs) * NSEC_PER_SEC)
69 63
70/* 64/*
71 * Convert cputime <-> timespec (nsec) 65 * Convert cputime <-> timespec (nsec)
72 */ 66 */
73static inline cputime_t timespec_to_cputime(const struct timespec *val) 67static inline cputime_t timespec_to_cputime(const struct timespec *val)
74{ 68{
75 cputime_t ret = val->tv_sec * NSEC_PER_SEC; 69 u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec;
76 return (ret + val->tv_nsec); 70 return (__force cputime_t) ret;
77} 71}
78static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) 72static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
79{ 73{
80 val->tv_sec = ct / NSEC_PER_SEC; 74 val->tv_sec = (__force u64) ct / NSEC_PER_SEC;
81 val->tv_nsec = ct % NSEC_PER_SEC; 75 val->tv_nsec = (__force u64) ct % NSEC_PER_SEC;
82} 76}
83 77
84/* 78/*
@@ -86,25 +80,28 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
86 */ 80 */
87static inline cputime_t timeval_to_cputime(struct timeval *val) 81static inline cputime_t timeval_to_cputime(struct timeval *val)
88{ 82{
89 cputime_t ret = val->tv_sec * NSEC_PER_SEC; 83 u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC;
90 return (ret + val->tv_usec * NSEC_PER_USEC); 84 return (__force cputime_t) ret;
91} 85}
92static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) 86static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
93{ 87{
94 val->tv_sec = ct / NSEC_PER_SEC; 88 val->tv_sec = (__force u64) ct / NSEC_PER_SEC;
95 val->tv_usec = (ct % NSEC_PER_SEC) / NSEC_PER_USEC; 89 val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC;
96} 90}
97 91
98/* 92/*
99 * Convert cputime <-> clock (USER_HZ) 93 * Convert cputime <-> clock (USER_HZ)
100 */ 94 */
101#define cputime_to_clock_t(__ct) ((__ct) / (NSEC_PER_SEC / USER_HZ)) 95#define cputime_to_clock_t(__ct) \
102#define clock_t_to_cputime(__x) ((__x) * (NSEC_PER_SEC / USER_HZ)) 96 ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ))
97#define clock_t_to_cputime(__x) \
98 (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ))
103 99
104/* 100/*
105 * Convert cputime64 to clock. 101 * Convert cputime64 to clock.
106 */ 102 */
107#define cputime64_to_clock_t(__ct) cputime_to_clock_t((cputime_t)__ct) 103#define cputime64_to_clock_t(__ct) \
104 cputime_to_clock_t((__force cputime_t)__ct)
108 105
109#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 106#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
110#endif /* __IA64_CPUTIME_H */ 107#endif /* __IA64_CPUTIME_H */
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index f114a3b14c6a..1516d1dc11fd 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -16,6 +16,7 @@
16 */ 16 */
17#include <linux/bootmem.h> 17#include <linux/bootmem.h>
18#include <linux/efi.h> 18#include <linux/efi.h>
19#include <linux/memblock.h>
19#include <linux/mm.h> 20#include <linux/mm.h>
20#include <linux/nmi.h> 21#include <linux/nmi.h>
21#include <linux/swap.h> 22#include <linux/swap.h>
@@ -348,7 +349,7 @@ paging_init (void)
348 printk("Virtual mem_map starts at 0x%p\n", mem_map); 349 printk("Virtual mem_map starts at 0x%p\n", mem_map);
349 } 350 }
350#else /* !CONFIG_VIRTUAL_MEM_MAP */ 351#else /* !CONFIG_VIRTUAL_MEM_MAP */
351 add_active_range(0, 0, max_low_pfn); 352 memblock_add_node(0, PFN_PHYS(max_low_pfn), 0);
352 free_area_init_nodes(max_zone_pfns); 353 free_area_init_nodes(max_zone_pfns);
353#endif /* !CONFIG_VIRTUAL_MEM_MAP */ 354#endif /* !CONFIG_VIRTUAL_MEM_MAP */
354 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); 355 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 00cb0e26c64e..13df239dbed1 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -10,6 +10,7 @@
10#include <linux/bootmem.h> 10#include <linux/bootmem.h>
11#include <linux/efi.h> 11#include <linux/efi.h>
12#include <linux/elf.h> 12#include <linux/elf.h>
13#include <linux/memblock.h>
13#include <linux/mm.h> 14#include <linux/mm.h>
14#include <linux/mmzone.h> 15#include <linux/mmzone.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -557,8 +558,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid)
557#endif 558#endif
558 559
559 if (start < end) 560 if (start < end)
560 add_active_range(nid, __pa(start) >> PAGE_SHIFT, 561 memblock_add_node(__pa(start), end - start, nid);
561 __pa(end) >> PAGE_SHIFT);
562 return 0; 562 return 0;
563} 563}
564 564
diff --git a/arch/m68k/platform/68328/timers.c b/arch/m68k/platform/68328/timers.c
index 309f725995bf..f2678866067b 100644
--- a/arch/m68k/platform/68328/timers.c
+++ b/arch/m68k/platform/68328/timers.c
@@ -93,7 +93,6 @@ static struct clocksource m68328_clk = {
93 .name = "timer", 93 .name = "timer",
94 .rating = 250, 94 .rating = 250,
95 .read = m68328_read_clk, 95 .read = m68328_read_clk,
96 .shift = 20,
97 .mask = CLOCKSOURCE_MASK(32), 96 .mask = CLOCKSOURCE_MASK(32),
98 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 97 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
99}; 98};
@@ -115,8 +114,7 @@ void hw_timer_init(void)
115 114
116 /* Enable timer 1 */ 115 /* Enable timer 1 */
117 TCTL |= TCTL_TEN; 116 TCTL |= TCTL_TEN;
118 m68328_clk.mult = clocksource_hz2mult(TICKS_PER_JIFFY*HZ, m68328_clk.shift); 117 clocksource_register_hz(&m68328_clk, TICKS_PER_JIFFY*HZ);
119 clocksource_register(&m68328_clk);
120} 118}
121 119
122/***************************************************************************/ 120/***************************************************************************/
diff --git a/arch/m68k/platform/coldfire/dma_timer.c b/arch/m68k/platform/coldfire/dma_timer.c
index a5f562823d7a..235ad57c4707 100644
--- a/arch/m68k/platform/coldfire/dma_timer.c
+++ b/arch/m68k/platform/coldfire/dma_timer.c
@@ -44,7 +44,6 @@ static struct clocksource clocksource_cf_dt = {
44 .rating = 200, 44 .rating = 200,
45 .read = cf_dt_get_cycles, 45 .read = cf_dt_get_cycles,
46 .mask = CLOCKSOURCE_MASK(32), 46 .mask = CLOCKSOURCE_MASK(32),
47 .shift = 20,
48 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 47 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
49}; 48};
50 49
@@ -60,9 +59,7 @@ static int __init init_cf_dt_clocksource(void)
60 __raw_writeb(0x00, DTER0); 59 __raw_writeb(0x00, DTER0);
61 __raw_writel(0x00000000, DTRR0); 60 __raw_writel(0x00000000, DTRR0);
62 __raw_writew(DMA_DTMR_CLK_DIV_16 | DMA_DTMR_ENABLE, DTMR0); 61 __raw_writew(DMA_DTMR_CLK_DIV_16 | DMA_DTMR_ENABLE, DTMR0);
63 clocksource_cf_dt.mult = clocksource_hz2mult(DMA_FREQ, 62 return clocksource_register_hz(&clocksource_cf_dt, DMA_FREQ);
64 clocksource_cf_dt.shift);
65 return clocksource_register(&clocksource_cf_dt);
66} 63}
67 64
68arch_initcall(init_cf_dt_clocksource); 65arch_initcall(init_cf_dt_clocksource);
diff --git a/arch/m68k/platform/coldfire/pit.c b/arch/m68k/platform/coldfire/pit.c
index c2b980926bec..02663d25822d 100644
--- a/arch/m68k/platform/coldfire/pit.c
+++ b/arch/m68k/platform/coldfire/pit.c
@@ -144,7 +144,6 @@ static struct clocksource pit_clk = {
144 .name = "pit", 144 .name = "pit",
145 .rating = 100, 145 .rating = 100,
146 .read = pit_read_clk, 146 .read = pit_read_clk,
147 .shift = 20,
148 .mask = CLOCKSOURCE_MASK(32), 147 .mask = CLOCKSOURCE_MASK(32),
149}; 148};
150 149
@@ -162,8 +161,7 @@ void hw_timer_init(void)
162 161
163 setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &pit_irq); 162 setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &pit_irq);
164 163
165 pit_clk.mult = clocksource_hz2mult(FREQ, pit_clk.shift); 164 clocksource_register_hz(&pit_clk, FREQ);
166 clocksource_register(&pit_clk);
167} 165}
168 166
169/***************************************************************************/ 167/***************************************************************************/
diff --git a/arch/m68k/platform/coldfire/sltimers.c b/arch/m68k/platform/coldfire/sltimers.c
index 6a85daf9a7fd..b7f822b552bb 100644
--- a/arch/m68k/platform/coldfire/sltimers.c
+++ b/arch/m68k/platform/coldfire/sltimers.c
@@ -114,7 +114,6 @@ static struct clocksource mcfslt_clk = {
114 .name = "slt", 114 .name = "slt",
115 .rating = 250, 115 .rating = 250,
116 .read = mcfslt_read_clk, 116 .read = mcfslt_read_clk,
117 .shift = 20,
118 .mask = CLOCKSOURCE_MASK(32), 117 .mask = CLOCKSOURCE_MASK(32),
119 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 118 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
120}; 119};
@@ -136,8 +135,7 @@ void hw_timer_init(void)
136 135
137 setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq); 136 setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq);
138 137
139 mcfslt_clk.mult = clocksource_hz2mult(MCF_BUSCLK, mcfslt_clk.shift); 138 clocksource_register_hz(&mcfslt_clk, MCF_BUSCLK);
140 clocksource_register(&mcfslt_clk);
141 139
142#ifdef CONFIG_HIGHPROFILE 140#ifdef CONFIG_HIGHPROFILE
143 mcfslt_profile_init(); 141 mcfslt_profile_init();
diff --git a/arch/m68k/platform/coldfire/timers.c b/arch/m68k/platform/coldfire/timers.c
index 60242f65fea9..0d90da32fcdb 100644
--- a/arch/m68k/platform/coldfire/timers.c
+++ b/arch/m68k/platform/coldfire/timers.c
@@ -88,7 +88,6 @@ static struct clocksource mcftmr_clk = {
88 .name = "tmr", 88 .name = "tmr",
89 .rating = 250, 89 .rating = 250,
90 .read = mcftmr_read_clk, 90 .read = mcftmr_read_clk,
91 .shift = 20,
92 .mask = CLOCKSOURCE_MASK(32), 91 .mask = CLOCKSOURCE_MASK(32),
93 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 92 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
94}; 93};
@@ -109,8 +108,7 @@ void hw_timer_init(void)
109 __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | 108 __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 |
110 MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, TA(MCFTIMER_TMR)); 109 MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, TA(MCFTIMER_TMR));
111 110
112 mcftmr_clk.mult = clocksource_hz2mult(FREQ, mcftmr_clk.shift); 111 clocksource_register_hz(&mcftmr_clk, FREQ);
113 clocksource_register(&mcftmr_clk);
114 112
115 setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq); 113 setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq);
116 114
diff --git a/arch/microblaze/include/asm/memblock.h b/arch/microblaze/include/asm/memblock.h
deleted file mode 100644
index 20a8e257c77f..000000000000
--- a/arch/microblaze/include/asm/memblock.h
+++ /dev/null
@@ -1,14 +0,0 @@
1/*
2 * Copyright (C) 2008 Michal Simek <monstr@monstr.eu>
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 */
8
9#ifndef _ASM_MICROBLAZE_MEMBLOCK_H
10#define _ASM_MICROBLAZE_MEMBLOCK_H
11
12#endif /* _ASM_MICROBLAZE_MEMBLOCK_H */
13
14
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 95cc295976a7..7dcb5bfffb75 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -103,10 +103,12 @@ void cpu_idle(void)
103 if (!idle) 103 if (!idle)
104 idle = default_idle; 104 idle = default_idle;
105 105
106 tick_nohz_stop_sched_tick(1); 106 tick_nohz_idle_enter();
107 rcu_idle_enter();
107 while (!need_resched()) 108 while (!need_resched())
108 idle(); 109 idle();
109 tick_nohz_restart_sched_tick(); 110 rcu_idle_exit();
111 tick_nohz_idle_exit();
110 112
111 preempt_enable_no_resched(); 113 preempt_enable_no_resched();
112 schedule(); 114 schedule();
diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c
index 977484add216..80d314e81901 100644
--- a/arch/microblaze/kernel/prom.c
+++ b/arch/microblaze/kernel/prom.c
@@ -122,7 +122,6 @@ void __init early_init_devtree(void *params)
122 of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line); 122 of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line);
123 123
124 /* Scan memory nodes and rebuild MEMBLOCKs */ 124 /* Scan memory nodes and rebuild MEMBLOCKs */
125 memblock_init();
126 of_scan_flat_dt(early_init_dt_scan_root, NULL); 125 of_scan_flat_dt(early_init_dt_scan_root, NULL);
127 of_scan_flat_dt(early_init_dt_scan_memory, NULL); 126 of_scan_flat_dt(early_init_dt_scan_memory, NULL);
128 127
@@ -130,7 +129,7 @@ void __init early_init_devtree(void *params)
130 strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); 129 strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE);
131 parse_early_param(); 130 parse_early_param();
132 131
133 memblock_analyze(); 132 memblock_allow_resize();
134 133
135 pr_debug("Phys. mem: %lx\n", (unsigned long) memblock_phys_mem_size()); 134 pr_debug("Phys. mem: %lx\n", (unsigned long) memblock_phys_mem_size());
136 135
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d46f1da18a3c..9c652eb68aaa 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -25,6 +25,9 @@ config MIPS
25 select GENERIC_IRQ_SHOW 25 select GENERIC_IRQ_SHOW
26 select HAVE_ARCH_JUMP_LABEL 26 select HAVE_ARCH_JUMP_LABEL
27 select IRQ_FORCED_THREADING 27 select IRQ_FORCED_THREADING
28 select HAVE_MEMBLOCK
29 select HAVE_MEMBLOCK_NODE_MAP
30 select ARCH_DISCARD_MEMBLOCK
28 31
29menu "Machine selection" 32menu "Machine selection"
30 33
@@ -2064,9 +2067,6 @@ config ARCH_DISCONTIGMEM_ENABLE
2064 or have huge holes in the physical address space for other reasons. 2067 or have huge holes in the physical address space for other reasons.
2065 See <file:Documentation/vm/numa> for more. 2068 See <file:Documentation/vm/numa> for more.
2066 2069
2067config ARCH_POPULATES_NODE_MAP
2068 def_bool y
2069
2070config ARCH_SPARSEMEM_ENABLE 2070config ARCH_SPARSEMEM_ENABLE
2071 bool 2071 bool
2072 select SPARSEMEM_STATIC 2072 select SPARSEMEM_STATIC
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index c47f96e453c0..7955409051c4 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -56,7 +56,8 @@ void __noreturn cpu_idle(void)
56 56
57 /* endless idle loop with no priority at all */ 57 /* endless idle loop with no priority at all */
58 while (1) { 58 while (1) {
59 tick_nohz_stop_sched_tick(1); 59 tick_nohz_idle_enter();
60 rcu_idle_enter();
60 while (!need_resched() && cpu_online(cpu)) { 61 while (!need_resched() && cpu_online(cpu)) {
61#ifdef CONFIG_MIPS_MT_SMTC 62#ifdef CONFIG_MIPS_MT_SMTC
62 extern void smtc_idle_loop_hook(void); 63 extern void smtc_idle_loop_hook(void);
@@ -77,7 +78,8 @@ void __noreturn cpu_idle(void)
77 system_state == SYSTEM_BOOTING)) 78 system_state == SYSTEM_BOOTING))
78 play_dead(); 79 play_dead();
79#endif 80#endif
80 tick_nohz_restart_sched_tick(); 81 rcu_idle_exit();
82 tick_nohz_idle_exit();
81 preempt_enable_no_resched(); 83 preempt_enable_no_resched();
82 schedule(); 84 schedule();
83 preempt_disable(); 85 preempt_disable();
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 84af26ab2212..b1cb8f87d7b4 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -14,6 +14,7 @@
14#include <linux/ioport.h> 14#include <linux/ioport.h>
15#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/screen_info.h> 16#include <linux/screen_info.h>
17#include <linux/memblock.h>
17#include <linux/bootmem.h> 18#include <linux/bootmem.h>
18#include <linux/initrd.h> 19#include <linux/initrd.h>
19#include <linux/root_dev.h> 20#include <linux/root_dev.h>
@@ -352,7 +353,7 @@ static void __init bootmem_init(void)
352 continue; 353 continue;
353#endif 354#endif
354 355
355 add_active_range(0, start, end); 356 memblock_add_node(PFN_PHYS(start), PFN_PHYS(end - start), 0);
356 } 357 }
357 358
358 /* 359 /*
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index bc1297109cc5..b105eca3c020 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -12,6 +12,7 @@
12 */ 12 */
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/memblock.h>
15#include <linux/mm.h> 16#include <linux/mm.h>
16#include <linux/mmzone.h> 17#include <linux/mmzone.h>
17#include <linux/module.h> 18#include <linux/module.h>
@@ -381,8 +382,8 @@ static void __init szmem(void)
381 continue; 382 continue;
382 } 383 }
383 num_physpages += slot_psize; 384 num_physpages += slot_psize;
384 add_active_range(node, slot_getbasepfn(node, slot), 385 memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)),
385 slot_getbasepfn(node, slot) + slot_psize); 386 PFN_PHYS(slot_psize), node);
386 } 387 }
387 } 388 }
388} 389}
diff --git a/arch/openrisc/include/asm/memblock.h b/arch/openrisc/include/asm/memblock.h
deleted file mode 100644
index bbe5a1c788cb..000000000000
--- a/arch/openrisc/include/asm/memblock.h
+++ /dev/null
@@ -1,24 +0,0 @@
1/*
2 * OpenRISC Linux
3 *
4 * Linux architectural port borrowing liberally from similar works of
5 * others. All original copyrights apply as per the original source
6 * declaration.
7 *
8 * OpenRISC implementation:
9 * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>
10 * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>
11 * et al.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 */
18
19#ifndef __ASM_OPENRISC_MEMBLOCK_H
20#define __ASM_OPENRISC_MEMBLOCK_H
21
22/* empty */
23
24#endif /* __ASM_OPENRISC_MEMBLOCK_H */
diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c
index d5bc5f813e89..e5fc78877830 100644
--- a/arch/openrisc/kernel/idle.c
+++ b/arch/openrisc/kernel/idle.c
@@ -51,7 +51,8 @@ void cpu_idle(void)
51 51
52 /* endless idle loop with no priority at all */ 52 /* endless idle loop with no priority at all */
53 while (1) { 53 while (1) {
54 tick_nohz_stop_sched_tick(1); 54 tick_nohz_idle_enter();
55 rcu_idle_enter();
55 56
56 while (!need_resched()) { 57 while (!need_resched()) {
57 check_pgt_cache(); 58 check_pgt_cache();
@@ -69,7 +70,8 @@ void cpu_idle(void)
69 set_thread_flag(TIF_POLLING_NRFLAG); 70 set_thread_flag(TIF_POLLING_NRFLAG);
70 } 71 }
71 72
72 tick_nohz_restart_sched_tick(); 73 rcu_idle_exit();
74 tick_nohz_idle_exit();
73 preempt_enable_no_resched(); 75 preempt_enable_no_resched();
74 schedule(); 76 schedule();
75 preempt_disable(); 77 preempt_disable();
diff --git a/arch/openrisc/kernel/prom.c b/arch/openrisc/kernel/prom.c
index 1bb58ba89afa..3d4478f6c942 100644
--- a/arch/openrisc/kernel/prom.c
+++ b/arch/openrisc/kernel/prom.c
@@ -76,14 +76,13 @@ void __init early_init_devtree(void *params)
76 of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line); 76 of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line);
77 77
78 /* Scan memory nodes and rebuild MEMBLOCKs */ 78 /* Scan memory nodes and rebuild MEMBLOCKs */
79 memblock_init();
80 of_scan_flat_dt(early_init_dt_scan_root, NULL); 79 of_scan_flat_dt(early_init_dt_scan_root, NULL);
81 of_scan_flat_dt(early_init_dt_scan_memory, NULL); 80 of_scan_flat_dt(early_init_dt_scan_memory, NULL);
82 81
83 /* Save command line for /proc/cmdline and then parse parameters */ 82 /* Save command line for /proc/cmdline and then parse parameters */
84 strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); 83 strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE);
85 84
86 memblock_analyze(); 85 memblock_allow_resize();
87 86
88 /* We must copy the flattend device tree from init memory to regular 87 /* We must copy the flattend device tree from init memory to regular
89 * memory because the device tree references the strings in it 88 * memory because the device tree references the strings in it
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 45b7389d77aa..7c0774397b89 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -198,8 +198,6 @@ static struct clocksource clocksource_cr16 = {
198 .rating = 300, 198 .rating = 300,
199 .read = read_cr16, 199 .read = read_cr16,
200 .mask = CLOCKSOURCE_MASK(BITS_PER_LONG), 200 .mask = CLOCKSOURCE_MASK(BITS_PER_LONG),
201 .mult = 0, /* to be set */
202 .shift = 22,
203 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 201 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
204}; 202};
205 203
@@ -270,7 +268,5 @@ void __init time_init(void)
270 268
271 /* register at clocksource framework */ 269 /* register at clocksource framework */
272 current_cr16_khz = PAGE0->mem_10msec/10; /* kHz */ 270 current_cr16_khz = PAGE0->mem_10msec/10; /* kHz */
273 clocksource_cr16.mult = clocksource_khz2mult(current_cr16_khz, 271 clocksource_register_khz(&clocksource_cr16, current_cr16_khz);
274 clocksource_cr16.shift);
275 clocksource_register(&clocksource_cr16);
276} 272}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 951e18f5335b..ead0bc68439d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -117,6 +117,7 @@ config PPC
117 select HAVE_KRETPROBES 117 select HAVE_KRETPROBES
118 select HAVE_ARCH_TRACEHOOK 118 select HAVE_ARCH_TRACEHOOK
119 select HAVE_MEMBLOCK 119 select HAVE_MEMBLOCK
120 select HAVE_MEMBLOCK_NODE_MAP
120 select HAVE_DMA_ATTRS 121 select HAVE_DMA_ATTRS
121 select HAVE_DMA_API_DEBUG 122 select HAVE_DMA_API_DEBUG
122 select USE_GENERIC_SMP_HELPERS if SMP 123 select USE_GENERIC_SMP_HELPERS if SMP
@@ -421,9 +422,6 @@ config ARCH_SPARSEMEM_DEFAULT
421 def_bool y 422 def_bool y
422 depends on (SMP && PPC_PSERIES) || PPC_PS3 423 depends on (SMP && PPC_PSERIES) || PPC_PS3
423 424
424config ARCH_POPULATES_NODE_MAP
425 def_bool y
426
427config SYS_SUPPORTS_HUGETLBFS 425config SYS_SUPPORTS_HUGETLBFS
428 bool 426 bool
429 427
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 1cf20bdfbeca..6ec1c380a4d6 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -29,25 +29,8 @@ static inline void setup_cputime_one_jiffy(void) { }
29#include <asm/time.h> 29#include <asm/time.h>
30#include <asm/param.h> 30#include <asm/param.h>
31 31
32typedef u64 cputime_t; 32typedef u64 __nocast cputime_t;
33typedef u64 cputime64_t; 33typedef u64 __nocast cputime64_t;
34
35#define cputime_zero ((cputime_t)0)
36#define cputime_max ((~((cputime_t)0) >> 1) - 1)
37#define cputime_add(__a, __b) ((__a) + (__b))
38#define cputime_sub(__a, __b) ((__a) - (__b))
39#define cputime_div(__a, __n) ((__a) / (__n))
40#define cputime_halve(__a) ((__a) >> 1)
41#define cputime_eq(__a, __b) ((__a) == (__b))
42#define cputime_gt(__a, __b) ((__a) > (__b))
43#define cputime_ge(__a, __b) ((__a) >= (__b))
44#define cputime_lt(__a, __b) ((__a) < (__b))
45#define cputime_le(__a, __b) ((__a) <= (__b))
46
47#define cputime64_zero ((cputime64_t)0)
48#define cputime64_add(__a, __b) ((__a) + (__b))
49#define cputime64_sub(__a, __b) ((__a) - (__b))
50#define cputime_to_cputime64(__ct) (__ct)
51 34
52#ifdef __KERNEL__ 35#ifdef __KERNEL__
53 36
@@ -65,7 +48,7 @@ DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta);
65 48
66static inline unsigned long cputime_to_jiffies(const cputime_t ct) 49static inline unsigned long cputime_to_jiffies(const cputime_t ct)
67{ 50{
68 return mulhdu(ct, __cputime_jiffies_factor); 51 return mulhdu((__force u64) ct, __cputime_jiffies_factor);
69} 52}
70 53
71/* Estimate the scaled cputime by scaling the real cputime based on 54/* Estimate the scaled cputime by scaling the real cputime based on
@@ -74,14 +57,15 @@ static inline cputime_t cputime_to_scaled(const cputime_t ct)
74{ 57{
75 if (cpu_has_feature(CPU_FTR_SPURR) && 58 if (cpu_has_feature(CPU_FTR_SPURR) &&
76 __get_cpu_var(cputime_last_delta)) 59 __get_cpu_var(cputime_last_delta))
77 return ct * __get_cpu_var(cputime_scaled_last_delta) / 60 return (__force u64) ct *
78 __get_cpu_var(cputime_last_delta); 61 __get_cpu_var(cputime_scaled_last_delta) /
62 __get_cpu_var(cputime_last_delta);
79 return ct; 63 return ct;
80} 64}
81 65
82static inline cputime_t jiffies_to_cputime(const unsigned long jif) 66static inline cputime_t jiffies_to_cputime(const unsigned long jif)
83{ 67{
84 cputime_t ct; 68 u64 ct;
85 unsigned long sec; 69 unsigned long sec;
86 70
87 /* have to be a little careful about overflow */ 71 /* have to be a little careful about overflow */
@@ -93,7 +77,7 @@ static inline cputime_t jiffies_to_cputime(const unsigned long jif)
93 } 77 }
94 if (sec) 78 if (sec)
95 ct += (cputime_t) sec * tb_ticks_per_sec; 79 ct += (cputime_t) sec * tb_ticks_per_sec;
96 return ct; 80 return (__force cputime_t) ct;
97} 81}
98 82
99static inline void setup_cputime_one_jiffy(void) 83static inline void setup_cputime_one_jiffy(void)
@@ -103,7 +87,7 @@ static inline void setup_cputime_one_jiffy(void)
103 87
104static inline cputime64_t jiffies64_to_cputime64(const u64 jif) 88static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
105{ 89{
106 cputime_t ct; 90 u64 ct;
107 u64 sec; 91 u64 sec;
108 92
109 /* have to be a little careful about overflow */ 93 /* have to be a little careful about overflow */
@@ -114,13 +98,13 @@ static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
114 do_div(ct, HZ); 98 do_div(ct, HZ);
115 } 99 }
116 if (sec) 100 if (sec)
117 ct += (cputime_t) sec * tb_ticks_per_sec; 101 ct += (u64) sec * tb_ticks_per_sec;
118 return ct; 102 return (__force cputime64_t) ct;
119} 103}
120 104
121static inline u64 cputime64_to_jiffies64(const cputime_t ct) 105static inline u64 cputime64_to_jiffies64(const cputime_t ct)
122{ 106{
123 return mulhdu(ct, __cputime_jiffies_factor); 107 return mulhdu((__force u64) ct, __cputime_jiffies_factor);
124} 108}
125 109
126/* 110/*
@@ -130,12 +114,12 @@ extern u64 __cputime_msec_factor;
130 114
131static inline unsigned long cputime_to_usecs(const cputime_t ct) 115static inline unsigned long cputime_to_usecs(const cputime_t ct)
132{ 116{
133 return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC; 117 return mulhdu((__force u64) ct, __cputime_msec_factor) * USEC_PER_MSEC;
134} 118}
135 119
136static inline cputime_t usecs_to_cputime(const unsigned long us) 120static inline cputime_t usecs_to_cputime(const unsigned long us)
137{ 121{
138 cputime_t ct; 122 u64 ct;
139 unsigned long sec; 123 unsigned long sec;
140 124
141 /* have to be a little careful about overflow */ 125 /* have to be a little careful about overflow */
@@ -147,9 +131,11 @@ static inline cputime_t usecs_to_cputime(const unsigned long us)
147 } 131 }
148 if (sec) 132 if (sec)
149 ct += (cputime_t) sec * tb_ticks_per_sec; 133 ct += (cputime_t) sec * tb_ticks_per_sec;
150 return ct; 134 return (__force cputime_t) ct;
151} 135}
152 136
137#define usecs_to_cputime64(us) usecs_to_cputime(us)
138
153/* 139/*
154 * Convert cputime <-> seconds 140 * Convert cputime <-> seconds
155 */ 141 */
@@ -157,12 +143,12 @@ extern u64 __cputime_sec_factor;
157 143
158static inline unsigned long cputime_to_secs(const cputime_t ct) 144static inline unsigned long cputime_to_secs(const cputime_t ct)
159{ 145{
160 return mulhdu(ct, __cputime_sec_factor); 146 return mulhdu((__force u64) ct, __cputime_sec_factor);
161} 147}
162 148
163static inline cputime_t secs_to_cputime(const unsigned long sec) 149static inline cputime_t secs_to_cputime(const unsigned long sec)
164{ 150{
165 return (cputime_t) sec * tb_ticks_per_sec; 151 return (__force cputime_t)((u64) sec * tb_ticks_per_sec);
166} 152}
167 153
168/* 154/*
@@ -170,7 +156,7 @@ static inline cputime_t secs_to_cputime(const unsigned long sec)
170 */ 156 */
171static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p) 157static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p)
172{ 158{
173 u64 x = ct; 159 u64 x = (__force u64) ct;
174 unsigned int frac; 160 unsigned int frac;
175 161
176 frac = do_div(x, tb_ticks_per_sec); 162 frac = do_div(x, tb_ticks_per_sec);
@@ -182,11 +168,11 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p)
182 168
183static inline cputime_t timespec_to_cputime(const struct timespec *p) 169static inline cputime_t timespec_to_cputime(const struct timespec *p)
184{ 170{
185 cputime_t ct; 171 u64 ct;
186 172
187 ct = (u64) p->tv_nsec * tb_ticks_per_sec; 173 ct = (u64) p->tv_nsec * tb_ticks_per_sec;
188 do_div(ct, 1000000000); 174 do_div(ct, 1000000000);
189 return ct + (u64) p->tv_sec * tb_ticks_per_sec; 175 return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec);
190} 176}
191 177
192/* 178/*
@@ -194,7 +180,7 @@ static inline cputime_t timespec_to_cputime(const struct timespec *p)
194 */ 180 */
195static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p) 181static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p)
196{ 182{
197 u64 x = ct; 183 u64 x = (__force u64) ct;
198 unsigned int frac; 184 unsigned int frac;
199 185
200 frac = do_div(x, tb_ticks_per_sec); 186 frac = do_div(x, tb_ticks_per_sec);
@@ -206,11 +192,11 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p)
206 192
207static inline cputime_t timeval_to_cputime(const struct timeval *p) 193static inline cputime_t timeval_to_cputime(const struct timeval *p)
208{ 194{
209 cputime_t ct; 195 u64 ct;
210 196
211 ct = (u64) p->tv_usec * tb_ticks_per_sec; 197 ct = (u64) p->tv_usec * tb_ticks_per_sec;
212 do_div(ct, 1000000); 198 do_div(ct, 1000000);
213 return ct + (u64) p->tv_sec * tb_ticks_per_sec; 199 return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec);
214} 200}
215 201
216/* 202/*
@@ -220,12 +206,12 @@ extern u64 __cputime_clockt_factor;
220 206
221static inline unsigned long cputime_to_clock_t(const cputime_t ct) 207static inline unsigned long cputime_to_clock_t(const cputime_t ct)
222{ 208{
223 return mulhdu(ct, __cputime_clockt_factor); 209 return mulhdu((__force u64) ct, __cputime_clockt_factor);
224} 210}
225 211
226static inline cputime_t clock_t_to_cputime(const unsigned long clk) 212static inline cputime_t clock_t_to_cputime(const unsigned long clk)
227{ 213{
228 cputime_t ct; 214 u64 ct;
229 unsigned long sec; 215 unsigned long sec;
230 216
231 /* have to be a little careful about overflow */ 217 /* have to be a little careful about overflow */
@@ -236,8 +222,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk)
236 do_div(ct, USER_HZ); 222 do_div(ct, USER_HZ);
237 } 223 }
238 if (sec) 224 if (sec)
239 ct += (cputime_t) sec * tb_ticks_per_sec; 225 ct += (u64) sec * tb_ticks_per_sec;
240 return ct; 226 return (__force cputime_t) ct;
241} 227}
242 228
243#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct)) 229#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct))
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index d4df013ad779..69c7377d2071 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -381,39 +381,6 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
381} 381}
382#endif 382#endif
383 383
384static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
385 unsigned long pte_index)
386{
387 unsigned long rb, va_low;
388
389 rb = (v & ~0x7fUL) << 16; /* AVA field */
390 va_low = pte_index >> 3;
391 if (v & HPTE_V_SECONDARY)
392 va_low = ~va_low;
393 /* xor vsid from AVA */
394 if (!(v & HPTE_V_1TB_SEG))
395 va_low ^= v >> 12;
396 else
397 va_low ^= v >> 24;
398 va_low &= 0x7ff;
399 if (v & HPTE_V_LARGE) {
400 rb |= 1; /* L field */
401 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
402 (r & 0xff000)) {
403 /* non-16MB large page, must be 64k */
404 /* (masks depend on page size) */
405 rb |= 0x1000; /* page encoding in LP field */
406 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
407 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
408 }
409 } else {
410 /* 4kB page */
411 rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
412 }
413 rb |= (v >> 54) & 0x300; /* B field */
414 return rb;
415}
416
417/* Magic register values loaded into r3 and r4 before the 'sc' assembly 384/* Magic register values loaded into r3 and r4 before the 'sc' assembly
418 * instruction for the OSI hypercalls */ 385 * instruction for the OSI hypercalls */
419#define OSI_SC_MAGIC_R3 0x113724FA 386#define OSI_SC_MAGIC_R3 0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index e43fe42b9875..d0ac94f98f9e 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -29,4 +29,37 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
29 29
30#define SPAPR_TCE_SHIFT 12 30#define SPAPR_TCE_SHIFT 12
31 31
32static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
33 unsigned long pte_index)
34{
35 unsigned long rb, va_low;
36
37 rb = (v & ~0x7fUL) << 16; /* AVA field */
38 va_low = pte_index >> 3;
39 if (v & HPTE_V_SECONDARY)
40 va_low = ~va_low;
41 /* xor vsid from AVA */
42 if (!(v & HPTE_V_1TB_SEG))
43 va_low ^= v >> 12;
44 else
45 va_low ^= v >> 24;
46 va_low &= 0x7ff;
47 if (v & HPTE_V_LARGE) {
48 rb |= 1; /* L field */
49 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
50 (r & 0xff000)) {
51 /* non-16MB large page, must be 64k */
52 /* (masks depend on page size) */
53 rb |= 0x1000; /* page encoding in LP field */
54 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
55 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
56 }
57 } else {
58 /* 4kB page */
59 rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
60 }
61 rb |= (v >> 54) & 0x300; /* B field */
62 return rb;
63}
64
32#endif /* __ASM_KVM_BOOK3S_64_H__ */ 65#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/memblock.h b/arch/powerpc/include/asm/memblock.h
deleted file mode 100644
index 43efc345065e..000000000000
--- a/arch/powerpc/include/asm/memblock.h
+++ /dev/null
@@ -1,8 +0,0 @@
1#ifndef _ASM_POWERPC_MEMBLOCK_H
2#define _ASM_POWERPC_MEMBLOCK_H
3
4#include <asm/udbg.h>
5
6#define MEMBLOCK_DBG(fmt...) udbg_printf(fmt)
7
8#endif /* _ASM_POWERPC_MEMBLOCK_H */
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 39a2baa6ad58..9c3cd490b1bd 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -46,6 +46,12 @@ static int __init powersave_off(char *arg)
46} 46}
47__setup("powersave=off", powersave_off); 47__setup("powersave=off", powersave_off);
48 48
49#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_TRACEPOINTS)
50static const bool idle_uses_rcu = 1;
51#else
52static const bool idle_uses_rcu;
53#endif
54
49/* 55/*
50 * The body of the idle task. 56 * The body of the idle task.
51 */ 57 */
@@ -56,7 +62,10 @@ void cpu_idle(void)
56 62
57 set_thread_flag(TIF_POLLING_NRFLAG); 63 set_thread_flag(TIF_POLLING_NRFLAG);
58 while (1) { 64 while (1) {
59 tick_nohz_stop_sched_tick(1); 65 tick_nohz_idle_enter();
66 if (!idle_uses_rcu)
67 rcu_idle_enter();
68
60 while (!need_resched() && !cpu_should_die()) { 69 while (!need_resched() && !cpu_should_die()) {
61 ppc64_runlatch_off(); 70 ppc64_runlatch_off();
62 71
@@ -93,7 +102,9 @@ void cpu_idle(void)
93 102
94 HMT_medium(); 103 HMT_medium();
95 ppc64_runlatch_on(); 104 ppc64_runlatch_on();
96 tick_nohz_restart_sched_tick(); 105 if (!idle_uses_rcu)
106 rcu_idle_exit();
107 tick_nohz_idle_exit();
97 preempt_enable_no_resched(); 108 preempt_enable_no_resched();
98 if (cpu_should_die()) 109 if (cpu_should_die())
99 cpu_die(); 110 cpu_die();
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 9ce1672afb59..a2158a395d96 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -107,9 +107,6 @@ void __init reserve_crashkernel(void)
107 unsigned long long crash_size, crash_base; 107 unsigned long long crash_size, crash_base;
108 int ret; 108 int ret;
109 109
110 /* this is necessary because of memblock_phys_mem_size() */
111 memblock_analyze();
112
113 /* use common parsing */ 110 /* use common parsing */
114 ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), 111 ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
115 &crash_size, &crash_base); 112 &crash_size, &crash_base);
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index fa1235b0503b..abe405dab34d 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -733,8 +733,6 @@ void __init early_init_devtree(void *params)
733 of_scan_flat_dt(early_init_dt_scan_chosen_ppc, cmd_line); 733 of_scan_flat_dt(early_init_dt_scan_chosen_ppc, cmd_line);
734 734
735 /* Scan memory nodes and rebuild MEMBLOCKs */ 735 /* Scan memory nodes and rebuild MEMBLOCKs */
736 memblock_init();
737
738 of_scan_flat_dt(early_init_dt_scan_root, NULL); 736 of_scan_flat_dt(early_init_dt_scan_root, NULL);
739 of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); 737 of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
740 738
@@ -756,20 +754,14 @@ void __init early_init_devtree(void *params)
756 early_reserve_mem(); 754 early_reserve_mem();
757 phyp_dump_reserve_mem(); 755 phyp_dump_reserve_mem();
758 756
759 limit = memory_limit; 757 /*
760 if (! limit) { 758 * Ensure that total memory size is page-aligned, because otherwise
761 phys_addr_t memsize; 759 * mark_bootmem() gets upset.
762 760 */
763 /* Ensure that total memory size is page-aligned, because 761 limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
764 * otherwise mark_bootmem() gets upset. */
765 memblock_analyze();
766 memsize = memblock_phys_mem_size();
767 if ((memsize & PAGE_MASK) != memsize)
768 limit = memsize & PAGE_MASK;
769 }
770 memblock_enforce_memory_limit(limit); 762 memblock_enforce_memory_limit(limit);
771 763
772 memblock_analyze(); 764 memblock_allow_resize();
773 memblock_dump_all(); 765 memblock_dump_all();
774 766
775 DBG("Phys. mem: %llx\n", memblock_phys_mem_size()); 767 DBG("Phys. mem: %llx\n", memblock_phys_mem_size());
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0cb137a9b038..336983da9e72 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -538,7 +538,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
538 tpaca->kvm_hstate.napping = 0; 538 tpaca->kvm_hstate.napping = 0;
539 vcpu->cpu = vc->pcpu; 539 vcpu->cpu = vc->pcpu;
540 smp_wmb(); 540 smp_wmb();
541#ifdef CONFIG_PPC_ICP_NATIVE 541#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
542 if (vcpu->arch.ptid) { 542 if (vcpu->arch.ptid) {
543 tpaca->cpu_start = 0x80; 543 tpaca->cpu_start = 0x80;
544 wmb(); 544 wmb();
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 3c791e1eb675..e2cfb9e1e20e 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -658,10 +658,12 @@ program_interrupt:
658 ulong cmd = kvmppc_get_gpr(vcpu, 3); 658 ulong cmd = kvmppc_get_gpr(vcpu, 3);
659 int i; 659 int i;
660 660
661#ifdef CONFIG_KVM_BOOK3S_64_PR
661 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { 662 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
662 r = RESUME_GUEST; 663 r = RESUME_GUEST;
663 break; 664 break;
664 } 665 }
666#endif
665 667
666 run->papr_hcall.nr = cmd; 668 run->papr_hcall.nr = cmd;
667 for (i = 0; i < 9; ++i) { 669 for (i = 0; i < 9; ++i) {
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 26d20903f2bc..8c0d45a6faf7 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -15,6 +15,7 @@
15#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/err.h> 17#include <linux/err.h>
18#include <linux/export.h>
18 19
19#include <asm/reg.h> 20#include <asm/reg.h>
20#include <asm/cputable.h> 21#include <asm/cputable.h>
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 161cefde5c15..58861fa1220e 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -134,8 +134,7 @@ void __init MMU_init(void)
134 134
135 if (memblock.memory.cnt > 1) { 135 if (memblock.memory.cnt > 1) {
136#ifndef CONFIG_WII 136#ifndef CONFIG_WII
137 memblock.memory.cnt = 1; 137 memblock_enforce_memory_limit(memblock.memory.regions[0].size);
138 memblock_analyze();
139 printk(KERN_WARNING "Only using first contiguous memory region"); 138 printk(KERN_WARNING "Only using first contiguous memory region");
140#else 139#else
141 wii_memory_fixups(); 140 wii_memory_fixups();
@@ -158,7 +157,6 @@ void __init MMU_init(void)
158#ifndef CONFIG_HIGHMEM 157#ifndef CONFIG_HIGHMEM
159 total_memory = total_lowmem; 158 total_memory = total_lowmem;
160 memblock_enforce_memory_limit(total_lowmem); 159 memblock_enforce_memory_limit(total_lowmem);
161 memblock_analyze();
162#endif /* CONFIG_HIGHMEM */ 160#endif /* CONFIG_HIGHMEM */
163 } 161 }
164 162
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2dd6bdd31fe1..8e2eb6611b0b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -199,7 +199,7 @@ void __init do_init_bootmem(void)
199 unsigned long start_pfn, end_pfn; 199 unsigned long start_pfn, end_pfn;
200 start_pfn = memblock_region_memory_base_pfn(reg); 200 start_pfn = memblock_region_memory_base_pfn(reg);
201 end_pfn = memblock_region_memory_end_pfn(reg); 201 end_pfn = memblock_region_memory_end_pfn(reg);
202 add_active_range(0, start_pfn, end_pfn); 202 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
203 } 203 }
204 204
205 /* Add all physical memory to the bootmem map, mark each area 205 /* Add all physical memory to the bootmem map, mark each area
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b22a83a91cb8..e6eea0ac80c8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -127,45 +127,25 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
127} 127}
128 128
129/* 129/*
130 * get_active_region_work_fn - A helper function for get_node_active_region 130 * get_node_active_region - Return active region containing pfn
131 * Returns datax set to the start_pfn and end_pfn if they contain
132 * the initial value of datax->start_pfn between them
133 * @start_pfn: start page(inclusive) of region to check
134 * @end_pfn: end page(exclusive) of region to check
135 * @datax: comes in with ->start_pfn set to value to search for and
136 * goes out with active range if it contains it
137 * Returns 1 if search value is in range else 0
138 */
139static int __init get_active_region_work_fn(unsigned long start_pfn,
140 unsigned long end_pfn, void *datax)
141{
142 struct node_active_region *data;
143 data = (struct node_active_region *)datax;
144
145 if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
146 data->start_pfn = start_pfn;
147 data->end_pfn = end_pfn;
148 return 1;
149 }
150 return 0;
151
152}
153
154/*
155 * get_node_active_region - Return active region containing start_pfn
156 * Active range returned is empty if none found. 131 * Active range returned is empty if none found.
157 * @start_pfn: The page to return the region for. 132 * @pfn: The page to return the region for
158 * @node_ar: Returned set to the active region containing start_pfn 133 * @node_ar: Returned set to the active region containing @pfn
159 */ 134 */
160static void __init get_node_active_region(unsigned long start_pfn, 135static void __init get_node_active_region(unsigned long pfn,
161 struct node_active_region *node_ar) 136 struct node_active_region *node_ar)
162{ 137{
163 int nid = early_pfn_to_nid(start_pfn); 138 unsigned long start_pfn, end_pfn;
139 int i, nid;
164 140
165 node_ar->nid = nid; 141 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
166 node_ar->start_pfn = start_pfn; 142 if (pfn >= start_pfn && pfn < end_pfn) {
167 node_ar->end_pfn = start_pfn; 143 node_ar->nid = nid;
168 work_with_active_regions(nid, get_active_region_work_fn, node_ar); 144 node_ar->start_pfn = start_pfn;
145 node_ar->end_pfn = end_pfn;
146 break;
147 }
148 }
169} 149}
170 150
171static void map_cpu_to_node(int cpu, int node) 151static void map_cpu_to_node(int cpu, int node)
@@ -710,9 +690,7 @@ static void __init parse_drconf_memory(struct device_node *memory)
710 node_set_online(nid); 690 node_set_online(nid);
711 sz = numa_enforce_memory_limit(base, size); 691 sz = numa_enforce_memory_limit(base, size);
712 if (sz) 692 if (sz)
713 add_active_range(nid, base >> PAGE_SHIFT, 693 memblock_set_node(base, sz, nid);
714 (base >> PAGE_SHIFT)
715 + (sz >> PAGE_SHIFT));
716 } while (--ranges); 694 } while (--ranges);
717 } 695 }
718} 696}
@@ -802,8 +780,7 @@ new_range:
802 continue; 780 continue;
803 } 781 }
804 782
805 add_active_range(nid, start >> PAGE_SHIFT, 783 memblock_set_node(start, size, nid);
806 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
807 784
808 if (--ranges) 785 if (--ranges)
809 goto new_range; 786 goto new_range;
@@ -839,7 +816,8 @@ static void __init setup_nonnuma(void)
839 end_pfn = memblock_region_memory_end_pfn(reg); 816 end_pfn = memblock_region_memory_end_pfn(reg);
840 817
841 fake_numa_create_new_node(end_pfn, &nid); 818 fake_numa_create_new_node(end_pfn, &nid);
842 add_active_range(nid, start_pfn, end_pfn); 819 memblock_set_node(PFN_PHYS(start_pfn),
820 PFN_PHYS(end_pfn - start_pfn), nid);
843 node_set_online(nid); 821 node_set_online(nid);
844 } 822 }
845} 823}
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 4e13d6f9023e..573ba3b69d1f 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -615,7 +615,6 @@ static void __early_init_mmu(int boot_cpu)
615 615
616 /* limit memory so we dont have linear faults */ 616 /* limit memory so we dont have linear faults */
617 memblock_enforce_memory_limit(linear_map_top); 617 memblock_enforce_memory_limit(linear_map_top);
618 memblock_analyze();
619 618
620 patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); 619 patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
621 patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e); 620 patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e);
diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c
index 1b5dc1a2e145..6d8dadf19f0b 100644
--- a/arch/powerpc/platforms/embedded6xx/wii.c
+++ b/arch/powerpc/platforms/embedded6xx/wii.c
@@ -79,24 +79,19 @@ void __init wii_memory_fixups(void)
79 BUG_ON(memblock.memory.cnt != 2); 79 BUG_ON(memblock.memory.cnt != 2);
80 BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base)); 80 BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base));
81 81
82 p[0].size = _ALIGN_DOWN(p[0].size, PAGE_SIZE); 82 /* trim unaligned tail */
83 p[1].size = _ALIGN_DOWN(p[1].size, PAGE_SIZE); 83 memblock_remove(ALIGN(p[1].base + p[1].size, PAGE_SIZE),
84 (phys_addr_t)ULLONG_MAX);
84 85
85 wii_hole_start = p[0].base + p[0].size; 86 /* determine hole, add & reserve them */
87 wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE);
86 wii_hole_size = p[1].base - wii_hole_start; 88 wii_hole_size = p[1].base - wii_hole_start;
87 89 memblock_add(wii_hole_start, wii_hole_size);
88 pr_info("MEM1: <%08llx %08llx>\n", p[0].base, p[0].size);
89 pr_info("HOLE: <%08lx %08lx>\n", wii_hole_start, wii_hole_size);
90 pr_info("MEM2: <%08llx %08llx>\n", p[1].base, p[1].size);
91
92 p[0].size += wii_hole_size + p[1].size;
93
94 memblock.memory.cnt = 1;
95 memblock_analyze();
96
97 /* reserve the hole */
98 memblock_reserve(wii_hole_start, wii_hole_size); 90 memblock_reserve(wii_hole_start, wii_hole_size);
99 91
92 BUG_ON(memblock.memory.cnt != 1);
93 __memblock_dump_all();
94
100 /* allow ioremapping the address space in the hole */ 95 /* allow ioremapping the address space in the hole */
101 __allow_ioremap_reserved = 1; 96 __allow_ioremap_reserved = 1;
102} 97}
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index ea0acbd8966d..8fc62586a973 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -563,7 +563,8 @@ static void yield_shared_processor(void)
563static void iseries_shared_idle(void) 563static void iseries_shared_idle(void)
564{ 564{
565 while (1) { 565 while (1) {
566 tick_nohz_stop_sched_tick(1); 566 tick_nohz_idle_enter();
567 rcu_idle_enter();
567 while (!need_resched() && !hvlpevent_is_pending()) { 568 while (!need_resched() && !hvlpevent_is_pending()) {
568 local_irq_disable(); 569 local_irq_disable();
569 ppc64_runlatch_off(); 570 ppc64_runlatch_off();
@@ -577,7 +578,8 @@ static void iseries_shared_idle(void)
577 } 578 }
578 579
579 ppc64_runlatch_on(); 580 ppc64_runlatch_on();
580 tick_nohz_restart_sched_tick(); 581 rcu_idle_exit();
582 tick_nohz_idle_exit();
581 583
582 if (hvlpevent_is_pending()) 584 if (hvlpevent_is_pending())
583 process_iSeries_events(); 585 process_iSeries_events();
@@ -593,7 +595,8 @@ static void iseries_dedicated_idle(void)
593 set_thread_flag(TIF_POLLING_NRFLAG); 595 set_thread_flag(TIF_POLLING_NRFLAG);
594 596
595 while (1) { 597 while (1) {
596 tick_nohz_stop_sched_tick(1); 598 tick_nohz_idle_enter();
599 rcu_idle_enter();
597 if (!need_resched()) { 600 if (!need_resched()) {
598 while (!need_resched()) { 601 while (!need_resched()) {
599 ppc64_runlatch_off(); 602 ppc64_runlatch_off();
@@ -610,7 +613,8 @@ static void iseries_dedicated_idle(void)
610 } 613 }
611 614
612 ppc64_runlatch_on(); 615 ppc64_runlatch_on();
613 tick_nohz_restart_sched_tick(); 616 rcu_idle_exit();
617 tick_nohz_idle_exit();
614 preempt_enable_no_resched(); 618 preempt_enable_no_resched();
615 schedule(); 619 schedule();
616 preempt_disable(); 620 preempt_disable();
diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c
index 72714ad27842..8bd6ba542691 100644
--- a/arch/powerpc/platforms/ps3/mm.c
+++ b/arch/powerpc/platforms/ps3/mm.c
@@ -319,7 +319,6 @@ static int __init ps3_mm_add_memory(void)
319 } 319 }
320 320
321 memblock_add(start_addr, map.r1.size); 321 memblock_add(start_addr, map.r1.size);
322 memblock_analyze();
323 322
324 result = online_pages(start_pfn, nr_pages); 323 result = online_pages(start_pfn, nr_pages);
325 324
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 27a49508b410..52d429be6c76 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -555,6 +555,8 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
555 555
556 (*depth)++; 556 (*depth)++;
557 trace_hcall_entry(opcode, args); 557 trace_hcall_entry(opcode, args);
558 if (opcode == H_CEDE)
559 rcu_idle_enter();
558 (*depth)--; 560 (*depth)--;
559 561
560out: 562out:
@@ -575,6 +577,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
575 goto out; 577 goto out;
576 578
577 (*depth)++; 579 (*depth)++;
580 if (opcode == H_CEDE)
581 rcu_idle_exit();
578 trace_hcall_exit(opcode, retval, retbuf); 582 trace_hcall_exit(opcode, retval, retbuf);
579 (*depth)--; 583 (*depth)--;
580 584
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 373679b3744a..d48ede334434 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -92,6 +92,9 @@ config S390
92 select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 92 select HAVE_ARCH_JUMP_LABEL if !MARCH_G5
93 select HAVE_RCU_TABLE_FREE if SMP 93 select HAVE_RCU_TABLE_FREE if SMP
94 select ARCH_SAVE_PAGE_KEYS if HIBERNATION 94 select ARCH_SAVE_PAGE_KEYS if HIBERNATION
95 select HAVE_MEMBLOCK
96 select HAVE_MEMBLOCK_NODE_MAP
97 select ARCH_DISCARD_MEMBLOCK
95 select ARCH_INLINE_SPIN_TRYLOCK 98 select ARCH_INLINE_SPIN_TRYLOCK
96 select ARCH_INLINE_SPIN_TRYLOCK_BH 99 select ARCH_INLINE_SPIN_TRYLOCK_BH
97 select ARCH_INLINE_SPIN_LOCK 100 select ARCH_INLINE_SPIN_LOCK
@@ -345,9 +348,6 @@ config WARN_DYNAMIC_STACK
345 348
346 Say N if you are unsure. 349 Say N if you are unsure.
347 350
348config ARCH_POPULATES_NODE_MAP
349 def_bool y
350
351comment "Kernel preemption" 351comment "Kernel preemption"
352 352
353source "kernel/Kconfig.preempt" 353source "kernel/Kconfig.preempt"
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 92f1cb745d69..4de031d6b76c 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data)
115 j = 0; 115 j = 0;
116 for_each_online_cpu(i) { 116 for_each_online_cpu(i) {
117 os_data->os_cpu[j].per_cpu_user = 117 os_data->os_cpu[j].per_cpu_user =
118 cputime_to_jiffies(kstat_cpu(i).cpustat.user); 118 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
119 os_data->os_cpu[j].per_cpu_nice = 119 os_data->os_cpu[j].per_cpu_nice =
120 cputime_to_jiffies(kstat_cpu(i).cpustat.nice); 120 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
121 os_data->os_cpu[j].per_cpu_system = 121 os_data->os_cpu[j].per_cpu_system =
122 cputime_to_jiffies(kstat_cpu(i).cpustat.system); 122 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
123 os_data->os_cpu[j].per_cpu_idle = 123 os_data->os_cpu[j].per_cpu_idle =
124 cputime_to_jiffies(kstat_cpu(i).cpustat.idle); 124 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
125 os_data->os_cpu[j].per_cpu_irq = 125 os_data->os_cpu[j].per_cpu_irq =
126 cputime_to_jiffies(kstat_cpu(i).cpustat.irq); 126 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
127 os_data->os_cpu[j].per_cpu_softirq = 127 os_data->os_cpu[j].per_cpu_softirq =
128 cputime_to_jiffies(kstat_cpu(i).cpustat.softirq); 128 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]);
129 os_data->os_cpu[j].per_cpu_iowait = 129 os_data->os_cpu[j].per_cpu_iowait =
130 cputime_to_jiffies(kstat_cpu(i).cpustat.iowait); 130 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
131 os_data->os_cpu[j].per_cpu_steal = 131 os_data->os_cpu[j].per_cpu_steal =
132 cputime_to_jiffies(kstat_cpu(i).cpustat.steal); 132 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
133 os_data->os_cpu[j].cpu_id = i; 133 os_data->os_cpu[j].cpu_id = i;
134 j++; 134 j++;
135 } 135 }
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 081434878296..c23c3900c304 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -16,114 +16,100 @@
16 16
17/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ 17/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
18 18
19typedef unsigned long long cputime_t; 19typedef unsigned long long __nocast cputime_t;
20typedef unsigned long long cputime64_t; 20typedef unsigned long long __nocast cputime64_t;
21 21
22#ifndef __s390x__ 22static inline unsigned long __div(unsigned long long n, unsigned long base)
23
24static inline unsigned int
25__div(unsigned long long n, unsigned int base)
26{ 23{
24#ifndef __s390x__
27 register_pair rp; 25 register_pair rp;
28 26
29 rp.pair = n >> 1; 27 rp.pair = n >> 1;
30 asm ("dr %0,%1" : "+d" (rp) : "d" (base >> 1)); 28 asm ("dr %0,%1" : "+d" (rp) : "d" (base >> 1));
31 return rp.subreg.odd; 29 return rp.subreg.odd;
30#else /* __s390x__ */
31 return n / base;
32#endif /* __s390x__ */
32} 33}
33 34
34#else /* __s390x__ */ 35#define cputime_one_jiffy jiffies_to_cputime(1)
36
37/*
38 * Convert cputime to jiffies and back.
39 */
40static inline unsigned long cputime_to_jiffies(const cputime_t cputime)
41{
42 return __div((__force unsigned long long) cputime, 4096000000ULL / HZ);
43}
35 44
36static inline unsigned int 45static inline cputime_t jiffies_to_cputime(const unsigned int jif)
37__div(unsigned long long n, unsigned int base)
38{ 46{
39 return n / base; 47 return (__force cputime_t)(jif * (4096000000ULL / HZ));
40} 48}
41 49
42#endif /* __s390x__ */ 50static inline u64 cputime64_to_jiffies64(cputime64_t cputime)
51{
52 unsigned long long jif = (__force unsigned long long) cputime;
53 do_div(jif, 4096000000ULL / HZ);
54 return jif;
55}
43 56
44#define cputime_zero (0ULL) 57static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
45#define cputime_one_jiffy jiffies_to_cputime(1) 58{
46#define cputime_max ((~0UL >> 1) - 1) 59 return (__force cputime64_t)(jif * (4096000000ULL / HZ));
47#define cputime_add(__a, __b) ((__a) + (__b))
48#define cputime_sub(__a, __b) ((__a) - (__b))
49#define cputime_div(__a, __n) ({ \
50 unsigned long long __div = (__a); \
51 do_div(__div,__n); \
52 __div; \
53})
54#define cputime_halve(__a) ((__a) >> 1)
55#define cputime_eq(__a, __b) ((__a) == (__b))
56#define cputime_gt(__a, __b) ((__a) > (__b))
57#define cputime_ge(__a, __b) ((__a) >= (__b))
58#define cputime_lt(__a, __b) ((__a) < (__b))
59#define cputime_le(__a, __b) ((__a) <= (__b))
60#define cputime_to_jiffies(__ct) (__div((__ct), 4096000000ULL / HZ))
61#define cputime_to_scaled(__ct) (__ct)
62#define jiffies_to_cputime(__hz) ((cputime_t)(__hz) * (4096000000ULL / HZ))
63
64#define cputime64_zero (0ULL)
65#define cputime64_add(__a, __b) ((__a) + (__b))
66#define cputime_to_cputime64(__ct) (__ct)
67
68static inline u64
69cputime64_to_jiffies64(cputime64_t cputime)
70{
71 do_div(cputime, 4096000000ULL / HZ);
72 return cputime;
73} 60}
74 61
75/* 62/*
76 * Convert cputime to microseconds and back. 63 * Convert cputime to microseconds and back.
77 */ 64 */
78static inline unsigned int 65static inline unsigned int cputime_to_usecs(const cputime_t cputime)
79cputime_to_usecs(const cputime_t cputime)
80{ 66{
81 return cputime_div(cputime, 4096); 67 return (__force unsigned long long) cputime >> 12;
82} 68}
83 69
84static inline cputime_t 70static inline cputime_t usecs_to_cputime(const unsigned int m)
85usecs_to_cputime(const unsigned int m)
86{ 71{
87 return (cputime_t) m * 4096; 72 return (__force cputime_t)(m * 4096ULL);
88} 73}
89 74
75#define usecs_to_cputime64(m) usecs_to_cputime(m)
76
90/* 77/*
91 * Convert cputime to milliseconds and back. 78 * Convert cputime to milliseconds and back.
92 */ 79 */
93static inline unsigned int 80static inline unsigned int cputime_to_secs(const cputime_t cputime)
94cputime_to_secs(const cputime_t cputime)
95{ 81{
96 return __div(cputime, 2048000000) >> 1; 82 return __div((__force unsigned long long) cputime, 2048000000) >> 1;
97} 83}
98 84
99static inline cputime_t 85static inline cputime_t secs_to_cputime(const unsigned int s)
100secs_to_cputime(const unsigned int s)
101{ 86{
102 return (cputime_t) s * 4096000000ULL; 87 return (__force cputime_t)(s * 4096000000ULL);
103} 88}
104 89
105/* 90/*
106 * Convert cputime to timespec and back. 91 * Convert cputime to timespec and back.
107 */ 92 */
108static inline cputime_t 93static inline cputime_t timespec_to_cputime(const struct timespec *value)
109timespec_to_cputime(const struct timespec *value)
110{ 94{
111 return value->tv_nsec * 4096 / 1000 + (u64) value->tv_sec * 4096000000ULL; 95 unsigned long long ret = value->tv_sec * 4096000000ULL;
96 return (__force cputime_t)(ret + value->tv_nsec * 4096 / 1000);
112} 97}
113 98
114static inline void 99static inline void cputime_to_timespec(const cputime_t cputime,
115cputime_to_timespec(const cputime_t cputime, struct timespec *value) 100 struct timespec *value)
116{ 101{
102 unsigned long long __cputime = (__force unsigned long long) cputime;
117#ifndef __s390x__ 103#ifndef __s390x__
118 register_pair rp; 104 register_pair rp;
119 105
120 rp.pair = cputime >> 1; 106 rp.pair = __cputime >> 1;
121 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); 107 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
122 value->tv_nsec = rp.subreg.even * 1000 / 4096; 108 value->tv_nsec = rp.subreg.even * 1000 / 4096;
123 value->tv_sec = rp.subreg.odd; 109 value->tv_sec = rp.subreg.odd;
124#else 110#else
125 value->tv_nsec = (cputime % 4096000000ULL) * 1000 / 4096; 111 value->tv_nsec = (__cputime % 4096000000ULL) * 1000 / 4096;
126 value->tv_sec = cputime / 4096000000ULL; 112 value->tv_sec = __cputime / 4096000000ULL;
127#endif 113#endif
128} 114}
129 115
@@ -132,50 +118,52 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value)
132 * Since cputime and timeval have the same resolution (microseconds) 118 * Since cputime and timeval have the same resolution (microseconds)
133 * this is easy. 119 * this is easy.
134 */ 120 */
135static inline cputime_t 121static inline cputime_t timeval_to_cputime(const struct timeval *value)
136timeval_to_cputime(const struct timeval *value)
137{ 122{
138 return value->tv_usec * 4096 + (u64) value->tv_sec * 4096000000ULL; 123 unsigned long long ret = value->tv_sec * 4096000000ULL;
124 return (__force cputime_t)(ret + value->tv_usec * 4096ULL);
139} 125}
140 126
141static inline void 127static inline void cputime_to_timeval(const cputime_t cputime,
142cputime_to_timeval(const cputime_t cputime, struct timeval *value) 128 struct timeval *value)
143{ 129{
130 unsigned long long __cputime = (__force unsigned long long) cputime;
144#ifndef __s390x__ 131#ifndef __s390x__
145 register_pair rp; 132 register_pair rp;
146 133
147 rp.pair = cputime >> 1; 134 rp.pair = __cputime >> 1;
148 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); 135 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
149 value->tv_usec = rp.subreg.even / 4096; 136 value->tv_usec = rp.subreg.even / 4096;
150 value->tv_sec = rp.subreg.odd; 137 value->tv_sec = rp.subreg.odd;
151#else 138#else
152 value->tv_usec = (cputime % 4096000000ULL) / 4096; 139 value->tv_usec = (__cputime % 4096000000ULL) / 4096;
153 value->tv_sec = cputime / 4096000000ULL; 140 value->tv_sec = __cputime / 4096000000ULL;
154#endif 141#endif
155} 142}
156 143
157/* 144/*
158 * Convert cputime to clock and back. 145 * Convert cputime to clock and back.
159 */ 146 */
160static inline clock_t 147static inline clock_t cputime_to_clock_t(cputime_t cputime)
161cputime_to_clock_t(cputime_t cputime)
162{ 148{
163 return cputime_div(cputime, 4096000000ULL / USER_HZ); 149 unsigned long long clock = (__force unsigned long long) cputime;
150 do_div(clock, 4096000000ULL / USER_HZ);
151 return clock;
164} 152}
165 153
166static inline cputime_t 154static inline cputime_t clock_t_to_cputime(unsigned long x)
167clock_t_to_cputime(unsigned long x)
168{ 155{
169 return (cputime_t) x * (4096000000ULL / USER_HZ); 156 return (__force cputime_t)(x * (4096000000ULL / USER_HZ));
170} 157}
171 158
172/* 159/*
173 * Convert cputime64 to clock. 160 * Convert cputime64 to clock.
174 */ 161 */
175static inline clock_t 162static inline clock_t cputime64_to_clock_t(cputime64_t cputime)
176cputime64_to_clock_t(cputime64_t cputime)
177{ 163{
178 return cputime_div(cputime, 4096000000ULL / USER_HZ); 164 unsigned long long clock = (__force unsigned long long) cputime;
165 do_div(clock, 4096000000ULL / USER_HZ);
166 return clock;
179} 167}
180 168
181struct s390_idle_data { 169struct s390_idle_data {
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 9451b210a1b4..3201ae447990 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -91,10 +91,12 @@ static void default_idle(void)
91void cpu_idle(void) 91void cpu_idle(void)
92{ 92{
93 for (;;) { 93 for (;;) {
94 tick_nohz_stop_sched_tick(1); 94 tick_nohz_idle_enter();
95 rcu_idle_enter();
95 while (!need_resched()) 96 while (!need_resched())
96 default_idle(); 97 default_idle();
97 tick_nohz_restart_sched_tick(); 98 rcu_idle_exit();
99 tick_nohz_idle_exit();
98 preempt_enable_no_resched(); 100 preempt_enable_no_resched();
99 schedule(); 101 schedule();
100 preempt_disable(); 102 preempt_disable();
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index e54c4ff8abaa..f11d1b037c50 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/memblock.h>
24#include <linux/mm.h> 25#include <linux/mm.h>
25#include <linux/stddef.h> 26#include <linux/stddef.h>
26#include <linux/unistd.h> 27#include <linux/unistd.h>
@@ -820,7 +821,8 @@ setup_memory(void)
820 end_chunk = min(end_chunk, end_pfn); 821 end_chunk = min(end_chunk, end_pfn);
821 if (start_chunk >= end_chunk) 822 if (start_chunk >= end_chunk)
822 continue; 823 continue;
823 add_active_range(0, start_chunk, end_chunk); 824 memblock_add_node(PFN_PHYS(start_chunk),
825 PFN_PHYS(end_chunk - start_chunk), 0);
824 pfn = max(start_chunk, start_pfn); 826 pfn = max(start_chunk, start_pfn);
825 for (; pfn < end_chunk; pfn++) 827 for (; pfn < end_chunk; pfn++)
826 page_set_storage_key(PFN_PHYS(pfn), 828 page_set_storage_key(PFN_PHYS(pfn),
diff --git a/arch/s390/oprofile/hwsampler.c b/arch/s390/oprofile/hwsampler.c
index f43c0e4282af..9daee91e6c3f 100644
--- a/arch/s390/oprofile/hwsampler.c
+++ b/arch/s390/oprofile/hwsampler.c
@@ -22,6 +22,7 @@
22#include <asm/irq.h> 22#include <asm/irq.h>
23 23
24#include "hwsampler.h" 24#include "hwsampler.h"
25#include "op_counter.h"
25 26
26#define MAX_NUM_SDB 511 27#define MAX_NUM_SDB 511
27#define MIN_NUM_SDB 1 28#define MIN_NUM_SDB 1
@@ -896,6 +897,8 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt,
896 if (sample_data_ptr->P == 1) { 897 if (sample_data_ptr->P == 1) {
897 /* userspace sample */ 898 /* userspace sample */
898 unsigned int pid = sample_data_ptr->prim_asn; 899 unsigned int pid = sample_data_ptr->prim_asn;
900 if (!counter_config.user)
901 goto skip_sample;
899 rcu_read_lock(); 902 rcu_read_lock();
900 tsk = pid_task(find_vpid(pid), PIDTYPE_PID); 903 tsk = pid_task(find_vpid(pid), PIDTYPE_PID);
901 if (tsk) 904 if (tsk)
@@ -903,6 +906,8 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt,
903 rcu_read_unlock(); 906 rcu_read_unlock();
904 } else { 907 } else {
905 /* kernelspace sample */ 908 /* kernelspace sample */
909 if (!counter_config.kernel)
910 goto skip_sample;
906 regs = task_pt_regs(current); 911 regs = task_pt_regs(current);
907 } 912 }
908 913
@@ -910,7 +915,7 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt,
910 oprofile_add_ext_hw_sample(sample_data_ptr->ia, regs, 0, 915 oprofile_add_ext_hw_sample(sample_data_ptr->ia, regs, 0,
911 !sample_data_ptr->P, tsk); 916 !sample_data_ptr->P, tsk);
912 mutex_unlock(&hws_sem); 917 mutex_unlock(&hws_sem);
913 918 skip_sample:
914 sample_data_ptr++; 919 sample_data_ptr++;
915 } 920 }
916} 921}
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
index bd58b72454cf..2297be406c61 100644
--- a/arch/s390/oprofile/init.c
+++ b/arch/s390/oprofile/init.c
@@ -2,10 +2,11 @@
2 * arch/s390/oprofile/init.c 2 * arch/s390/oprofile/init.c
3 * 3 *
4 * S390 Version 4 * S390 Version
5 * Copyright (C) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation 5 * Copyright (C) 2002-2011 IBM Deutschland Entwicklung GmbH, IBM Corporation
6 * Author(s): Thomas Spatzier (tspat@de.ibm.com) 6 * Author(s): Thomas Spatzier (tspat@de.ibm.com)
7 * Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com) 7 * Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com)
8 * Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com) 8 * Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com)
9 * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com)
9 * 10 *
10 * @remark Copyright 2002-2011 OProfile authors 11 * @remark Copyright 2002-2011 OProfile authors
11 */ 12 */
@@ -14,6 +15,8 @@
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/errno.h> 16#include <linux/errno.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/module.h>
19#include <asm/processor.h>
17 20
18#include "../../../drivers/oprofile/oprof.h" 21#include "../../../drivers/oprofile/oprof.h"
19 22
@@ -22,6 +25,7 @@ extern void s390_backtrace(struct pt_regs * const regs, unsigned int depth);
22#ifdef CONFIG_64BIT 25#ifdef CONFIG_64BIT
23 26
24#include "hwsampler.h" 27#include "hwsampler.h"
28#include "op_counter.h"
25 29
26#define DEFAULT_INTERVAL 4127518 30#define DEFAULT_INTERVAL 4127518
27 31
@@ -35,16 +39,41 @@ static unsigned long oprofile_max_interval;
35static unsigned long oprofile_sdbt_blocks = DEFAULT_SDBT_BLOCKS; 39static unsigned long oprofile_sdbt_blocks = DEFAULT_SDBT_BLOCKS;
36static unsigned long oprofile_sdb_blocks = DEFAULT_SDB_BLOCKS; 40static unsigned long oprofile_sdb_blocks = DEFAULT_SDB_BLOCKS;
37 41
38static int hwsampler_file; 42static int hwsampler_enabled;
39static int hwsampler_running; /* start_mutex must be held to change */ 43static int hwsampler_running; /* start_mutex must be held to change */
44static int hwsampler_available;
40 45
41static struct oprofile_operations timer_ops; 46static struct oprofile_operations timer_ops;
42 47
48struct op_counter_config counter_config;
49
50enum __force_cpu_type {
51 reserved = 0, /* do not force */
52 timer,
53};
54static int force_cpu_type;
55
56static int set_cpu_type(const char *str, struct kernel_param *kp)
57{
58 if (!strcmp(str, "timer")) {
59 force_cpu_type = timer;
60 printk(KERN_INFO "oprofile: forcing timer to be returned "
61 "as cpu type\n");
62 } else {
63 force_cpu_type = 0;
64 }
65
66 return 0;
67}
68module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
69MODULE_PARM_DESC(cpu_type, "Force legacy basic mode sampling"
70 "(report cpu_type \"timer\"");
71
43static int oprofile_hwsampler_start(void) 72static int oprofile_hwsampler_start(void)
44{ 73{
45 int retval; 74 int retval;
46 75
47 hwsampler_running = hwsampler_file; 76 hwsampler_running = hwsampler_enabled;
48 77
49 if (!hwsampler_running) 78 if (!hwsampler_running)
50 return timer_ops.start(); 79 return timer_ops.start();
@@ -72,10 +101,16 @@ static void oprofile_hwsampler_stop(void)
72 return; 101 return;
73} 102}
74 103
104/*
105 * File ops used for:
106 * /dev/oprofile/0/enabled
107 * /dev/oprofile/hwsampling/hwsampler (cpu_type = timer)
108 */
109
75static ssize_t hwsampler_read(struct file *file, char __user *buf, 110static ssize_t hwsampler_read(struct file *file, char __user *buf,
76 size_t count, loff_t *offset) 111 size_t count, loff_t *offset)
77{ 112{
78 return oprofilefs_ulong_to_user(hwsampler_file, buf, count, offset); 113 return oprofilefs_ulong_to_user(hwsampler_enabled, buf, count, offset);
79} 114}
80 115
81static ssize_t hwsampler_write(struct file *file, char const __user *buf, 116static ssize_t hwsampler_write(struct file *file, char const __user *buf,
@@ -91,6 +126,9 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf,
91 if (retval <= 0) 126 if (retval <= 0)
92 return retval; 127 return retval;
93 128
129 if (val != 0 && val != 1)
130 return -EINVAL;
131
94 if (oprofile_started) 132 if (oprofile_started)
95 /* 133 /*
96 * save to do without locking as we set 134 * save to do without locking as we set
@@ -99,7 +137,7 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf,
99 */ 137 */
100 return -EBUSY; 138 return -EBUSY;
101 139
102 hwsampler_file = val; 140 hwsampler_enabled = val;
103 141
104 return count; 142 return count;
105} 143}
@@ -109,38 +147,311 @@ static const struct file_operations hwsampler_fops = {
109 .write = hwsampler_write, 147 .write = hwsampler_write,
110}; 148};
111 149
150/*
151 * File ops used for:
152 * /dev/oprofile/0/count
153 * /dev/oprofile/hwsampling/hw_interval (cpu_type = timer)
154 *
155 * Make sure that the value is within the hardware range.
156 */
157
158static ssize_t hw_interval_read(struct file *file, char __user *buf,
159 size_t count, loff_t *offset)
160{
161 return oprofilefs_ulong_to_user(oprofile_hw_interval, buf,
162 count, offset);
163}
164
165static ssize_t hw_interval_write(struct file *file, char const __user *buf,
166 size_t count, loff_t *offset)
167{
168 unsigned long val;
169 int retval;
170
171 if (*offset)
172 return -EINVAL;
173 retval = oprofilefs_ulong_from_user(&val, buf, count);
174 if (retval)
175 return retval;
176 if (val < oprofile_min_interval)
177 oprofile_hw_interval = oprofile_min_interval;
178 else if (val > oprofile_max_interval)
179 oprofile_hw_interval = oprofile_max_interval;
180 else
181 oprofile_hw_interval = val;
182
183 return count;
184}
185
186static const struct file_operations hw_interval_fops = {
187 .read = hw_interval_read,
188 .write = hw_interval_write,
189};
190
191/*
192 * File ops used for:
193 * /dev/oprofile/0/event
194 * Only a single event with number 0 is supported with this counter.
195 *
196 * /dev/oprofile/0/unit_mask
197 * This is a dummy file needed by the user space tools.
198 * No value other than 0 is accepted or returned.
199 */
200
201static ssize_t hwsampler_zero_read(struct file *file, char __user *buf,
202 size_t count, loff_t *offset)
203{
204 return oprofilefs_ulong_to_user(0, buf, count, offset);
205}
206
207static ssize_t hwsampler_zero_write(struct file *file, char const __user *buf,
208 size_t count, loff_t *offset)
209{
210 unsigned long val;
211 int retval;
212
213 if (*offset)
214 return -EINVAL;
215
216 retval = oprofilefs_ulong_from_user(&val, buf, count);
217 if (retval)
218 return retval;
219 if (val != 0)
220 return -EINVAL;
221 return count;
222}
223
224static const struct file_operations zero_fops = {
225 .read = hwsampler_zero_read,
226 .write = hwsampler_zero_write,
227};
228
229/* /dev/oprofile/0/kernel file ops. */
230
231static ssize_t hwsampler_kernel_read(struct file *file, char __user *buf,
232 size_t count, loff_t *offset)
233{
234 return oprofilefs_ulong_to_user(counter_config.kernel,
235 buf, count, offset);
236}
237
238static ssize_t hwsampler_kernel_write(struct file *file, char const __user *buf,
239 size_t count, loff_t *offset)
240{
241 unsigned long val;
242 int retval;
243
244 if (*offset)
245 return -EINVAL;
246
247 retval = oprofilefs_ulong_from_user(&val, buf, count);
248 if (retval)
249 return retval;
250
251 if (val != 0 && val != 1)
252 return -EINVAL;
253
254 counter_config.kernel = val;
255
256 return count;
257}
258
259static const struct file_operations kernel_fops = {
260 .read = hwsampler_kernel_read,
261 .write = hwsampler_kernel_write,
262};
263
264/* /dev/oprofile/0/user file ops. */
265
266static ssize_t hwsampler_user_read(struct file *file, char __user *buf,
267 size_t count, loff_t *offset)
268{
269 return oprofilefs_ulong_to_user(counter_config.user,
270 buf, count, offset);
271}
272
273static ssize_t hwsampler_user_write(struct file *file, char const __user *buf,
274 size_t count, loff_t *offset)
275{
276 unsigned long val;
277 int retval;
278
279 if (*offset)
280 return -EINVAL;
281
282 retval = oprofilefs_ulong_from_user(&val, buf, count);
283 if (retval)
284 return retval;
285
286 if (val != 0 && val != 1)
287 return -EINVAL;
288
289 counter_config.user = val;
290
291 return count;
292}
293
294static const struct file_operations user_fops = {
295 .read = hwsampler_user_read,
296 .write = hwsampler_user_write,
297};
298
299
300/*
301 * File ops used for: /dev/oprofile/timer/enabled
302 * The value always has to be the inverted value of hwsampler_enabled. So
303 * no separate variable is created. That way we do not need locking.
304 */
305
306static ssize_t timer_enabled_read(struct file *file, char __user *buf,
307 size_t count, loff_t *offset)
308{
309 return oprofilefs_ulong_to_user(!hwsampler_enabled, buf, count, offset);
310}
311
312static ssize_t timer_enabled_write(struct file *file, char const __user *buf,
313 size_t count, loff_t *offset)
314{
315 unsigned long val;
316 int retval;
317
318 if (*offset)
319 return -EINVAL;
320
321 retval = oprofilefs_ulong_from_user(&val, buf, count);
322 if (retval)
323 return retval;
324
325 if (val != 0 && val != 1)
326 return -EINVAL;
327
328 /* Timer cannot be disabled without having hardware sampling. */
329 if (val == 0 && !hwsampler_available)
330 return -EINVAL;
331
332 if (oprofile_started)
333 /*
334 * save to do without locking as we set
335 * hwsampler_running in start() when start_mutex is
336 * held
337 */
338 return -EBUSY;
339
340 hwsampler_enabled = !val;
341
342 return count;
343}
344
345static const struct file_operations timer_enabled_fops = {
346 .read = timer_enabled_read,
347 .write = timer_enabled_write,
348};
349
350
112static int oprofile_create_hwsampling_files(struct super_block *sb, 351static int oprofile_create_hwsampling_files(struct super_block *sb,
113 struct dentry *root) 352 struct dentry *root)
114{ 353{
115 struct dentry *hw_dir; 354 struct dentry *dir;
355
356 dir = oprofilefs_mkdir(sb, root, "timer");
357 if (!dir)
358 return -EINVAL;
359
360 oprofilefs_create_file(sb, dir, "enabled", &timer_enabled_fops);
361
362 if (!hwsampler_available)
363 return 0;
116 364
117 /* reinitialize default values */ 365 /* reinitialize default values */
118 hwsampler_file = 1; 366 hwsampler_enabled = 1;
367 counter_config.kernel = 1;
368 counter_config.user = 1;
119 369
120 hw_dir = oprofilefs_mkdir(sb, root, "hwsampling"); 370 if (!force_cpu_type) {
121 if (!hw_dir) 371 /*
122 return -EINVAL; 372 * Create the counter file system. A single virtual
373 * counter is created which can be used to
374 * enable/disable hardware sampling dynamically from
375 * user space. The user space will configure a single
376 * counter with a single event. The value of 'event'
377 * and 'unit_mask' are not evaluated by the kernel code
378 * and can only be set to 0.
379 */
380
381 dir = oprofilefs_mkdir(sb, root, "0");
382 if (!dir)
383 return -EINVAL;
123 384
124 oprofilefs_create_file(sb, hw_dir, "hwsampler", &hwsampler_fops); 385 oprofilefs_create_file(sb, dir, "enabled", &hwsampler_fops);
125 oprofilefs_create_ulong(sb, hw_dir, "hw_interval", 386 oprofilefs_create_file(sb, dir, "event", &zero_fops);
126 &oprofile_hw_interval); 387 oprofilefs_create_file(sb, dir, "count", &hw_interval_fops);
127 oprofilefs_create_ro_ulong(sb, hw_dir, "hw_min_interval", 388 oprofilefs_create_file(sb, dir, "unit_mask", &zero_fops);
128 &oprofile_min_interval); 389 oprofilefs_create_file(sb, dir, "kernel", &kernel_fops);
129 oprofilefs_create_ro_ulong(sb, hw_dir, "hw_max_interval", 390 oprofilefs_create_file(sb, dir, "user", &user_fops);
130 &oprofile_max_interval); 391 oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks",
131 oprofilefs_create_ulong(sb, hw_dir, "hw_sdbt_blocks", 392 &oprofile_sdbt_blocks);
132 &oprofile_sdbt_blocks);
133 393
394 } else {
395 /*
396 * Hardware sampling can be used but the cpu_type is
397 * forced to timer in order to deal with legacy user
398 * space tools. The /dev/oprofile/hwsampling fs is
399 * provided in that case.
400 */
401 dir = oprofilefs_mkdir(sb, root, "hwsampling");
402 if (!dir)
403 return -EINVAL;
404
405 oprofilefs_create_file(sb, dir, "hwsampler",
406 &hwsampler_fops);
407 oprofilefs_create_file(sb, dir, "hw_interval",
408 &hw_interval_fops);
409 oprofilefs_create_ro_ulong(sb, dir, "hw_min_interval",
410 &oprofile_min_interval);
411 oprofilefs_create_ro_ulong(sb, dir, "hw_max_interval",
412 &oprofile_max_interval);
413 oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks",
414 &oprofile_sdbt_blocks);
415 }
134 return 0; 416 return 0;
135} 417}
136 418
137static int oprofile_hwsampler_init(struct oprofile_operations *ops) 419static int oprofile_hwsampler_init(struct oprofile_operations *ops)
138{ 420{
421 /*
422 * Initialize the timer mode infrastructure as well in order
423 * to be able to switch back dynamically. oprofile_timer_init
424 * is not supposed to fail.
425 */
426 if (oprofile_timer_init(ops))
427 BUG();
428
429 memcpy(&timer_ops, ops, sizeof(timer_ops));
430 ops->create_files = oprofile_create_hwsampling_files;
431
432 /*
433 * If the user space tools do not support newer cpu types,
434 * the force_cpu_type module parameter
435 * can be used to always return \"timer\" as cpu type.
436 */
437 if (force_cpu_type != timer) {
438 struct cpuid id;
439
440 get_cpu_id (&id);
441
442 switch (id.machine) {
443 case 0x2097: case 0x2098: ops->cpu_type = "s390/z10"; break;
444 case 0x2817: case 0x2818: ops->cpu_type = "s390/z196"; break;
445 default: return -ENODEV;
446 }
447 }
448
139 if (hwsampler_setup()) 449 if (hwsampler_setup())
140 return -ENODEV; 450 return -ENODEV;
141 451
142 /* 452 /*
143 * create hwsampler files only if hwsampler_setup() succeeds. 453 * Query the range for the sampling interval from the
454 * hardware.
144 */ 455 */
145 oprofile_min_interval = hwsampler_query_min_interval(); 456 oprofile_min_interval = hwsampler_query_min_interval();
146 if (oprofile_min_interval == 0) 457 if (oprofile_min_interval == 0)
@@ -155,23 +466,17 @@ static int oprofile_hwsampler_init(struct oprofile_operations *ops)
155 if (oprofile_hw_interval > oprofile_max_interval) 466 if (oprofile_hw_interval > oprofile_max_interval)
156 oprofile_hw_interval = oprofile_max_interval; 467 oprofile_hw_interval = oprofile_max_interval;
157 468
158 if (oprofile_timer_init(ops)) 469 printk(KERN_INFO "oprofile: System z hardware sampling "
159 return -ENODEV; 470 "facility found.\n");
160
161 printk(KERN_INFO "oprofile: using hardware sampling\n");
162
163 memcpy(&timer_ops, ops, sizeof(timer_ops));
164 471
165 ops->start = oprofile_hwsampler_start; 472 ops->start = oprofile_hwsampler_start;
166 ops->stop = oprofile_hwsampler_stop; 473 ops->stop = oprofile_hwsampler_stop;
167 ops->create_files = oprofile_create_hwsampling_files;
168 474
169 return 0; 475 return 0;
170} 476}
171 477
172static void oprofile_hwsampler_exit(void) 478static void oprofile_hwsampler_exit(void)
173{ 479{
174 oprofile_timer_exit();
175 hwsampler_shutdown(); 480 hwsampler_shutdown();
176} 481}
177 482
@@ -182,7 +487,15 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
182 ops->backtrace = s390_backtrace; 487 ops->backtrace = s390_backtrace;
183 488
184#ifdef CONFIG_64BIT 489#ifdef CONFIG_64BIT
185 return oprofile_hwsampler_init(ops); 490
491 /*
492 * -ENODEV is not reported to the caller. The module itself
493 * will use the timer mode sampling as fallback and this is
494 * always available.
495 */
496 hwsampler_available = oprofile_hwsampler_init(ops) == 0;
497
498 return 0;
186#else 499#else
187 return -ENODEV; 500 return -ENODEV;
188#endif 501#endif
diff --git a/arch/s390/oprofile/op_counter.h b/arch/s390/oprofile/op_counter.h
new file mode 100644
index 000000000000..1a8d3ca09014
--- /dev/null
+++ b/arch/s390/oprofile/op_counter.h
@@ -0,0 +1,23 @@
1/**
2 * arch/s390/oprofile/op_counter.h
3 *
4 * Copyright (C) 2011 IBM Deutschland Entwicklung GmbH, IBM Corporation
5 * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com)
6 *
7 * @remark Copyright 2011 OProfile authors
8 */
9
10#ifndef OP_COUNTER_H
11#define OP_COUNTER_H
12
13struct op_counter_config {
14 /* `enabled' maps to the hwsampler_file variable. */
15 /* `count' maps to the oprofile_hw_interval variable. */
16 /* `event' and `unit_mask' are unused. */
17 unsigned long kernel;
18 unsigned long user;
19};
20
21extern struct op_counter_config counter_config;
22
23#endif /* OP_COUNTER_H */
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index df169e84db4e..8b0c9464aa9d 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -4,6 +4,9 @@ config SCORE
4 def_bool y 4 def_bool y
5 select HAVE_GENERIC_HARDIRQS 5 select HAVE_GENERIC_HARDIRQS
6 select GENERIC_IRQ_SHOW 6 select GENERIC_IRQ_SHOW
7 select HAVE_MEMBLOCK
8 select HAVE_MEMBLOCK_NODE_MAP
9 select ARCH_DISCARD_MEMBLOCK
7 10
8choice 11choice
9 prompt "System type" 12 prompt "System type"
@@ -60,9 +63,6 @@ config 32BIT
60config ARCH_FLATMEM_ENABLE 63config ARCH_FLATMEM_ENABLE
61 def_bool y 64 def_bool y
62 65
63config ARCH_POPULATES_NODE_MAP
64 def_bool y
65
66source "mm/Kconfig" 66source "mm/Kconfig"
67 67
68config MEMORY_START 68config MEMORY_START
diff --git a/arch/score/kernel/setup.c b/arch/score/kernel/setup.c
index 6f898c057878..b48459afefdd 100644
--- a/arch/score/kernel/setup.c
+++ b/arch/score/kernel/setup.c
@@ -26,6 +26,7 @@
26#include <linux/bootmem.h> 26#include <linux/bootmem.h>
27#include <linux/initrd.h> 27#include <linux/initrd.h>
28#include <linux/ioport.h> 28#include <linux/ioport.h>
29#include <linux/memblock.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
30#include <linux/seq_file.h> 31#include <linux/seq_file.h>
31#include <linux/screen_info.h> 32#include <linux/screen_info.h>
@@ -54,7 +55,8 @@ static void __init bootmem_init(void)
54 /* Initialize the boot-time allocator with low memory only. */ 55 /* Initialize the boot-time allocator with low memory only. */
55 bootmap_size = init_bootmem_node(NODE_DATA(0), start_pfn, 56 bootmap_size = init_bootmem_node(NODE_DATA(0), start_pfn,
56 min_low_pfn, max_low_pfn); 57 min_low_pfn, max_low_pfn);
57 add_active_range(0, min_low_pfn, max_low_pfn); 58 memblock_add_node(PFN_PHYS(min_low_pfn),
59 PFN_PHYS(max_low_pfn - min_low_pfn), 0);
58 60
59 free_bootmem(PFN_PHYS(start_pfn), 61 free_bootmem(PFN_PHYS(start_pfn),
60 (max_low_pfn - start_pfn) << PAGE_SHIFT); 62 (max_low_pfn - start_pfn) << PAGE_SHIFT);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 5629e2099130..47a2f1c2cb0d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -4,6 +4,7 @@ config SUPERH
4 select CLKDEV_LOOKUP 4 select CLKDEV_LOOKUP
5 select HAVE_IDE if HAS_IOPORT 5 select HAVE_IDE if HAS_IOPORT
6 select HAVE_MEMBLOCK 6 select HAVE_MEMBLOCK
7 select HAVE_MEMBLOCK_NODE_MAP
7 select HAVE_OPROFILE 8 select HAVE_OPROFILE
8 select HAVE_GENERIC_DMA_COHERENT 9 select HAVE_GENERIC_DMA_COHERENT
9 select HAVE_ARCH_TRACEHOOK 10 select HAVE_ARCH_TRACEHOOK
diff --git a/arch/sh/include/asm/memblock.h b/arch/sh/include/asm/memblock.h
deleted file mode 100644
index e87063fad2ea..000000000000
--- a/arch/sh/include/asm/memblock.h
+++ /dev/null
@@ -1,4 +0,0 @@
1#ifndef __ASM_SH_MEMBLOCK_H
2#define __ASM_SH_MEMBLOCK_H
3
4#endif /* __ASM_SH_MEMBLOCK_H */
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index db4ecd731a00..406508d4ce74 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -89,7 +89,8 @@ void cpu_idle(void)
89 89
90 /* endless idle loop with no priority at all */ 90 /* endless idle loop with no priority at all */
91 while (1) { 91 while (1) {
92 tick_nohz_stop_sched_tick(1); 92 tick_nohz_idle_enter();
93 rcu_idle_enter();
93 94
94 while (!need_resched()) { 95 while (!need_resched()) {
95 check_pgt_cache(); 96 check_pgt_cache();
@@ -111,7 +112,8 @@ void cpu_idle(void)
111 start_critical_timings(); 112 start_critical_timings();
112 } 113 }
113 114
114 tick_nohz_restart_sched_tick(); 115 rcu_idle_exit();
116 tick_nohz_idle_exit();
115 preempt_enable_no_resched(); 117 preempt_enable_no_resched();
116 schedule(); 118 schedule();
117 preempt_disable(); 119 preempt_disable();
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index c5a33f007f88..9fea49f6e667 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -157,9 +157,6 @@ void __init reserve_crashkernel(void)
157 unsigned long long crash_size, crash_base; 157 unsigned long long crash_size, crash_base;
158 int ret; 158 int ret;
159 159
160 /* this is necessary because of memblock_phys_mem_size() */
161 memblock_analyze();
162
163 ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), 160 ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
164 &crash_size, &crash_base); 161 &crash_size, &crash_base);
165 if (ret == 0 && crash_size > 0) { 162 if (ret == 0 && crash_size > 0) {
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 1a0e946679a4..7b57bf1dc855 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -230,7 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
230 pmb_bolt_mapping((unsigned long)__va(start), start, end - start, 230 pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
231 PAGE_KERNEL); 231 PAGE_KERNEL);
232 232
233 add_active_range(nid, start_pfn, end_pfn); 233 memblock_set_node(PFN_PHYS(start_pfn),
234 PFN_PHYS(end_pfn - start_pfn), nid);
234} 235}
235 236
236void __init __weak plat_early_device_setup(void) 237void __init __weak plat_early_device_setup(void)
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index c3e61b366493..cb8f9920f4dd 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -143,9 +143,6 @@ config MAX_ACTIVE_REGIONS
143 CPU_SUBTYPE_SH7785) 143 CPU_SUBTYPE_SH7785)
144 default "1" 144 default "1"
145 145
146config ARCH_POPULATES_NODE_MAP
147 def_bool y
148
149config ARCH_SELECT_MEMORY_MODEL 146config ARCH_SELECT_MEMORY_MODEL
150 def_bool y 147 def_bool y
151 148
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 939ca0f356f6..82cc576fab15 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -324,7 +324,6 @@ void __init paging_init(void)
324 unsigned long vaddr, end; 324 unsigned long vaddr, end;
325 int nid; 325 int nid;
326 326
327 memblock_init();
328 sh_mv.mv_mem_init(); 327 sh_mv.mv_mem_init();
329 328
330 early_reserve_mem(); 329 early_reserve_mem();
@@ -337,7 +336,7 @@ void __init paging_init(void)
337 sh_mv.mv_mem_reserve(); 336 sh_mv.mv_mem_reserve();
338 337
339 memblock_enforce_memory_limit(memory_limit); 338 memblock_enforce_memory_limit(memory_limit);
340 memblock_analyze(); 339 memblock_allow_resize();
341 340
342 memblock_dump_all(); 341 memblock_dump_all();
343 342
diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c
index b4c2d2b946dd..e4dd5d5a1115 100644
--- a/arch/sh/oprofile/common.c
+++ b/arch/sh/oprofile/common.c
@@ -49,7 +49,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
49 return oprofile_perf_init(ops); 49 return oprofile_perf_init(ops);
50} 50}
51 51
52void __exit oprofile_arch_exit(void) 52void oprofile_arch_exit(void)
53{ 53{
54 oprofile_perf_exit(); 54 oprofile_perf_exit();
55 kfree(sh_pmu_op_name); 55 kfree(sh_pmu_op_name);
@@ -60,5 +60,5 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
60 ops->backtrace = sh_backtrace; 60 ops->backtrace = sh_backtrace;
61 return -ENODEV; 61 return -ENODEV;
62} 62}
63void __exit oprofile_arch_exit(void) {} 63void oprofile_arch_exit(void) {}
64#endif /* CONFIG_HW_PERF_EVENTS */ 64#endif /* CONFIG_HW_PERF_EVENTS */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index f92602e86607..70ae9d81870e 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -43,6 +43,7 @@ config SPARC64
43 select HAVE_KPROBES 43 select HAVE_KPROBES
44 select HAVE_RCU_TABLE_FREE if SMP 44 select HAVE_RCU_TABLE_FREE if SMP
45 select HAVE_MEMBLOCK 45 select HAVE_MEMBLOCK
46 select HAVE_MEMBLOCK_NODE_MAP
46 select HAVE_SYSCALL_WRAPPERS 47 select HAVE_SYSCALL_WRAPPERS
47 select HAVE_DYNAMIC_FTRACE 48 select HAVE_DYNAMIC_FTRACE
48 select HAVE_FTRACE_MCOUNT_RECORD 49 select HAVE_FTRACE_MCOUNT_RECORD
@@ -352,9 +353,6 @@ config NODES_SPAN_OTHER_NODES
352 def_bool y 353 def_bool y
353 depends on NEED_MULTIPLE_NODES 354 depends on NEED_MULTIPLE_NODES
354 355
355config ARCH_POPULATES_NODE_MAP
356 def_bool y if SPARC64
357
358config ARCH_SELECT_MEMORY_MODEL 356config ARCH_SELECT_MEMORY_MODEL
359 def_bool y if SPARC64 357 def_bool y if SPARC64
360 358
diff --git a/arch/sparc/include/asm/memblock.h b/arch/sparc/include/asm/memblock.h
deleted file mode 100644
index c67b047ef85e..000000000000
--- a/arch/sparc/include/asm/memblock.h
+++ /dev/null
@@ -1,8 +0,0 @@
1#ifndef _SPARC64_MEMBLOCK_H
2#define _SPARC64_MEMBLOCK_H
3
4#include <asm/oplib.h>
5
6#define MEMBLOCK_DBG(fmt...) prom_printf(fmt)
7
8#endif /* !(_SPARC64_MEMBLOCK_H) */
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 3739a06a76cb..39d8b05201a2 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -95,12 +95,14 @@ void cpu_idle(void)
95 set_thread_flag(TIF_POLLING_NRFLAG); 95 set_thread_flag(TIF_POLLING_NRFLAG);
96 96
97 while(1) { 97 while(1) {
98 tick_nohz_stop_sched_tick(1); 98 tick_nohz_idle_enter();
99 rcu_idle_enter();
99 100
100 while (!need_resched() && !cpu_is_offline(cpu)) 101 while (!need_resched() && !cpu_is_offline(cpu))
101 sparc64_yield(cpu); 102 sparc64_yield(cpu);
102 103
103 tick_nohz_restart_sched_tick(); 104 rcu_idle_exit();
105 tick_nohz_idle_exit();
104 106
105 preempt_enable_no_resched(); 107 preempt_enable_no_resched();
106 108
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index fe1e3fc31bc5..ffb883ddd0f0 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -84,7 +84,7 @@ static void prom_sync_me(void)
84 84
85 prom_printf("PROM SYNC COMMAND...\n"); 85 prom_printf("PROM SYNC COMMAND...\n");
86 show_free_areas(0); 86 show_free_areas(0);
87 if(current->pid != 0) { 87 if (!is_idle_task(current)) {
88 local_irq_enable(); 88 local_irq_enable();
89 sys_sync(); 89 sys_sync();
90 local_irq_disable(); 90 local_irq_disable();
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 8e073d802139..b3f5e7dfea51 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -790,7 +790,7 @@ static int find_node(unsigned long addr)
790 return -1; 790 return -1;
791} 791}
792 792
793u64 memblock_nid_range(u64 start, u64 end, int *nid) 793static u64 memblock_nid_range(u64 start, u64 end, int *nid)
794{ 794{
795 *nid = find_node(start); 795 *nid = find_node(start);
796 start += PAGE_SIZE; 796 start += PAGE_SIZE;
@@ -808,7 +808,7 @@ u64 memblock_nid_range(u64 start, u64 end, int *nid)
808 return start; 808 return start;
809} 809}
810#else 810#else
811u64 memblock_nid_range(u64 start, u64 end, int *nid) 811static u64 memblock_nid_range(u64 start, u64 end, int *nid)
812{ 812{
813 *nid = 0; 813 *nid = 0;
814 return end; 814 return end;
@@ -816,7 +816,7 @@ u64 memblock_nid_range(u64 start, u64 end, int *nid)
816#endif 816#endif
817 817
818/* This must be invoked after performing all of the necessary 818/* This must be invoked after performing all of the necessary
819 * add_active_range() calls for 'nid'. We need to be able to get 819 * memblock_set_node() calls for 'nid'. We need to be able to get
820 * correct data from get_pfn_range_for_nid(). 820 * correct data from get_pfn_range_for_nid().
821 */ 821 */
822static void __init allocate_node_data(int nid) 822static void __init allocate_node_data(int nid)
@@ -987,14 +987,11 @@ static void __init add_node_ranges(void)
987 987
988 this_end = memblock_nid_range(start, end, &nid); 988 this_end = memblock_nid_range(start, end, &nid);
989 989
990 numadbg("Adding active range nid[%d] " 990 numadbg("Setting memblock NUMA node nid[%d] "
991 "start[%lx] end[%lx]\n", 991 "start[%lx] end[%lx]\n",
992 nid, start, this_end); 992 nid, start, this_end);
993 993
994 add_active_range(nid, 994 memblock_set_node(start, this_end - start, nid);
995 start >> PAGE_SHIFT,
996 this_end >> PAGE_SHIFT);
997
998 start = this_end; 995 start = this_end;
999 } 996 }
1000 } 997 }
@@ -1282,7 +1279,6 @@ static void __init bootmem_init_nonnuma(void)
1282{ 1279{
1283 unsigned long top_of_ram = memblock_end_of_DRAM(); 1280 unsigned long top_of_ram = memblock_end_of_DRAM();
1284 unsigned long total_ram = memblock_phys_mem_size(); 1281 unsigned long total_ram = memblock_phys_mem_size();
1285 struct memblock_region *reg;
1286 1282
1287 numadbg("bootmem_init_nonnuma()\n"); 1283 numadbg("bootmem_init_nonnuma()\n");
1288 1284
@@ -1292,20 +1288,8 @@ static void __init bootmem_init_nonnuma(void)
1292 (top_of_ram - total_ram) >> 20); 1288 (top_of_ram - total_ram) >> 20);
1293 1289
1294 init_node_masks_nonnuma(); 1290 init_node_masks_nonnuma();
1295 1291 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
1296 for_each_memblock(memory, reg) {
1297 unsigned long start_pfn, end_pfn;
1298
1299 if (!reg->size)
1300 continue;
1301
1302 start_pfn = memblock_region_memory_base_pfn(reg);
1303 end_pfn = memblock_region_memory_end_pfn(reg);
1304 add_active_range(0, start_pfn, end_pfn);
1305 }
1306
1307 allocate_node_data(0); 1292 allocate_node_data(0);
1308
1309 node_set_online(0); 1293 node_set_online(0);
1310} 1294}
1311 1295
@@ -1769,8 +1753,6 @@ void __init paging_init(void)
1769 sun4v_ktsb_init(); 1753 sun4v_ktsb_init();
1770 } 1754 }
1771 1755
1772 memblock_init();
1773
1774 /* Find available physical memory... 1756 /* Find available physical memory...
1775 * 1757 *
1776 * Read it twice in order to work around a bug in openfirmware. 1758 * Read it twice in order to work around a bug in openfirmware.
@@ -1796,7 +1778,7 @@ void __init paging_init(void)
1796 1778
1797 memblock_enforce_memory_limit(cmdline_memory_size); 1779 memblock_enforce_memory_limit(cmdline_memory_size);
1798 1780
1799 memblock_analyze(); 1781 memblock_allow_resize();
1800 memblock_dump_all(); 1782 memblock_dump_all();
1801 1783
1802 set_bit(0, mmu_context_bmap); 1784 set_bit(0, mmu_context_bmap);
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 9c45d8bbdf57..4c1ac6e5347a 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -85,7 +85,8 @@ void cpu_idle(void)
85 85
86 /* endless idle loop with no priority at all */ 86 /* endless idle loop with no priority at all */
87 while (1) { 87 while (1) {
88 tick_nohz_stop_sched_tick(1); 88 tick_nohz_idle_enter();
89 rcu_idle_enter();
89 while (!need_resched()) { 90 while (!need_resched()) {
90 if (cpu_is_offline(cpu)) 91 if (cpu_is_offline(cpu))
91 BUG(); /* no HOTPLUG_CPU */ 92 BUG(); /* no HOTPLUG_CPU */
@@ -105,7 +106,8 @@ void cpu_idle(void)
105 local_irq_enable(); 106 local_irq_enable();
106 current_thread_info()->status |= TS_POLLING; 107 current_thread_info()->status |= TS_POLLING;
107 } 108 }
108 tick_nohz_restart_sched_tick(); 109 rcu_idle_exit();
110 tick_nohz_idle_exit();
109 preempt_enable_no_resched(); 111 preempt_enable_no_resched();
110 schedule(); 112 schedule();
111 preempt_disable(); 113 preempt_disable();
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 25b7b90fd620..c1eaaa1fcc20 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -54,7 +54,7 @@ static noinline void force_sig_info_fault(const char *type, int si_signo,
54 if (unlikely(tsk->pid < 2)) { 54 if (unlikely(tsk->pid < 2)) {
55 panic("Signal %d (code %d) at %#lx sent to %s!", 55 panic("Signal %d (code %d) at %#lx sent to %s!",
56 si_signo, si_code & 0xffff, address, 56 si_signo, si_code & 0xffff, address,
57 tsk->pid ? "init" : "the idle task"); 57 is_idle_task(tsk) ? "the idle task" : "init");
58 } 58 }
59 59
60 info.si_signo = si_signo; 60 info.si_signo = si_signo;
@@ -515,7 +515,7 @@ no_context:
515 515
516 if (unlikely(tsk->pid < 2)) { 516 if (unlikely(tsk->pid < 2)) {
517 panic("Kernel page fault running %s!", 517 panic("Kernel page fault running %s!",
518 tsk->pid ? "init" : "the idle task"); 518 is_idle_task(tsk) ? "the idle task" : "init");
519 } 519 }
520 520
521 /* 521 /*
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index c5338351aecd..69f24905abdc 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -246,10 +246,12 @@ void default_idle(void)
246 if (need_resched()) 246 if (need_resched())
247 schedule(); 247 schedule();
248 248
249 tick_nohz_stop_sched_tick(1); 249 tick_nohz_idle_enter();
250 rcu_idle_enter();
250 nsecs = disable_timer(); 251 nsecs = disable_timer();
251 idle_sleep(nsecs); 252 idle_sleep(nsecs);
252 tick_nohz_restart_sched_tick(); 253 rcu_idle_exit();
254 tick_nohz_idle_exit();
253 } 255 }
254} 256}
255 257
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index a08d9fab81f2..82a6e22f1f35 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -75,8 +75,6 @@ static struct clocksource itimer_clocksource = {
75 .rating = 300, 75 .rating = 300,
76 .read = itimer_read, 76 .read = itimer_read,
77 .mask = CLOCKSOURCE_MASK(64), 77 .mask = CLOCKSOURCE_MASK(64),
78 .mult = 1000,
79 .shift = 0,
80 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 78 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
81}; 79};
82 80
@@ -94,9 +92,9 @@ static void __init setup_itimer(void)
94 clockevent_delta2ns(60 * HZ, &itimer_clockevent); 92 clockevent_delta2ns(60 * HZ, &itimer_clockevent);
95 itimer_clockevent.min_delta_ns = 93 itimer_clockevent.min_delta_ns =
96 clockevent_delta2ns(1, &itimer_clockevent); 94 clockevent_delta2ns(1, &itimer_clockevent);
97 err = clocksource_register(&itimer_clocksource); 95 err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC);
98 if (err) { 96 if (err) {
99 printk(KERN_ERR "clocksource_register returned %d\n", err); 97 printk(KERN_ERR "clocksource_register_hz returned %d\n", err);
100 return; 98 return;
101 } 99 }
102 clockevents_register_device(&itimer_clockevent); 100 clockevents_register_device(&itimer_clockevent);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index ba401df971ed..52edc2b62873 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -55,7 +55,8 @@ void cpu_idle(void)
55{ 55{
56 /* endless idle loop with no priority at all */ 56 /* endless idle loop with no priority at all */
57 while (1) { 57 while (1) {
58 tick_nohz_stop_sched_tick(1); 58 tick_nohz_idle_enter();
59 rcu_idle_enter();
59 while (!need_resched()) { 60 while (!need_resched()) {
60 local_irq_disable(); 61 local_irq_disable();
61 stop_critical_timings(); 62 stop_critical_timings();
@@ -63,7 +64,8 @@ void cpu_idle(void)
63 local_irq_enable(); 64 local_irq_enable();
64 start_critical_timings(); 65 start_critical_timings();
65 } 66 }
66 tick_nohz_restart_sched_tick(); 67 rcu_idle_exit();
68 tick_nohz_idle_exit();
67 preempt_enable_no_resched(); 69 preempt_enable_no_resched();
68 schedule(); 70 schedule();
69 preempt_disable(); 71 preempt_disable();
diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c
index 471b6bca8da4..673d7a89d8ff 100644
--- a/arch/unicore32/kernel/setup.c
+++ b/arch/unicore32/kernel/setup.c
@@ -37,6 +37,7 @@
37#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
38#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
39#include <asm/traps.h> 39#include <asm/traps.h>
40#include <asm/memblock.h>
40 41
41#include "setup.h" 42#include "setup.h"
42 43
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 3b379cddbc64..de186bde8975 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -26,6 +26,7 @@
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include <asm/sizes.h> 27#include <asm/sizes.h>
28#include <asm/tlb.h> 28#include <asm/tlb.h>
29#include <asm/memblock.h>
29#include <mach/map.h> 30#include <mach/map.h>
30 31
31#include "mm.h" 32#include "mm.h"
@@ -245,7 +246,6 @@ void __init uc32_memblock_init(struct meminfo *mi)
245 sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), 246 sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]),
246 meminfo_cmp, NULL); 247 meminfo_cmp, NULL);
247 248
248 memblock_init();
249 for (i = 0; i < mi->nr_banks; i++) 249 for (i = 0; i < mi->nr_banks; i++)
250 memblock_add(mi->bank[i].start, mi->bank[i].size); 250 memblock_add(mi->bank[i].start, mi->bank[i].size);
251 251
@@ -264,7 +264,7 @@ void __init uc32_memblock_init(struct meminfo *mi)
264 264
265 uc32_mm_memblock_reserve(); 265 uc32_mm_memblock_reserve();
266 266
267 memblock_analyze(); 267 memblock_allow_resize();
268 memblock_dump_all(); 268 memblock_dump_all();
269} 269}
270 270
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
index 3e5c3e5a0b45..43c20b40e444 100644
--- a/arch/unicore32/mm/mmu.c
+++ b/arch/unicore32/mm/mmu.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/sizes.h> 26#include <asm/sizes.h>
27#include <asm/tlb.h> 27#include <asm/tlb.h>
28#include <asm/memblock.h>
28 29
29#include <mach/map.h> 30#include <mach/map.h>
30 31
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efb42949cc09..5731eb70e0a0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,8 @@ config X86
26 select HAVE_IOREMAP_PROT 26 select HAVE_IOREMAP_PROT
27 select HAVE_KPROBES 27 select HAVE_KPROBES
28 select HAVE_MEMBLOCK 28 select HAVE_MEMBLOCK
29 select HAVE_MEMBLOCK_NODE_MAP
30 select ARCH_DISCARD_MEMBLOCK
29 select ARCH_WANT_OPTIONAL_GPIOLIB 31 select ARCH_WANT_OPTIONAL_GPIOLIB
30 select ARCH_WANT_FRAME_POINTERS 32 select ARCH_WANT_FRAME_POINTERS
31 select HAVE_DMA_ATTRS 33 select HAVE_DMA_ATTRS
@@ -204,9 +206,6 @@ config ZONE_DMA32
204 bool 206 bool
205 default X86_64 207 default X86_64
206 208
207config ARCH_POPULATES_NODE_MAP
208 def_bool y
209
210config AUDIT_ARCH 209config AUDIT_ARCH
211 bool 210 bool
212 default X86_64 211 default X86_64
@@ -343,6 +342,7 @@ config X86_EXTENDED_PLATFORM
343 342
344 If you enable this option then you'll be able to select support 343 If you enable this option then you'll be able to select support
345 for the following (non-PC) 64 bit x86 platforms: 344 for the following (non-PC) 64 bit x86 platforms:
345 Numascale NumaChip
346 ScaleMP vSMP 346 ScaleMP vSMP
347 SGI Ultraviolet 347 SGI Ultraviolet
348 348
@@ -351,6 +351,18 @@ config X86_EXTENDED_PLATFORM
351endif 351endif
352# This is an alphabetically sorted list of 64 bit extended platforms 352# This is an alphabetically sorted list of 64 bit extended platforms
353# Please maintain the alphabetic order if and when there are additions 353# Please maintain the alphabetic order if and when there are additions
354config X86_NUMACHIP
355 bool "Numascale NumaChip"
356 depends on X86_64
357 depends on X86_EXTENDED_PLATFORM
358 depends on NUMA
359 depends on SMP
360 depends on X86_X2APIC
361 depends on !EDAC_AMD64
362 ---help---
363 Adds support for Numascale NumaChip large-SMP systems. Needed to
364 enable more than ~168 cores.
365 If you don't have one of these, you should say N here.
354 366
355config X86_VSMP 367config X86_VSMP
356 bool "ScaleMP vSMP" 368 bool "ScaleMP vSMP"
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a6253ec1b284..3e274564f6bf 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -134,7 +134,7 @@ ENTRY(ia32_sysenter_target)
134 CFI_REL_OFFSET rsp,0 134 CFI_REL_OFFSET rsp,0
135 pushfq_cfi 135 pushfq_cfi
136 /*CFI_REL_OFFSET rflags,0*/ 136 /*CFI_REL_OFFSET rflags,0*/
137 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d 137 movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
138 CFI_REGISTER rip,r10 138 CFI_REGISTER rip,r10
139 pushq_cfi $__USER32_CS 139 pushq_cfi $__USER32_CS
140 /*CFI_REL_OFFSET cs,0*/ 140 /*CFI_REL_OFFSET cs,0*/
@@ -150,9 +150,8 @@ ENTRY(ia32_sysenter_target)
150 .section __ex_table,"a" 150 .section __ex_table,"a"
151 .quad 1b,ia32_badarg 151 .quad 1b,ia32_badarg
152 .previous 152 .previous
153 GET_THREAD_INFO(%r10) 153 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
154 orl $TS_COMPAT,TI_status(%r10) 154 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
155 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
156 CFI_REMEMBER_STATE 155 CFI_REMEMBER_STATE
157 jnz sysenter_tracesys 156 jnz sysenter_tracesys
158 cmpq $(IA32_NR_syscalls-1),%rax 157 cmpq $(IA32_NR_syscalls-1),%rax
@@ -162,13 +161,12 @@ sysenter_do_call:
162sysenter_dispatch: 161sysenter_dispatch:
163 call *ia32_sys_call_table(,%rax,8) 162 call *ia32_sys_call_table(,%rax,8)
164 movq %rax,RAX-ARGOFFSET(%rsp) 163 movq %rax,RAX-ARGOFFSET(%rsp)
165 GET_THREAD_INFO(%r10)
166 DISABLE_INTERRUPTS(CLBR_NONE) 164 DISABLE_INTERRUPTS(CLBR_NONE)
167 TRACE_IRQS_OFF 165 TRACE_IRQS_OFF
168 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 166 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
169 jnz sysexit_audit 167 jnz sysexit_audit
170sysexit_from_sys_call: 168sysexit_from_sys_call:
171 andl $~TS_COMPAT,TI_status(%r10) 169 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
172 /* clear IF, that popfq doesn't enable interrupts early */ 170 /* clear IF, that popfq doesn't enable interrupts early */
173 andl $~0x200,EFLAGS-R11(%rsp) 171 andl $~0x200,EFLAGS-R11(%rsp)
174 movl RIP-R11(%rsp),%edx /* User %eip */ 172 movl RIP-R11(%rsp),%edx /* User %eip */
@@ -205,7 +203,7 @@ sysexit_from_sys_call:
205 .endm 203 .endm
206 204
207 .macro auditsys_exit exit 205 .macro auditsys_exit exit
208 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 206 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
209 jnz ia32_ret_from_sys_call 207 jnz ia32_ret_from_sys_call
210 TRACE_IRQS_ON 208 TRACE_IRQS_ON
211 sti 209 sti
@@ -215,12 +213,11 @@ sysexit_from_sys_call:
215 movzbl %al,%edi /* zero-extend that into %edi */ 213 movzbl %al,%edi /* zero-extend that into %edi */
216 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 214 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
217 call audit_syscall_exit 215 call audit_syscall_exit
218 GET_THREAD_INFO(%r10)
219 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ 216 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
220 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 217 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
221 cli 218 cli
222 TRACE_IRQS_OFF 219 TRACE_IRQS_OFF
223 testl %edi,TI_flags(%r10) 220 testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
224 jz \exit 221 jz \exit
225 CLEAR_RREGS -ARGOFFSET 222 CLEAR_RREGS -ARGOFFSET
226 jmp int_with_check 223 jmp int_with_check
@@ -238,7 +235,7 @@ sysexit_audit:
238 235
239sysenter_tracesys: 236sysenter_tracesys:
240#ifdef CONFIG_AUDITSYSCALL 237#ifdef CONFIG_AUDITSYSCALL
241 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 238 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
242 jz sysenter_auditsys 239 jz sysenter_auditsys
243#endif 240#endif
244 SAVE_REST 241 SAVE_REST
@@ -309,9 +306,8 @@ ENTRY(ia32_cstar_target)
309 .section __ex_table,"a" 306 .section __ex_table,"a"
310 .quad 1b,ia32_badarg 307 .quad 1b,ia32_badarg
311 .previous 308 .previous
312 GET_THREAD_INFO(%r10) 309 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
313 orl $TS_COMPAT,TI_status(%r10) 310 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
315 CFI_REMEMBER_STATE 311 CFI_REMEMBER_STATE
316 jnz cstar_tracesys 312 jnz cstar_tracesys
317 cmpq $IA32_NR_syscalls-1,%rax 313 cmpq $IA32_NR_syscalls-1,%rax
@@ -321,13 +317,12 @@ cstar_do_call:
321cstar_dispatch: 317cstar_dispatch:
322 call *ia32_sys_call_table(,%rax,8) 318 call *ia32_sys_call_table(,%rax,8)
323 movq %rax,RAX-ARGOFFSET(%rsp) 319 movq %rax,RAX-ARGOFFSET(%rsp)
324 GET_THREAD_INFO(%r10)
325 DISABLE_INTERRUPTS(CLBR_NONE) 320 DISABLE_INTERRUPTS(CLBR_NONE)
326 TRACE_IRQS_OFF 321 TRACE_IRQS_OFF
327 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 322 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
328 jnz sysretl_audit 323 jnz sysretl_audit
329sysretl_from_sys_call: 324sysretl_from_sys_call:
330 andl $~TS_COMPAT,TI_status(%r10) 325 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
331 RESTORE_ARGS 0,-ARG_SKIP,0,0,0 326 RESTORE_ARGS 0,-ARG_SKIP,0,0,0
332 movl RIP-ARGOFFSET(%rsp),%ecx 327 movl RIP-ARGOFFSET(%rsp),%ecx
333 CFI_REGISTER rip,rcx 328 CFI_REGISTER rip,rcx
@@ -355,7 +350,7 @@ sysretl_audit:
355 350
356cstar_tracesys: 351cstar_tracesys:
357#ifdef CONFIG_AUDITSYSCALL 352#ifdef CONFIG_AUDITSYSCALL
358 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 353 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
359 jz cstar_auditsys 354 jz cstar_auditsys
360#endif 355#endif
361 xchgl %r9d,%ebp 356 xchgl %r9d,%ebp
@@ -420,9 +415,8 @@ ENTRY(ia32_syscall)
420 /* note the registers are not zero extended to the sf. 415 /* note the registers are not zero extended to the sf.
421 this could be a problem. */ 416 this could be a problem. */
422 SAVE_ARGS 0,1,0 417 SAVE_ARGS 0,1,0
423 GET_THREAD_INFO(%r10) 418 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
424 orl $TS_COMPAT,TI_status(%r10) 419 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
425 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
426 jnz ia32_tracesys 420 jnz ia32_tracesys
427 cmpq $(IA32_NR_syscalls-1),%rax 421 cmpq $(IA32_NR_syscalls-1),%rax
428 ja ia32_badsys 422 ja ia32_badsys
@@ -459,8 +453,8 @@ quiet_ni_syscall:
459 CFI_ENDPROC 453 CFI_ENDPROC
460 454
461 .macro PTREGSCALL label, func, arg 455 .macro PTREGSCALL label, func, arg
462 .globl \label 456 ALIGN
463\label: 457GLOBAL(\label)
464 leaq \func(%rip),%rax 458 leaq \func(%rip),%rax
465 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ 459 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
466 jmp ia32_ptregs_common 460 jmp ia32_ptregs_common
@@ -477,7 +471,8 @@ quiet_ni_syscall:
477 PTREGSCALL stub32_vfork, sys_vfork, %rdi 471 PTREGSCALL stub32_vfork, sys_vfork, %rdi
478 PTREGSCALL stub32_iopl, sys_iopl, %rsi 472 PTREGSCALL stub32_iopl, sys_iopl, %rsi
479 473
480ENTRY(ia32_ptregs_common) 474 ALIGN
475ia32_ptregs_common:
481 popq %r11 476 popq %r11
482 CFI_ENDPROC 477 CFI_ENDPROC
483 CFI_STARTPROC32 simple 478 CFI_STARTPROC32 simple
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 091508b533b4..952bd0100c5c 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -4,10 +4,10 @@
4 4
5#ifdef CONFIG_SMP 5#ifdef CONFIG_SMP
6 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
71: lock 7672: lock
8 .section .smp_locks,"a" 8 .section .smp_locks,"a"
9 .balign 4 9 .balign 4
10 .long 1b - . 10 .long 672b - .
11 .previous 11 .previous
12 .endm 12 .endm
13#else 13#else
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1a6c09af048f..3ab9bdd87e79 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -176,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void)
176} 176}
177 177
178extern int x2apic_phys; 178extern int x2apic_phys;
179extern int x2apic_preenabled;
179extern void check_x2apic(void); 180extern void check_x2apic(void);
180extern void enable_x2apic(void); 181extern void enable_x2apic(void);
181extern void x2apic_icr_write(u32 low, u32 id); 182extern void x2apic_icr_write(u32 low, u32 id);
@@ -198,6 +199,9 @@ static inline void x2apic_force_phys(void)
198 x2apic_phys = 1; 199 x2apic_phys = 1;
199} 200}
200#else 201#else
202static inline void disable_x2apic(void)
203{
204}
201static inline void check_x2apic(void) 205static inline void check_x2apic(void)
202{ 206{
203} 207}
@@ -212,6 +216,7 @@ static inline void x2apic_force_phys(void)
212{ 216{
213} 217}
214 218
219#define nox2apic 0
215#define x2apic_preenabled 0 220#define x2apic_preenabled 0
216#define x2apic_supported() 0 221#define x2apic_supported() 0
217#endif 222#endif
@@ -410,6 +415,7 @@ extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
410#endif 415#endif
411 416
412#ifdef CONFIG_X86_LOCAL_APIC 417#ifdef CONFIG_X86_LOCAL_APIC
418
413static inline u32 apic_read(u32 reg) 419static inline u32 apic_read(u32 reg)
414{ 420{
415 return apic->read(reg); 421 return apic->read(reg);
diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h
new file mode 100644
index 000000000000..a2d312796440
--- /dev/null
+++ b/arch/x86/include/asm/apic_flat_64.h
@@ -0,0 +1,7 @@
1#ifndef _ASM_X86_APIC_FLAT_64_H
2#define _ASM_X86_APIC_FLAT_64_H
3
4extern void flat_init_apic_ldr(void);
5
6#endif
7
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3925d8007864..134bba00df09 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -144,6 +144,7 @@
144 144
145#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) 145#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
146#define APIC_BASE_MSR 0x800 146#define APIC_BASE_MSR 0x800
147#define XAPIC_ENABLE (1UL << 11)
147#define X2APIC_ENABLE (1UL << 10) 148#define X2APIC_ENABLE (1UL << 10)
148 149
149#ifdef CONFIG_X86_32 150#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 1775d6e5920e..b97596e2b68c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -380,6 +380,8 @@ static inline unsigned long __fls(unsigned long word)
380 return word; 380 return word;
381} 381}
382 382
383#undef ADDR
384
383#ifdef __KERNEL__ 385#ifdef __KERNEL__
384/** 386/**
385 * ffs - find first set bit in word 387 * ffs - find first set bit in word
@@ -395,10 +397,25 @@ static inline unsigned long __fls(unsigned long word)
395static inline int ffs(int x) 397static inline int ffs(int x)
396{ 398{
397 int r; 399 int r;
398#ifdef CONFIG_X86_CMOV 400
401#ifdef CONFIG_X86_64
402 /*
403 * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
404 * dest reg is undefined if x==0, but their CPU architect says its
405 * value is written to set it to the same as before, except that the
406 * top 32 bits will be cleared.
407 *
408 * We cannot do this on 32 bits because at the very least some
409 * 486 CPUs did not behave this way.
410 */
411 long tmp = -1;
412 asm("bsfl %1,%0"
413 : "=r" (r)
414 : "rm" (x), "0" (tmp));
415#elif defined(CONFIG_X86_CMOV)
399 asm("bsfl %1,%0\n\t" 416 asm("bsfl %1,%0\n\t"
400 "cmovzl %2,%0" 417 "cmovzl %2,%0"
401 : "=r" (r) : "rm" (x), "r" (-1)); 418 : "=&r" (r) : "rm" (x), "r" (-1));
402#else 419#else
403 asm("bsfl %1,%0\n\t" 420 asm("bsfl %1,%0\n\t"
404 "jnz 1f\n\t" 421 "jnz 1f\n\t"
@@ -422,7 +439,22 @@ static inline int ffs(int x)
422static inline int fls(int x) 439static inline int fls(int x)
423{ 440{
424 int r; 441 int r;
425#ifdef CONFIG_X86_CMOV 442
443#ifdef CONFIG_X86_64
444 /*
445 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
446 * dest reg is undefined if x==0, but their CPU architect says its
447 * value is written to set it to the same as before, except that the
448 * top 32 bits will be cleared.
449 *
450 * We cannot do this on 32 bits because at the very least some
451 * 486 CPUs did not behave this way.
452 */
453 long tmp = -1;
454 asm("bsrl %1,%0"
455 : "=r" (r)
456 : "rm" (x), "0" (tmp));
457#elif defined(CONFIG_X86_CMOV)
426 asm("bsrl %1,%0\n\t" 458 asm("bsrl %1,%0\n\t"
427 "cmovzl %2,%0" 459 "cmovzl %2,%0"
428 : "=&r" (r) : "rm" (x), "rm" (-1)); 460 : "=&r" (r) : "rm" (x), "rm" (-1));
@@ -434,11 +466,35 @@ static inline int fls(int x)
434#endif 466#endif
435 return r + 1; 467 return r + 1;
436} 468}
437#endif /* __KERNEL__ */
438
439#undef ADDR
440 469
441#ifdef __KERNEL__ 470/**
471 * fls64 - find last set bit in a 64-bit word
472 * @x: the word to search
473 *
474 * This is defined in a similar way as the libc and compiler builtin
475 * ffsll, but returns the position of the most significant set bit.
476 *
477 * fls64(value) returns 0 if value is 0 or the position of the last
478 * set bit if value is nonzero. The last (most significant) bit is
479 * at position 64.
480 */
481#ifdef CONFIG_X86_64
482static __always_inline int fls64(__u64 x)
483{
484 long bitpos = -1;
485 /*
486 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
487 * dest reg is undefined if x==0, but their CPU architect says its
488 * value is written to set it to the same as before.
489 */
490 asm("bsrq %1,%0"
491 : "+r" (bitpos)
492 : "rm" (x));
493 return bitpos + 1;
494}
495#else
496#include <asm-generic/bitops/fls64.h>
497#endif
442 498
443#include <asm-generic/bitops/find.h> 499#include <asm-generic/bitops/find.h>
444 500
@@ -450,12 +506,6 @@ static inline int fls(int x)
450 506
451#include <asm-generic/bitops/const_hweight.h> 507#include <asm-generic/bitops/const_hweight.h>
452 508
453#endif /* __KERNEL__ */
454
455#include <asm-generic/bitops/fls64.h>
456
457#ifdef __KERNEL__
458
459#include <asm-generic/bitops/le.h> 509#include <asm-generic/bitops/le.h>
460 510
461#include <asm-generic/bitops/ext2-atomic-setbit.h> 511#include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 5d3acdf5a7a6..0c9fa2745f13 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -14,6 +14,8 @@ extern void __cmpxchg_wrong_size(void)
14 __compiletime_error("Bad argument size for cmpxchg"); 14 __compiletime_error("Bad argument size for cmpxchg");
15extern void __xadd_wrong_size(void) 15extern void __xadd_wrong_size(void)
16 __compiletime_error("Bad argument size for xadd"); 16 __compiletime_error("Bad argument size for xadd");
17extern void __add_wrong_size(void)
18 __compiletime_error("Bad argument size for add");
17 19
18/* 20/*
19 * Constants for operation sizes. On 32-bit, the 64-bit size it set to 21 * Constants for operation sizes. On 32-bit, the 64-bit size it set to
@@ -31,60 +33,47 @@ extern void __xadd_wrong_size(void)
31#define __X86_CASE_Q -1 /* sizeof will never return -1 */ 33#define __X86_CASE_Q -1 /* sizeof will never return -1 */
32#endif 34#endif
33 35
36/*
37 * An exchange-type operation, which takes a value and a pointer, and
38 * returns a the old value.
39 */
40#define __xchg_op(ptr, arg, op, lock) \
41 ({ \
42 __typeof__ (*(ptr)) __ret = (arg); \
43 switch (sizeof(*(ptr))) { \
44 case __X86_CASE_B: \
45 asm volatile (lock #op "b %b0, %1\n" \
46 : "+r" (__ret), "+m" (*(ptr)) \
47 : : "memory", "cc"); \
48 break; \
49 case __X86_CASE_W: \
50 asm volatile (lock #op "w %w0, %1\n" \
51 : "+r" (__ret), "+m" (*(ptr)) \
52 : : "memory", "cc"); \
53 break; \
54 case __X86_CASE_L: \
55 asm volatile (lock #op "l %0, %1\n" \
56 : "+r" (__ret), "+m" (*(ptr)) \
57 : : "memory", "cc"); \
58 break; \
59 case __X86_CASE_Q: \
60 asm volatile (lock #op "q %q0, %1\n" \
61 : "+r" (__ret), "+m" (*(ptr)) \
62 : : "memory", "cc"); \
63 break; \
64 default: \
65 __ ## op ## _wrong_size(); \
66 } \
67 __ret; \
68 })
69
34/* 70/*
35 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. 71 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
36 * Since this is generally used to protect other memory information, we 72 * Since this is generally used to protect other memory information, we
37 * use "asm volatile" and "memory" clobbers to prevent gcc from moving 73 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
38 * information around. 74 * information around.
39 */ 75 */
40#define __xchg(x, ptr, size) \ 76#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
41({ \
42 __typeof(*(ptr)) __x = (x); \
43 switch (size) { \
44 case __X86_CASE_B: \
45 { \
46 volatile u8 *__ptr = (volatile u8 *)(ptr); \
47 asm volatile("xchgb %0,%1" \
48 : "=q" (__x), "+m" (*__ptr) \
49 : "0" (__x) \
50 : "memory"); \
51 break; \
52 } \
53 case __X86_CASE_W: \
54 { \
55 volatile u16 *__ptr = (volatile u16 *)(ptr); \
56 asm volatile("xchgw %0,%1" \
57 : "=r" (__x), "+m" (*__ptr) \
58 : "0" (__x) \
59 : "memory"); \
60 break; \
61 } \
62 case __X86_CASE_L: \
63 { \
64 volatile u32 *__ptr = (volatile u32 *)(ptr); \
65 asm volatile("xchgl %0,%1" \
66 : "=r" (__x), "+m" (*__ptr) \
67 : "0" (__x) \
68 : "memory"); \
69 break; \
70 } \
71 case __X86_CASE_Q: \
72 { \
73 volatile u64 *__ptr = (volatile u64 *)(ptr); \
74 asm volatile("xchgq %0,%1" \
75 : "=r" (__x), "+m" (*__ptr) \
76 : "0" (__x) \
77 : "memory"); \
78 break; \
79 } \
80 default: \
81 __xchg_wrong_size(); \
82 } \
83 __x; \
84})
85
86#define xchg(ptr, v) \
87 __xchg((v), (ptr), sizeof(*ptr))
88 77
89/* 78/*
90 * Atomic compare and exchange. Compare OLD with MEM, if identical, 79 * Atomic compare and exchange. Compare OLD with MEM, if identical,
@@ -165,46 +154,80 @@ extern void __xadd_wrong_size(void)
165 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) 154 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
166#endif 155#endif
167 156
168#define __xadd(ptr, inc, lock) \ 157/*
158 * xadd() adds "inc" to "*ptr" and atomically returns the previous
159 * value of "*ptr".
160 *
161 * xadd() is locked when multiple CPUs are online
162 * xadd_sync() is always locked
163 * xadd_local() is never locked
164 */
165#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
166#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
167#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ")
168#define xadd_local(ptr, inc) __xadd((ptr), (inc), "")
169
170#define __add(ptr, inc, lock) \
169 ({ \ 171 ({ \
170 __typeof__ (*(ptr)) __ret = (inc); \ 172 __typeof__ (*(ptr)) __ret = (inc); \
171 switch (sizeof(*(ptr))) { \ 173 switch (sizeof(*(ptr))) { \
172 case __X86_CASE_B: \ 174 case __X86_CASE_B: \
173 asm volatile (lock "xaddb %b0, %1\n" \ 175 asm volatile (lock "addb %b1, %0\n" \
174 : "+r" (__ret), "+m" (*(ptr)) \ 176 : "+m" (*(ptr)) : "ri" (inc) \
175 : : "memory", "cc"); \ 177 : "memory", "cc"); \
176 break; \ 178 break; \
177 case __X86_CASE_W: \ 179 case __X86_CASE_W: \
178 asm volatile (lock "xaddw %w0, %1\n" \ 180 asm volatile (lock "addw %w1, %0\n" \
179 : "+r" (__ret), "+m" (*(ptr)) \ 181 : "+m" (*(ptr)) : "ri" (inc) \
180 : : "memory", "cc"); \ 182 : "memory", "cc"); \
181 break; \ 183 break; \
182 case __X86_CASE_L: \ 184 case __X86_CASE_L: \
183 asm volatile (lock "xaddl %0, %1\n" \ 185 asm volatile (lock "addl %1, %0\n" \
184 : "+r" (__ret), "+m" (*(ptr)) \ 186 : "+m" (*(ptr)) : "ri" (inc) \
185 : : "memory", "cc"); \ 187 : "memory", "cc"); \
186 break; \ 188 break; \
187 case __X86_CASE_Q: \ 189 case __X86_CASE_Q: \
188 asm volatile (lock "xaddq %q0, %1\n" \ 190 asm volatile (lock "addq %1, %0\n" \
189 : "+r" (__ret), "+m" (*(ptr)) \ 191 : "+m" (*(ptr)) : "ri" (inc) \
190 : : "memory", "cc"); \ 192 : "memory", "cc"); \
191 break; \ 193 break; \
192 default: \ 194 default: \
193 __xadd_wrong_size(); \ 195 __add_wrong_size(); \
194 } \ 196 } \
195 __ret; \ 197 __ret; \
196 }) 198 })
197 199
198/* 200/*
199 * xadd() adds "inc" to "*ptr" and atomically returns the previous 201 * add_*() adds "inc" to "*ptr"
200 * value of "*ptr".
201 * 202 *
202 * xadd() is locked when multiple CPUs are online 203 * __add() takes a lock prefix
203 * xadd_sync() is always locked 204 * add_smp() is locked when multiple CPUs are online
204 * xadd_local() is never locked 205 * add_sync() is always locked
205 */ 206 */
206#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) 207#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX)
207#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") 208#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ")
208#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") 209
210#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \
211({ \
212 bool __ret; \
213 __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \
214 __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \
215 BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \
216 BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
217 VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \
218 VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \
219 asm volatile(pfx "cmpxchg%c4b %2; sete %0" \
220 : "=a" (__ret), "+d" (__old2), \
221 "+m" (*(p1)), "+m" (*(p2)) \
222 : "i" (2 * sizeof(long)), "a" (__old1), \
223 "b" (__new1), "c" (__new2)); \
224 __ret; \
225})
226
227#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \
228 __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2)
229
230#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \
231 __cmpxchg_double(, p1, p2, o1, o2, n1, n2)
209 232
210#endif /* ASM_X86_CMPXCHG_H */ 233#endif /* ASM_X86_CMPXCHG_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index fbebb07dd80b..53f4b219336b 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -166,52 +166,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
166 166
167#endif 167#endif
168 168
169#define cmpxchg8b(ptr, o1, o2, n1, n2) \
170({ \
171 char __ret; \
172 __typeof__(o2) __dummy; \
173 __typeof__(*(ptr)) __old1 = (o1); \
174 __typeof__(o2) __old2 = (o2); \
175 __typeof__(*(ptr)) __new1 = (n1); \
176 __typeof__(o2) __new2 = (n2); \
177 asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \
178 : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
179 : "a" (__old1), "d"(__old2), \
180 "b" (__new1), "c" (__new2) \
181 : "memory"); \
182 __ret; })
183
184
185#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \
186({ \
187 char __ret; \
188 __typeof__(o2) __dummy; \
189 __typeof__(*(ptr)) __old1 = (o1); \
190 __typeof__(o2) __old2 = (o2); \
191 __typeof__(*(ptr)) __new1 = (n1); \
192 __typeof__(o2) __new2 = (n2); \
193 asm volatile("cmpxchg8b %2; setz %1" \
194 : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
195 : "a" (__old), "d"(__old2), \
196 "b" (__new1), "c" (__new2), \
197 : "memory"); \
198 __ret; })
199
200
201#define cmpxchg_double(ptr, o1, o2, n1, n2) \
202({ \
203 BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
204 VM_BUG_ON((unsigned long)(ptr) % 8); \
205 cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \
206})
207
208#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
209({ \
210 BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
211 VM_BUG_ON((unsigned long)(ptr) % 8); \
212 cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
213})
214
215#define system_has_cmpxchg_double() cpu_has_cx8 169#define system_has_cmpxchg_double() cpu_has_cx8
216 170
217#endif /* _ASM_X86_CMPXCHG_32_H */ 171#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 285da02c38fa..614be87f1a9b 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,49 +20,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
20 cmpxchg_local((ptr), (o), (n)); \ 20 cmpxchg_local((ptr), (o), (n)); \
21}) 21})
22 22
23#define cmpxchg16b(ptr, o1, o2, n1, n2) \
24({ \
25 char __ret; \
26 __typeof__(o2) __junk; \
27 __typeof__(*(ptr)) __old1 = (o1); \
28 __typeof__(o2) __old2 = (o2); \
29 __typeof__(*(ptr)) __new1 = (n1); \
30 __typeof__(o2) __new2 = (n2); \
31 asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \
32 : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
33 : "b"(__new1), "c"(__new2), \
34 "a"(__old1), "d"(__old2)); \
35 __ret; })
36
37
38#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \
39({ \
40 char __ret; \
41 __typeof__(o2) __junk; \
42 __typeof__(*(ptr)) __old1 = (o1); \
43 __typeof__(o2) __old2 = (o2); \
44 __typeof__(*(ptr)) __new1 = (n1); \
45 __typeof__(o2) __new2 = (n2); \
46 asm volatile("cmpxchg16b %2;setz %1" \
47 : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
48 : "b"(__new1), "c"(__new2), \
49 "a"(__old1), "d"(__old2)); \
50 __ret; })
51
52#define cmpxchg_double(ptr, o1, o2, n1, n2) \
53({ \
54 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
55 VM_BUG_ON((unsigned long)(ptr) % 16); \
56 cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \
57})
58
59#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
60({ \
61 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
62 VM_BUG_ON((unsigned long)(ptr) % 16); \
63 cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
64})
65
66#define system_has_cmpxchg_double() cpu_has_cx16 23#define system_has_cmpxchg_double() cpu_has_cx16
67 24
68#endif /* _ASM_X86_CMPXCHG_64_H */ 25#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9a2d644c08ef..ced283ac79df 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -4,6 +4,7 @@
4#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
5 5
6#include <linux/types.h> 6#include <linux/types.h>
7#include <linux/log2.h>
7 8
8/* 9/*
9 * do_div() is NOT a C function. It wants to return 10 * do_div() is NOT a C function. It wants to return
@@ -21,15 +22,20 @@
21({ \ 22({ \
22 unsigned long __upper, __low, __high, __mod, __base; \ 23 unsigned long __upper, __low, __high, __mod, __base; \
23 __base = (base); \ 24 __base = (base); \
24 asm("":"=a" (__low), "=d" (__high) : "A" (n)); \ 25 if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
25 __upper = __high; \ 26 __mod = n & (__base - 1); \
26 if (__high) { \ 27 n >>= ilog2(__base); \
27 __upper = __high % (__base); \ 28 } else { \
28 __high = __high / (__base); \ 29 asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
30 __upper = __high; \
31 if (__high) { \
32 __upper = __high % (__base); \
33 __high = __high / (__base); \
34 } \
35 asm("divl %2" : "=a" (__low), "=d" (__mod) \
36 : "rm" (__base), "0" (__low), "1" (__upper)); \
37 asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
29 } \ 38 } \
30 asm("divl %2":"=a" (__low), "=d" (__mod) \
31 : "rm" (__base), "0" (__low), "1" (__upper)); \
32 asm("":"=A" (n) : "a" (__low), "d" (__high)); \
33 __mod; \ 39 __mod; \
34}) 40})
35 41
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 908b96957d88..37782566af24 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -117,7 +117,7 @@ static inline void early_memtest(unsigned long start, unsigned long end)
117 117
118extern unsigned long e820_end_of_ram_pfn(void); 118extern unsigned long e820_end_of_ram_pfn(void);
119extern unsigned long e820_end_of_low_ram_pfn(void); 119extern unsigned long e820_end_of_low_ram_pfn(void);
120extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); 120extern u64 early_reserve_e820(u64 sizet, u64 align);
121 121
122void memblock_x86_fill(void); 122void memblock_x86_fill(void);
123void memblock_find_dma_reserve(void); 123void memblock_find_dma_reserve(void);
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 55e4de613f0e..da0b3ca815b7 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -11,6 +11,7 @@ typedef struct {
11#ifdef CONFIG_X86_LOCAL_APIC 11#ifdef CONFIG_X86_LOCAL_APIC
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14 unsigned int icr_read_retry_count;
14#endif 15#endif
15 unsigned int x86_platform_ipis; /* arch dependent */ 16 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 17 unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c9e09ea05644..6919e936345b 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
218#ifdef CONFIG_SMP 218#ifdef CONFIG_SMP
219#define safe_address (__per_cpu_offset[0]) 219#define safe_address (__per_cpu_offset[0])
220#else 220#else
221#define safe_address (kstat_cpu(0).cpustat.user) 221#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
222#endif 222#endif
223 223
224/* 224/*
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 88c765e16410..74df3f1eddfd 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -137,6 +137,13 @@ static inline int insn_is_avx(struct insn *insn)
137 return (insn->vex_prefix.value != 0); 137 return (insn->vex_prefix.value != 0);
138} 138}
139 139
140/* Ensure this instruction is decoded completely */
141static inline int insn_complete(struct insn *insn)
142{
143 return insn->opcode.got && insn->modrm.got && insn->sib.got &&
144 insn->displacement.got && insn->immediate.got;
145}
146
140static inline insn_byte_t insn_vex_m_bits(struct insn *insn) 147static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
141{ 148{
142 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ 149 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h
index 853728519ae9..88d0c3c74c13 100644
--- a/arch/x86/include/asm/mach_timer.h
+++ b/arch/x86/include/asm/mach_timer.h
@@ -15,7 +15,7 @@
15 15
16#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ 16#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
17#define CALIBRATE_LATCH \ 17#define CALIBRATE_LATCH \
18 ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) 18 ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
19 19
20static inline void mach_prepare_counter(void) 20static inline void mach_prepare_counter(void)
21{ 21{
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 01fdf5674e24..0e8e85bb7c51 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -81,8 +81,8 @@ static inline unsigned char current_lock_cmos_reg(void)
81#else 81#else
82#define lock_cmos_prefix(reg) do {} while (0) 82#define lock_cmos_prefix(reg) do {} while (0)
83#define lock_cmos_suffix(reg) do {} while (0) 83#define lock_cmos_suffix(reg) do {} while (0)
84#define lock_cmos(reg) 84#define lock_cmos(reg) do { } while (0)
85#define unlock_cmos() 85#define unlock_cmos() do { } while (0)
86#define do_i_have_lock_cmos() 0 86#define do_i_have_lock_cmos() 0
87#define current_lock_cmos_reg() 0 87#define current_lock_cmos_reg() 0
88#endif 88#endif
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
deleted file mode 100644
index 0cd3800f33b9..000000000000
--- a/arch/x86/include/asm/memblock.h
+++ /dev/null
@@ -1,23 +0,0 @@
1#ifndef _X86_MEMBLOCK_H
2#define _X86_MEMBLOCK_H
3
4#define ARCH_DISCARD_MEMBLOCK
5
6u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
7
8void memblock_x86_reserve_range(u64 start, u64 end, char *name);
9void memblock_x86_free_range(u64 start, u64 end);
10struct range;
11int __get_free_all_memory_range(struct range **range, int nodeid,
12 unsigned long start_pfn, unsigned long end_pfn);
13int get_free_all_memory_range(struct range **rangep, int nodeid);
14
15void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
16 unsigned long last_pfn);
17u64 memblock_x86_hole_size(u64 start, u64 end);
18u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
19u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
20u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
21bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
22
23#endif
diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h
new file mode 100644
index 000000000000..660f843df928
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip_csr.h
@@ -0,0 +1,167 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Numascale NumaConnect-Specific Header file
7 *
8 * Copyright (C) 2011 Numascale AS. All rights reserved.
9 *
10 * Send feedback to <support@numascale.com>
11 *
12 */
13
14#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
15#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
16
17#include <linux/numa.h>
18#include <linux/percpu.h>
19#include <linux/io.h>
20#include <linux/swab.h>
21#include <asm/types.h>
22#include <asm/processor.h>
23
24#define CSR_NODE_SHIFT 16
25#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT)
26#define CSR_NODE_MASK 0x0fff /* 4K nodes */
27
28/* 32K CSR space, b15 indicates geo/non-geo */
29#define CSR_OFFSET_MASK 0x7fffUL
30
31/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */
32#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL
33#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL
34#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1)
35
36/*
37 * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
38 * when using the direct mapping on x86_64, both start and size needs to be
39 * aligned with PMD_SIZE which is 2M
40 */
41#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL
42#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL
43#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
44
45static inline void *gcsr_address(int node, unsigned long offset)
46{
47 return __va(NUMACHIP_GCSR_BASE | (1UL << 15) |
48 CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK));
49}
50
51static inline void *lcsr_address(unsigned long offset)
52{
53 return __va(NUMACHIP_LCSR_BASE | (1UL << 15) |
54 CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
55}
56
57static inline unsigned int read_gcsr(int node, unsigned long offset)
58{
59 return swab32(readl(gcsr_address(node, offset)));
60}
61
62static inline void write_gcsr(int node, unsigned long offset, unsigned int val)
63{
64 writel(swab32(val), gcsr_address(node, offset));
65}
66
67static inline unsigned int read_lcsr(unsigned long offset)
68{
69 return swab32(readl(lcsr_address(offset)));
70}
71
72static inline void write_lcsr(unsigned long offset, unsigned int val)
73{
74 writel(swab32(val), lcsr_address(offset));
75}
76
77/* ========================================================================= */
78/* CSR_G0_STATE_CLEAR */
79/* ========================================================================= */
80
81#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12))
82union numachip_csr_g0_state_clear {
83 unsigned int v;
84 struct numachip_csr_g0_state_clear_s {
85 unsigned int _state:2;
86 unsigned int _rsvd_2_6:5;
87 unsigned int _lost:1;
88 unsigned int _rsvd_8_31:24;
89 } s;
90};
91
92/* ========================================================================= */
93/* CSR_G0_NODE_IDS */
94/* ========================================================================= */
95
96#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
97union numachip_csr_g0_node_ids {
98 unsigned int v;
99 struct numachip_csr_g0_node_ids_s {
100 unsigned int _initialid:16;
101 unsigned int _nodeid:12;
102 unsigned int _rsvd_28_31:4;
103 } s;
104};
105
106/* ========================================================================= */
107/* CSR_G3_EXT_IRQ_GEN */
108/* ========================================================================= */
109
110#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
111union numachip_csr_g3_ext_irq_gen {
112 unsigned int v;
113 struct numachip_csr_g3_ext_irq_gen_s {
114 unsigned int _vector:8;
115 unsigned int _msgtype:3;
116 unsigned int _index:5;
117 unsigned int _destination_apic_id:16;
118 } s;
119};
120
121/* ========================================================================= */
122/* CSR_G3_EXT_IRQ_STATUS */
123/* ========================================================================= */
124
125#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12))
126union numachip_csr_g3_ext_irq_status {
127 unsigned int v;
128 struct numachip_csr_g3_ext_irq_status_s {
129 unsigned int _result:32;
130 } s;
131};
132
133/* ========================================================================= */
134/* CSR_G3_EXT_IRQ_DEST */
135/* ========================================================================= */
136
137#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12))
138union numachip_csr_g3_ext_irq_dest {
139 unsigned int v;
140 struct numachip_csr_g3_ext_irq_dest_s {
141 unsigned int _irq:8;
142 unsigned int _rsvd_8_31:24;
143 } s;
144};
145
146/* ========================================================================= */
147/* CSR_G3_NC_ATT_MAP_SELECT */
148/* ========================================================================= */
149
150#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12))
151union numachip_csr_g3_nc_att_map_select {
152 unsigned int v;
153 struct numachip_csr_g3_nc_att_map_select_s {
154 unsigned int _upper_address_bits:4;
155 unsigned int _select_ram:4;
156 unsigned int _rsvd_8_31:24;
157 } s;
158};
159
160/* ========================================================================= */
161/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */
162/* ========================================================================= */
163
164#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12))
165
166#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
167
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 3470c9d0ebba..529bf07e8067 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -451,23 +451,20 @@ do { \
451#endif /* !CONFIG_M386 */ 451#endif /* !CONFIG_M386 */
452 452
453#ifdef CONFIG_X86_CMPXCHG64 453#ifdef CONFIG_X86_CMPXCHG64
454#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ 454#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \
455({ \ 455({ \
456 char __ret; \ 456 bool __ret; \
457 typeof(o1) __o1 = o1; \ 457 typeof(pcp1) __o1 = (o1), __n1 = (n1); \
458 typeof(o1) __n1 = n1; \ 458 typeof(pcp2) __o2 = (o2), __n2 = (n2); \
459 typeof(o2) __o2 = o2; \
460 typeof(o2) __n2 = n2; \
461 typeof(o2) __dummy = n2; \
462 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ 459 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
463 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ 460 : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \
464 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ 461 : "b" (__n1), "c" (__n2), "a" (__o1)); \
465 __ret; \ 462 __ret; \
466}) 463})
467 464
468#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) 465#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
469#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) 466#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
470#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) 467#define irqsafe_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
471#endif /* CONFIG_X86_CMPXCHG64 */ 468#endif /* CONFIG_X86_CMPXCHG64 */
472 469
473/* 470/*
@@ -508,31 +505,23 @@ do { \
508 * it in software. The address used in the cmpxchg16 instruction must be 505 * it in software. The address used in the cmpxchg16 instruction must be
509 * aligned to a 16 byte boundary. 506 * aligned to a 16 byte boundary.
510 */ 507 */
511#ifdef CONFIG_SMP 508#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \
512#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
513#else
514#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
515#endif
516#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
517({ \ 509({ \
518 char __ret; \ 510 bool __ret; \
519 typeof(o1) __o1 = o1; \ 511 typeof(pcp1) __o1 = (o1), __n1 = (n1); \
520 typeof(o1) __n1 = n1; \ 512 typeof(pcp2) __o2 = (o2), __n2 = (n2); \
521 typeof(o2) __o2 = o2; \ 513 alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
522 typeof(o2) __n2 = n2; \ 514 "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
523 typeof(o2) __dummy; \
524 alternative_io(CMPXCHG16B_EMU_CALL, \
525 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
526 X86_FEATURE_CX16, \ 515 X86_FEATURE_CX16, \
527 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ 516 ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \
528 "S" (&pcp1), "b"(__n1), "c"(__n2), \ 517 "+m" (pcp2), "+d" (__o2)), \
529 "a"(__o1), "d"(__o2) : "memory"); \ 518 "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \
530 __ret; \ 519 __ret; \
531}) 520})
532 521
533#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) 522#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
534#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) 523#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
535#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) 524#define irqsafe_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
536 525
537#endif 526#endif
538 527
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f61c62f7d5d8..096c975e099f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -57,6 +57,7 @@
57 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) 57 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
58 58
59#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 59#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
60#define ARCH_PERFMON_EVENTS_COUNT 7
60 61
61/* 62/*
62 * Intel "Architectural Performance Monitoring" CPUID 63 * Intel "Architectural Performance Monitoring" CPUID
@@ -72,6 +73,19 @@ union cpuid10_eax {
72 unsigned int full; 73 unsigned int full;
73}; 74};
74 75
76union cpuid10_ebx {
77 struct {
78 unsigned int no_unhalted_core_cycles:1;
79 unsigned int no_instructions_retired:1;
80 unsigned int no_unhalted_reference_cycles:1;
81 unsigned int no_llc_reference:1;
82 unsigned int no_llc_misses:1;
83 unsigned int no_branch_instruction_retired:1;
84 unsigned int no_branch_misses_retired:1;
85 } split;
86 unsigned int full;
87};
88
75union cpuid10_edx { 89union cpuid10_edx {
76 struct { 90 struct {
77 unsigned int num_counters_fixed:5; 91 unsigned int num_counters_fixed:5;
@@ -81,6 +95,15 @@ union cpuid10_edx {
81 unsigned int full; 95 unsigned int full;
82}; 96};
83 97
98struct x86_pmu_capability {
99 int version;
100 int num_counters_gp;
101 int num_counters_fixed;
102 int bit_width_gp;
103 int bit_width_fixed;
104 unsigned int events_mask;
105 int events_mask_len;
106};
84 107
85/* 108/*
86 * Fixed-purpose performance events: 109 * Fixed-purpose performance events:
@@ -89,23 +112,24 @@ union cpuid10_edx {
89/* 112/*
90 * All 3 fixed-mode PMCs are configured via this single MSR: 113 * All 3 fixed-mode PMCs are configured via this single MSR:
91 */ 114 */
92#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d 115#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
93 116
94/* 117/*
95 * The counts are available in three separate MSRs: 118 * The counts are available in three separate MSRs:
96 */ 119 */
97 120
98/* Instr_Retired.Any: */ 121/* Instr_Retired.Any: */
99#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 122#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
100#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) 123#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
101 124
102/* CPU_CLK_Unhalted.Core: */ 125/* CPU_CLK_Unhalted.Core: */
103#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a 126#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
104#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) 127#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
105 128
106/* CPU_CLK_Unhalted.Ref: */ 129/* CPU_CLK_Unhalted.Ref: */
107#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 130#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
108#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) 131#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2)
132#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES)
109 133
110/* 134/*
111 * We model BTS tracing as another fixed-mode PMC. 135 * We model BTS tracing as another fixed-mode PMC.
@@ -202,6 +226,7 @@ struct perf_guest_switch_msr {
202}; 226};
203 227
204extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); 228extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
229extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
205#else 230#else
206static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) 231static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
207{ 232{
@@ -209,6 +234,11 @@ static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
209 return NULL; 234 return NULL;
210} 235}
211 236
237static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
238{
239 memset(cap, 0, sizeof(*cap));
240}
241
212static inline void perf_events_lapic_init(void) { } 242static inline void perf_events_lapic_init(void) { }
213#endif 243#endif
214 244
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18601c86fab1..49afb3f41eb6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -703,7 +703,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
703 pte_update(mm, addr, ptep); 703 pte_update(mm, addr, ptep);
704} 704}
705 705
706#define flush_tlb_fix_spurious_fault(vma, address) 706#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
707 707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) 708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709 709
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 2dddb317bb39..f8ab3eaad128 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -6,6 +6,7 @@
6 * EFLAGS bits 6 * EFLAGS bits
7 */ 7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ 8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ 10#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ 11#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ 12#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b650435ffb53..aa9088c26931 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -99,7 +99,6 @@ struct cpuinfo_x86 {
99 u16 apicid; 99 u16 apicid;
100 u16 initial_apicid; 100 u16 initial_apicid;
101 u16 x86_clflush_size; 101 u16 x86_clflush_size;
102#ifdef CONFIG_SMP
103 /* number of cores as seen by the OS: */ 102 /* number of cores as seen by the OS: */
104 u16 booted_cores; 103 u16 booted_cores;
105 /* Physical processor id: */ 104 /* Physical processor id: */
@@ -110,7 +109,6 @@ struct cpuinfo_x86 {
110 u8 compute_unit_id; 109 u8 compute_unit_id;
111 /* Index into per_cpu list: */ 110 /* Index into per_cpu list: */
112 u16 cpu_index; 111 u16 cpu_index;
113#endif
114 u32 microcode; 112 u32 microcode;
115} __attribute__((__aligned__(SMP_CACHE_BYTES))); 113} __attribute__((__aligned__(SMP_CACHE_BYTES)));
116 114
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 972c260919a3..a82c2bf504b6 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -79,23 +79,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
79 return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; 79 return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
80} 80}
81 81
82#if (NR_CPUS < 256)
83static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) 82static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
84{ 83{
85 asm volatile(UNLOCK_LOCK_PREFIX "incb %0" 84 __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
86 : "+m" (lock->head_tail)
87 :
88 : "memory", "cc");
89} 85}
90#else
91static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
92{
93 asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
94 : "+m" (lock->head_tail)
95 :
96 : "memory", "cc");
97}
98#endif
99 86
100static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) 87static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
101{ 88{
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c127b52..185b719ec61a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,7 +40,8 @@ struct thread_info {
40 */ 40 */
41 __u8 supervisor_stack[0]; 41 __u8 supervisor_stack[0];
42#endif 42#endif
43 int uaccess_err; 43 int sig_on_uaccess_error:1;
44 int uaccess_err:1; /* uaccess failed */
44}; 45};
45 46
46#define INIT_THREAD_INFO(tsk) \ 47#define INIT_THREAD_INFO(tsk) \
@@ -231,6 +232,12 @@ static inline struct thread_info *current_thread_info(void)
231 movq PER_CPU_VAR(kernel_stack),reg ; \ 232 movq PER_CPU_VAR(kernel_stack),reg ; \
232 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg 233 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
233 234
235/*
236 * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
237 * a certain register (to be used in assembler memory operands).
238 */
239#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
240
234#endif 241#endif
235 242
236#endif /* !X86_32 */ 243#endif /* !X86_32 */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c00692476e9f..800f77c60051 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -130,10 +130,8 @@ extern void setup_node_to_cpumask_map(void);
130 .balance_interval = 1, \ 130 .balance_interval = 1, \
131} 131}
132 132
133#ifdef CONFIG_X86_64
134extern int __node_distance(int, int); 133extern int __node_distance(int, int);
135#define node_distance(a, b) __node_distance(a, b) 134#define node_distance(a, b) __node_distance(a, b)
136#endif
137 135
138#else /* !CONFIG_NUMA */ 136#else /* !CONFIG_NUMA */
139 137
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 83e2efd181e2..15d99153a96d 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,8 @@ extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void); 52extern unsigned long native_calibrate_tsc(void);
53 53
54extern int tsc_clocksource_reliable;
55
54/* 56/*
55 * Boot-time check whether the TSCs are synchronized across 57 * Boot-time check whether the TSCs are synchronized across
56 * all CPUs/cores: 58 * all CPUs/cores:
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 36361bf6fdd1..8be5f54d9360 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; };
462 barrier(); 462 barrier();
463 463
464#define uaccess_catch(err) \ 464#define uaccess_catch(err) \
465 (err) |= current_thread_info()->uaccess_err; \ 465 (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
466 current_thread_info()->uaccess_err = prev_err; \ 466 current_thread_info()->uaccess_err = prev_err; \
467} while (0) 467} while (0)
468 468
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1971e652d24b..1ac860a09849 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -7,6 +7,7 @@
7struct mpc_bus; 7struct mpc_bus;
8struct mpc_cpu; 8struct mpc_cpu;
9struct mpc_table; 9struct mpc_table;
10struct cpuinfo_x86;
10 11
11/** 12/**
12 * struct x86_init_mpparse - platform specific mpparse ops 13 * struct x86_init_mpparse - platform specific mpparse ops
@@ -147,6 +148,7 @@ struct x86_init_ops {
147 */ 148 */
148struct x86_cpuinit_ops { 149struct x86_cpuinit_ops {
149 void (*setup_percpu_clockev)(void); 150 void (*setup_percpu_clockev)(void);
151 void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
150}; 152};
151 153
152/** 154/**
@@ -186,5 +188,6 @@ extern struct x86_msi_ops x86_msi;
186 188
187extern void x86_init_noop(void); 189extern void x86_init_noop(void);
188extern void x86_init_uint_noop(unsigned int unused); 190extern void x86_init_uint_noop(unsigned int unused);
191extern void x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node);
189 192
190#endif 193#endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d0822d..ce664f33ea8e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -219,6 +219,8 @@ static int __init
219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) 219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
220{ 220{
221 struct acpi_madt_local_x2apic *processor = NULL; 221 struct acpi_madt_local_x2apic *processor = NULL;
222 int apic_id;
223 u8 enabled;
222 224
223 processor = (struct acpi_madt_local_x2apic *)header; 225 processor = (struct acpi_madt_local_x2apic *)header;
224 226
@@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
227 229
228 acpi_table_print_madt_entry(header); 230 acpi_table_print_madt_entry(header);
229 231
232 apic_id = processor->local_apic_id;
233 enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
230#ifdef CONFIG_X86_X2APIC 234#ifdef CONFIG_X86_X2APIC
231 /* 235 /*
232 * We need to register disabled CPU as well to permit 236 * We need to register disabled CPU as well to permit
@@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
235 * to not preallocating memory for all NR_CPUS 239 * to not preallocating memory for all NR_CPUS
236 * when we use CPU hotplug. 240 * when we use CPU hotplug.
237 */ 241 */
238 acpi_register_lapic(processor->local_apic_id, /* APIC ID */ 242 if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
239 processor->lapic_flags & ACPI_MADT_ENABLED); 243 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
244 else
245 acpi_register_lapic(apic_id, enabled);
240#else 246#else
241 printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); 247 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
242#endif 248#endif
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8facc..013c1810ce72 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu)
123{ 123{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; 124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask; 125 unsigned int mask;
126 int cuid = 0; 126 int cuid;
127 127
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0; 129 return 0;
130 130
131 pci_read_config_dword(link, 0x1d4, &mask); 131 pci_read_config_dword(link, 0x1d4, &mask);
132 132
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id; 133 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf; 134 return (mask >> (4 * cuid)) & 0xf;
137} 135}
138 136
@@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask)
141 static unsigned int reset, ban; 139 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); 140 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg; 141 unsigned int reg;
144 int cuid = 0; 142 int cuid;
145 143
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) 144 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL; 145 return -EINVAL;
@@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask)
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); 157 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 } 158 }
161 159
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id; 160 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid; 161 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26; 162 mask |= (0xf ^ (1 << cuid)) << 26;
167 163
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3d2661ca6542..6e76c191a835 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void)
88 */ 88 */
89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, 89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
90 aper_size, aper_size); 90 aper_size, aper_size);
91 if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { 91 if (!addr || addr + aper_size > GART_MAX_ADDR) {
92 printk(KERN_ERR 92 printk(KERN_ERR
93 "Cannot allocate aperture memory hole (%lx,%uK)\n", 93 "Cannot allocate aperture memory hole (%lx,%uK)\n",
94 addr, aper_size>>10); 94 addr, aper_size>>10);
95 return 0; 95 return 0;
96 } 96 }
97 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); 97 memblock_reserve(addr, aper_size);
98 /* 98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the 99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping. 100 * kernel direct mapping.
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 767fd04f2843..0ae0323b1f9c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o
10 10
11ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
12# APIC probe will depend on the listing order here 12# APIC probe will depend on the listing order here
13obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 14obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o 15obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o 16obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f98d84caf94c..2eec05b6d1b8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);
146int x2apic_mode; 146int x2apic_mode;
147#ifdef CONFIG_X86_X2APIC 147#ifdef CONFIG_X86_X2APIC
148/* x2apic enabled before OS handover */ 148/* x2apic enabled before OS handover */
149static int x2apic_preenabled; 149int x2apic_preenabled;
150static int x2apic_disabled;
151static int nox2apic;
150static __init int setup_nox2apic(char *str) 152static __init int setup_nox2apic(char *str)
151{ 153{
152 if (x2apic_enabled()) { 154 if (x2apic_enabled()) {
153 pr_warning("Bios already enabled x2apic, " 155 int apicid = native_apic_msr_read(APIC_ID);
154 "can't enforce nox2apic"); 156
155 return 0; 157 if (apicid >= 255) {
156 } 158 pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
159 apicid);
160 return 0;
161 }
162
163 pr_warning("x2apic already enabled. will disable it\n");
164 } else
165 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
166
167 nox2apic = 1;
157 168
158 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
159 return 0; 169 return 0;
160} 170}
161early_param("nox2apic", setup_nox2apic); 171early_param("nox2apic", setup_nox2apic);
@@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)
250 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 260 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
251 if (!send_status) 261 if (!send_status)
252 break; 262 break;
263 inc_irq_stat(icr_read_retry_count);
253 udelay(100); 264 udelay(100);
254 } while (timeout++ < 1000); 265 } while (timeout++ < 1000);
255 266
@@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
876 * Besides, if we don't timer interrupts ignore the global 887 * Besides, if we don't timer interrupts ignore the global
877 * interrupt lock, which is the WrongThing (tm) to do. 888 * interrupt lock, which is the WrongThing (tm) to do.
878 */ 889 */
879 exit_idle();
880 irq_enter(); 890 irq_enter();
891 exit_idle();
881 local_apic_timer_interrupt(); 892 local_apic_timer_interrupt();
882 irq_exit(); 893 irq_exit();
883 894
@@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)
1431} 1442}
1432 1443
1433#ifdef CONFIG_X86_X2APIC 1444#ifdef CONFIG_X86_X2APIC
1445/*
1446 * Need to disable xapic and x2apic at the same time and then enable xapic mode
1447 */
1448static inline void __disable_x2apic(u64 msr)
1449{
1450 wrmsrl(MSR_IA32_APICBASE,
1451 msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
1452 wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
1453}
1454
1455static __init void disable_x2apic(void)
1456{
1457 u64 msr;
1458
1459 if (!cpu_has_x2apic)
1460 return;
1461
1462 rdmsrl(MSR_IA32_APICBASE, msr);
1463 if (msr & X2APIC_ENABLE) {
1464 u32 x2apic_id = read_apic_id();
1465
1466 if (x2apic_id >= 255)
1467 panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
1468
1469 pr_info("Disabling x2apic\n");
1470 __disable_x2apic(msr);
1471
1472 if (nox2apic) {
1473 clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
1474 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
1475 }
1476
1477 x2apic_disabled = 1;
1478 x2apic_mode = 0;
1479
1480 register_lapic_address(mp_lapic_addr);
1481 }
1482}
1483
1434void check_x2apic(void) 1484void check_x2apic(void)
1435{ 1485{
1436 if (x2apic_enabled()) { 1486 if (x2apic_enabled()) {
@@ -1441,15 +1491,20 @@ void check_x2apic(void)
1441 1491
1442void enable_x2apic(void) 1492void enable_x2apic(void)
1443{ 1493{
1444 int msr, msr2; 1494 u64 msr;
1495
1496 rdmsrl(MSR_IA32_APICBASE, msr);
1497 if (x2apic_disabled) {
1498 __disable_x2apic(msr);
1499 return;
1500 }
1445 1501
1446 if (!x2apic_mode) 1502 if (!x2apic_mode)
1447 return; 1503 return;
1448 1504
1449 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1450 if (!(msr & X2APIC_ENABLE)) { 1505 if (!(msr & X2APIC_ENABLE)) {
1451 printk_once(KERN_INFO "Enabling x2apic\n"); 1506 printk_once(KERN_INFO "Enabling x2apic\n");
1452 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); 1507 wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
1453 } 1508 }
1454} 1509}
1455#endif /* CONFIG_X86_X2APIC */ 1510#endif /* CONFIG_X86_X2APIC */
@@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void)
1486 ret = save_ioapic_entries(); 1541 ret = save_ioapic_entries();
1487 if (ret) { 1542 if (ret) {
1488 pr_info("Saving IO-APIC state failed: %d\n", ret); 1543 pr_info("Saving IO-APIC state failed: %d\n", ret);
1489 goto out; 1544 return;
1490 } 1545 }
1491 1546
1492 local_irq_save(flags); 1547 local_irq_save(flags);
1493 legacy_pic->mask_all(); 1548 legacy_pic->mask_all();
1494 mask_ioapic_entries(); 1549 mask_ioapic_entries();
1495 1550
1551 if (x2apic_preenabled && nox2apic)
1552 disable_x2apic();
1553
1496 if (dmar_table_init_ret) 1554 if (dmar_table_init_ret)
1497 ret = -1; 1555 ret = -1;
1498 else 1556 else
1499 ret = enable_IR(); 1557 ret = enable_IR();
1500 1558
1559 if (!x2apic_supported())
1560 goto skip_x2apic;
1561
1501 if (ret < 0) { 1562 if (ret < 0) {
1502 /* IR is required if there is APIC ID > 255 even when running 1563 /* IR is required if there is APIC ID > 255 even when running
1503 * under KVM 1564 * under KVM
1504 */ 1565 */
1505 if (max_physical_apicid > 255 || 1566 if (max_physical_apicid > 255 ||
1506 !hypervisor_x2apic_available()) 1567 !hypervisor_x2apic_available()) {
1507 goto nox2apic; 1568 if (x2apic_preenabled)
1569 disable_x2apic();
1570 goto skip_x2apic;
1571 }
1508 /* 1572 /*
1509 * without IR all CPUs can be addressed by IOAPIC/MSI 1573 * without IR all CPUs can be addressed by IOAPIC/MSI
1510 * only in physical mode 1574 * only in physical mode
@@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void)
1512 x2apic_force_phys(); 1576 x2apic_force_phys();
1513 } 1577 }
1514 1578
1515 if (ret == IRQ_REMAP_XAPIC_MODE) 1579 if (ret == IRQ_REMAP_XAPIC_MODE) {
1516 goto nox2apic; 1580 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1581 goto skip_x2apic;
1582 }
1517 1583
1518 x2apic_enabled = 1; 1584 x2apic_enabled = 1;
1519 1585
@@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void)
1523 pr_info("Enabled x2apic\n"); 1589 pr_info("Enabled x2apic\n");
1524 } 1590 }
1525 1591
1526nox2apic: 1592skip_x2apic:
1527 if (ret < 0) /* IR enabling failed */ 1593 if (ret < 0) /* IR enabling failed */
1528 restore_ioapic_entries(); 1594 restore_ioapic_entries();
1529 legacy_pic->restore_mask(); 1595 legacy_pic->restore_mask();
1530 local_irq_restore(flags); 1596 local_irq_restore(flags);
1531
1532out:
1533 if (x2apic_enabled || !x2apic_supported())
1534 return;
1535
1536 if (x2apic_preenabled)
1537 panic("x2apic: enabled by BIOS but kernel init failed.");
1538 else if (ret == IRQ_REMAP_XAPIC_MODE)
1539 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1540 else if (ret < 0)
1541 pr_info("x2apic not enabled, IRQ remapping init failed\n");
1542} 1597}
1543 1598
1544#ifdef CONFIG_X86_64 1599#ifdef CONFIG_X86_64
@@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1809{ 1864{
1810 u32 v; 1865 u32 v;
1811 1866
1812 exit_idle();
1813 irq_enter(); 1867 irq_enter();
1868 exit_idle();
1814 /* 1869 /*
1815 * Check if this really is a spurious interrupt and ACK it 1870 * Check if this really is a spurious interrupt and ACK it
1816 * if it is a vectored one. Just in case... 1871 * if it is a vectored one. Just in case...
@@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)
1846 "Illegal register address", /* APIC Error Bit 7 */ 1901 "Illegal register address", /* APIC Error Bit 7 */
1847 }; 1902 };
1848 1903
1849 exit_idle();
1850 irq_enter(); 1904 irq_enter();
1905 exit_idle();
1851 /* First tickle the hardware, only then report what went on. -- REW */ 1906 /* First tickle the hardware, only then report what went on. -- REW */
1852 v0 = apic_read(APIC_ESR); 1907 v0 = apic_read(APIC_ESR);
1853 apic_write(APIC_ESR, 0); 1908 apic_write(APIC_ESR, 0);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f7a41e4cae47..8c3cdded6f2b 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
62 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 62 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
63 * document number 292116). So here it goes... 63 * document number 292116). So here it goes...
64 */ 64 */
65static void flat_init_apic_ldr(void) 65void flat_init_apic_ldr(void)
66{ 66{
67 unsigned long val; 67 unsigned long val;
68 unsigned long num, id; 68 unsigned long num, id;
@@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
171 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
172} 172}
173 173
174static int flat_probe(void)
175{
176 return 1;
177}
178
174static struct apic apic_flat = { 179static struct apic apic_flat = {
175 .name = "flat", 180 .name = "flat",
176 .probe = NULL, 181 .probe = flat_probe,
177 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 182 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
178 .apic_id_registered = flat_apic_id_registered, 183 .apic_id_registered = flat_apic_id_registered,
179 184
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000000..09d3d8c1cd99
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,294 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Numascale NumaConnect-Specific APIC Code
7 *
8 * Copyright (C) 2011 Numascale AS. All rights reserved.
9 *
10 * Send feedback to <support@numascale.com>
11 *
12 */
13
14#include <linux/errno.h>
15#include <linux/threads.h>
16#include <linux/cpumask.h>
17#include <linux/string.h>
18#include <linux/kernel.h>
19#include <linux/module.h>
20#include <linux/ctype.h>
21#include <linux/init.h>
22#include <linux/hardirq.h>
23#include <linux/delay.h>
24
25#include <asm/numachip/numachip_csr.h>
26#include <asm/smp.h>
27#include <asm/apic.h>
28#include <asm/ipi.h>
29#include <asm/apic_flat_64.h>
30
31static int numachip_system __read_mostly;
32
33static struct apic apic_numachip __read_mostly;
34
35static unsigned int get_apic_id(unsigned long x)
36{
37 unsigned long value;
38 unsigned int id;
39
40 rdmsrl(MSR_FAM10H_NODE_ID, value);
41 id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
42
43 return id;
44}
45
46static unsigned long set_apic_id(unsigned int id)
47{
48 unsigned long x;
49
50 x = ((id & 0xffU) << 24);
51 return x;
52}
53
54static unsigned int read_xapic_id(void)
55{
56 return get_apic_id(apic_read(APIC_ID));
57}
58
59static int numachip_apic_id_registered(void)
60{
61 return physid_isset(read_xapic_id(), phys_cpu_present_map);
62}
63
64static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
65{
66 return initial_apic_id >> index_msb;
67}
68
69static const struct cpumask *numachip_target_cpus(void)
70{
71 return cpu_online_mask;
72}
73
74static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
75{
76 cpumask_clear(retmask);
77 cpumask_set_cpu(cpu, retmask);
78}
79
80static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
81{
82 union numachip_csr_g3_ext_irq_gen int_gen;
83
84 int_gen.s._destination_apic_id = phys_apicid;
85 int_gen.s._vector = 0;
86 int_gen.s._msgtype = APIC_DM_INIT >> 8;
87 int_gen.s._index = 0;
88
89 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
90
91 int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
92 int_gen.s._vector = start_rip >> 12;
93
94 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
95
96 atomic_set(&init_deasserted, 1);
97 return 0;
98}
99
100static void numachip_send_IPI_one(int cpu, int vector)
101{
102 union numachip_csr_g3_ext_irq_gen int_gen;
103 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
104
105 int_gen.s._destination_apic_id = apicid;
106 int_gen.s._vector = vector;
107 int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
108 int_gen.s._index = 0;
109
110 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
111}
112
113static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
114{
115 unsigned int cpu;
116
117 for_each_cpu(cpu, mask)
118 numachip_send_IPI_one(cpu, vector);
119}
120
121static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
122 int vector)
123{
124 unsigned int this_cpu = smp_processor_id();
125 unsigned int cpu;
126
127 for_each_cpu(cpu, mask) {
128 if (cpu != this_cpu)
129 numachip_send_IPI_one(cpu, vector);
130 }
131}
132
133static void numachip_send_IPI_allbutself(int vector)
134{
135 unsigned int this_cpu = smp_processor_id();
136 unsigned int cpu;
137
138 for_each_online_cpu(cpu) {
139 if (cpu != this_cpu)
140 numachip_send_IPI_one(cpu, vector);
141 }
142}
143
144static void numachip_send_IPI_all(int vector)
145{
146 numachip_send_IPI_mask(cpu_online_mask, vector);
147}
148
149static void numachip_send_IPI_self(int vector)
150{
151 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
152}
153
154static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
155{
156 int cpu;
157
158 /*
159 * We're using fixed IRQ delivery, can only return one phys APIC ID.
160 * May as well be the first.
161 */
162 cpu = cpumask_first(cpumask);
163 if (likely((unsigned)cpu < nr_cpu_ids))
164 return per_cpu(x86_cpu_to_apicid, cpu);
165
166 return BAD_APICID;
167}
168
169static unsigned int
170numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
171 const struct cpumask *andmask)
172{
173 int cpu;
174
175 /*
176 * We're using fixed IRQ delivery, can only return one phys APIC ID.
177 * May as well be the first.
178 */
179 for_each_cpu_and(cpu, cpumask, andmask) {
180 if (cpumask_test_cpu(cpu, cpu_online_mask))
181 break;
182 }
183 return per_cpu(x86_cpu_to_apicid, cpu);
184}
185
186static int __init numachip_probe(void)
187{
188 return apic == &apic_numachip;
189}
190
191static void __init map_csrs(void)
192{
193 printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
194 NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
195 init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
196
197 printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
198 NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
199 init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
200}
201
202static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
203{
204 c->phys_proc_id = node;
205 per_cpu(cpu_llc_id, smp_processor_id()) = node;
206}
207
208static int __init numachip_system_init(void)
209{
210 unsigned int val;
211
212 if (!numachip_system)
213 return 0;
214
215 x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
216
217 map_csrs();
218
219 val = read_lcsr(CSR_G0_NODE_IDS);
220 printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
221
222 return 0;
223}
224early_initcall(numachip_system_init);
225
226static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
227{
228 if (!strncmp(oem_id, "NUMASC", 6)) {
229 numachip_system = 1;
230 return 1;
231 }
232
233 return 0;
234}
235
236static struct apic apic_numachip __refconst = {
237
238 .name = "NumaConnect system",
239 .probe = numachip_probe,
240 .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
241 .apic_id_registered = numachip_apic_id_registered,
242
243 .irq_delivery_mode = dest_Fixed,
244 .irq_dest_mode = 0, /* physical */
245
246 .target_cpus = numachip_target_cpus,
247 .disable_esr = 0,
248 .dest_logical = 0,
249 .check_apicid_used = NULL,
250 .check_apicid_present = NULL,
251
252 .vector_allocation_domain = numachip_vector_allocation_domain,
253 .init_apic_ldr = flat_init_apic_ldr,
254
255 .ioapic_phys_id_map = NULL,
256 .setup_apic_routing = NULL,
257 .multi_timer_check = NULL,
258 .cpu_present_to_apicid = default_cpu_present_to_apicid,
259 .apicid_to_cpu_present = NULL,
260 .setup_portio_remap = NULL,
261 .check_phys_apicid_present = default_check_phys_apicid_present,
262 .enable_apic_mode = NULL,
263 .phys_pkg_id = numachip_phys_pkg_id,
264 .mps_oem_check = NULL,
265
266 .get_apic_id = get_apic_id,
267 .set_apic_id = set_apic_id,
268 .apic_id_mask = 0xffU << 24,
269
270 .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid,
271 .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
272
273 .send_IPI_mask = numachip_send_IPI_mask,
274 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
275 .send_IPI_allbutself = numachip_send_IPI_allbutself,
276 .send_IPI_all = numachip_send_IPI_all,
277 .send_IPI_self = numachip_send_IPI_self,
278
279 .wakeup_secondary_cpu = numachip_wakeup_secondary,
280 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
281 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
282 .wait_for_init_deassert = NULL,
283 .smp_callin_clear_local_apic = NULL,
284 .inquire_remote_apic = NULL, /* REMRD not supported */
285
286 .read = native_apic_mem_read,
287 .write = native_apic_mem_write,
288 .icr_read = native_apic_icr_read,
289 .icr_write = native_apic_icr_write,
290 .wait_icr_idle = native_apic_wait_icr_idle,
291 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
292};
293apic_driver(apic_numachip);
294
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6d939d7847e2..fb072754bc1d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2421 unsigned vector, me; 2421 unsigned vector, me;
2422 2422
2423 ack_APIC_irq(); 2423 ack_APIC_irq();
2424 exit_idle();
2425 irq_enter(); 2424 irq_enter();
2425 exit_idle();
2426 2426
2427 me = smp_processor_id(); 2427 me = smp_processor_id();
2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -2948,6 +2948,10 @@ static inline void __init check_timer(void)
2948 } 2948 }
2949 local_irq_disable(); 2949 local_irq_disable();
2950 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 2950 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2951 if (x2apic_preenabled)
2952 apic_printk(APIC_QUIET, KERN_INFO
2953 "Perhaps problem with the pre-enabled x2apic mode\n"
2954 "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
2951 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2955 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2952 "report. Then try booting with the 'noapic' option.\n"); 2956 "report. Then try booting with the 'noapic' option.\n");
2953out: 2957out:
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 452932d34730..5da1269e8ddc 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
62 62
63void __init setup_bios_corruption_check(void) 63void __init setup_bios_corruption_check(void)
64{ 64{
65 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ 65 phys_addr_t start, end;
66 u64 i;
66 67
67 if (memory_corruption_check == -1) { 68 if (memory_corruption_check == -1) {
68 memory_corruption_check = 69 memory_corruption_check =
@@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void)
82 83
83 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 84 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
84 85
85 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { 86 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
86 u64 size; 87 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
87 addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); 88 PAGE_SIZE, corruption_check_size);
89 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
90 PAGE_SIZE, corruption_check_size);
91 if (start >= end)
92 continue;
88 93
89 if (addr == MEMBLOCK_ERROR) 94 memblock_reserve(start, end - start);
90 break; 95 scan_areas[num_scan_areas].addr = start;
91 96 scan_areas[num_scan_areas].size = end - start;
92 if (addr >= corruption_check_size)
93 break;
94
95 if ((addr + size) > corruption_check_size)
96 size = corruption_check_size - addr;
97
98 memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
99 scan_areas[num_scan_areas].addr = addr;
100 scan_areas[num_scan_areas].size = size;
101 num_scan_areas++;
102 97
103 /* Assume we've already mapped this early memory */ 98 /* Assume we've already mapped this early memory */
104 memset(__va(addr), 0, size); 99 memset(__va(start), 0, end - start);
105 100
106 addr += size; 101 if (++num_scan_areas >= MAX_SCAN_AREAS)
102 break;
107 } 103 }
108 104
109 if (num_scan_areas) 105 if (num_scan_areas)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 0bab2b18bb20..f4773f4aae35 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
148 148
149static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) 149static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
150{ 150{
151#ifdef CONFIG_SMP
152 /* calling is from identify_secondary_cpu() ? */ 151 /* calling is from identify_secondary_cpu() ? */
153 if (!c->cpu_index) 152 if (!c->cpu_index)
154 return; 153 return;
@@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
192 191
193valid_k7: 192valid_k7:
194 ; 193 ;
195#endif
196} 194}
197 195
198static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 196static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
353 if (node == NUMA_NO_NODE) 351 if (node == NUMA_NO_NODE)
354 node = per_cpu(cpu_llc_id, cpu); 352 node = per_cpu(cpu_llc_id, cpu);
355 353
354 /*
355 * If core numbers are inconsistent, it's likely a multi-fabric platform,
356 * so invoke platform-specific handler
357 */
358 if (c->phys_proc_id != node)
359 x86_cpuinit.fixup_cpu_id(c, node);
360
356 if (!node_online(node)) { 361 if (!node_online(node)) {
357 /* 362 /*
358 * Two possibilities here: 363 * Two possibilities here:
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e58d978e0758..159103c0b1f4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
278 } 278 }
279#ifdef CONFIG_X86_32 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 13) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
283 lo |= (1<<1 | 1<<7); 283 lo |= (1<<1 | 1<<7);
284 wrmsr(MSR_VIA_FCR, lo, hi); 284 wrmsr(MSR_VIA_FCR, lo, hi);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b13a831..850f2963a420 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
676 if (this_cpu->c_early_init) 676 if (this_cpu->c_early_init)
677 this_cpu->c_early_init(c); 677 this_cpu->c_early_init(c);
678 678
679#ifdef CONFIG_SMP
680 c->cpu_index = 0; 679 c->cpu_index = 0;
681#endif
682 filter_cpuid_features(c, false); 680 filter_cpuid_features(c, false);
683 681
684 setup_smep(c); 682 setup_smep(c);
@@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
764 c->apicid = c->initial_apicid; 762 c->apicid = c->initial_apicid;
765# endif 763# endif
766#endif 764#endif
767
768#ifdef CONFIG_X86_HT
769 c->phys_proc_id = c->initial_apicid; 765 c->phys_proc_id = c->initial_apicid;
770#endif
771 } 766 }
772 767
773 setup_smep(c); 768 setup_smep(c);
@@ -1141,6 +1136,15 @@ static void dbg_restore_debug_regs(void)
1141#endif /* ! CONFIG_KGDB */ 1136#endif /* ! CONFIG_KGDB */
1142 1137
1143/* 1138/*
1139 * Prints an error where the NUMA and configured core-number mismatch and the
1140 * platform didn't override this to fix it up
1141 */
1142void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
1143{
1144 pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
1145}
1146
1147/*
1144 * cpu_init() initializes state that is per-CPU. Some data is already 1148 * cpu_init() initializes state that is per-CPU. Some data is already
1145 * initialized (naturally) in the bootstrap process, such as the GDT 1149 * initialized (naturally) in the bootstrap process, such as the GDT
1146 * and IDT. We reload them nevertheless, this function acts as a 1150 * and IDT. We reload them nevertheless, this function acts as a
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1b22dcc51af4..8bacc7826fb3 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,5 +1,4 @@
1#ifndef ARCH_X86_CPU_H 1#ifndef ARCH_X86_CPU_H
2
3#define ARCH_X86_CPU_H 2#define ARCH_X86_CPU_H
4 3
5struct cpu_model_info { 4struct cpu_model_info {
@@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
35 34
36extern void get_cpu_cap(struct cpuinfo_x86 *c); 35extern void get_cpu_cap(struct cpuinfo_x86 *c);
37extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
38extern void get_cpu_cap(struct cpuinfo_x86 *c); 37#endif /* ARCH_X86_CPU_H */
39
40#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 523131213f08..3e6ff6cbf42a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)
181 181
182static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) 182static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
183{ 183{
184#ifdef CONFIG_SMP
185 /* calling is from identify_secondary_cpu() ? */ 184 /* calling is from identify_secondary_cpu() ? */
186 if (!c->cpu_index) 185 if (!c->cpu_index)
187 return; 186 return;
@@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
198 WARN_ONCE(1, "WARNING: SMP operation may be unreliable" 197 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
199 "with B stepping processors.\n"); 198 "with B stepping processors.\n");
200 } 199 }
201#endif
202} 200}
203 201
204static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 202static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2af127d4c3d1..e9c9d0aab36a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -119,9 +119,7 @@ void mce_setup(struct mce *m)
119 m->time = get_seconds(); 119 m->time = get_seconds();
120 m->cpuvendor = boot_cpu_data.x86_vendor; 120 m->cpuvendor = boot_cpu_data.x86_vendor;
121 m->cpuid = cpuid_eax(1); 121 m->cpuid = cpuid_eax(1);
122#ifdef CONFIG_SMP
123 m->socketid = cpu_data(m->extcpu).phys_proc_id; 122 m->socketid = cpu_data(m->extcpu).phys_proc_id;
124#endif
125 m->apicid = cpu_data(m->extcpu).initial_apicid; 123 m->apicid = cpu_data(m->extcpu).initial_apicid;
126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 124 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
127} 125}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f5474218cffe..1d76872b6a45 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -64,11 +64,9 @@ struct threshold_bank {
64}; 64};
65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); 65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
66 66
67#ifdef CONFIG_SMP
68static unsigned char shared_bank[NR_BANKS] = { 67static unsigned char shared_bank[NR_BANKS] = {
69 0, 0, 0, 0, 1 68 0, 0, 0, 0, 1
70}; 69};
71#endif
72 70
73static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 71static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
74 72
@@ -202,10 +200,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
202 200
203 if (!block) 201 if (!block)
204 per_cpu(bank_map, cpu) |= (1 << bank); 202 per_cpu(bank_map, cpu) |= (1 << bank);
205#ifdef CONFIG_SMP
206 if (shared_bank[bank] && c->cpu_core_id) 203 if (shared_bank[bank] && c->cpu_core_id)
207 break; 204 break;
208#endif 205
209 offset = setup_APIC_mce(offset, 206 offset = setup_APIC_mce(offset,
210 (high & MASK_LVTOFF_HI) >> 20); 207 (high & MASK_LVTOFF_HI) >> 20);
211 208
@@ -531,7 +528,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
531 528
532 sprintf(name, "threshold_bank%i", bank); 529 sprintf(name, "threshold_bank%i", bank);
533 530
534#ifdef CONFIG_SMP
535 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 531 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
536 i = cpumask_first(cpu_llc_shared_mask(cpu)); 532 i = cpumask_first(cpu_llc_shared_mask(cpu));
537 533
@@ -558,7 +554,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
558 554
559 goto out; 555 goto out;
560 } 556 }
561#endif
562 557
563 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 558 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
564 if (!b) { 559 if (!b) {
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..ce215616d5b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -397,8 +397,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
397 397
398asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) 398asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
399{ 399{
400 exit_idle();
401 irq_enter(); 400 irq_enter();
401 exit_idle();
402 inc_irq_stat(irq_thermal_count); 402 inc_irq_stat(irq_thermal_count);
403 smp_thermal_vector(); 403 smp_thermal_vector();
404 irq_exit(); 404 irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index d746df2909c9..aa578cadb940 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void smp_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle();
23 irq_enter(); 22 irq_enter();
23 exit_idle();
24 inc_irq_stat(irq_threshold_count); 24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector(); 25 mce_threshold_vector();
26 irq_exit(); 26 irq_exit();
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bda212a0010..5adce1040b11 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -484,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)
484 return event->pmu == &pmu; 484 return event->pmu == &pmu;
485} 485}
486 486
487/*
488 * Event scheduler state:
489 *
490 * Assign events iterating over all events and counters, beginning
491 * with events with least weights first. Keep the current iterator
492 * state in struct sched_state.
493 */
494struct sched_state {
495 int weight;
496 int event; /* event index */
497 int counter; /* counter index */
498 int unassigned; /* number of events to be assigned left */
499 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
500};
501
502/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
503#define SCHED_STATES_MAX 2
504
505struct perf_sched {
506 int max_weight;
507 int max_events;
508 struct event_constraint **constraints;
509 struct sched_state state;
510 int saved_states;
511 struct sched_state saved[SCHED_STATES_MAX];
512};
513
514/*
515 * Initialize interator that runs through all events and counters.
516 */
517static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
518 int num, int wmin, int wmax)
519{
520 int idx;
521
522 memset(sched, 0, sizeof(*sched));
523 sched->max_events = num;
524 sched->max_weight = wmax;
525 sched->constraints = c;
526
527 for (idx = 0; idx < num; idx++) {
528 if (c[idx]->weight == wmin)
529 break;
530 }
531
532 sched->state.event = idx; /* start with min weight */
533 sched->state.weight = wmin;
534 sched->state.unassigned = num;
535}
536
537static void perf_sched_save_state(struct perf_sched *sched)
538{
539 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
540 return;
541
542 sched->saved[sched->saved_states] = sched->state;
543 sched->saved_states++;
544}
545
546static bool perf_sched_restore_state(struct perf_sched *sched)
547{
548 if (!sched->saved_states)
549 return false;
550
551 sched->saved_states--;
552 sched->state = sched->saved[sched->saved_states];
553
554 /* continue with next counter: */
555 clear_bit(sched->state.counter++, sched->state.used);
556
557 return true;
558}
559
560/*
561 * Select a counter for the current event to schedule. Return true on
562 * success.
563 */
564static bool __perf_sched_find_counter(struct perf_sched *sched)
565{
566 struct event_constraint *c;
567 int idx;
568
569 if (!sched->state.unassigned)
570 return false;
571
572 if (sched->state.event >= sched->max_events)
573 return false;
574
575 c = sched->constraints[sched->state.event];
576
577 /* Prefer fixed purpose counters */
578 if (x86_pmu.num_counters_fixed) {
579 idx = X86_PMC_IDX_FIXED;
580 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
581 if (!__test_and_set_bit(idx, sched->state.used))
582 goto done;
583 }
584 }
585 /* Grab the first unused counter starting with idx */
586 idx = sched->state.counter;
587 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
588 if (!__test_and_set_bit(idx, sched->state.used))
589 goto done;
590 }
591
592 return false;
593
594done:
595 sched->state.counter = idx;
596
597 if (c->overlap)
598 perf_sched_save_state(sched);
599
600 return true;
601}
602
603static bool perf_sched_find_counter(struct perf_sched *sched)
604{
605 while (!__perf_sched_find_counter(sched)) {
606 if (!perf_sched_restore_state(sched))
607 return false;
608 }
609
610 return true;
611}
612
613/*
614 * Go through all unassigned events and find the next one to schedule.
615 * Take events with the least weight first. Return true on success.
616 */
617static bool perf_sched_next_event(struct perf_sched *sched)
618{
619 struct event_constraint *c;
620
621 if (!sched->state.unassigned || !--sched->state.unassigned)
622 return false;
623
624 do {
625 /* next event */
626 sched->state.event++;
627 if (sched->state.event >= sched->max_events) {
628 /* next weight */
629 sched->state.event = 0;
630 sched->state.weight++;
631 if (sched->state.weight > sched->max_weight)
632 return false;
633 }
634 c = sched->constraints[sched->state.event];
635 } while (c->weight != sched->state.weight);
636
637 sched->state.counter = 0; /* start with first counter */
638
639 return true;
640}
641
642/*
643 * Assign a counter for each event.
644 */
645static int perf_assign_events(struct event_constraint **constraints, int n,
646 int wmin, int wmax, int *assign)
647{
648 struct perf_sched sched;
649
650 perf_sched_init(&sched, constraints, n, wmin, wmax);
651
652 do {
653 if (!perf_sched_find_counter(&sched))
654 break; /* failed */
655 if (assign)
656 assign[sched.state.event] = sched.state.counter;
657 } while (perf_sched_next_event(&sched));
658
659 return sched.state.unassigned;
660}
661
487int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 662int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
488{ 663{
489 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; 664 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
490 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 665 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
491 int i, j, w, wmax, num = 0; 666 int i, wmin, wmax, num = 0;
492 struct hw_perf_event *hwc; 667 struct hw_perf_event *hwc;
493 668
494 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 669 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
495 670
496 for (i = 0; i < n; i++) { 671 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
497 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); 672 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
498 constraints[i] = c; 673 constraints[i] = c;
674 wmin = min(wmin, c->weight);
675 wmax = max(wmax, c->weight);
499 } 676 }
500 677
501 /* 678 /*
@@ -521,60 +698,12 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
521 if (assign) 698 if (assign)
522 assign[i] = hwc->idx; 699 assign[i] = hwc->idx;
523 } 700 }
524 if (i == n)
525 goto done;
526
527 /*
528 * begin slow path
529 */
530
531 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
532 701
533 /* 702 /* slow path */
534 * weight = number of possible counters 703 if (i != n)
535 * 704 num = perf_assign_events(constraints, n, wmin, wmax, assign);
536 * 1 = most constrained, only works on one counter
537 * wmax = least constrained, works on any counter
538 *
539 * assign events to counters starting with most
540 * constrained events.
541 */
542 wmax = x86_pmu.num_counters;
543 705
544 /* 706 /*
545 * when fixed event counters are present,
546 * wmax is incremented by 1 to account
547 * for one more choice
548 */
549 if (x86_pmu.num_counters_fixed)
550 wmax++;
551
552 for (w = 1, num = n; num && w <= wmax; w++) {
553 /* for each event */
554 for (i = 0; num && i < n; i++) {
555 c = constraints[i];
556 hwc = &cpuc->event_list[i]->hw;
557
558 if (c->weight != w)
559 continue;
560
561 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
562 if (!test_bit(j, used_mask))
563 break;
564 }
565
566 if (j == X86_PMC_IDX_MAX)
567 break;
568
569 __set_bit(j, used_mask);
570
571 if (assign)
572 assign[i] = j;
573 num--;
574 }
575 }
576done:
577 /*
578 * scheduling failed or is just a simulation, 707 * scheduling failed or is just a simulation,
579 * free resources if necessary 708 * free resources if necessary
580 */ 709 */
@@ -1119,6 +1248,7 @@ static void __init pmu_check_apic(void)
1119 1248
1120static int __init init_hw_perf_events(void) 1249static int __init init_hw_perf_events(void)
1121{ 1250{
1251 struct x86_pmu_quirk *quirk;
1122 struct event_constraint *c; 1252 struct event_constraint *c;
1123 int err; 1253 int err;
1124 1254
@@ -1147,8 +1277,8 @@ static int __init init_hw_perf_events(void)
1147 1277
1148 pr_cont("%s PMU driver.\n", x86_pmu.name); 1278 pr_cont("%s PMU driver.\n", x86_pmu.name);
1149 1279
1150 if (x86_pmu.quirks) 1280 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1151 x86_pmu.quirks(); 1281 quirk->func();
1152 1282
1153 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1283 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1154 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1284 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
@@ -1171,12 +1301,18 @@ static int __init init_hw_perf_events(void)
1171 1301
1172 unconstrained = (struct event_constraint) 1302 unconstrained = (struct event_constraint)
1173 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1303 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1174 0, x86_pmu.num_counters); 1304 0, x86_pmu.num_counters, 0);
1175 1305
1176 if (x86_pmu.event_constraints) { 1306 if (x86_pmu.event_constraints) {
1307 /*
1308 * event on fixed counter2 (REF_CYCLES) only works on this
1309 * counter, so do not extend mask to generic counters
1310 */
1177 for_each_event_constraint(c, x86_pmu.event_constraints) { 1311 for_each_event_constraint(c, x86_pmu.event_constraints) {
1178 if (c->cmask != X86_RAW_EVENT_MASK) 1312 if (c->cmask != X86_RAW_EVENT_MASK
1313 || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1179 continue; 1314 continue;
1315 }
1180 1316
1181 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; 1317 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1182 c->weight += x86_pmu.num_counters; 1318 c->weight += x86_pmu.num_counters;
@@ -1566,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
1566 1702
1567 return misc; 1703 return misc;
1568} 1704}
1705
1706void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
1707{
1708 cap->version = x86_pmu.version;
1709 cap->num_counters_gp = x86_pmu.num_counters;
1710 cap->num_counters_fixed = x86_pmu.num_counters_fixed;
1711 cap->bit_width_gp = x86_pmu.cntval_bits;
1712 cap->bit_width_fixed = x86_pmu.cntval_bits;
1713 cap->events_mask = (unsigned int)x86_pmu.events_maskl;
1714 cap->events_mask_len = x86_pmu.events_mask_len;
1715}
1716EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index b9698d40ac4b..8944062f46e2 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -45,6 +45,7 @@ struct event_constraint {
45 u64 code; 45 u64 code;
46 u64 cmask; 46 u64 cmask;
47 int weight; 47 int weight;
48 int overlap;
48}; 49};
49 50
50struct amd_nb { 51struct amd_nb {
@@ -151,15 +152,40 @@ struct cpu_hw_events {
151 void *kfree_on_online; 152 void *kfree_on_online;
152}; 153};
153 154
154#define __EVENT_CONSTRAINT(c, n, m, w) {\ 155#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
155 { .idxmsk64 = (n) }, \ 156 { .idxmsk64 = (n) }, \
156 .code = (c), \ 157 .code = (c), \
157 .cmask = (m), \ 158 .cmask = (m), \
158 .weight = (w), \ 159 .weight = (w), \
160 .overlap = (o), \
159} 161}
160 162
161#define EVENT_CONSTRAINT(c, n, m) \ 163#define EVENT_CONSTRAINT(c, n, m) \
162 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 164 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
165
166/*
167 * The overlap flag marks event constraints with overlapping counter
168 * masks. This is the case if the counter mask of such an event is not
169 * a subset of any other counter mask of a constraint with an equal or
170 * higher weight, e.g.:
171 *
172 * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
173 * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
174 * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
175 *
176 * The event scheduler may not select the correct counter in the first
177 * cycle because it needs to know which subsequent events will be
178 * scheduled. It may fail to schedule the events then. So we set the
179 * overlap flag for such constraints to give the scheduler a hint which
180 * events to select for counter rescheduling.
181 *
182 * Care must be taken as the rescheduling algorithm is O(n!) which
183 * will increase scheduling cycles for an over-commited system
184 * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
185 * and its counter masks must be kept at a minimum.
186 */
187#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
188 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
163 189
164/* 190/*
165 * Constraint on the Event code. 191 * Constraint on the Event code.
@@ -235,6 +261,11 @@ union perf_capabilities {
235 u64 capabilities; 261 u64 capabilities;
236}; 262};
237 263
264struct x86_pmu_quirk {
265 struct x86_pmu_quirk *next;
266 void (*func)(void);
267};
268
238/* 269/*
239 * struct x86_pmu - generic x86 pmu 270 * struct x86_pmu - generic x86 pmu
240 */ 271 */
@@ -259,6 +290,11 @@ struct x86_pmu {
259 int num_counters_fixed; 290 int num_counters_fixed;
260 int cntval_bits; 291 int cntval_bits;
261 u64 cntval_mask; 292 u64 cntval_mask;
293 union {
294 unsigned long events_maskl;
295 unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
296 };
297 int events_mask_len;
262 int apic; 298 int apic;
263 u64 max_period; 299 u64 max_period;
264 struct event_constraint * 300 struct event_constraint *
@@ -268,7 +304,7 @@ struct x86_pmu {
268 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 304 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
269 struct perf_event *event); 305 struct perf_event *event);
270 struct event_constraint *event_constraints; 306 struct event_constraint *event_constraints;
271 void (*quirks)(void); 307 struct x86_pmu_quirk *quirks;
272 int perfctr_second_write; 308 int perfctr_second_write;
273 309
274 int (*cpu_prepare)(int cpu); 310 int (*cpu_prepare)(int cpu);
@@ -309,6 +345,15 @@ struct x86_pmu {
309 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); 345 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
310}; 346};
311 347
348#define x86_add_quirk(func_) \
349do { \
350 static struct x86_pmu_quirk __quirk __initdata = { \
351 .func = func_, \
352 }; \
353 __quirk.next = x86_pmu.quirks; \
354 x86_pmu.quirks = &__quirk; \
355} while (0)
356
312#define ERF_NO_HT_SHARING 1 357#define ERF_NO_HT_SHARING 1
313#define ERF_HAS_RSP_1 2 358#define ERF_HAS_RSP_1 2
314 359
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index aeefd45697a2..0397b23be8e9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = {
492static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); 492static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
493static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); 493static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
494static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); 494static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
495static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); 495static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
496static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); 496static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
497static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); 497static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
498 498
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 8d601b18bf9f..3bd37bdf1b8e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
31 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
31}; 32};
32 33
33static struct event_constraint intel_core_event_constraints[] __read_mostly = 34static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =
45{ 46{
46 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 47 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
47 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 48 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
48 /* 49 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
49 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
50 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
51 * ratio between these counters.
52 */
53 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
54 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ 50 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
55 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 51 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
56 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 52 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
68{ 64{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 65 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 66 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 67 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ 68 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
73 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ 69 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
74 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ 70 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly
90{ 86{
91 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 87 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
92 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 88 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
93 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 89 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
94 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ 90 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
95 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ 91 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
96 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ 92 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
102{ 98{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 99 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 100 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
105 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 101 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
106 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ 102 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
107 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 103 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
108 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 104 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
125{ 121{
126 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 122 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
127 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 123 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
128 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 124 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
129 EVENT_CONSTRAINT_END 125 EVENT_CONSTRAINT_END
130}; 126};
131 127
@@ -1169,7 +1165,7 @@ again:
1169 */ 1165 */
1170 c = &unconstrained; 1166 c = &unconstrained;
1171 } else if (intel_try_alt_er(event, orig_idx)) { 1167 } else if (intel_try_alt_er(event, orig_idx)) {
1172 raw_spin_unlock(&era->lock); 1168 raw_spin_unlock_irqrestore(&era->lock, flags);
1173 goto again; 1169 goto again;
1174 } 1170 }
1175 raw_spin_unlock_irqrestore(&era->lock, flags); 1171 raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1519 .guest_get_msrs = intel_guest_get_msrs, 1515 .guest_get_msrs = intel_guest_get_msrs,
1520}; 1516};
1521 1517
1522static void intel_clovertown_quirks(void) 1518static __init void intel_clovertown_quirk(void)
1523{ 1519{
1524 /* 1520 /*
1525 * PEBS is unreliable due to: 1521 * PEBS is unreliable due to:
@@ -1545,19 +1541,60 @@ static void intel_clovertown_quirks(void)
1545 x86_pmu.pebs_constraints = NULL; 1541 x86_pmu.pebs_constraints = NULL;
1546} 1542}
1547 1543
1548static void intel_sandybridge_quirks(void) 1544static __init void intel_sandybridge_quirk(void)
1549{ 1545{
1550 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); 1546 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
1551 x86_pmu.pebs = 0; 1547 x86_pmu.pebs = 0;
1552 x86_pmu.pebs_constraints = NULL; 1548 x86_pmu.pebs_constraints = NULL;
1553} 1549}
1554 1550
1551static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
1552 { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
1553 { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
1554 { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
1555 { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
1556 { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
1557 { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
1558 { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
1559};
1560
1561static __init void intel_arch_events_quirk(void)
1562{
1563 int bit;
1564
1565 /* disable event that reported as not presend by cpuid */
1566 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
1567 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
1568 printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
1569 intel_arch_events_map[bit].name);
1570 }
1571}
1572
1573static __init void intel_nehalem_quirk(void)
1574{
1575 union cpuid10_ebx ebx;
1576
1577 ebx.full = x86_pmu.events_maskl;
1578 if (ebx.split.no_branch_misses_retired) {
1579 /*
1580 * Erratum AAJ80 detected, we work it around by using
1581 * the BR_MISP_EXEC.ANY event. This will over-count
1582 * branch-misses, but it's still much better than the
1583 * architectural event which is often completely bogus:
1584 */
1585 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1586 ebx.split.no_branch_misses_retired = 0;
1587 x86_pmu.events_maskl = ebx.full;
1588 printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
1589 }
1590}
1591
1555__init int intel_pmu_init(void) 1592__init int intel_pmu_init(void)
1556{ 1593{
1557 union cpuid10_edx edx; 1594 union cpuid10_edx edx;
1558 union cpuid10_eax eax; 1595 union cpuid10_eax eax;
1596 union cpuid10_ebx ebx;
1559 unsigned int unused; 1597 unsigned int unused;
1560 unsigned int ebx;
1561 int version; 1598 int version;
1562 1599
1563 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 1600 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -1574,8 +1611,8 @@ __init int intel_pmu_init(void)
1574 * Check whether the Architectural PerfMon supports 1611 * Check whether the Architectural PerfMon supports
1575 * Branch Misses Retired hw_event or not. 1612 * Branch Misses Retired hw_event or not.
1576 */ 1613 */
1577 cpuid(10, &eax.full, &ebx, &unused, &edx.full); 1614 cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
1578 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) 1615 if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
1579 return -ENODEV; 1616 return -ENODEV;
1580 1617
1581 version = eax.split.version_id; 1618 version = eax.split.version_id;
@@ -1589,6 +1626,9 @@ __init int intel_pmu_init(void)
1589 x86_pmu.cntval_bits = eax.split.bit_width; 1626 x86_pmu.cntval_bits = eax.split.bit_width;
1590 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; 1627 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
1591 1628
1629 x86_pmu.events_maskl = ebx.full;
1630 x86_pmu.events_mask_len = eax.split.mask_length;
1631
1592 /* 1632 /*
1593 * Quirk: v2 perfmon does not report fixed-purpose events, so 1633 * Quirk: v2 perfmon does not report fixed-purpose events, so
1594 * assume at least 3 events: 1634 * assume at least 3 events:
@@ -1608,6 +1648,8 @@ __init int intel_pmu_init(void)
1608 1648
1609 intel_ds_init(); 1649 intel_ds_init();
1610 1650
1651 x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
1652
1611 /* 1653 /*
1612 * Install the hw-cache-events table: 1654 * Install the hw-cache-events table:
1613 */ 1655 */
@@ -1617,7 +1659,7 @@ __init int intel_pmu_init(void)
1617 break; 1659 break;
1618 1660
1619 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1661 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1620 x86_pmu.quirks = intel_clovertown_quirks; 1662 x86_add_quirk(intel_clovertown_quirk);
1621 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1663 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1622 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1664 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1623 case 29: /* six-core 45 nm xeon "Dunnington" */ 1665 case 29: /* six-core 45 nm xeon "Dunnington" */
@@ -1651,17 +1693,8 @@ __init int intel_pmu_init(void)
1651 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1693 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1652 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1694 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1653 1695
1654 if (ebx & 0x40) { 1696 x86_add_quirk(intel_nehalem_quirk);
1655 /*
1656 * Erratum AAJ80 detected, we work it around by using
1657 * the BR_MISP_EXEC.ANY event. This will over-count
1658 * branch-misses, but it's still much better than the
1659 * architectural event which is often completely bogus:
1660 */
1661 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1662 1697
1663 pr_cont("erratum AAJ80 worked around, ");
1664 }
1665 pr_cont("Nehalem events, "); 1698 pr_cont("Nehalem events, ");
1666 break; 1699 break;
1667 1700
@@ -1701,7 +1734,7 @@ __init int intel_pmu_init(void)
1701 break; 1734 break;
1702 1735
1703 case 42: /* SandyBridge */ 1736 case 42: /* SandyBridge */
1704 x86_pmu.quirks = intel_sandybridge_quirks; 1737 x86_add_quirk(intel_sandybridge_quirk);
1705 case 45: /* SandyBridge, "Romely-EP" */ 1738 case 45: /* SandyBridge, "Romely-EP" */
1706 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1739 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1707 sizeof(hw_cache_event_ids)); 1740 sizeof(hw_cache_event_ids));
@@ -1738,5 +1771,6 @@ __init int intel_pmu_init(void)
1738 break; 1771 break;
1739 } 1772 }
1740 } 1773 }
1774
1741 return 0; 1775 return 0;
1742} 1776}
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..7b3fe56b1c21 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {
16 "100mhzsteps", 16 "100mhzsteps",
17 "hwpstate", 17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */ 18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */ 19 "cpb", /* core performance boost */
20 "eff_freq_ro", /* Readonly aperf/mperf */
20}; 21};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 14b23140e81f..8022c6681485 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
64static int show_cpuinfo(struct seq_file *m, void *v) 64static int show_cpuinfo(struct seq_file *m, void *v)
65{ 65{
66 struct cpuinfo_x86 *c = v; 66 struct cpuinfo_x86 *c = v;
67 unsigned int cpu = 0; 67 unsigned int cpu;
68 int i; 68 int i;
69 69
70#ifdef CONFIG_SMP
71 cpu = c->cpu_index; 70 cpu = c->cpu_index;
72#endif
73 seq_printf(m, "processor\t: %u\n" 71 seq_printf(m, "processor\t: %u\n"
74 "vendor_id\t: %s\n" 72 "vendor_id\t: %s\n"
75 "cpu family\t: %d\n" 73 "cpu family\t: %d\n"
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..8071e2f3d6eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -738,35 +738,17 @@ core_initcall(e820_mark_nvs_memory);
738/* 738/*
739 * pre allocated 4k and reserved it in memblock and e820_saved 739 * pre allocated 4k and reserved it in memblock and e820_saved
740 */ 740 */
741u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) 741u64 __init early_reserve_e820(u64 size, u64 align)
742{ 742{
743 u64 size = 0;
744 u64 addr; 743 u64 addr;
745 u64 start;
746 744
747 for (start = startt; ; start += size) { 745 addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
748 start = memblock_x86_find_in_range_size(start, &size, align); 746 if (addr) {
749 if (start == MEMBLOCK_ERROR) 747 e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
750 return 0; 748 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
751 if (size >= sizet) 749 update_e820_saved();
752 break;
753 } 750 }
754 751
755#ifdef CONFIG_X86_32
756 if (start >= MAXMEM)
757 return 0;
758 if (start + size > MAXMEM)
759 size = MAXMEM - start;
760#endif
761
762 addr = round_down(start + size - sizet, align);
763 if (addr < start)
764 return 0;
765 memblock_x86_reserve_range(addr, addr + sizet, "new next");
766 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
767 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
768 update_e820_saved();
769
770 return addr; 752 return addr;
771} 753}
772 754
@@ -1090,7 +1072,7 @@ void __init memblock_x86_fill(void)
1090 * We are safe to enable resizing, beause memblock_x86_fill() 1072 * We are safe to enable resizing, beause memblock_x86_fill()
1091 * is rather later for x86 1073 * is rather later for x86
1092 */ 1074 */
1093 memblock_can_resize = 1; 1075 memblock_allow_resize();
1094 1076
1095 for (i = 0; i < e820.nr_map; i++) { 1077 for (i = 0; i < e820.nr_map; i++) {
1096 struct e820entry *ei = &e820.map[i]; 1078 struct e820entry *ei = &e820.map[i];
@@ -1105,22 +1087,36 @@ void __init memblock_x86_fill(void)
1105 memblock_add(ei->addr, ei->size); 1087 memblock_add(ei->addr, ei->size);
1106 } 1088 }
1107 1089
1108 memblock_analyze();
1109 memblock_dump_all(); 1090 memblock_dump_all();
1110} 1091}
1111 1092
1112void __init memblock_find_dma_reserve(void) 1093void __init memblock_find_dma_reserve(void)
1113{ 1094{
1114#ifdef CONFIG_X86_64 1095#ifdef CONFIG_X86_64
1115 u64 free_size_pfn; 1096 u64 nr_pages = 0, nr_free_pages = 0;
1116 u64 mem_size_pfn; 1097 unsigned long start_pfn, end_pfn;
1098 phys_addr_t start, end;
1099 int i;
1100 u64 u;
1101
1117 /* 1102 /*
1118 * need to find out used area below MAX_DMA_PFN 1103 * need to find out used area below MAX_DMA_PFN
1119 * need to use memblock to get free size in [0, MAX_DMA_PFN] 1104 * need to use memblock to get free size in [0, MAX_DMA_PFN]
1120 * at first, and assume boot_mem will not take below MAX_DMA_PFN 1105 * at first, and assume boot_mem will not take below MAX_DMA_PFN
1121 */ 1106 */
1122 mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; 1107 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
1123 free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; 1108 start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
1124 set_dma_reserve(mem_size_pfn - free_size_pfn); 1109 end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
1110 nr_pages += end_pfn - start_pfn;
1111 }
1112
1113 for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
1114 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1115 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1116 if (start_pfn < end_pfn)
1117 nr_free_pages += end_pfn - start_pfn;
1118 }
1119
1120 set_dma_reserve(nr_pages - nr_free_pages);
1125#endif 1121#endif
1126} 1122}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f5344001..22d0e21b4dd7 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -625,6 +625,8 @@ work_notifysig: # deal with pending signals and
625 movl %esp, %eax 625 movl %esp, %eax
626 jne work_notifysig_v86 # returning to kernel-space or 626 jne work_notifysig_v86 # returning to kernel-space or
627 # vm86-space 627 # vm86-space
628 TRACE_IRQS_ON
629 ENABLE_INTERRUPTS(CLBR_NONE)
628 xorl %edx, %edx 630 xorl %edx, %edx
629 call do_notify_resume 631 call do_notify_resume
630 jmp resume_userspace_sig 632 jmp resume_userspace_sig
@@ -638,6 +640,8 @@ work_notifysig_v86:
638#else 640#else
639 movl %esp, %eax 641 movl %esp, %eax
640#endif 642#endif
643 TRACE_IRQS_ON
644 ENABLE_INTERRUPTS(CLBR_NONE)
641 xorl %edx, %edx 645 xorl %edx, %edx
642 call do_notify_resume 646 call do_notify_resume
643 jmp resume_userspace_sig 647 jmp resume_userspace_sig
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..a20e1cb9dc87 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64)
221 /*CFI_REL_OFFSET ss,0*/ 221 /*CFI_REL_OFFSET ss,0*/
222 pushq_cfi %rax /* rsp */ 222 pushq_cfi %rax /* rsp */
223 CFI_REL_OFFSET rsp,0 223 CFI_REL_OFFSET rsp,0
224 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ 224 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
225 /*CFI_REL_OFFSET rflags,0*/ 225 /*CFI_REL_OFFSET rflags,0*/
226 pushq_cfi $__KERNEL_CS /* cs */ 226 pushq_cfi $__KERNEL_CS /* cs */
227 /*CFI_REL_OFFSET cs,0*/ 227 /*CFI_REL_OFFSET cs,0*/
@@ -411,7 +411,7 @@ ENTRY(ret_from_fork)
411 RESTORE_REST 411 RESTORE_REST
412 412
413 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 413 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
414 je int_ret_from_sys_call 414 jz retint_restore_args
415 415
416 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET 416 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
417 jnz int_ret_from_sys_call 417 jnz int_ret_from_sys_call
@@ -465,7 +465,7 @@ ENTRY(system_call)
465 * after the swapgs, so that it can do the swapgs 465 * after the swapgs, so that it can do the swapgs
466 * for the guest and jump here on syscall. 466 * for the guest and jump here on syscall.
467 */ 467 */
468ENTRY(system_call_after_swapgs) 468GLOBAL(system_call_after_swapgs)
469 469
470 movq %rsp,PER_CPU_VAR(old_rsp) 470 movq %rsp,PER_CPU_VAR(old_rsp)
471 movq PER_CPU_VAR(kernel_stack),%rsp 471 movq PER_CPU_VAR(kernel_stack),%rsp
@@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs)
478 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 478 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
479 movq %rcx,RIP-ARGOFFSET(%rsp) 479 movq %rcx,RIP-ARGOFFSET(%rsp)
480 CFI_REL_OFFSET rip,RIP-ARGOFFSET 480 CFI_REL_OFFSET rip,RIP-ARGOFFSET
481 GET_THREAD_INFO(%rcx) 481 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
482 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
483 jnz tracesys 482 jnz tracesys
484system_call_fastpath: 483system_call_fastpath:
485 cmpq $__NR_syscall_max,%rax 484 cmpq $__NR_syscall_max,%rax
@@ -496,10 +495,9 @@ ret_from_sys_call:
496 /* edi: flagmask */ 495 /* edi: flagmask */
497sysret_check: 496sysret_check:
498 LOCKDEP_SYS_EXIT 497 LOCKDEP_SYS_EXIT
499 GET_THREAD_INFO(%rcx)
500 DISABLE_INTERRUPTS(CLBR_NONE) 498 DISABLE_INTERRUPTS(CLBR_NONE)
501 TRACE_IRQS_OFF 499 TRACE_IRQS_OFF
502 movl TI_flags(%rcx),%edx 500 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
503 andl %edi,%edx 501 andl %edi,%edx
504 jnz sysret_careful 502 jnz sysret_careful
505 CFI_REMEMBER_STATE 503 CFI_REMEMBER_STATE
@@ -583,7 +581,7 @@ sysret_audit:
583 /* Do syscall tracing */ 581 /* Do syscall tracing */
584tracesys: 582tracesys:
585#ifdef CONFIG_AUDITSYSCALL 583#ifdef CONFIG_AUDITSYSCALL
586 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 584 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
587 jz auditsys 585 jz auditsys
588#endif 586#endif
589 SAVE_REST 587 SAVE_REST
@@ -612,8 +610,6 @@ tracesys:
612GLOBAL(int_ret_from_sys_call) 610GLOBAL(int_ret_from_sys_call)
613 DISABLE_INTERRUPTS(CLBR_NONE) 611 DISABLE_INTERRUPTS(CLBR_NONE)
614 TRACE_IRQS_OFF 612 TRACE_IRQS_OFF
615 testl $3,CS-ARGOFFSET(%rsp)
616 je retint_restore_args
617 movl $_TIF_ALLWORK_MASK,%edi 613 movl $_TIF_ALLWORK_MASK,%edi
618 /* edi: mask to check */ 614 /* edi: mask to check */
619GLOBAL(int_with_check) 615GLOBAL(int_with_check)
@@ -953,6 +949,7 @@ END(common_interrupt)
953ENTRY(\sym) 949ENTRY(\sym)
954 INTR_FRAME 950 INTR_FRAME
955 pushq_cfi $~(\num) 951 pushq_cfi $~(\num)
952.Lcommon_\sym:
956 interrupt \do_sym 953 interrupt \do_sym
957 jmp ret_from_intr 954 jmp ret_from_intr
958 CFI_ENDPROC 955 CFI_ENDPROC
@@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
976 x86_platform_ipi smp_x86_platform_ipi 973 x86_platform_ipi smp_x86_platform_ipi
977 974
978#ifdef CONFIG_SMP 975#ifdef CONFIG_SMP
979.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 976 ALIGN
977 INTR_FRAME
978.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
980 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 979 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
981.if NUM_INVALIDATE_TLB_VECTORS > \idx 980.if NUM_INVALIDATE_TLB_VECTORS > \idx
982apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ 981ENTRY(invalidate_interrupt\idx)
983 invalidate_interrupt\idx smp_invalidate_interrupt 982 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
983 jmp .Lcommon_invalidate_interrupt0
984 CFI_ADJUST_CFA_OFFSET -8
985END(invalidate_interrupt\idx)
984.endif 986.endif
985.endr 987.endr
988 CFI_ENDPROC
989apicinterrupt INVALIDATE_TLB_VECTOR_START, \
990 invalidate_interrupt0, smp_invalidate_interrupt
986#endif 991#endif
987 992
988apicinterrupt THRESHOLD_APIC_VECTOR \ 993apicinterrupt THRESHOLD_APIC_VECTOR \
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index af0699ba48cf..48d9d4ea1020 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)
52 lowmem = 0x9f000; 52 lowmem = 0x9f000;
53 53
54 /* reserve all memory between lowmem and the 1MB mark */ 54 /* reserve all memory between lowmem and the 1MB mark */
55 memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); 55 memblock_reserve(lowmem, 0x100000 - lowmem);
56} 56}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3bb08509a7a1..51ff18616d50 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void)
31 31
32void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
33{ 33{
34 memblock_init(); 34 memblock_reserve(__pa_symbol(&_text),
35 35 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
36 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
37 36
38#ifdef CONFIG_BLK_DEV_INITRD 37#ifdef CONFIG_BLK_DEV_INITRD
39 /* Reserve INITRD */ 38 /* Reserve INITRD */
@@ -42,7 +41,7 @@ void __init i386_start_kernel(void)
42 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 41 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
43 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 42 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
44 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 43 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
45 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); 44 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
46 } 45 }
47#endif 46#endif
48 47
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5655c2272adb..3a3b779f41d3 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 memblock_init(); 101 memblock_reserve(__pa_symbol(&_text),
102 102 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
103 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 103
105#ifdef CONFIG_BLK_DEV_INITRD 104#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 105 /* Reserve INITRD */
@@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
109 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 108 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
110 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 109 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
111 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 110 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
112 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); 111 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
113 } 112 }
114#endif 113#endif
115 114
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1bb0bf4d92cd..07b0a56a754d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -32,8 +32,6 @@
32#define HPET_MIN_CYCLES 128 32#define HPET_MIN_CYCLES 128
33#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) 33#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
34 34
35#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
36
37/* 35/*
38 * HPET address is set in acpi/boot.c, when an ACPI entry exists 36 * HPET address is set in acpi/boot.c, when an ACPI entry exists
39 */ 37 */
@@ -55,6 +53,11 @@ struct hpet_dev {
55 char name[10]; 53 char name[10];
56}; 54};
57 55
56inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
57{
58 return container_of(evtdev, struct hpet_dev, evt);
59}
60
58inline unsigned int hpet_readl(unsigned int a) 61inline unsigned int hpet_readl(unsigned int a)
59{ 62{
60 return readl(hpet_virt_address + a); 63 return readl(hpet_virt_address + a);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 429e0c92924e..7943e0c21bde 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)
74 for_each_online_cpu(j) 74 for_each_online_cpu(j)
75 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); 75 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
76 seq_printf(p, " IRQ work interrupts\n"); 76 seq_printf(p, " IRQ work interrupts\n");
77 seq_printf(p, "%*s: ", prec, "RTR");
78 for_each_online_cpu(j)
79 seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
80 seq_printf(p, " APIC ICR read retries\n");
77#endif 81#endif
78 if (x86_platform_ipi_callback) { 82 if (x86_platform_ipi_callback) {
79 seq_printf(p, "%*s: ", prec, "PLT"); 83 seq_printf(p, "%*s: ", prec, "PLT");
@@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
136 sum += irq_stats(cpu)->irq_spurious_count; 140 sum += irq_stats(cpu)->irq_spurious_count;
137 sum += irq_stats(cpu)->apic_perf_irqs; 141 sum += irq_stats(cpu)->apic_perf_irqs;
138 sum += irq_stats(cpu)->apic_irq_work_irqs; 142 sum += irq_stats(cpu)->apic_irq_work_irqs;
143 sum += irq_stats(cpu)->icr_read_retry_count;
139#endif 144#endif
140 if (x86_platform_ipi_callback) 145 if (x86_platform_ipi_callback)
141 sum += irq_stats(cpu)->x86_platform_ipis; 146 sum += irq_stats(cpu)->x86_platform_ipis;
@@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
181 unsigned vector = ~regs->orig_ax; 186 unsigned vector = ~regs->orig_ax;
182 unsigned irq; 187 unsigned irq;
183 188
184 exit_idle();
185 irq_enter(); 189 irq_enter();
190 exit_idle();
186 191
187 irq = __this_cpu_read(vector_irq[vector]); 192 irq = __this_cpu_read(vector_irq[vector]);
188 193
@@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
209 214
210 ack_APIC_irq(); 215 ack_APIC_irq();
211 216
212 exit_idle();
213
214 irq_enter(); 217 irq_enter();
215 218
219 exit_idle();
220
216 inc_irq_stat(x86_platform_ipis); 221 inc_irq_stat(x86_platform_ipis);
217 222
218 if (x86_platform_ipi_callback) 223 if (x86_platform_ipi_callback)
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index ea9d5f2f13ef..2889b3d43882 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
50 put_online_cpus(); 50 put_online_cpus();
51} 51}
52 52
53void arch_jump_label_transform_static(struct jump_entry *entry, 53__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
54 enum jump_label_type type) 54 enum jump_label_type type)
55{ 55{
56 __jump_label_transform(entry, type, text_poke_early); 56 __jump_label_transform(entry, type, text_poke_early);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0741b062a304..ca470e4c92dc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early)
564 564
565static void __init smp_reserve_memory(struct mpf_intel *mpf) 565static void __init smp_reserve_memory(struct mpf_intel *mpf)
566{ 566{
567 unsigned long size = get_mpc_size(mpf->physptr); 567 memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
568
569 memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
570} 568}
571 569
572static int __init smp_scan_config(unsigned long base, unsigned long length) 570static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
595 mpf, (u64)virt_to_phys(mpf)); 593 mpf, (u64)virt_to_phys(mpf));
596 594
597 mem = virt_to_phys(mpf); 595 mem = virt_to_phys(mpf);
598 memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); 596 memblock_reserve(mem, sizeof(*mpf));
599 if (mpf->physptr) 597 if (mpf->physptr)
600 smp_reserve_memory(mpf); 598 smp_reserve_memory(mpf);
601 599
@@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);
836 834
837void __init early_reserve_e820_mpc_new(void) 835void __init early_reserve_e820_mpc_new(void)
838{ 836{
839 if (enable_update_mptable && alloc_mptable) { 837 if (enable_update_mptable && alloc_mptable)
840 u64 startt = 0; 838 mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
841 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
842 }
843} 839}
844 840
845static int __init update_mp_table(void) 841static int __init update_mp_table(void)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ee5d4fbd53b4..15763af7bfe3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
293 regs.orig_ax = -1; 293 regs.orig_ax = -1;
294 regs.ip = (unsigned long) kernel_thread_helper; 294 regs.ip = (unsigned long) kernel_thread_helper;
295 regs.cs = __KERNEL_CS | get_kernel_rpl(); 295 regs.cs = __KERNEL_CS | get_kernel_rpl();
296 regs.flags = X86_EFLAGS_IF | 0x2; 296 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
297 297
298 /* Ok, create the new process.. */ 298 /* Ok, create the new process.. */
299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 795b79f984c2..485204f58cda 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -99,7 +99,8 @@ void cpu_idle(void)
99 99
100 /* endless idle loop with no priority at all */ 100 /* endless idle loop with no priority at all */
101 while (1) { 101 while (1) {
102 tick_nohz_stop_sched_tick(1); 102 tick_nohz_idle_enter();
103 rcu_idle_enter();
103 while (!need_resched()) { 104 while (!need_resched()) {
104 105
105 check_pgt_cache(); 106 check_pgt_cache();
@@ -116,7 +117,8 @@ void cpu_idle(void)
116 pm_idle(); 117 pm_idle();
117 start_critical_timings(); 118 start_critical_timings();
118 } 119 }
119 tick_nohz_restart_sched_tick(); 120 rcu_idle_exit();
121 tick_nohz_idle_exit();
120 preempt_enable_no_resched(); 122 preempt_enable_no_resched();
121 schedule(); 123 schedule();
122 preempt_disable(); 124 preempt_disable();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3bd7e6eebf31..9b9fe4a85c87 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -122,7 +122,7 @@ void cpu_idle(void)
122 122
123 /* endless idle loop with no priority at all */ 123 /* endless idle loop with no priority at all */
124 while (1) { 124 while (1) {
125 tick_nohz_stop_sched_tick(1); 125 tick_nohz_idle_enter();
126 while (!need_resched()) { 126 while (!need_resched()) {
127 127
128 rmb(); 128 rmb();
@@ -139,8 +139,14 @@ void cpu_idle(void)
139 enter_idle(); 139 enter_idle();
140 /* Don't trace irqs off for idle */ 140 /* Don't trace irqs off for idle */
141 stop_critical_timings(); 141 stop_critical_timings();
142
143 /* enter_idle() needs rcu for notifiers */
144 rcu_idle_enter();
145
142 if (cpuidle_idle_call()) 146 if (cpuidle_idle_call())
143 pm_idle(); 147 pm_idle();
148
149 rcu_idle_exit();
144 start_critical_timings(); 150 start_critical_timings();
145 151
146 /* In many cases the interrupt that ended idle 152 /* In many cases the interrupt that ended idle
@@ -149,7 +155,7 @@ void cpu_idle(void)
149 __exit_idle(); 155 __exit_idle();
150 } 156 }
151 157
152 tick_nohz_restart_sched_tick(); 158 tick_nohz_idle_exit();
153 preempt_enable_no_resched(); 159 preempt_enable_no_resched();
154 schedule(); 160 schedule();
155 preempt_disable(); 161 preempt_disable();
@@ -293,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
293 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 299 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
294 300
295 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 301 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
296 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 302 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
303 IO_BITMAP_BYTES, GFP_KERNEL);
297 if (!p->thread.io_bitmap_ptr) { 304 if (!p->thread.io_bitmap_ptr) {
298 p->thread.io_bitmap_max = 0; 305 p->thread.io_bitmap_max = 0;
299 return -ENOMEM; 306 return -ENOMEM;
300 } 307 }
301 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
302 IO_BITMAP_BYTES);
303 set_tsk_thread_flag(p, TIF_IO_BITMAP); 308 set_tsk_thread_flag(p, TIF_IO_BITMAP);
304 } 309 }
305 310
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 82528799c5de..89a04c7b5bb6 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -749,7 +749,8 @@ put:
749/* 749/*
750 * Handle PTRACE_POKEUSR calls for the debug register area. 750 * Handle PTRACE_POKEUSR calls for the debug register area.
751 */ 751 */
752int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) 752static int ptrace_set_debugreg(struct task_struct *tsk, int n,
753 unsigned long val)
753{ 754{
754 struct thread_struct *thread = &(tsk->thread); 755 struct thread_struct *thread = &(tsk->thread);
755 int rc = 0; 756 int rc = 0;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cf0ef986cb6d..d05444ac2aea 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -306,7 +306,8 @@ static void __init cleanup_highmap(void)
306static void __init reserve_brk(void) 306static void __init reserve_brk(void)
307{ 307{
308 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
309 memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); 309 memblock_reserve(__pa(_brk_start),
310 __pa(_brk_end) - __pa(_brk_start));
310 311
311 /* Mark brk area as locked down and no longer taking any 312 /* Mark brk area as locked down and no longer taking any
312 new allocations */ 313 new allocations */
@@ -331,13 +332,13 @@ static void __init relocate_initrd(void)
331 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 332 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
332 PAGE_SIZE); 333 PAGE_SIZE);
333 334
334 if (ramdisk_here == MEMBLOCK_ERROR) 335 if (!ramdisk_here)
335 panic("Cannot find place for new RAMDISK of size %lld\n", 336 panic("Cannot find place for new RAMDISK of size %lld\n",
336 ramdisk_size); 337 ramdisk_size);
337 338
338 /* Note: this includes all the lowmem currently occupied by 339 /* Note: this includes all the lowmem currently occupied by
339 the initrd, we rely on that fact to keep the data intact. */ 340 the initrd, we rely on that fact to keep the data intact. */
340 memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); 341 memblock_reserve(ramdisk_here, area_size);
341 initrd_start = ramdisk_here + PAGE_OFFSET; 342 initrd_start = ramdisk_here + PAGE_OFFSET;
342 initrd_end = initrd_start + ramdisk_size; 343 initrd_end = initrd_start + ramdisk_size;
343 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", 344 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -393,7 +394,7 @@ static void __init reserve_initrd(void)
393 initrd_start = 0; 394 initrd_start = 0;
394 395
395 if (ramdisk_size >= (end_of_lowmem>>1)) { 396 if (ramdisk_size >= (end_of_lowmem>>1)) {
396 memblock_x86_free_range(ramdisk_image, ramdisk_end); 397 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
397 printk(KERN_ERR "initrd too large to handle, " 398 printk(KERN_ERR "initrd too large to handle, "
398 "disabling initrd\n"); 399 "disabling initrd\n");
399 return; 400 return;
@@ -416,7 +417,7 @@ static void __init reserve_initrd(void)
416 417
417 relocate_initrd(); 418 relocate_initrd();
418 419
419 memblock_x86_free_range(ramdisk_image, ramdisk_end); 420 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
420} 421}
421#else 422#else
422static void __init reserve_initrd(void) 423static void __init reserve_initrd(void)
@@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)
490{ 491{
491 struct setup_data *data; 492 struct setup_data *data;
492 u64 pa_data; 493 u64 pa_data;
493 char buf[32];
494 494
495 if (boot_params.hdr.version < 0x0209) 495 if (boot_params.hdr.version < 0x0209)
496 return; 496 return;
497 pa_data = boot_params.hdr.setup_data; 497 pa_data = boot_params.hdr.setup_data;
498 while (pa_data) { 498 while (pa_data) {
499 data = early_memremap(pa_data, sizeof(*data)); 499 data = early_memremap(pa_data, sizeof(*data));
500 sprintf(buf, "setup data %x", data->type); 500 memblock_reserve(pa_data, sizeof(*data) + data->len);
501 memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
502 pa_data = data->next; 501 pa_data = data->next;
503 early_iounmap(data, sizeof(*data)); 502 early_iounmap(data, sizeof(*data));
504 } 503 }
@@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void)
554 crash_base = memblock_find_in_range(alignment, 553 crash_base = memblock_find_in_range(alignment,
555 CRASH_KERNEL_ADDR_MAX, crash_size, alignment); 554 CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
556 555
557 if (crash_base == MEMBLOCK_ERROR) { 556 if (!crash_base) {
558 pr_info("crashkernel reservation failed - No suitable area found.\n"); 557 pr_info("crashkernel reservation failed - No suitable area found.\n");
559 return; 558 return;
560 } 559 }
@@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void)
568 return; 567 return;
569 } 568 }
570 } 569 }
571 memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); 570 memblock_reserve(crash_base, crash_size);
572 571
573 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 572 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
574 "for crashkernel (System RAM: %ldMB)\n", 573 "for crashkernel (System RAM: %ldMB)\n",
@@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void)
626 addr = find_ibft_region(&size); 625 addr = find_ibft_region(&size);
627 626
628 if (size) 627 if (size)
629 memblock_x86_reserve_range(addr, addr + size, "* ibft"); 628 memblock_reserve(addr, size);
630} 629}
631 630
632static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; 631static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..e38e21754eea 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -840,7 +840,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
840 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); 840 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
841 841
842 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 842 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
843 !physid_isset(apicid, phys_cpu_present_map)) { 843 !physid_isset(apicid, phys_cpu_present_map) ||
844 (!x2apic_mode && apicid >= 255)) {
844 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 845 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
845 return -EINVAL; 846 return -EINVAL;
846 } 847 }
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a91ae7709b49..a73b61055ad6 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -14,11 +14,11 @@ void __init setup_trampolines(void)
14 14
15 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
17 if (mem == MEMBLOCK_ERROR) 17 if (!mem)
18 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
19 19
20 x86_trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); 21 memblock_reserve(mem, size);
22 22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", 23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size); 24 x86_trampoline_base, (unsigned long long)mem, size);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..fa1191fb679d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
306 == NOTIFY_STOP) 306 == NOTIFY_STOP)
307 return; 307 return;
308#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ 308#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
309#ifdef CONFIG_KPROBES 309
310 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 310 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
311 == NOTIFY_STOP) 311 == NOTIFY_STOP)
312 return; 312 return;
313#else
314 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
315 == NOTIFY_STOP)
316 return;
317#endif
318 313
319 preempt_conditional_sti(regs); 314 preempt_conditional_sti(regs);
320 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); 315 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db483369f10b..2c9cf0fd78f5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;
35 erroneous rdtsc usage on !cpu_has_tsc processors */ 35 erroneous rdtsc usage on !cpu_has_tsc processors */
36static int __read_mostly tsc_disabled = -1; 36static int __read_mostly tsc_disabled = -1;
37 37
38static int tsc_clocksource_reliable; 38int tsc_clocksource_reliable;
39/* 39/*
40 * Scheduler clock - returns current time in nanosec units. 40 * Scheduler clock - returns current time in nanosec units.
41 */ 41 */
@@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
178} 178}
179 179
180#define CAL_MS 10 180#define CAL_MS 10
181#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) 181#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
182#define CAL_PIT_LOOPS 1000 182#define CAL_PIT_LOOPS 1000
183 183
184#define CAL2_MS 50 184#define CAL2_MS 50
185#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) 185#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
186#define CAL2_PIT_LOOPS 5000 186#define CAL2_PIT_LOOPS 5000
187 187
188 188
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0aa5fed8b9e6..9eba29b46cb7 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 if (unsynchronized_tsc()) 113 if (unsynchronized_tsc())
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (tsc_clocksource_reliable) {
117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info( 118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n"); 119 "Skipped synchronization checks as TSC is reliable.\n");
@@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)
172{ 172{
173 int cpus = 2; 173 int cpus = 2;
174 174
175 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) 175 if (unsynchronized_tsc() || tsc_clocksource_reliable)
176 return; 176 return;
177 177
178 /* 178 /*
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e4d4a22e8b94..b07ba9393564 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
57 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), 57 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
58}; 58};
59 59
60static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; 60static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
61 61
62static int __init vsyscall_setup(char *str) 62static int __init vsyscall_setup(char *str)
63{ 63{
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)
140 return nr; 140 return nr;
141} 141}
142 142
143static bool write_ok_or_segv(unsigned long ptr, size_t size)
144{
145 /*
146 * XXX: if access_ok, get_user, and put_user handled
147 * sig_on_uaccess_error, this could go away.
148 */
149
150 if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
151 siginfo_t info;
152 struct thread_struct *thread = &current->thread;
153
154 thread->error_code = 6; /* user fault, no page, write */
155 thread->cr2 = ptr;
156 thread->trap_no = 14;
157
158 memset(&info, 0, sizeof(info));
159 info.si_signo = SIGSEGV;
160 info.si_errno = 0;
161 info.si_code = SEGV_MAPERR;
162 info.si_addr = (void __user *)ptr;
163
164 force_sig_info(SIGSEGV, &info, current);
165 return false;
166 } else {
167 return true;
168 }
169}
170
143bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) 171bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
144{ 172{
145 struct task_struct *tsk; 173 struct task_struct *tsk;
146 unsigned long caller; 174 unsigned long caller;
147 int vsyscall_nr; 175 int vsyscall_nr;
176 int prev_sig_on_uaccess_error;
148 long ret; 177 long ret;
149 178
150 /* 179 /*
@@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
180 if (seccomp_mode(&tsk->seccomp)) 209 if (seccomp_mode(&tsk->seccomp))
181 do_exit(SIGKILL); 210 do_exit(SIGKILL);
182 211
212 /*
213 * With a real vsyscall, page faults cause SIGSEGV. We want to
214 * preserve that behavior to make writing exploits harder.
215 */
216 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
217 current_thread_info()->sig_on_uaccess_error = 1;
218
219 /*
220 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
221 * 64-bit, so we don't need to special-case it here. For all the
222 * vsyscalls, 0 means "don't write anything" not "write it at
223 * address 0".
224 */
225 ret = -EFAULT;
183 switch (vsyscall_nr) { 226 switch (vsyscall_nr) {
184 case 0: 227 case 0:
228 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
229 !write_ok_or_segv(regs->si, sizeof(struct timezone)))
230 break;
231
185 ret = sys_gettimeofday( 232 ret = sys_gettimeofday(
186 (struct timeval __user *)regs->di, 233 (struct timeval __user *)regs->di,
187 (struct timezone __user *)regs->si); 234 (struct timezone __user *)regs->si);
188 break; 235 break;
189 236
190 case 1: 237 case 1:
238 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
239 break;
240
191 ret = sys_time((time_t __user *)regs->di); 241 ret = sys_time((time_t __user *)regs->di);
192 break; 242 break;
193 243
194 case 2: 244 case 2:
245 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
246 !write_ok_or_segv(regs->si, sizeof(unsigned)))
247 break;
248
195 ret = sys_getcpu((unsigned __user *)regs->di, 249 ret = sys_getcpu((unsigned __user *)regs->di,
196 (unsigned __user *)regs->si, 250 (unsigned __user *)regs->si,
197 0); 251 0);
198 break; 252 break;
199 } 253 }
200 254
255 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
256
201 if (ret == -EFAULT) { 257 if (ret == -EFAULT) {
202 /* 258 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
203 * Bad news -- userspace fed a bad pointer to a vsyscall.
204 *
205 * With a real vsyscall, that would have caused SIGSEGV.
206 * To make writing reliable exploits using the emulated
207 * vsyscalls harder, generate SIGSEGV here as well.
208 */
209 warn_bad_vsyscall(KERN_INFO, regs, 259 warn_bad_vsyscall(KERN_INFO, regs,
210 "vsyscall fault (exploit attempt?)"); 260 "vsyscall fault (exploit attempt?)");
211 goto sigsegv; 261
262 /*
263 * If we failed to generate a signal for any reason,
264 * generate one here. (This should be impossible.)
265 */
266 if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
267 !sigismember(&tsk->pending.signal, SIGSEGV)))
268 goto sigsegv;
269
270 return true; /* Don't emulate the ret. */
212 } 271 }
213 272
214 regs->ax = ret; 273 regs->ax = ret;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c1d6cd549397..91f83e21b989 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
92 92
93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
94 .setup_percpu_clockev = setup_secondary_APIC_clock, 94 .setup_percpu_clockev = setup_secondary_APIC_clock,
95 .fixup_cpu_id = x86_default_fixup_cpu_id,
95}; 96};
96 97
97static void default_nmi_init(void) { }; 98static void default_nmi_init(void) { };
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 76e3f1cd0369..405f2620392f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -338,11 +338,15 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
338 return HRTIMER_NORESTART; 338 return HRTIMER_NORESTART;
339} 339}
340 340
341static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 341static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
342{ 342{
343 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
343 struct kvm_timer *pt = &ps->pit_timer; 344 struct kvm_timer *pt = &ps->pit_timer;
344 s64 interval; 345 s64 interval;
345 346
347 if (!irqchip_in_kernel(kvm))
348 return;
349
346 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 350 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
347 351
348 pr_debug("create pit timer, interval is %llu nsec\n", interval); 352 pr_debug("create pit timer, interval is %llu nsec\n", interval);
@@ -394,13 +398,13 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
394 /* FIXME: enhance mode 4 precision */ 398 /* FIXME: enhance mode 4 precision */
395 case 4: 399 case 4:
396 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { 400 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
397 create_pit_timer(ps, val, 0); 401 create_pit_timer(kvm, val, 0);
398 } 402 }
399 break; 403 break;
400 case 2: 404 case 2:
401 case 3: 405 case 3:
402 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ 406 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
403 create_pit_timer(ps, val, 1); 407 create_pit_timer(kvm, val, 1);
404 } 408 }
405 break; 409 break;
406 default: 410 default:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c38efd7b792e..4c938da2ba00 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
602{ 602{
603 struct kvm_cpuid_entry2 *best; 603 struct kvm_cpuid_entry2 *best;
604 struct kvm_lapic *apic = vcpu->arch.apic; 604 struct kvm_lapic *apic = vcpu->arch.apic;
605 u32 timer_mode_mask;
606 605
607 best = kvm_find_cpuid_entry(vcpu, 1, 0); 606 best = kvm_find_cpuid_entry(vcpu, 1, 0);
608 if (!best) 607 if (!best)
@@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
615 best->ecx |= bit(X86_FEATURE_OSXSAVE); 614 best->ecx |= bit(X86_FEATURE_OSXSAVE);
616 } 615 }
617 616
618 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 617 if (apic) {
619 best->function == 0x1) { 618 if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
620 best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); 619 apic->lapic_timer.timer_mode_mask = 3 << 17;
621 timer_mode_mask = 3 << 17; 620 else
622 } else 621 apic->lapic_timer.timer_mode_mask = 1 << 17;
623 timer_mode_mask = 1 << 17; 622 }
624
625 if (apic)
626 apic->lapic_timer.timer_mode_mask = timer_mode_mask;
627} 623}
628 624
629int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 625int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext)
2135 case KVM_CAP_TSC_CONTROL: 2131 case KVM_CAP_TSC_CONTROL:
2136 r = kvm_has_tsc_control; 2132 r = kvm_has_tsc_control;
2137 break; 2133 break;
2134 case KVM_CAP_TSC_DEADLINE_TIMER:
2135 r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2136 break;
2138 default: 2137 default:
2139 r = 0; 2138 r = 0;
2140 break; 2139 break;
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index 46fc4ee09fc4..88ad5fbda6e1 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -82,9 +82,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
82 const insn_attr_t *table; 82 const insn_attr_t *table;
83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) 83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
84 return 0; 84 return 0;
85 table = inat_avx_tables[vex_m][vex_p]; 85 /* At first, this checks the master table */
86 table = inat_avx_tables[vex_m][0];
86 if (!table) 87 if (!table)
87 return 0; 88 return 0;
89 if (!inat_is_group(table[opcode]) && vex_p) {
90 /* If this is not a group, get attribute directly */
91 table = inat_avx_tables[vex_m][vex_p];
92 if (!table)
93 return 0;
94 }
88 return table[opcode]; 95 return table[opcode];
89} 96}
90 97
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 374562ed6704..5a1f9f3e3fbb 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -202,7 +202,7 @@ void insn_get_opcode(struct insn *insn)
202 m = insn_vex_m_bits(insn); 202 m = insn_vex_m_bits(insn);
203 p = insn_vex_p_bits(insn); 203 p = insn_vex_p_bits(insn);
204 insn->attr = inat_get_avx_attribute(op, m, p); 204 insn->attr = inat_get_avx_attribute(op, m, p);
205 if (!inat_accept_vex(insn->attr)) 205 if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
206 insn->attr = 0; /* This instruction is bad */ 206 insn->attr = 0; /* This instruction is bad */
207 goto end; /* VEX has only 1 byte for opcode */ 207 goto end; /* VEX has only 1 byte for opcode */
208 } 208 }
@@ -249,6 +249,8 @@ void insn_get_modrm(struct insn *insn)
249 pfx = insn_last_prefix(insn); 249 pfx = insn_last_prefix(insn);
250 insn->attr = inat_get_group_attribute(mod, pfx, 250 insn->attr = inat_get_group_attribute(mod, pfx,
251 insn->attr); 251 insn->attr);
252 if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
253 insn->attr = 0; /* This is bad */
252 } 254 }
253 } 255 }
254 256
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 82004d2bf05e..bd59090825db 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr);
164size_t strlen(const char *s) 164size_t strlen(const char *s)
165{ 165{
166 int d0; 166 int d0;
167 int res; 167 size_t res;
168 asm volatile("repne\n\t" 168 asm volatile("repne\n\t"
169 "scasb\n\t" 169 "scasb"
170 "notl %0\n\t"
171 "decl %0"
172 : "=c" (res), "=&D" (d0) 170 : "=c" (res), "=&D" (d0)
173 : "1" (s), "a" (0), "0" (0xffffffffu) 171 : "1" (s), "a" (0), "0" (0xffffffffu)
174 : "memory"); 172 : "memory");
175 return res; 173 return ~res - 1;
176} 174}
177EXPORT_SYMBOL(strlen); 175EXPORT_SYMBOL(strlen);
178#endif 176#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index a793da5e560e..5b83c51c12e0 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1,5 +1,11 @@
1# x86 Opcode Maps 1# x86 Opcode Maps
2# 2#
3# This is (mostly) based on following documentations.
4# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2
5# (#325383-040US, October 2011)
6# - Intel(R) Advanced Vector Extensions Programming Reference
7# (#319433-011,JUNE 2011).
8#
3#<Opcode maps> 9#<Opcode maps>
4# Table: table-name 10# Table: table-name
5# Referrer: escaped-name 11# Referrer: escaped-name
@@ -15,10 +21,13 @@
15# EndTable 21# EndTable
16# 22#
17# AVX Superscripts 23# AVX Superscripts
18# (VEX): this opcode can accept VEX prefix. 24# (v): this opcode requires VEX prefix.
19# (oVEX): this opcode requires VEX prefix. 25# (v1): this opcode only supports 128bit VEX.
20# (o128): this opcode only supports 128bit VEX. 26#
21# (o256): this opcode only supports 256bit VEX. 27# Last Prefix Superscripts
28# - (66): the last prefix is 0x66
29# - (F3): the last prefix is 0xF3
30# - (F2): the last prefix is 0xF2
22# 31#
23 32
24Table: one byte opcode 33Table: one byte opcode
@@ -199,8 +208,8 @@ a0: MOV AL,Ob
199a1: MOV rAX,Ov 208a1: MOV rAX,Ov
200a2: MOV Ob,AL 209a2: MOV Ob,AL
201a3: MOV Ov,rAX 210a3: MOV Ov,rAX
202a4: MOVS/B Xb,Yb 211a4: MOVS/B Yb,Xb
203a5: MOVS/W/D/Q Xv,Yv 212a5: MOVS/W/D/Q Yv,Xv
204a6: CMPS/B Xb,Yb 213a6: CMPS/B Xb,Yb
205a7: CMPS/W/D Xv,Yv 214a7: CMPS/W/D Xv,Yv
206a8: TEST AL,Ib 215a8: TEST AL,Ib
@@ -233,8 +242,8 @@ c0: Grp2 Eb,Ib (1A)
233c1: Grp2 Ev,Ib (1A) 242c1: Grp2 Ev,Ib (1A)
234c2: RETN Iw (f64) 243c2: RETN Iw (f64)
235c3: RETN 244c3: RETN
236c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) 245c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
237c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) 246c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
238c6: Grp11 Eb,Ib (1A) 247c6: Grp11 Eb,Ib (1A)
239c7: Grp11 Ev,Iz (1A) 248c7: Grp11 Ev,Iz (1A)
240c8: ENTER Iw,Ib 249c8: ENTER Iw,Ib
@@ -320,14 +329,19 @@ AVXcode: 1
320# 3DNow! uses the last imm byte as opcode extension. 329# 3DNow! uses the last imm byte as opcode extension.
3210f: 3DNow! Pq,Qq,Ib 3300f: 3DNow! Pq,Qq,Ib
322# 0x0f 0x10-0x1f 331# 0x0f 0x10-0x1f
32310: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) 332# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
32411: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) 333# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
32512: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) 334# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
32613: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) 335# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
32714: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) 336# Reference A.1
32815: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) 33710: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
32916: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) 33811: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
33017: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) 33912: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
34013: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
34114: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
34215: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
34316: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
34417: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
33118: Grp16 (1A) 34518: Grp16 (1A)
33219: 34619:
3331a: 3471a:
@@ -345,14 +359,14 @@ AVXcode: 1
34525: 35925:
34626: 36026:
34727: 36127:
34828: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) 36228: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
34929: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) 36329: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
3502a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) 3642a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
3512b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) 3652b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
3522c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) 3662c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
3532d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) 3672d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
3542e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) 3682e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
3552f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) 3692f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
356# 0x0f 0x30-0x3f 370# 0x0f 0x30-0x3f
35730: WRMSR 37130: WRMSR
35831: RDTSC 37231: RDTSC
@@ -388,65 +402,66 @@ AVXcode: 1
3884e: CMOVLE/NG Gv,Ev 4024e: CMOVLE/NG Gv,Ev
3894f: CMOVNLE/G Gv,Ev 4034f: CMOVNLE/G Gv,Ev
390# 0x0f 0x50-0x5f 404# 0x0f 0x50-0x5f
39150: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) 40550: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
39251: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) 40651: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
39352: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) 40752: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
39453: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) 40853: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
39554: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) 40954: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
39655: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) 41055: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
39756: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) 41156: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
39857: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) 41257: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
39958: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) 41358: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
40059: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) 41459: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
4015a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) 4155a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
4025b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) 4165b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
4035c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) 4175c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
4045d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) 4185d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
4055e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) 4195e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
4065f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) 4205f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
407# 0x0f 0x60-0x6f 421# 0x0f 0x60-0x6f
40860: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) 42260: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
40961: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) 42361: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
41062: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) 42462: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
41163: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) 42563: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
41264: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) 42664: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
41365: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) 42765: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
41466: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) 42866: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
41567: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) 42967: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
41668: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) 43068: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
41769: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) 43169: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
4186a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) 4326a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
4196b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) 4336b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
4206c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) 4346c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
4216d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) 4356d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
4226e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) 4366e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
4236f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) 4376f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3)
424# 0x0f 0x70-0x7f 438# 0x0f 0x70-0x7f
42570: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) 43970: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
42671: Grp12 (1A) 44071: Grp12 (1A)
42772: Grp13 (1A) 44172: Grp13 (1A)
42873: Grp14 (1A) 44273: Grp14 (1A)
42974: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) 44374: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
43075: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) 44475: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
43176: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) 44576: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
43277: emms/vzeroupper/vzeroall (VEX) 446# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
43378: VMREAD Ed/q,Gd/q 44777: emms | vzeroupper | vzeroall
43479: VMWRITE Gd/q,Ed/q 44878: VMREAD Ey,Gy
44979: VMWRITE Gy,Ey
4357a: 4507a:
4367b: 4517b:
4377c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) 4527c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
4387d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) 4537d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
4397e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) 4547e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
4407f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) 4557f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
441# 0x0f 0x80-0x8f 456# 0x0f 0x80-0x8f
44280: JO Jz (f64) 45780: JO Jz (f64)
44381: JNO Jz (f64) 45881: JNO Jz (f64)
44482: JB/JNAE/JC Jz (f64) 45982: JB/JC/JNAE Jz (f64)
44583: JNB/JAE/JNC Jz (f64) 46083: JAE/JNB/JNC Jz (f64)
44684: JZ/JE Jz (f64) 46184: JE/JZ Jz (f64)
44785: JNZ/JNE Jz (f64) 46285: JNE/JNZ Jz (f64)
44886: JBE/JNA Jz (f64) 46386: JBE/JNA Jz (f64)
44987: JNBE/JA Jz (f64) 46487: JA/JNBE Jz (f64)
45088: JS Jz (f64) 46588: JS Jz (f64)
45189: JNS Jz (f64) 46689: JNS Jz (f64)
4528a: JP/JPE Jz (f64) 4678a: JP/JPE Jz (f64)
@@ -502,18 +517,18 @@ b8: JMPE | POPCNT Gv,Ev (F3)
502b9: Grp10 (1A) 517b9: Grp10 (1A)
503ba: Grp8 Ev,Ib (1A) 518ba: Grp8 Ev,Ib (1A)
504bb: BTC Ev,Gv 519bb: BTC Ev,Gv
505bc: BSF Gv,Ev 520bc: BSF Gv,Ev | TZCNT Gv,Ev (F3)
506bd: BSR Gv,Ev 521bd: BSR Gv,Ev | LZCNT Gv,Ev (F3)
507be: MOVSX Gv,Eb 522be: MOVSX Gv,Eb
508bf: MOVSX Gv,Ew 523bf: MOVSX Gv,Ew
509# 0x0f 0xc0-0xcf 524# 0x0f 0xc0-0xcf
510c0: XADD Eb,Gb 525c0: XADD Eb,Gb
511c1: XADD Ev,Gv 526c1: XADD Ev,Gv
512c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) 527c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
513c3: movnti Md/q,Gd/q 528c3: movnti My,Gy
514c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) 529c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
515c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) 530c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
516c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) 531c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
517c7: Grp9 (1A) 532c7: Grp9 (1A)
518c8: BSWAP RAX/EAX/R8/R8D 533c8: BSWAP RAX/EAX/R8/R8D
519c9: BSWAP RCX/ECX/R9/R9D 534c9: BSWAP RCX/ECX/R9/R9D
@@ -524,55 +539,55 @@ cd: BSWAP RBP/EBP/R13/R13D
524ce: BSWAP RSI/ESI/R14/R14D 539ce: BSWAP RSI/ESI/R14/R14D
525cf: BSWAP RDI/EDI/R15/R15D 540cf: BSWAP RDI/EDI/R15/R15D
526# 0x0f 0xd0-0xdf 541# 0x0f 0xd0-0xdf
527d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) 542d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
528d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) 543d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
529d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) 544d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
530d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) 545d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
531d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) 546d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
532d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) 547d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
533d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) 548d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
534d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) 549d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
535d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) 550d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
536d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) 551d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
537da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) 552da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
538db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) 553db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1)
539dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) 554dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
540dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) 555dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
541de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) 556de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
542df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) 557df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1)
543# 0x0f 0xe0-0xef 558# 0x0f 0xe0-0xef
544e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) 559e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
545e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) 560e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
546e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) 561e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
547e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) 562e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
548e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) 563e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
549e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) 564e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
550e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) 565e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2)
551e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) 566e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
552e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) 567e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
553e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) 568e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
554ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) 569ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
555eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) 570eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1)
556ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) 571ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
557ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) 572ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
558ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) 573ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
559ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) 574ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1)
560# 0x0f 0xf0-0xff 575# 0x0f 0xf0-0xff
561f0: lddqu Vdq,Mdq (F2),(VEX) 576f0: vlddqu Vx,Mx (F2)
562f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) 577f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
563f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) 578f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
564f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) 579f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
565f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) 580f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
566f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) 581f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
567f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) 582f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
568f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) 583f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
569f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) 584f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
570f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) 585f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
571fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) 586fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
572fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) 587fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
573fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) 588fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
574fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) 589fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
575fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) 590fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
576ff: 591ff:
577EndTable 592EndTable
578 593
@@ -580,155 +595,193 @@ Table: 3-byte opcode 1 (0x0f 0x38)
580Referrer: 3-byte escape 1 595Referrer: 3-byte escape 1
581AVXcode: 2 596AVXcode: 2
582# 0x0f 0x38 0x00-0x0f 597# 0x0f 0x38 0x00-0x0f
58300: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) 59800: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
58401: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) 59901: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
58502: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) 60002: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
58603: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) 60103: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
58704: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) 60204: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
58805: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) 60305: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
58906: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) 60406: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
59007: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) 60507: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
59108: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) 60608: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
59209: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) 60709: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
5930a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) 6080a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
5940b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) 6090b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
5950c: Vpermilps /r (66),(oVEX) 6100c: vpermilps Vx,Hx,Wx (66),(v)
5960d: Vpermilpd /r (66),(oVEX) 6110d: vpermilpd Vx,Hx,Wx (66),(v)
5970e: vtestps /r (66),(oVEX) 6120e: vtestps Vx,Wx (66),(v)
5980f: vtestpd /r (66),(oVEX) 6130f: vtestpd Vx,Wx (66),(v)
599# 0x0f 0x38 0x10-0x1f 614# 0x0f 0x38 0x10-0x1f
60010: pblendvb Vdq,Wdq (66) 61510: pblendvb Vdq,Wdq (66)
60111: 61611:
60212: 61712:
60313: 61813: vcvtph2ps Vx,Wx,Ib (66),(v)
60414: blendvps Vdq,Wdq (66) 61914: blendvps Vdq,Wdq (66)
60515: blendvpd Vdq,Wdq (66) 62015: blendvpd Vdq,Wdq (66)
60616: 62116: vpermps Vqq,Hqq,Wqq (66),(v)
60717: ptest Vdq,Wdq (66),(VEX) 62217: vptest Vx,Wx (66)
60818: vbroadcastss /r (66),(oVEX) 62318: vbroadcastss Vx,Wd (66),(v)
60919: vbroadcastsd /r (66),(oVEX),(o256) 62419: vbroadcastsd Vqq,Wq (66),(v)
6101a: vbroadcastf128 /r (66),(oVEX),(o256) 6251a: vbroadcastf128 Vqq,Mdq (66),(v)
6111b: 6261b:
6121c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) 6271c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
6131d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) 6281d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
6141e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) 6291e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
6151f: 6301f:
616# 0x0f 0x38 0x20-0x2f 631# 0x0f 0x38 0x20-0x2f
61720: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) 63220: vpmovsxbw Vx,Ux/Mq (66),(v1)
61821: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) 63321: vpmovsxbd Vx,Ux/Md (66),(v1)
61922: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) 63422: vpmovsxbq Vx,Ux/Mw (66),(v1)
62023: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) 63523: vpmovsxwd Vx,Ux/Mq (66),(v1)
62124: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) 63624: vpmovsxwq Vx,Ux/Md (66),(v1)
62225: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) 63725: vpmovsxdq Vx,Ux/Mq (66),(v1)
62326: 63826:
62427: 63927:
62528: pmuldq Vdq,Wdq (66),(VEX),(o128) 64028: vpmuldq Vx,Hx,Wx (66),(v1)
62629: pcmpeqq Vdq,Wdq (66),(VEX),(o128) 64129: vpcmpeqq Vx,Hx,Wx (66),(v1)
6272a: movntdqa Vdq,Mdq (66),(VEX),(o128) 6422a: vmovntdqa Vx,Mx (66),(v1)
6282b: packusdw Vdq,Wdq (66),(VEX),(o128) 6432b: vpackusdw Vx,Hx,Wx (66),(v1)
6292c: vmaskmovps(ld) /r (66),(oVEX) 6442c: vmaskmovps Vx,Hx,Mx (66),(v)
6302d: vmaskmovpd(ld) /r (66),(oVEX) 6452d: vmaskmovpd Vx,Hx,Mx (66),(v)
6312e: vmaskmovps(st) /r (66),(oVEX) 6462e: vmaskmovps Mx,Hx,Vx (66),(v)
6322f: vmaskmovpd(st) /r (66),(oVEX) 6472f: vmaskmovpd Mx,Hx,Vx (66),(v)
633# 0x0f 0x38 0x30-0x3f 648# 0x0f 0x38 0x30-0x3f
63430: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) 64930: vpmovzxbw Vx,Ux/Mq (66),(v1)
63531: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) 65031: vpmovzxbd Vx,Ux/Md (66),(v1)
63632: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) 65132: vpmovzxbq Vx,Ux/Mw (66),(v1)
63733: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) 65233: vpmovzxwd Vx,Ux/Mq (66),(v1)
63834: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) 65334: vpmovzxwq Vx,Ux/Md (66),(v1)
63935: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) 65435: vpmovzxdq Vx,Ux/Mq (66),(v1)
64036: 65536: vpermd Vqq,Hqq,Wqq (66),(v)
64137: pcmpgtq Vdq,Wdq (66),(VEX),(o128) 65637: vpcmpgtq Vx,Hx,Wx (66),(v1)
64238: pminsb Vdq,Wdq (66),(VEX),(o128) 65738: vpminsb Vx,Hx,Wx (66),(v1)
64339: pminsd Vdq,Wdq (66),(VEX),(o128) 65839: vpminsd Vx,Hx,Wx (66),(v1)
6443a: pminuw Vdq,Wdq (66),(VEX),(o128) 6593a: vpminuw Vx,Hx,Wx (66),(v1)
6453b: pminud Vdq,Wdq (66),(VEX),(o128) 6603b: vpminud Vx,Hx,Wx (66),(v1)
6463c: pmaxsb Vdq,Wdq (66),(VEX),(o128) 6613c: vpmaxsb Vx,Hx,Wx (66),(v1)
6473d: pmaxsd Vdq,Wdq (66),(VEX),(o128) 6623d: vpmaxsd Vx,Hx,Wx (66),(v1)
6483e: pmaxuw Vdq,Wdq (66),(VEX),(o128) 6633e: vpmaxuw Vx,Hx,Wx (66),(v1)
6493f: pmaxud Vdq,Wdq (66),(VEX),(o128) 6643f: vpmaxud Vx,Hx,Wx (66),(v1)
650# 0x0f 0x38 0x40-0x8f 665# 0x0f 0x38 0x40-0x8f
65140: pmulld Vdq,Wdq (66),(VEX),(o128) 66640: vpmulld Vx,Hx,Wx (66),(v1)
65241: phminposuw Vdq,Wdq (66),(VEX),(o128) 66741: vphminposuw Vdq,Wdq (66),(v1)
65380: INVEPT Gd/q,Mdq (66) 66842:
65481: INVPID Gd/q,Mdq (66) 66943:
67044:
67145: vpsrlvd/q Vx,Hx,Wx (66),(v)
67246: vpsravd Vx,Hx,Wx (66),(v)
67347: vpsllvd/q Vx,Hx,Wx (66),(v)
674# Skip 0x48-0x57
67558: vpbroadcastd Vx,Wx (66),(v)
67659: vpbroadcastq Vx,Wx (66),(v)
6775a: vbroadcasti128 Vqq,Mdq (66),(v)
678# Skip 0x5b-0x77
67978: vpbroadcastb Vx,Wx (66),(v)
68079: vpbroadcastw Vx,Wx (66),(v)
681# Skip 0x7a-0x7f
68280: INVEPT Gy,Mdq (66)
68381: INVPID Gy,Mdq (66)
68482: INVPCID Gy,Mdq (66)
6858c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
6868e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
655# 0x0f 0x38 0x90-0xbf (FMA) 687# 0x0f 0x38 0x90-0xbf (FMA)
65696: vfmaddsub132pd/ps /r (66),(VEX) 68890: vgatherdd/q Vx,Hx,Wx (66),(v)
65797: vfmsubadd132pd/ps /r (66),(VEX) 68991: vgatherqd/q Vx,Hx,Wx (66),(v)
65898: vfmadd132pd/ps /r (66),(VEX) 69092: vgatherdps/d Vx,Hx,Wx (66),(v)
65999: vfmadd132sd/ss /r (66),(VEX),(o128) 69193: vgatherqps/d Vx,Hx,Wx (66),(v)
6609a: vfmsub132pd/ps /r (66),(VEX) 69294:
6619b: vfmsub132sd/ss /r (66),(VEX),(o128) 69395:
6629c: vfnmadd132pd/ps /r (66),(VEX) 69496: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
6639d: vfnmadd132sd/ss /r (66),(VEX),(o128) 69597: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
6649e: vfnmsub132pd/ps /r (66),(VEX) 69698: vfmadd132ps/d Vx,Hx,Wx (66),(v)
6659f: vfnmsub132sd/ss /r (66),(VEX),(o128) 69799: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
666a6: vfmaddsub213pd/ps /r (66),(VEX) 6989a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
667a7: vfmsubadd213pd/ps /r (66),(VEX) 6999b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
668a8: vfmadd213pd/ps /r (66),(VEX) 7009c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
669a9: vfmadd213sd/ss /r (66),(VEX),(o128) 7019d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
670aa: vfmsub213pd/ps /r (66),(VEX) 7029e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
671ab: vfmsub213sd/ss /r (66),(VEX),(o128) 7039f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
672ac: vfnmadd213pd/ps /r (66),(VEX) 704a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
673ad: vfnmadd213sd/ss /r (66),(VEX),(o128) 705a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
674ae: vfnmsub213pd/ps /r (66),(VEX) 706a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
675af: vfnmsub213sd/ss /r (66),(VEX),(o128) 707a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
676b6: vfmaddsub231pd/ps /r (66),(VEX) 708aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
677b7: vfmsubadd231pd/ps /r (66),(VEX) 709ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
678b8: vfmadd231pd/ps /r (66),(VEX) 710ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
679b9: vfmadd231sd/ss /r (66),(VEX),(o128) 711ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
680ba: vfmsub231pd/ps /r (66),(VEX) 712ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
681bb: vfmsub231sd/ss /r (66),(VEX),(o128) 713af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
682bc: vfnmadd231pd/ps /r (66),(VEX) 714b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
683bd: vfnmadd231sd/ss /r (66),(VEX),(o128) 715b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
684be: vfnmsub231pd/ps /r (66),(VEX) 716b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
685bf: vfnmsub231sd/ss /r (66),(VEX),(o128) 717b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
718ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
719bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
720bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
721bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
722be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
723bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
686# 0x0f 0x38 0xc0-0xff 724# 0x0f 0x38 0xc0-0xff
687db: aesimc Vdq,Wdq (66),(VEX),(o128) 725db: VAESIMC Vdq,Wdq (66),(v1)
688dc: aesenc Vdq,Wdq (66),(VEX),(o128) 726dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
689dd: aesenclast Vdq,Wdq (66),(VEX),(o128) 727dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
690de: aesdec Vdq,Wdq (66),(VEX),(o128) 728de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
691df: aesdeclast Vdq,Wdq (66),(VEX),(o128) 729df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
692f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) 730f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2)
693f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) 731f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2)
732f3: ANDN Gy,By,Ey (v)
733f4: Grp17 (1A)
734f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
735f6: MULX By,Gy,rDX,Ey (F2),(v)
736f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
694EndTable 737EndTable
695 738
696Table: 3-byte opcode 2 (0x0f 0x3a) 739Table: 3-byte opcode 2 (0x0f 0x3a)
697Referrer: 3-byte escape 2 740Referrer: 3-byte escape 2
698AVXcode: 3 741AVXcode: 3
699# 0x0f 0x3a 0x00-0xff 742# 0x0f 0x3a 0x00-0xff
70004: vpermilps /r,Ib (66),(oVEX) 74300: vpermq Vqq,Wqq,Ib (66),(v)
70105: vpermilpd /r,Ib (66),(oVEX) 74401: vpermpd Vqq,Wqq,Ib (66),(v)
70206: vperm2f128 /r,Ib (66),(oVEX),(o256) 74502: vpblendd Vx,Hx,Wx,Ib (66),(v)
70308: roundps Vdq,Wdq,Ib (66),(VEX) 74603:
70409: roundpd Vdq,Wdq,Ib (66),(VEX) 74704: vpermilps Vx,Wx,Ib (66),(v)
7050a: roundss Vss,Wss,Ib (66),(VEX),(o128) 74805: vpermilpd Vx,Wx,Ib (66),(v)
7060b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) 74906: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
7070c: blendps Vdq,Wdq,Ib (66),(VEX) 75007:
7080d: blendpd Vdq,Wdq,Ib (66),(VEX) 75108: vroundps Vx,Wx,Ib (66)
7090e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) 75209: vroundpd Vx,Wx,Ib (66)
7100f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) 7530a: vroundss Vss,Wss,Ib (66),(v1)
71114: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) 7540b: vroundsd Vsd,Wsd,Ib (66),(v1)
71215: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) 7550c: vblendps Vx,Hx,Wx,Ib (66)
71316: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) 7560d: vblendpd Vx,Hx,Wx,Ib (66)
71417: extractps Ed,Vdq,Ib (66),(VEX),(o128) 7570e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
71518: vinsertf128 /r,Ib (66),(oVEX),(o256) 7580f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
71619: vextractf128 /r,Ib (66),(oVEX),(o256) 75914: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
71720: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) 76015: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
71821: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) 76116: vpextrd/q Ey,Vdq,Ib (66),(v1)
71922: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) 76217: vextractps Ed,Vdq,Ib (66),(v1)
72040: dpps Vdq,Wdq,Ib (66),(VEX) 76318: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v)
72141: dppd Vdq,Wdq,Ib (66),(VEX),(o128) 76419: vextractf128 Wdq,Vqq,Ib (66),(v)
72242: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) 7651d: vcvtps2ph Wx,Vx,Ib (66),(v)
72344: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) 76620: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
7244a: vblendvps /r,Ib (66),(oVEX) 76721: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
7254b: vblendvpd /r,Ib (66),(oVEX) 76822: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
7264c: vpblendvb /r,Ib (66),(oVEX),(o128) 76938: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v)
72760: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) 77039: vextracti128 Wdq,Vqq,Ib (66),(v)
72861: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) 77140: vdpps Vx,Hx,Wx,Ib (66)
72962: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) 77241: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
73063: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) 77342: vmpsadbw Vx,Hx,Wx,Ib (66),(v1)
731df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) 77444: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
77546: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
7764a: vblendvps Vx,Hx,Wx,Lx (66),(v)
7774b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
7784c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
77960: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
78061: vpcmpestri Vdq,Wdq,Ib (66),(v1)
78162: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
78263: vpcmpistri Vdq,Wdq,Ib (66),(v1)
783df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
784f0: RORX Gy,Ey,Ib (F2),(v)
732EndTable 785EndTable
733 786
734GrpTable: Grp1 787GrpTable: Grp1
@@ -790,7 +843,7 @@ GrpTable: Grp5
7902: CALLN Ev (f64) 8432: CALLN Ev (f64)
7913: CALLF Ep 8443: CALLF Ep
7924: JMPN Ev (f64) 8454: JMPN Ev (f64)
7935: JMPF Ep 8465: JMPF Mp
7946: PUSH Ev (d64) 8476: PUSH Ev (d64)
7957: 8487:
796EndTable 849EndTable
@@ -807,7 +860,7 @@ EndTable
807GrpTable: Grp7 860GrpTable: Grp7
8080: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) 8610: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
8091: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) 8621: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
8102: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) 8632: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B)
8113: LIDT Ms 8643: LIDT Ms
8124: SMSW Mw/Rv 8654: SMSW Mw/Rv
8135: 8665:
@@ -824,44 +877,45 @@ EndTable
824 877
825GrpTable: Grp9 878GrpTable: Grp9
8261: CMPXCHG8B/16B Mq/Mdq 8791: CMPXCHG8B/16B Mq/Mdq
8276: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) 8806: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
8287: VMPTRST Mq 8817: VMPTRST Mq | VMPTRST Mq (F3)
829EndTable 882EndTable
830 883
831GrpTable: Grp10 884GrpTable: Grp10
832EndTable 885EndTable
833 886
834GrpTable: Grp11 887GrpTable: Grp11
888# Note: the operands are given by group opcode
8350: MOV 8890: MOV
836EndTable 890EndTable
837 891
838GrpTable: Grp12 892GrpTable: Grp12
8392: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) 8932: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
8404: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) 8944: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
8416: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) 8956: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
842EndTable 896EndTable
843 897
844GrpTable: Grp13 898GrpTable: Grp13
8452: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) 8992: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
8464: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) 9004: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1)
8476: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) 9016: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
848EndTable 902EndTable
849 903
850GrpTable: Grp14 904GrpTable: Grp14
8512: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) 9052: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
8523: psrldq Udq,Ib (66),(11B),(VEX),(o128) 9063: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
8536: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) 9076: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
8547: pslldq Udq,Ib (66),(11B),(VEX),(o128) 9087: vpslldq Hx,Ux,Ib (66),(11B),(v1)
855EndTable 909EndTable
856 910
857GrpTable: Grp15 911GrpTable: Grp15
8580: fxsave 9120: fxsave | RDFSBASE Ry (F3),(11B)
8591: fxstor 9131: fxstor | RDGSBASE Ry (F3),(11B)
8602: ldmxcsr (VEX) 9142: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
8613: stmxcsr (VEX) 9153: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
8624: XSAVE 9164: XSAVE
8635: XRSTOR | lfence (11B) 9175: XRSTOR | lfence (11B)
8646: mfence (11B) 9186: XSAVEOPT | mfence (11B)
8657: clflush | sfence (11B) 9197: clflush | sfence (11B)
866EndTable 920EndTable
867 921
@@ -872,6 +926,12 @@ GrpTable: Grp16
8723: prefetch T2 9263: prefetch T2
873EndTable 927EndTable
874 928
929GrpTable: Grp17
9301: BLSR By,Ey (v)
9312: BLSMSK By,Ey (v)
9323: BLSI By,Ey (v)
933EndTable
934
875# AMD's Prefetch Group 935# AMD's Prefetch Group
876GrpTable: GrpP 936GrpTable: GrpP
8770: PREFETCH 9370: PREFETCH
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3d11327c9ab4..23d8e5fecf76 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -27,6 +27,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
27obj-$(CONFIG_ACPI_NUMA) += srat.o 27obj-$(CONFIG_ACPI_NUMA) += srat.o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
29 29
30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
31
32obj-$(CONFIG_MEMTEST) += memtest.o 30obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index d0474ad2a6e5..1fb85dbe390a 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs)
25 if (fixup) { 25 if (fixup) {
26 /* If fixup is less than 16, it means uaccess error */ 26 /* If fixup is less than 16, it means uaccess error */
27 if (fixup->fixup < 16) { 27 if (fixup->fixup < 16) {
28 current_thread_info()->uaccess_err = -EFAULT; 28 current_thread_info()->uaccess_err = 1;
29 regs->ip += fixup->fixup; 29 regs->ip += fixup->fixup;
30 return 1; 30 return 1;
31 } 31 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5db0490deb07..9d74824a708d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
626 626
627static noinline void 627static noinline void
628no_context(struct pt_regs *regs, unsigned long error_code, 628no_context(struct pt_regs *regs, unsigned long error_code,
629 unsigned long address) 629 unsigned long address, int signal, int si_code)
630{ 630{
631 struct task_struct *tsk = current; 631 struct task_struct *tsk = current;
632 unsigned long *stackend; 632 unsigned long *stackend;
@@ -634,8 +634,17 @@ no_context(struct pt_regs *regs, unsigned long error_code,
634 int sig; 634 int sig;
635 635
636 /* Are we prepared to handle this kernel fault? */ 636 /* Are we prepared to handle this kernel fault? */
637 if (fixup_exception(regs)) 637 if (fixup_exception(regs)) {
638 if (current_thread_info()->sig_on_uaccess_error && signal) {
639 tsk->thread.trap_no = 14;
640 tsk->thread.error_code = error_code | PF_USER;
641 tsk->thread.cr2 = address;
642
643 /* XXX: hwpoison faults will set the wrong code. */
644 force_sig_info_fault(signal, si_code, address, tsk, 0);
645 }
638 return; 646 return;
647 }
639 648
640 /* 649 /*
641 * 32-bit: 650 * 32-bit:
@@ -755,7 +764,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
755 if (is_f00f_bug(regs, address)) 764 if (is_f00f_bug(regs, address))
756 return; 765 return;
757 766
758 no_context(regs, error_code, address); 767 no_context(regs, error_code, address, SIGSEGV, si_code);
759} 768}
760 769
761static noinline void 770static noinline void
@@ -819,7 +828,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
819 828
820 /* Kernel mode? Handle exceptions or die: */ 829 /* Kernel mode? Handle exceptions or die: */
821 if (!(error_code & PF_USER)) { 830 if (!(error_code & PF_USER)) {
822 no_context(regs, error_code, address); 831 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
823 return; 832 return;
824 } 833 }
825 834
@@ -854,7 +863,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
854 if (!(fault & VM_FAULT_RETRY)) 863 if (!(fault & VM_FAULT_RETRY))
855 up_read(&current->mm->mmap_sem); 864 up_read(&current->mm->mmap_sem);
856 if (!(error_code & PF_USER)) 865 if (!(error_code & PF_USER))
857 no_context(regs, error_code, address); 866 no_context(regs, error_code, address, 0, 0);
858 return 1; 867 return 1;
859 } 868 }
860 if (!(fault & VM_FAULT_ERROR)) 869 if (!(fault & VM_FAULT_ERROR))
@@ -864,7 +873,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
864 /* Kernel mode? Handle exceptions or die: */ 873 /* Kernel mode? Handle exceptions or die: */
865 if (!(error_code & PF_USER)) { 874 if (!(error_code & PF_USER)) {
866 up_read(&current->mm->mmap_sem); 875 up_read(&current->mm->mmap_sem);
867 no_context(regs, error_code, address); 876 no_context(regs, error_code, address,
877 SIGSEGV, SEGV_MAPERR);
868 return 1; 878 return 1;
869 } 879 }
870 880
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 87488b93a65c..a298914058f9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -67,7 +67,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
67 good_end = max_pfn_mapped << PAGE_SHIFT; 67 good_end = max_pfn_mapped << PAGE_SHIFT;
68 68
69 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); 69 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
70 if (base == MEMBLOCK_ERROR) 70 if (!base)
71 panic("Cannot find space for the kernel page tables"); 71 panic("Cannot find space for the kernel page tables");
72 72
73 pgt_buf_start = base >> PAGE_SHIFT; 73 pgt_buf_start = base >> PAGE_SHIFT;
@@ -80,7 +80,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
80 80
81void __init native_pagetable_reserve(u64 start, u64 end) 81void __init native_pagetable_reserve(u64 start, u64 end)
82{ 82{
83 memblock_x86_reserve_range(start, end, "PGTABLE"); 83 memblock_reserve(start, end - start);
84} 84}
85 85
86struct map_range { 86struct map_range {
@@ -279,8 +279,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
279 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) 279 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
280 * so that they can be reused for other purposes. 280 * so that they can be reused for other purposes.
281 * 281 *
282 * On native it just means calling memblock_x86_reserve_range, on Xen it 282 * On native it just means calling memblock_reserve, on Xen it also
283 * also means marking RW the pagetable pages that we allocated before 283 * means marking RW the pagetable pages that we allocated before
284 * but that haven't been used. 284 * but that haven't been used.
285 * 285 *
286 * In fact on xen we mark RO the whole range pgt_buf_start - 286 * In fact on xen we mark RO the whole range pgt_buf_start -
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 29f7c6d98179..0c1da394a634 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -427,23 +427,17 @@ static void __init add_one_highpage_init(struct page *page)
427void __init add_highpages_with_active_regions(int nid, 427void __init add_highpages_with_active_regions(int nid,
428 unsigned long start_pfn, unsigned long end_pfn) 428 unsigned long start_pfn, unsigned long end_pfn)
429{ 429{
430 struct range *range; 430 phys_addr_t start, end;
431 int nr_range; 431 u64 i;
432 int i; 432
433 433 for_each_free_mem_range(i, nid, &start, &end, NULL) {
434 nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); 434 unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
435 435 start_pfn, end_pfn);
436 for (i = 0; i < nr_range; i++) { 436 unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
437 struct page *page; 437 start_pfn, end_pfn);
438 int node_pfn; 438 for ( ; pfn < e_pfn; pfn++)
439 439 if (pfn_valid(pfn))
440 for (node_pfn = range[i].start; node_pfn < range[i].end; 440 add_one_highpage_init(pfn_to_page(pfn));
441 node_pfn++) {
442 if (!pfn_valid(node_pfn))
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page);
446 }
447 } 441 }
448} 442}
449#else 443#else
@@ -650,18 +644,18 @@ void __init initmem_init(void)
650 highstart_pfn = highend_pfn = max_pfn; 644 highstart_pfn = highend_pfn = max_pfn;
651 if (max_pfn > max_low_pfn) 645 if (max_pfn > max_low_pfn)
652 highstart_pfn = max_low_pfn; 646 highstart_pfn = max_low_pfn;
653 memblock_x86_register_active_regions(0, 0, highend_pfn);
654 sparse_memory_present_with_active_regions(0);
655 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 647 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
656 pages_to_mb(highend_pfn - highstart_pfn)); 648 pages_to_mb(highend_pfn - highstart_pfn));
657 num_physpages = highend_pfn; 649 num_physpages = highend_pfn;
658 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 650 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
659#else 651#else
660 memblock_x86_register_active_regions(0, 0, max_low_pfn);
661 sparse_memory_present_with_active_regions(0);
662 num_physpages = max_low_pfn; 652 num_physpages = max_low_pfn;
663 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 653 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
664#endif 654#endif
655
656 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
657 sparse_memory_present_with_active_regions(0);
658
665#ifdef CONFIG_FLATMEM 659#ifdef CONFIG_FLATMEM
666 max_mapnr = num_physpages; 660 max_mapnr = num_physpages;
667#endif 661#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bbaaa005bf0e..a8a56ce3a962 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -608,7 +608,7 @@ kernel_physical_mapping_init(unsigned long start,
608#ifndef CONFIG_NUMA 608#ifndef CONFIG_NUMA
609void __init initmem_init(void) 609void __init initmem_init(void)
610{ 610{
611 memblock_x86_register_active_regions(0, 0, max_pfn); 611 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
612} 612}
613#endif 613#endif
614 614
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
deleted file mode 100644
index 992da5ec5a64..000000000000
--- a/arch/x86/mm/memblock.c
+++ /dev/null
@@ -1,348 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bitops.h>
5#include <linux/memblock.h>
6#include <linux/bootmem.h>
7#include <linux/mm.h>
8#include <linux/range.h>
9
10/* Check for already reserved areas */
11bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
12{
13 struct memblock_region *r;
14 u64 addr = *addrp, last;
15 u64 size = *sizep;
16 bool changed = false;
17
18again:
19 last = addr + size;
20 for_each_memblock(reserved, r) {
21 if (last > r->base && addr < r->base) {
22 size = r->base - addr;
23 changed = true;
24 goto again;
25 }
26 if (last > (r->base + r->size) && addr < (r->base + r->size)) {
27 addr = round_up(r->base + r->size, align);
28 size = last - addr;
29 changed = true;
30 goto again;
31 }
32 if (last <= (r->base + r->size) && addr >= r->base) {
33 *sizep = 0;
34 return false;
35 }
36 }
37 if (changed) {
38 *addrp = addr;
39 *sizep = size;
40 }
41 return changed;
42}
43
44/*
45 * Find next free range after start, and size is returned in *sizep
46 */
47u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
48{
49 struct memblock_region *r;
50
51 for_each_memblock(memory, r) {
52 u64 ei_start = r->base;
53 u64 ei_last = ei_start + r->size;
54 u64 addr;
55
56 addr = round_up(ei_start, align);
57 if (addr < start)
58 addr = round_up(start, align);
59 if (addr >= ei_last)
60 continue;
61 *sizep = ei_last - addr;
62 while (memblock_x86_check_reserved_size(&addr, sizep, align))
63 ;
64
65 if (*sizep)
66 return addr;
67 }
68
69 return MEMBLOCK_ERROR;
70}
71
72static __init struct range *find_range_array(int count)
73{
74 u64 end, size, mem;
75 struct range *range;
76
77 size = sizeof(struct range) * count;
78 end = memblock.current_limit;
79
80 mem = memblock_find_in_range(0, end, size, sizeof(struct range));
81 if (mem == MEMBLOCK_ERROR)
82 panic("can not find more space for range array");
83
84 /*
85 * This range is tempoaray, so don't reserve it, it will not be
86 * overlapped because We will not alloccate new buffer before
87 * We discard this one
88 */
89 range = __va(mem);
90 memset(range, 0, size);
91
92 return range;
93}
94
95static void __init memblock_x86_subtract_reserved(struct range *range, int az)
96{
97 u64 final_start, final_end;
98 struct memblock_region *r;
99
100 /* Take out region array itself at first*/
101 memblock_free_reserved_regions();
102
103 memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
104
105 for_each_memblock(reserved, r) {
106 memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
107 final_start = PFN_DOWN(r->base);
108 final_end = PFN_UP(r->base + r->size);
109 if (final_start >= final_end)
110 continue;
111 subtract_range(range, az, final_start, final_end);
112 }
113
114 /* Put region array back ? */
115 memblock_reserve_reserved_regions();
116}
117
118struct count_data {
119 int nr;
120};
121
122static int __init count_work_fn(unsigned long start_pfn,
123 unsigned long end_pfn, void *datax)
124{
125 struct count_data *data = datax;
126
127 data->nr++;
128
129 return 0;
130}
131
132static int __init count_early_node_map(int nodeid)
133{
134 struct count_data data;
135
136 data.nr = 0;
137 work_with_active_regions(nodeid, count_work_fn, &data);
138
139 return data.nr;
140}
141
142int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
143 unsigned long start_pfn, unsigned long end_pfn)
144{
145 int count;
146 struct range *range;
147 int nr_range;
148
149 count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
150
151 range = find_range_array(count);
152 nr_range = 0;
153
154 /*
155 * Use early_node_map[] and memblock.reserved.region to get range array
156 * at first
157 */
158 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
159 subtract_range(range, count, 0, start_pfn);
160 subtract_range(range, count, end_pfn, -1ULL);
161
162 memblock_x86_subtract_reserved(range, count);
163 nr_range = clean_sort_range(range, count);
164
165 *rangep = range;
166 return nr_range;
167}
168
169int __init get_free_all_memory_range(struct range **rangep, int nodeid)
170{
171 unsigned long end_pfn = -1UL;
172
173#ifdef CONFIG_X86_32
174 end_pfn = max_low_pfn;
175#endif
176 return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
177}
178
179static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
180{
181 int i, count;
182 struct range *range;
183 int nr_range;
184 u64 final_start, final_end;
185 u64 free_size;
186 struct memblock_region *r;
187
188 count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
189
190 range = find_range_array(count);
191 nr_range = 0;
192
193 addr = PFN_UP(addr);
194 limit = PFN_DOWN(limit);
195
196 for_each_memblock(memory, r) {
197 final_start = PFN_UP(r->base);
198 final_end = PFN_DOWN(r->base + r->size);
199 if (final_start >= final_end)
200 continue;
201 if (final_start >= limit || final_end <= addr)
202 continue;
203
204 nr_range = add_range(range, count, nr_range, final_start, final_end);
205 }
206 subtract_range(range, count, 0, addr);
207 subtract_range(range, count, limit, -1ULL);
208
209 /* Subtract memblock.reserved.region in range ? */
210 if (!get_free)
211 goto sort_and_count_them;
212 for_each_memblock(reserved, r) {
213 final_start = PFN_DOWN(r->base);
214 final_end = PFN_UP(r->base + r->size);
215 if (final_start >= final_end)
216 continue;
217 if (final_start >= limit || final_end <= addr)
218 continue;
219
220 subtract_range(range, count, final_start, final_end);
221 }
222
223sort_and_count_them:
224 nr_range = clean_sort_range(range, count);
225
226 free_size = 0;
227 for (i = 0; i < nr_range; i++)
228 free_size += range[i].end - range[i].start;
229
230 return free_size << PAGE_SHIFT;
231}
232
233u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
234{
235 return __memblock_x86_memory_in_range(addr, limit, true);
236}
237
238u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
239{
240 return __memblock_x86_memory_in_range(addr, limit, false);
241}
242
243void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
244{
245 if (start == end)
246 return;
247
248 if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
249 return;
250
251 memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
252
253 memblock_reserve(start, end - start);
254}
255
256void __init memblock_x86_free_range(u64 start, u64 end)
257{
258 if (start == end)
259 return;
260
261 if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
262 return;
263
264 memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
265
266 memblock_free(start, end - start);
267}
268
269/*
270 * Need to call this function after memblock_x86_register_active_regions,
271 * so early_node_map[] is filled already.
272 */
273u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
274{
275 u64 addr;
276 addr = find_memory_core_early(nid, size, align, start, end);
277 if (addr != MEMBLOCK_ERROR)
278 return addr;
279
280 /* Fallback, should already have start end within node range */
281 return memblock_find_in_range(start, end, size, align);
282}
283
284/*
285 * Finds an active region in the address range from start_pfn to last_pfn and
286 * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
287 */
288static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
289 unsigned long start_pfn,
290 unsigned long last_pfn,
291 unsigned long *ei_startpfn,
292 unsigned long *ei_endpfn)
293{
294 u64 align = PAGE_SIZE;
295
296 *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
297 *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
298
299 /* Skip map entries smaller than a page */
300 if (*ei_startpfn >= *ei_endpfn)
301 return 0;
302
303 /* Skip if map is outside the node */
304 if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
305 return 0;
306
307 /* Check for overlaps */
308 if (*ei_startpfn < start_pfn)
309 *ei_startpfn = start_pfn;
310 if (*ei_endpfn > last_pfn)
311 *ei_endpfn = last_pfn;
312
313 return 1;
314}
315
316/* Walk the memblock.memory map and register active regions within a node */
317void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
318 unsigned long last_pfn)
319{
320 unsigned long ei_startpfn;
321 unsigned long ei_endpfn;
322 struct memblock_region *r;
323
324 for_each_memblock(memory, r)
325 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
326 &ei_startpfn, &ei_endpfn))
327 add_active_range(nid, ei_startpfn, ei_endpfn);
328}
329
330/*
331 * Find the hole size (in bytes) in the memory range.
332 * @start: starting address of the memory range to scan
333 * @end: ending address of the memory range to scan
334 */
335u64 __init memblock_x86_hole_size(u64 start, u64 end)
336{
337 unsigned long start_pfn = start >> PAGE_SHIFT;
338 unsigned long last_pfn = end >> PAGE_SHIFT;
339 unsigned long ei_startpfn, ei_endpfn, ram = 0;
340 struct memblock_region *r;
341
342 for_each_memblock(memory, r)
343 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
344 &ei_startpfn, &ei_endpfn))
345 ram += ei_endpfn - ei_startpfn;
346
347 return end - start - ((u64)ram << PAGE_SHIFT);
348}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 92faf3a1c53e..c80b9fb95734 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -34,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
34 (unsigned long long) pattern, 34 (unsigned long long) pattern,
35 (unsigned long long) start_bad, 35 (unsigned long long) start_bad,
36 (unsigned long long) end_bad); 36 (unsigned long long) end_bad);
37 memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); 37 memblock_reserve(start_bad, end_bad - start_bad);
38} 38}
39 39
40static void __init memtest(u64 pattern, u64 start_phys, u64 size) 40static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -70,24 +70,19 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)
70 70
71static void __init do_one_pass(u64 pattern, u64 start, u64 end) 71static void __init do_one_pass(u64 pattern, u64 start, u64 end)
72{ 72{
73 u64 size = 0; 73 u64 i;
74 74 phys_addr_t this_start, this_end;
75 while (start < end) { 75
76 start = memblock_x86_find_in_range_size(start, &size, 1); 76 for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
77 77 this_start = clamp_t(phys_addr_t, this_start, start, end);
78 /* done ? */ 78 this_end = clamp_t(phys_addr_t, this_end, start, end);
79 if (start >= end) 79 if (this_start < this_end) {
80 break; 80 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
81 if (start + size > end) 81 (unsigned long long)this_start,
82 size = end - start; 82 (unsigned long long)this_end,
83 83 (unsigned long long)cpu_to_be64(pattern));
84 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", 84 memtest(pattern, this_start, this_end - this_start);
85 (unsigned long long) start, 85 }
86 (unsigned long long) start + size,
87 (unsigned long long) cpu_to_be64(pattern));
88 memtest(pattern, start, size);
89
90 start += size;
91 } 86 }
92} 87}
93 88
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fbeaaf416610..496f494593bf 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -192,8 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
192/* Initialize NODE_DATA for a node on the local memory */ 192/* Initialize NODE_DATA for a node on the local memory */
193static void __init setup_node_data(int nid, u64 start, u64 end) 193static void __init setup_node_data(int nid, u64 start, u64 end)
194{ 194{
195 const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
196 const u64 nd_high = PFN_PHYS(max_pfn_mapped);
197 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 195 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
198 bool remapped = false; 196 bool remapped = false;
199 u64 nd_pa; 197 u64 nd_pa;
@@ -224,17 +222,12 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
224 nd_pa = __pa(nd); 222 nd_pa = __pa(nd);
225 remapped = true; 223 remapped = true;
226 } else { 224 } else {
227 nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, 225 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
228 nd_size, SMP_CACHE_BYTES); 226 if (!nd_pa) {
229 if (nd_pa == MEMBLOCK_ERROR)
230 nd_pa = memblock_find_in_range(nd_low, nd_high,
231 nd_size, SMP_CACHE_BYTES);
232 if (nd_pa == MEMBLOCK_ERROR) {
233 pr_err("Cannot find %zu bytes in node %d\n", 227 pr_err("Cannot find %zu bytes in node %d\n",
234 nd_size, nid); 228 nd_size, nid);
235 return; 229 return;
236 } 230 }
237 memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
238 nd = __va(nd_pa); 231 nd = __va(nd_pa);
239 } 232 }
240 233
@@ -371,8 +364,7 @@ void __init numa_reset_distance(void)
371 364
372 /* numa_distance could be 1LU marking allocation failure, test cnt */ 365 /* numa_distance could be 1LU marking allocation failure, test cnt */
373 if (numa_distance_cnt) 366 if (numa_distance_cnt)
374 memblock_x86_free_range(__pa(numa_distance), 367 memblock_free(__pa(numa_distance), size);
375 __pa(numa_distance) + size);
376 numa_distance_cnt = 0; 368 numa_distance_cnt = 0;
377 numa_distance = NULL; /* enable table creation */ 369 numa_distance = NULL; /* enable table creation */
378} 370}
@@ -395,13 +387,13 @@ static int __init numa_alloc_distance(void)
395 387
396 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 388 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
397 size, PAGE_SIZE); 389 size, PAGE_SIZE);
398 if (phys == MEMBLOCK_ERROR) { 390 if (!phys) {
399 pr_warning("NUMA: Warning: can't allocate distance table!\n"); 391 pr_warning("NUMA: Warning: can't allocate distance table!\n");
400 /* don't retry until explicitly reset */ 392 /* don't retry until explicitly reset */
401 numa_distance = (void *)1LU; 393 numa_distance = (void *)1LU;
402 return -ENOMEM; 394 return -ENOMEM;
403 } 395 }
404 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); 396 memblock_reserve(phys, size);
405 397
406 numa_distance = __va(phys); 398 numa_distance = __va(phys);
407 numa_distance_cnt = cnt; 399 numa_distance_cnt = cnt;
@@ -482,8 +474,8 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
482 numaram = 0; 474 numaram = 0;
483 } 475 }
484 476
485 e820ram = max_pfn - (memblock_x86_hole_size(0, 477 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
486 PFN_PHYS(max_pfn)) >> PAGE_SHIFT); 478
487 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 479 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
488 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { 480 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
489 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", 481 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
@@ -505,13 +497,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
505 if (WARN_ON(nodes_empty(node_possible_map))) 497 if (WARN_ON(nodes_empty(node_possible_map)))
506 return -EINVAL; 498 return -EINVAL;
507 499
508 for (i = 0; i < mi->nr_blks; i++) 500 for (i = 0; i < mi->nr_blks; i++) {
509 memblock_x86_register_active_regions(mi->blk[i].nid, 501 struct numa_memblk *mb = &mi->blk[i];
510 mi->blk[i].start >> PAGE_SHIFT, 502 memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
511 mi->blk[i].end >> PAGE_SHIFT); 503 }
512
513 /* for out of order entries */
514 sort_node_map();
515 504
516 /* 505 /*
517 * If sections array is gonna be used for pfn -> nid mapping, check 506 * If sections array is gonna be used for pfn -> nid mapping, check
@@ -545,6 +534,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
545 setup_node_data(nid, start, end); 534 setup_node_data(nid, start, end);
546 } 535 }
547 536
537 /* Dump memblock with node info and return. */
538 memblock_dump_all();
548 return 0; 539 return 0;
549} 540}
550 541
@@ -582,7 +573,7 @@ static int __init numa_init(int (*init_func)(void))
582 nodes_clear(node_possible_map); 573 nodes_clear(node_possible_map);
583 nodes_clear(node_online_map); 574 nodes_clear(node_online_map);
584 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 575 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
585 remove_all_active_ranges(); 576 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
586 numa_reset_distance(); 577 numa_reset_distance();
587 578
588 ret = init_func(); 579 ret = init_func();
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 3adebe7e536a..534255a36b6b 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -199,23 +199,23 @@ void __init init_alloc_remap(int nid, u64 start, u64 end)
199 199
200 /* allocate node memory and the lowmem remap area */ 200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); 201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (node_pa == MEMBLOCK_ERROR) { 202 if (!node_pa) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", 203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid); 204 size, nid);
205 return; 205 return;
206 } 206 }
207 memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); 207 memblock_reserve(node_pa, size);
208 208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, 209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT, 210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES); 211 size, LARGE_PAGE_BYTES);
212 if (remap_pa == MEMBLOCK_ERROR) { 212 if (!remap_pa) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", 213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid); 214 size, nid);
215 memblock_x86_free_range(node_pa, node_pa + size); 215 memblock_free(node_pa, size);
216 return; 216 return;
217 } 217 }
218 memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); 218 memblock_reserve(remap_pa, size);
219 remap_va = phys_to_virt(remap_pa); 219 remap_va = phys_to_virt(remap_pa);
220 220
221 /* perform actual remap */ 221 /* perform actual remap */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index dd27f401f0a0..92e27119ee1a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -19,7 +19,7 @@ unsigned long __init numa_free_all_bootmem(void)
19 for_each_online_node(i) 19 for_each_online_node(i)
20 pages += free_all_bootmem_node(NODE_DATA(i)); 20 pages += free_all_bootmem_node(NODE_DATA(i));
21 21
22 pages += free_all_memory_core_early(MAX_NUMNODES); 22 pages += free_low_memory_core_early(MAX_NUMNODES);
23 23
24 return pages; 24 return pages;
25} 25}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index d0ed086b6247..46db56845f18 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -28,6 +28,16 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
28 return -ENOENT; 28 return -ENOENT;
29} 29}
30 30
31static u64 mem_hole_size(u64 start, u64 end)
32{
33 unsigned long start_pfn = PFN_UP(start);
34 unsigned long end_pfn = PFN_DOWN(end);
35
36 if (start_pfn < end_pfn)
37 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
38 return 0;
39}
40
31/* 41/*
32 * Sets up nid to range from @start to @end. The return value is -errno if 42 * Sets up nid to range from @start to @end. The return value is -errno if
33 * something went wrong, 0 otherwise. 43 * something went wrong, 0 otherwise.
@@ -89,7 +99,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
89 * Calculate target node size. x86_32 freaks on __udivdi3() so do 99 * Calculate target node size. x86_32 freaks on __udivdi3() so do
90 * the division in ulong number of pages and convert back. 100 * the division in ulong number of pages and convert back.
91 */ 101 */
92 size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); 102 size = max_addr - addr - mem_hole_size(addr, max_addr);
93 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 103 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
94 104
95 /* 105 /*
@@ -135,8 +145,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
135 * Continue to add memory to this fake node if its 145 * Continue to add memory to this fake node if its
136 * non-reserved memory is less than the per-node size. 146 * non-reserved memory is less than the per-node size.
137 */ 147 */
138 while (end - start - 148 while (end - start - mem_hole_size(start, end) < size) {
139 memblock_x86_hole_size(start, end) < size) {
140 end += FAKE_NODE_MIN_SIZE; 149 end += FAKE_NODE_MIN_SIZE;
141 if (end > limit) { 150 if (end > limit) {
142 end = limit; 151 end = limit;
@@ -150,7 +159,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
150 * this one must extend to the boundary. 159 * this one must extend to the boundary.
151 */ 160 */
152 if (end < dma32_end && dma32_end - end - 161 if (end < dma32_end && dma32_end - end -
153 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 162 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
154 end = dma32_end; 163 end = dma32_end;
155 164
156 /* 165 /*
@@ -158,8 +167,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
158 * next node, this one must extend to the end of the 167 * next node, this one must extend to the end of the
159 * physical node. 168 * physical node.
160 */ 169 */
161 if (limit - end - 170 if (limit - end - mem_hole_size(end, limit) < size)
162 memblock_x86_hole_size(end, limit) < size)
163 end = limit; 171 end = limit;
164 172
165 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 173 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
@@ -180,7 +188,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
180{ 188{
181 u64 end = start + size; 189 u64 end = start + size;
182 190
183 while (end - start - memblock_x86_hole_size(start, end) < size) { 191 while (end - start - mem_hole_size(start, end) < size) {
184 end += FAKE_NODE_MIN_SIZE; 192 end += FAKE_NODE_MIN_SIZE;
185 if (end > max_addr) { 193 if (end > max_addr) {
186 end = max_addr; 194 end = max_addr;
@@ -211,8 +219,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
211 * creates a uniform distribution of node sizes across the entire 219 * creates a uniform distribution of node sizes across the entire
212 * machine (but not necessarily over physical nodes). 220 * machine (but not necessarily over physical nodes).
213 */ 221 */
214 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / 222 min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
215 MAX_NUMNODES;
216 min_size = max(min_size, FAKE_NODE_MIN_SIZE); 223 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
217 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) 224 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
218 min_size = (min_size + FAKE_NODE_MIN_SIZE) & 225 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
@@ -252,7 +259,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
252 * this one must extend to the boundary. 259 * this one must extend to the boundary.
253 */ 260 */
254 if (end < dma32_end && dma32_end - end - 261 if (end < dma32_end && dma32_end - end -
255 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 262 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
256 end = dma32_end; 263 end = dma32_end;
257 264
258 /* 265 /*
@@ -260,8 +267,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
260 * next node, this one must extend to the end of the 267 * next node, this one must extend to the end of the
261 * physical node. 268 * physical node.
262 */ 269 */
263 if (limit - end - 270 if (limit - end - mem_hole_size(end, limit) < size)
264 memblock_x86_hole_size(end, limit) < size)
265 end = limit; 271 end = limit;
266 272
267 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 273 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
@@ -351,11 +357,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
351 357
352 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 358 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
353 phys_size, PAGE_SIZE); 359 phys_size, PAGE_SIZE);
354 if (phys == MEMBLOCK_ERROR) { 360 if (!phys) {
355 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 361 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
356 goto no_emu; 362 goto no_emu;
357 } 363 }
358 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); 364 memblock_reserve(phys, phys_size);
359 phys_dist = __va(phys); 365 phys_dist = __va(phys);
360 366
361 for (i = 0; i < numa_dist_cnt; i++) 367 for (i = 0; i < numa_dist_cnt; i++)
@@ -424,7 +430,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
424 430
425 /* free the copied physical distance table */ 431 /* free the copied physical distance table */
426 if (phys_dist) 432 if (phys_dist)
427 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); 433 memblock_free(__pa(phys_dist), phys_size);
428 return; 434 return;
429 435
430no_emu: 436no_emu:
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f9e526742fa1..eda2acbb6e81 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -998,7 +998,7 @@ out_err:
998} 998}
999EXPORT_SYMBOL(set_memory_uc); 999EXPORT_SYMBOL(set_memory_uc);
1000 1000
1001int _set_memory_array(unsigned long *addr, int addrinarray, 1001static int _set_memory_array(unsigned long *addr, int addrinarray,
1002 unsigned long new_type) 1002 unsigned long new_type)
1003{ 1003{
1004 int i, j; 1004 int i, j;
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 81dbfdeb080d..fd61b3fb7341 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -69,6 +69,12 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
69 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) 69 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
70 return; 70 return;
71 pxm = pa->proximity_domain; 71 pxm = pa->proximity_domain;
72 apic_id = pa->apic_id;
73 if (!cpu_has_x2apic && (apic_id >= 0xff)) {
74 printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
75 pxm, apic_id);
76 return;
77 }
72 node = setup_node(pxm); 78 node = setup_node(pxm);
73 if (node < 0) { 79 if (node < 0) {
74 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); 80 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -76,7 +82,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
76 return; 82 return;
77 } 83 }
78 84
79 apic_id = pa->apic_id;
80 if (apic_id >= MAX_LOCAL_APIC) { 85 if (apic_id >= MAX_LOCAL_APIC) {
81 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 86 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
82 return; 87 return;
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 446902b2a6b6..1599f568f0e2 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -4,9 +4,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
4 oprof.o cpu_buffer.o buffer_sync.o \ 4 oprof.o cpu_buffer.o buffer_sync.o \
5 event_buffer.o oprofile_files.o \ 5 event_buffer.o oprofile_files.o \
6 oprofilefs.o oprofile_stats.o \ 6 oprofilefs.o oprofile_stats.o \
7 timer_int.o ) 7 timer_int.o nmi_timer_int.o )
8 8
9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o 9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ 10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
11 op_model_ppro.o op_model_p4.o 11 op_model_ppro.o op_model_p4.o
12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
index f148cf652678..9e138d00ad36 100644
--- a/arch/x86/oprofile/init.c
+++ b/arch/x86/oprofile/init.c
@@ -16,37 +16,23 @@
16 * with the NMI mode driver. 16 * with the NMI mode driver.
17 */ 17 */
18 18
19#ifdef CONFIG_X86_LOCAL_APIC
19extern int op_nmi_init(struct oprofile_operations *ops); 20extern int op_nmi_init(struct oprofile_operations *ops);
20extern int op_nmi_timer_init(struct oprofile_operations *ops);
21extern void op_nmi_exit(void); 21extern void op_nmi_exit(void);
22extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); 22#else
23static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
24static void op_nmi_exit(void) { }
25#endif
23 26
24static int nmi_timer; 27extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
25 28
26int __init oprofile_arch_init(struct oprofile_operations *ops) 29int __init oprofile_arch_init(struct oprofile_operations *ops)
27{ 30{
28 int ret;
29
30 ret = -ENODEV;
31
32#ifdef CONFIG_X86_LOCAL_APIC
33 ret = op_nmi_init(ops);
34#endif
35 nmi_timer = (ret != 0);
36#ifdef CONFIG_X86_IO_APIC
37 if (nmi_timer)
38 ret = op_nmi_timer_init(ops);
39#endif
40 ops->backtrace = x86_backtrace; 31 ops->backtrace = x86_backtrace;
41 32 return op_nmi_init(ops);
42 return ret;
43} 33}
44 34
45
46void oprofile_arch_exit(void) 35void oprofile_arch_exit(void)
47{ 36{
48#ifdef CONFIG_X86_LOCAL_APIC 37 op_nmi_exit();
49 if (!nmi_timer)
50 op_nmi_exit();
51#endif
52} 38}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 75f9528e0372..26b8a8514ee5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -595,24 +595,36 @@ static int __init p4_init(char **cpu_type)
595 return 0; 595 return 0;
596} 596}
597 597
598static int force_arch_perfmon; 598enum __force_cpu_type {
599static int force_cpu_type(const char *str, struct kernel_param *kp) 599 reserved = 0, /* do not force */
600 timer,
601 arch_perfmon,
602};
603
604static int force_cpu_type;
605
606static int set_cpu_type(const char *str, struct kernel_param *kp)
600{ 607{
601 if (!strcmp(str, "arch_perfmon")) { 608 if (!strcmp(str, "timer")) {
602 force_arch_perfmon = 1; 609 force_cpu_type = timer;
610 printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
611 } else if (!strcmp(str, "arch_perfmon")) {
612 force_cpu_type = arch_perfmon;
603 printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); 613 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
614 } else {
615 force_cpu_type = 0;
604 } 616 }
605 617
606 return 0; 618 return 0;
607} 619}
608module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); 620module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
609 621
610static int __init ppro_init(char **cpu_type) 622static int __init ppro_init(char **cpu_type)
611{ 623{
612 __u8 cpu_model = boot_cpu_data.x86_model; 624 __u8 cpu_model = boot_cpu_data.x86_model;
613 struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ 625 struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
614 626
615 if (force_arch_perfmon && cpu_has_arch_perfmon) 627 if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon)
616 return 0; 628 return 0;
617 629
618 /* 630 /*
@@ -679,6 +691,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
679 if (!cpu_has_apic) 691 if (!cpu_has_apic)
680 return -ENODEV; 692 return -ENODEV;
681 693
694 if (force_cpu_type == timer)
695 return -ENODEV;
696
682 switch (vendor) { 697 switch (vendor) {
683 case X86_VENDOR_AMD: 698 case X86_VENDOR_AMD:
684 /* Needs to be at least an Athlon (or hammer in 32bit mode) */ 699 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
deleted file mode 100644
index 7f8052cd6620..000000000000
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ /dev/null
@@ -1,50 +0,0 @@
1/**
2 * @file nmi_timer_int.c
3 *
4 * @remark Copyright 2003 OProfile authors
5 * @remark Read the file COPYING
6 *
7 * @author Zwane Mwaikambo <zwane@linuxpower.ca>
8 */
9
10#include <linux/init.h>
11#include <linux/smp.h>
12#include <linux/errno.h>
13#include <linux/oprofile.h>
14#include <linux/rcupdate.h>
15#include <linux/kdebug.h>
16
17#include <asm/nmi.h>
18#include <asm/apic.h>
19#include <asm/ptrace.h>
20
21static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs)
22{
23 oprofile_add_sample(regs, 0);
24 return NMI_HANDLED;
25}
26
27static int timer_start(void)
28{
29 if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify,
30 0, "oprofile-timer"))
31 return 1;
32 return 0;
33}
34
35
36static void timer_stop(void)
37{
38 unregister_nmi_handler(NMI_LOCAL, "oprofile-timer");
39 synchronize_sched(); /* Allow already-started NMIs to complete. */
40}
41
42
43int __init op_nmi_timer_init(struct oprofile_operations *ops)
44{
45 ops->start = timer_start;
46 ops->stop = timer_stop;
47 ops->cpu_type = "timer";
48 printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
49 return 0;
50}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 37718f0f053d..4cf9bd0a1653 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -238,7 +238,8 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
238 238
239 spin_lock_irqsave(&rtc_lock, flags); 239 spin_lock_irqsave(&rtc_lock, flags);
240 efi_call_phys_prelog(); 240 efi_call_phys_prelog();
241 status = efi_call_phys2(efi_phys.get_time, tm, tc); 241 status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
242 virt_to_phys(tc));
242 efi_call_phys_epilog(); 243 efi_call_phys_epilog();
243 spin_unlock_irqrestore(&rtc_lock, flags); 244 spin_unlock_irqrestore(&rtc_lock, flags);
244 return status; 245 return status;
@@ -352,8 +353,7 @@ void __init efi_memblock_x86_reserve_range(void)
352 boot_params.efi_info.efi_memdesc_size; 353 boot_params.efi_info.efi_memdesc_size;
353 memmap.desc_version = boot_params.efi_info.efi_memdesc_version; 354 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
354 memmap.desc_size = boot_params.efi_info.efi_memdesc_size; 355 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
355 memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, 356 memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
356 "EFI memmap");
357} 357}
358 358
359#if EFI_DEBUG 359#if EFI_DEBUG
@@ -397,16 +397,14 @@ void __init efi_reserve_boot_services(void)
397 if ((start+size >= virt_to_phys(_text) 397 if ((start+size >= virt_to_phys(_text)
398 && start <= virt_to_phys(_end)) || 398 && start <= virt_to_phys(_end)) ||
399 !e820_all_mapped(start, start+size, E820_RAM) || 399 !e820_all_mapped(start, start+size, E820_RAM) ||
400 memblock_x86_check_reserved_size(&start, &size, 400 memblock_is_region_reserved(start, size)) {
401 1<<EFI_PAGE_SHIFT)) {
402 /* Could not reserve, skip it */ 401 /* Could not reserve, skip it */
403 md->num_pages = 0; 402 md->num_pages = 0;
404 memblock_dbg(PFX "Could not reserve boot range " 403 memblock_dbg(PFX "Could not reserve boot range "
405 "[0x%010llx-0x%010llx]\n", 404 "[0x%010llx-0x%010llx]\n",
406 start, start+size-1); 405 start, start+size-1);
407 } else 406 } else
408 memblock_x86_reserve_range(start, start+size, 407 memblock_reserve(start, size);
409 "EFI Boot");
410 } 408 }
411} 409}
412 410
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index f82082677337..d511aa97533a 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -18,14 +18,21 @@ chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
18quiet_cmd_posttest = TEST $@ 18quiet_cmd_posttest = TEST $@
19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) 19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
20 20
21posttest: $(obj)/test_get_len vmlinux 21quiet_cmd_sanitytest = TEST $@
22 cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
23
24posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity
22 $(call cmd,posttest) 25 $(call cmd,posttest)
26 $(call cmd,sanitytest)
23 27
24hostprogs-y := test_get_len 28hostprogs-y += test_get_len insn_sanity
25 29
26# -I needed for generated C source and C source which in the kernel tree. 30# -I needed for generated C source and C source which in the kernel tree.
27HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ 31HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
28 32
33HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
34
29# Dependencies are also needed. 35# Dependencies are also needed.
30$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c 36$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
31 37
38$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index eaf11f52fc0b..5f6a5b6c3a15 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -47,7 +47,7 @@ BEGIN {
47 sep_expr = "^\\|$" 47 sep_expr = "^\\|$"
48 group_expr = "^Grp[0-9A-Za-z]+" 48 group_expr = "^Grp[0-9A-Za-z]+"
49 49
50 imm_expr = "^[IJAO][a-z]" 50 imm_expr = "^[IJAOL][a-z]"
51 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" 51 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
52 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" 52 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
53 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" 53 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
@@ -59,6 +59,7 @@ BEGIN {
59 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" 59 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
60 imm_flag["Ob"] = "INAT_MOFFSET" 60 imm_flag["Ob"] = "INAT_MOFFSET"
61 imm_flag["Ov"] = "INAT_MOFFSET" 61 imm_flag["Ov"] = "INAT_MOFFSET"
62 imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
62 63
63 modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" 64 modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
64 force64_expr = "\\([df]64\\)" 65 force64_expr = "\\([df]64\\)"
@@ -70,8 +71,12 @@ BEGIN {
70 lprefix3_expr = "\\(F2\\)" 71 lprefix3_expr = "\\(F2\\)"
71 max_lprefix = 4 72 max_lprefix = 4
72 73
73 vexok_expr = "\\(VEX\\)" 74 # All opcodes starting with lower-case 'v' or with (v1) superscript
74 vexonly_expr = "\\(oVEX\\)" 75 # accepts VEX prefix
76 vexok_opcode_expr = "^v.*"
77 vexok_expr = "\\(v1\\)"
78 # All opcodes with (v) superscript supports *only* VEX prefix
79 vexonly_expr = "\\(v\\)"
75 80
76 prefix_expr = "\\(Prefix\\)" 81 prefix_expr = "\\(Prefix\\)"
77 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" 82 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
@@ -85,8 +90,8 @@ BEGIN {
85 prefix_num["SEG=GS"] = "INAT_PFX_GS" 90 prefix_num["SEG=GS"] = "INAT_PFX_GS"
86 prefix_num["SEG=SS"] = "INAT_PFX_SS" 91 prefix_num["SEG=SS"] = "INAT_PFX_SS"
87 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" 92 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
88 prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" 93 prefix_num["VEX+1byte"] = "INAT_PFX_VEX2"
89 prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" 94 prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
90 95
91 clear_vars() 96 clear_vars()
92} 97}
@@ -310,12 +315,10 @@ function convert_operands(count,opnd, i,j,imm,mod)
310 if (match(opcode, fpu_expr)) 315 if (match(opcode, fpu_expr))
311 flags = add_flags(flags, "INAT_MODRM") 316 flags = add_flags(flags, "INAT_MODRM")
312 317
313 # check VEX only code 318 # check VEX codes
314 if (match(ext, vexonly_expr)) 319 if (match(ext, vexonly_expr))
315 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") 320 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
316 321 else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
317 # check VEX only code
318 if (match(ext, vexok_expr))
319 flags = add_flags(flags, "INAT_VEXOK") 322 flags = add_flags(flags, "INAT_VEXOK")
320 323
321 # check prefixes 324 # check prefixes
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
new file mode 100644
index 000000000000..cc2f8c131286
--- /dev/null
+++ b/arch/x86/tools/insn_sanity.c
@@ -0,0 +1,275 @@
1/*
2 * x86 decoder sanity test - based on test_get_insn.c
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 * Copyright (C) Hitachi, Ltd., 2011
20 */
21
22#include <stdlib.h>
23#include <stdio.h>
24#include <string.h>
25#include <assert.h>
26#include <unistd.h>
27#include <sys/types.h>
28#include <sys/stat.h>
29#include <fcntl.h>
30
31#define unlikely(cond) (cond)
32#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
33
34#include <asm/insn.h>
35#include <inat.c>
36#include <insn.c>
37
38/*
39 * Test of instruction analysis against tampering.
40 * Feed random binary to instruction decoder and ensure not to
41 * access out-of-instruction-buffer.
42 */
43
44#define DEFAULT_MAX_ITER 10000
45#define INSN_NOP 0x90
46
47static const char *prog; /* Program name */
48static int verbose; /* Verbosity */
49static int x86_64; /* x86-64 bit mode flag */
50static unsigned int seed; /* Random seed */
51static unsigned long iter_start; /* Start of iteration number */
52static unsigned long iter_end = DEFAULT_MAX_ITER; /* End of iteration number */
53static FILE *input_file; /* Input file name */
54
55static void usage(const char *err)
56{
57 if (err)
58 fprintf(stderr, "Error: %s\n\n", err);
59 fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog);
60 fprintf(stderr, "\t-y 64bit mode\n");
61 fprintf(stderr, "\t-n 32bit mode\n");
62 fprintf(stderr, "\t-v Verbosity(-vv dumps any decoded result)\n");
63 fprintf(stderr, "\t-s Give a random seed (and iteration number)\n");
64 fprintf(stderr, "\t-m Give a maximum iteration number\n");
65 fprintf(stderr, "\t-i Give an input file with decoded binary\n");
66 exit(1);
67}
68
69static void dump_field(FILE *fp, const char *name, const char *indent,
70 struct insn_field *field)
71{
72 fprintf(fp, "%s.%s = {\n", indent, name);
73 fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
74 indent, field->value, field->bytes[0], field->bytes[1],
75 field->bytes[2], field->bytes[3]);
76 fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
77 field->got, field->nbytes);
78}
79
80static void dump_insn(FILE *fp, struct insn *insn)
81{
82 fprintf(fp, "Instruction = {\n");
83 dump_field(fp, "prefixes", "\t", &insn->prefixes);
84 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
85 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
86 dump_field(fp, "opcode", "\t", &insn->opcode);
87 dump_field(fp, "modrm", "\t", &insn->modrm);
88 dump_field(fp, "sib", "\t", &insn->sib);
89 dump_field(fp, "displacement", "\t", &insn->displacement);
90 dump_field(fp, "immediate1", "\t", &insn->immediate1);
91 dump_field(fp, "immediate2", "\t", &insn->immediate2);
92 fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
93 insn->attr, insn->opnd_bytes, insn->addr_bytes);
94 fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
95 insn->length, insn->x86_64, insn->kaddr);
96}
97
98static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter,
99 unsigned char *insn_buf, struct insn *insn)
100{
101 int i;
102
103 fprintf(fp, "%s:\n", msg);
104
105 dump_insn(fp, insn);
106
107 fprintf(fp, "You can reproduce this with below command(s);\n");
108
109 /* Input a decoded instruction sequence directly */
110 fprintf(fp, " $ echo ");
111 for (i = 0; i < MAX_INSN_SIZE; i++)
112 fprintf(fp, " %02x", insn_buf[i]);
113 fprintf(fp, " | %s -i -\n", prog);
114
115 if (!input_file) {
116 fprintf(fp, "Or \n");
117 /* Give a seed and iteration number */
118 fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter);
119 }
120}
121
122static void init_random_seed(void)
123{
124 int fd;
125
126 fd = open("/dev/urandom", O_RDONLY);
127 if (fd < 0)
128 goto fail;
129
130 if (read(fd, &seed, sizeof(seed)) != sizeof(seed))
131 goto fail;
132
133 close(fd);
134 return;
135fail:
136 usage("Failed to open /dev/urandom");
137}
138
139/* Read given instruction sequence from the input file */
140static int read_next_insn(unsigned char *insn_buf)
141{
142 char buf[256] = "", *tmp;
143 int i;
144
145 tmp = fgets(buf, ARRAY_SIZE(buf), input_file);
146 if (tmp == NULL || feof(input_file))
147 return 0;
148
149 for (i = 0; i < MAX_INSN_SIZE; i++) {
150 insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16);
151 if (*tmp != ' ')
152 break;
153 }
154
155 return i;
156}
157
158static int generate_insn(unsigned char *insn_buf)
159{
160 int i;
161
162 if (input_file)
163 return read_next_insn(insn_buf);
164
165 /* Fills buffer with random binary up to MAX_INSN_SIZE */
166 for (i = 0; i < MAX_INSN_SIZE - 1; i += 2)
167 *(unsigned short *)(&insn_buf[i]) = random() & 0xffff;
168
169 while (i < MAX_INSN_SIZE)
170 insn_buf[i++] = random() & 0xff;
171
172 return i;
173}
174
175static void parse_args(int argc, char **argv)
176{
177 int c;
178 char *tmp = NULL;
179 int set_seed = 0;
180
181 prog = argv[0];
182 while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) {
183 switch (c) {
184 case 'y':
185 x86_64 = 1;
186 break;
187 case 'n':
188 x86_64 = 0;
189 break;
190 case 'v':
191 verbose++;
192 break;
193 case 'i':
194 if (strcmp("-", optarg) == 0)
195 input_file = stdin;
196 else
197 input_file = fopen(optarg, "r");
198 if (!input_file)
199 usage("Failed to open input file");
200 break;
201 case 's':
202 seed = (unsigned int)strtoul(optarg, &tmp, 0);
203 if (*tmp == ',') {
204 optarg = tmp + 1;
205 iter_start = strtoul(optarg, &tmp, 0);
206 }
207 if (*tmp != '\0' || tmp == optarg)
208 usage("Failed to parse seed");
209 set_seed = 1;
210 break;
211 case 'm':
212 iter_end = strtoul(optarg, &tmp, 0);
213 if (*tmp != '\0' || tmp == optarg)
214 usage("Failed to parse max_iter");
215 break;
216 default:
217 usage(NULL);
218 }
219 }
220
221 /* Check errors */
222 if (iter_end < iter_start)
223 usage("Max iteration number must be bigger than iter-num");
224
225 if (set_seed && input_file)
226 usage("Don't use input file (-i) with random seed (-s)");
227
228 /* Initialize random seed */
229 if (!input_file) {
230 if (!set_seed) /* No seed is given */
231 init_random_seed();
232 srand(seed);
233 }
234}
235
236int main(int argc, char **argv)
237{
238 struct insn insn;
239 int insns = 0;
240 int errors = 0;
241 unsigned long i;
242 unsigned char insn_buf[MAX_INSN_SIZE * 2];
243
244 parse_args(argc, argv);
245
246 /* Prepare stop bytes with NOPs */
247 memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE);
248
249 for (i = 0; i < iter_end; i++) {
250 if (generate_insn(insn_buf) <= 0)
251 break;
252
253 if (i < iter_start) /* Skip to given iteration number */
254 continue;
255
256 /* Decode an instruction */
257 insn_init(&insn, insn_buf, x86_64);
258 insn_get_length(&insn);
259
260 if (insn.next_byte <= insn.kaddr ||
261 insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
262 /* Access out-of-range memory */
263 dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn);
264 errors++;
265 } else if (verbose && !insn_complete(&insn))
266 dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn);
267 else if (verbose >= 2)
268 dump_insn(stdout, &insn);
269 insns++;
270 }
271
272 fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed);
273
274 return errors ? 1 : 0;
275}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1f928659c338..12eb07bfb267 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1215,8 +1215,6 @@ asmlinkage void __init xen_start_kernel(void)
1215 local_irq_disable(); 1215 local_irq_disable();
1216 early_boot_irqs_disabled = true; 1216 early_boot_irqs_disabled = true;
1217 1217
1218 memblock_init();
1219
1220 xen_raw_console_write("mapping kernel into physical memory\n"); 1218 xen_raw_console_write("mapping kernel into physical memory\n");
1221 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1219 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1222 xen_ident_map_ISA(); 1220 xen_ident_map_ISA();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 87f6673b1207..f4bf8aa574f4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1774,10 +1774,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1774 __xen_write_cr3(true, __pa(pgd)); 1774 __xen_write_cr3(true, __pa(pgd));
1775 xen_mc_issue(PARAVIRT_LAZY_CPU); 1775 xen_mc_issue(PARAVIRT_LAZY_CPU);
1776 1776
1777 memblock_x86_reserve_range(__pa(xen_start_info->pt_base), 1777 memblock_reserve(__pa(xen_start_info->pt_base),
1778 __pa(xen_start_info->pt_base + 1778 xen_start_info->nr_pt_frames * PAGE_SIZE);
1779 xen_start_info->nr_pt_frames * PAGE_SIZE),
1780 "XEN PAGETABLES");
1781 1779
1782 return pgd; 1780 return pgd;
1783} 1781}
@@ -1853,10 +1851,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1853 PFN_DOWN(__pa(initial_page_table))); 1851 PFN_DOWN(__pa(initial_page_table)));
1854 xen_write_cr3(__pa(initial_page_table)); 1852 xen_write_cr3(__pa(initial_page_table));
1855 1853
1856 memblock_x86_reserve_range(__pa(xen_start_info->pt_base), 1854 memblock_reserve(__pa(xen_start_info->pt_base),
1857 __pa(xen_start_info->pt_base + 1855 xen_start_info->nr_pt_frames * PAGE_SIZE));
1858 xen_start_info->nr_pt_frames * PAGE_SIZE),
1859 "XEN PAGETABLES");
1860 1856
1861 return initial_page_table; 1857 return initial_page_table;
1862} 1858}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b2c7179fa263..e03c63692176 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -75,7 +75,7 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
75 if (i == XEN_EXTRA_MEM_MAX_REGIONS) 75 if (i == XEN_EXTRA_MEM_MAX_REGIONS)
76 printk(KERN_WARNING "Warning: not enough extra memory regions\n"); 76 printk(KERN_WARNING "Warning: not enough extra memory regions\n");
77 77
78 memblock_x86_reserve_range(start, start + size, "XEN EXTRA"); 78 memblock_reserve(start, size);
79 79
80 xen_max_p2m_pfn = PFN_DOWN(start + size); 80 xen_max_p2m_pfn = PFN_DOWN(start + size);
81 81
@@ -311,9 +311,8 @@ char * __init xen_memory_setup(void)
311 * - xen_start_info 311 * - xen_start_info
312 * See comment above "struct start_info" in <xen/interface/xen.h> 312 * See comment above "struct start_info" in <xen/interface/xen.h>
313 */ 313 */
314 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), 314 memblock_reserve(__pa(xen_start_info->mfn_list),
315 __pa(xen_start_info->pt_base), 315 xen_start_info->pt_base - xen_start_info->mfn_list);
316 "XEN START INFO");
317 316
318 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 317 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
319 318
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index f3e5eb43f71c..ac62f9cf1e10 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -41,14 +41,6 @@ static struct clocksource ccount_clocksource = {
41 .rating = 200, 41 .rating = 200,
42 .read = ccount_read, 42 .read = ccount_read,
43 .mask = CLOCKSOURCE_MASK(32), 43 .mask = CLOCKSOURCE_MASK(32),
44 /*
45 * With a shift of 22 the lower limit of the cpu clock is
46 * 1MHz, where NSEC_PER_CCOUNT is 1000 or a bit less than
47 * 2^10: Since we have 32 bits and the multiplicator can
48 * already take up as much as 10 bits, this leaves us with
49 * remaining upper 22 bits.
50 */
51 .shift = 22,
52}; 44};
53 45
54static irqreturn_t timer_interrupt(int irq, void *dev_id); 46static irqreturn_t timer_interrupt(int irq, void *dev_id);
@@ -66,10 +58,7 @@ void __init time_init(void)
66 printk("%d.%02d MHz\n", (int)ccount_per_jiffy/(1000000/HZ), 58 printk("%d.%02d MHz\n", (int)ccount_per_jiffy/(1000000/HZ),
67 (int)(ccount_per_jiffy/(10000/HZ))%100); 59 (int)(ccount_per_jiffy/(10000/HZ))%100);
68#endif 60#endif
69 ccount_clocksource.mult = 61 clocksource_register_hz(&ccount_clocksource, CCOUNT_PER_JIFFY * HZ);
70 clocksource_hz2mult(CCOUNT_PER_JIFFY * HZ,
71 ccount_clocksource.shift);
72 clocksource_register(&ccount_clocksource);
73 62
74 /* Initialize the linux timer interrupt. */ 63 /* Initialize the linux timer interrupt. */
75 64
diff --git a/block/blk-map.c b/block/blk-map.c
index 164cd0059706..623e1cd4cffe 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
311 if (IS_ERR(bio)) 311 if (IS_ERR(bio))
312 return PTR_ERR(bio); 312 return PTR_ERR(bio);
313 313
314 if (rq_data_dir(rq) == WRITE) 314 if (!reading)
315 bio->bi_rw |= REQ_WRITE; 315 bio->bi_rw |= REQ_WRITE;
316 316
317 if (do_copy) 317 if (do_copy)
diff --git a/block/blk-tag.c b/block/blk-tag.c
index e74d6d13838f..4af6f5cc1167 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -282,18 +282,9 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
282void blk_queue_end_tag(struct request_queue *q, struct request *rq) 282void blk_queue_end_tag(struct request_queue *q, struct request *rq)
283{ 283{
284 struct blk_queue_tag *bqt = q->queue_tags; 284 struct blk_queue_tag *bqt = q->queue_tags;
285 int tag = rq->tag; 285 unsigned tag = rq->tag; /* negative tags invalid */
286 286
287 BUG_ON(tag == -1); 287 BUG_ON(tag >= bqt->real_max_depth);
288
289 if (unlikely(tag >= bqt->max_depth)) {
290 /*
291 * This can happen after tag depth has been reduced.
292 * But tag shouldn't be larger than real_max_depth.
293 */
294 WARN_ON(tag >= bqt->real_max_depth);
295 return;
296 }
297 288
298 list_del_init(&rq->queuelist); 289 list_del_init(&rq->queuelist);
299 rq->cmd_flags &= ~REQ_QUEUED; 290 rq->cmd_flags &= ~REQ_QUEUED;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4c12869fcf77..3548705b04e4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1655,6 +1655,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1655 struct request *next) 1655 struct request *next)
1656{ 1656{
1657 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1657 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1658 struct cfq_data *cfqd = q->elevator->elevator_data;
1659
1658 /* 1660 /*
1659 * reposition in fifo if next is older than rq 1661 * reposition in fifo if next is older than rq
1660 */ 1662 */
@@ -1669,6 +1671,16 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1669 cfq_remove_request(next); 1671 cfq_remove_request(next);
1670 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, 1672 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1671 rq_data_dir(next), rq_is_sync(next)); 1673 rq_data_dir(next), rq_is_sync(next));
1674
1675 cfqq = RQ_CFQQ(next);
1676 /*
1677 * all requests of this queue are merged to other queues, delete it
1678 * from the service tree. If it's the active_queue,
1679 * cfq_dispatch_requests() will choose to expire it or do idle
1680 */
1681 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
1682 cfqq != cfqd->active_queue)
1683 cfq_del_cfqq_rr(cfqd, cfqq);
1672} 1684}
1673 1685
1674static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1686static int cfq_allow_merge(struct request_queue *q, struct request *rq,
diff --git a/block/ioctl.c b/block/ioctl.c
index ca939fc1030f..d510c2a4eff8 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -180,6 +180,26 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
180EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); 180EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
181 181
182/* 182/*
183 * Is it an unrecognized ioctl? The correct returns are either
184 * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
185 * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
186 * code before returning.
187 *
188 * Confused drivers sometimes return EINVAL, which is wrong. It
189 * means "I understood the ioctl command, but the parameters to
190 * it were wrong".
191 *
192 * We should aim to just fix the broken drivers, the EINVAL case
193 * should go away.
194 */
195static inline int is_unrecognized_ioctl(int ret)
196{
197 return ret == -EINVAL ||
198 ret == -ENOTTY ||
199 ret == -ENOIOCTLCMD;
200}
201
202/*
183 * always keep this in sync with compat_blkdev_ioctl() 203 * always keep this in sync with compat_blkdev_ioctl()
184 */ 204 */
185int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, 205int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
@@ -196,8 +216,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
196 return -EACCES; 216 return -EACCES;
197 217
198 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 218 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
199 /* -EINVAL to handle old uncorrected drivers */ 219 if (!is_unrecognized_ioctl(ret))
200 if (ret != -EINVAL && ret != -ENOTTY)
201 return ret; 220 return ret;
202 221
203 fsync_bdev(bdev); 222 fsync_bdev(bdev);
@@ -206,8 +225,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
206 225
207 case BLKROSET: 226 case BLKROSET:
208 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 227 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
209 /* -EINVAL to handle old uncorrected drivers */ 228 if (!is_unrecognized_ioctl(ret))
210 if (ret != -EINVAL && ret != -ENOTTY)
211 return ret; 229 return ret;
212 if (!capable(CAP_SYS_ADMIN)) 230 if (!capable(CAP_SYS_ADMIN))
213 return -EACCES; 231 return -EACCES;
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 251acea3d359..3991502b21e5 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -247,6 +247,13 @@ struct sys_device *get_cpu_sysdev(unsigned cpu)
247} 247}
248EXPORT_SYMBOL_GPL(get_cpu_sysdev); 248EXPORT_SYMBOL_GPL(get_cpu_sysdev);
249 249
250bool cpu_is_hotpluggable(unsigned cpu)
251{
252 struct sys_device *dev = get_cpu_sysdev(cpu);
253 return dev && container_of(dev, struct cpu, sysdev)->hotpluggable;
254}
255EXPORT_SYMBOL_GPL(cpu_is_hotpluggable);
256
250int __init cpu_dev_init(void) 257int __init cpu_dev_init(void)
251{ 258{
252 int err; 259 int err;
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index effe7974aa9a..6b5cf02c35c8 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -143,7 +143,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
143#ifndef CONFIG_X86_64 143#ifndef CONFIG_X86_64
144#include <asm/mach_timer.h> 144#include <asm/mach_timer.h>
145#define PMTMR_EXPECTED_RATE \ 145#define PMTMR_EXPECTED_RATE \
146 ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) 146 ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (PIT_TICK_RATE>>10))
147/* 147/*
148 * Some boards have the PMTMR running way too fast. We check 148 * Some boards have the PMTMR running way too fast. We check
149 * the PMTMR rate against PIT channel 2 to catch these cases. 149 * the PMTMR rate against PIT channel 2 to catch these cases.
diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c
index 27c49e60b7d6..e7cab2da910f 100644
--- a/drivers/clocksource/i8253.c
+++ b/drivers/clocksource/i8253.c
@@ -53,7 +53,7 @@ static cycle_t i8253_read(struct clocksource *cs)
53 count |= inb_p(PIT_CH0) << 8; 53 count |= inb_p(PIT_CH0) << 8;
54 54
55 /* VIA686a test code... reset the latch if count > max + 1 */ 55 /* VIA686a test code... reset the latch if count > max + 1 */
56 if (count > LATCH) { 56 if (count > PIT_LATCH) {
57 outb_p(0x34, PIT_MODE); 57 outb_p(0x34, PIT_MODE);
58 outb_p(PIT_LATCH & 0xff, PIT_CH0); 58 outb_p(PIT_LATCH & 0xff, PIT_CH0);
59 outb_p(PIT_LATCH >> 8, PIT_CH0); 59 outb_p(PIT_LATCH >> 8, PIT_CH0);
@@ -114,8 +114,8 @@ static void init_pit_timer(enum clock_event_mode mode,
114 case CLOCK_EVT_MODE_PERIODIC: 114 case CLOCK_EVT_MODE_PERIODIC:
115 /* binary, mode 2, LSB/MSB, ch 0 */ 115 /* binary, mode 2, LSB/MSB, ch 0 */
116 outb_p(0x34, PIT_MODE); 116 outb_p(0x34, PIT_MODE);
117 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ 117 outb_p(PIT_LATCH & 0xff , PIT_CH0); /* LSB */
118 outb_p(LATCH >> 8 , PIT_CH0); /* MSB */ 118 outb_p(PIT_LATCH >> 8 , PIT_CH0); /* MSB */
119 break; 119 break;
120 120
121 case CLOCK_EVT_MODE_SHUTDOWN: 121 case CLOCK_EVT_MODE_SHUTDOWN:
diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 79c47e88d5d1..55d0f95f82f9 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -59,7 +59,6 @@ static struct clocksource clksrc = {
59 .rating = 200, 59 .rating = 200,
60 .read = tc_get_cycles, 60 .read = tc_get_cycles,
61 .mask = CLOCKSOURCE_MASK(32), 61 .mask = CLOCKSOURCE_MASK(32),
62 .shift = 18,
63 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 62 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
64}; 63};
65 64
@@ -256,7 +255,6 @@ static int __init tcb_clksrc_init(void)
256 best_divisor_idx = i; 255 best_divisor_idx = i;
257 } 256 }
258 257
259 clksrc.mult = clocksource_hz2mult(divided_rate, clksrc.shift);
260 258
261 printk(bootinfo, clksrc.name, CONFIG_ATMEL_TCB_CLKSRC_BLOCK, 259 printk(bootinfo, clksrc.name, CONFIG_ATMEL_TCB_CLKSRC_BLOCK,
262 divided_rate / 1000000, 260 divided_rate / 1000000,
@@ -292,7 +290,7 @@ static int __init tcb_clksrc_init(void)
292 __raw_writel(ATMEL_TC_SYNC, tcaddr + ATMEL_TC_BCR); 290 __raw_writel(ATMEL_TC_SYNC, tcaddr + ATMEL_TC_BCR);
293 291
294 /* and away we go! */ 292 /* and away we go! */
295 clocksource_register(&clksrc); 293 clocksource_register_hz(&clksrc, divided_rate);
296 294
297 /* channel 2: periodic and oneshot timer support */ 295 /* channel 2: periodic and oneshot timer support */
298 setup_clkevents(tc, clk32k_divisor_idx); 296 setup_clkevents(tc, clk32k_divisor_idx);
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index c97b468ee9f7..235a340e81f2 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -95,27 +95,26 @@ static struct dbs_tuners {
95 .freq_step = 5, 95 .freq_step = 5,
96}; 96};
97 97
98static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 98static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
99 cputime64_t *wall)
100{ 99{
101 cputime64_t idle_time; 100 u64 idle_time;
102 cputime64_t cur_wall_time; 101 u64 cur_wall_time;
103 cputime64_t busy_time; 102 u64 busy_time;
104 103
105 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 104 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
106 busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
107 kstat_cpu(cpu).cpustat.system);
108 105
109 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); 106 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
110 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); 107 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
111 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); 108 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
112 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); 109 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
110 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
111 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
113 112
114 idle_time = cputime64_sub(cur_wall_time, busy_time); 113 idle_time = cur_wall_time - busy_time;
115 if (wall) 114 if (wall)
116 *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); 115 *wall = jiffies_to_usecs(cur_wall_time);
117 116
118 return (cputime64_t)jiffies_to_usecs(idle_time); 117 return jiffies_to_usecs(idle_time);
119} 118}
120 119
121static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 120static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -272,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
272 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 271 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
273 &dbs_info->prev_cpu_wall); 272 &dbs_info->prev_cpu_wall);
274 if (dbs_tuners_ins.ignore_nice) 273 if (dbs_tuners_ins.ignore_nice)
275 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 274 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
276 } 275 }
277 return count; 276 return count;
278} 277}
@@ -353,20 +352,20 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
353 352
354 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 353 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
355 354
356 wall_time = (unsigned int) cputime64_sub(cur_wall_time, 355 wall_time = (unsigned int)
357 j_dbs_info->prev_cpu_wall); 356 (cur_wall_time - j_dbs_info->prev_cpu_wall);
358 j_dbs_info->prev_cpu_wall = cur_wall_time; 357 j_dbs_info->prev_cpu_wall = cur_wall_time;
359 358
360 idle_time = (unsigned int) cputime64_sub(cur_idle_time, 359 idle_time = (unsigned int)
361 j_dbs_info->prev_cpu_idle); 360 (cur_idle_time - j_dbs_info->prev_cpu_idle);
362 j_dbs_info->prev_cpu_idle = cur_idle_time; 361 j_dbs_info->prev_cpu_idle = cur_idle_time;
363 362
364 if (dbs_tuners_ins.ignore_nice) { 363 if (dbs_tuners_ins.ignore_nice) {
365 cputime64_t cur_nice; 364 u64 cur_nice;
366 unsigned long cur_nice_jiffies; 365 unsigned long cur_nice_jiffies;
367 366
368 cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, 367 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
369 j_dbs_info->prev_cpu_nice); 368 j_dbs_info->prev_cpu_nice;
370 /* 369 /*
371 * Assumption: nice time between sampling periods will 370 * Assumption: nice time between sampling periods will
372 * be less than 2^32 jiffies for 32 bit sys 371 * be less than 2^32 jiffies for 32 bit sys
@@ -374,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
374 cur_nice_jiffies = (unsigned long) 373 cur_nice_jiffies = (unsigned long)
375 cputime64_to_jiffies64(cur_nice); 374 cputime64_to_jiffies64(cur_nice);
376 375
377 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 376 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
378 idle_time += jiffies_to_usecs(cur_nice_jiffies); 377 idle_time += jiffies_to_usecs(cur_nice_jiffies);
379 } 378 }
380 379
@@ -501,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
501 500
502 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 501 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
503 &j_dbs_info->prev_cpu_wall); 502 &j_dbs_info->prev_cpu_wall);
504 if (dbs_tuners_ins.ignore_nice) { 503 if (dbs_tuners_ins.ignore_nice)
505 j_dbs_info->prev_cpu_nice = 504 j_dbs_info->prev_cpu_nice =
506 kstat_cpu(j).cpustat.nice; 505 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
507 }
508 } 506 }
509 this_dbs_info->down_skip = 0; 507 this_dbs_info->down_skip = 0;
510 this_dbs_info->requested_freq = policy->cur; 508 this_dbs_info->requested_freq = policy->cur;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index fa8af4ebb1d6..3d679eee70a1 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -119,27 +119,26 @@ static struct dbs_tuners {
119 .powersave_bias = 0, 119 .powersave_bias = 0,
120}; 120};
121 121
122static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 122static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
123 cputime64_t *wall)
124{ 123{
125 cputime64_t idle_time; 124 u64 idle_time;
126 cputime64_t cur_wall_time; 125 u64 cur_wall_time;
127 cputime64_t busy_time; 126 u64 busy_time;
128 127
129 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 128 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
130 busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
131 kstat_cpu(cpu).cpustat.system);
132 129
133 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); 130 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
134 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); 131 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
135 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); 132 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
136 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); 133 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
134 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
135 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
137 136
138 idle_time = cputime64_sub(cur_wall_time, busy_time); 137 idle_time = cur_wall_time - busy_time;
139 if (wall) 138 if (wall)
140 *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); 139 *wall = jiffies_to_usecs(cur_wall_time);
141 140
142 return (cputime64_t)jiffies_to_usecs(idle_time); 141 return jiffies_to_usecs(idle_time);
143} 142}
144 143
145static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 144static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -345,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
345 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 344 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
346 &dbs_info->prev_cpu_wall); 345 &dbs_info->prev_cpu_wall);
347 if (dbs_tuners_ins.ignore_nice) 346 if (dbs_tuners_ins.ignore_nice)
348 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 347 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
349 348
350 } 349 }
351 return count; 350 return count;
@@ -442,24 +441,24 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
442 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 441 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
443 cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); 442 cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
444 443
445 wall_time = (unsigned int) cputime64_sub(cur_wall_time, 444 wall_time = (unsigned int)
446 j_dbs_info->prev_cpu_wall); 445 (cur_wall_time - j_dbs_info->prev_cpu_wall);
447 j_dbs_info->prev_cpu_wall = cur_wall_time; 446 j_dbs_info->prev_cpu_wall = cur_wall_time;
448 447
449 idle_time = (unsigned int) cputime64_sub(cur_idle_time, 448 idle_time = (unsigned int)
450 j_dbs_info->prev_cpu_idle); 449 (cur_idle_time - j_dbs_info->prev_cpu_idle);
451 j_dbs_info->prev_cpu_idle = cur_idle_time; 450 j_dbs_info->prev_cpu_idle = cur_idle_time;
452 451
453 iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, 452 iowait_time = (unsigned int)
454 j_dbs_info->prev_cpu_iowait); 453 (cur_iowait_time - j_dbs_info->prev_cpu_iowait);
455 j_dbs_info->prev_cpu_iowait = cur_iowait_time; 454 j_dbs_info->prev_cpu_iowait = cur_iowait_time;
456 455
457 if (dbs_tuners_ins.ignore_nice) { 456 if (dbs_tuners_ins.ignore_nice) {
458 cputime64_t cur_nice; 457 u64 cur_nice;
459 unsigned long cur_nice_jiffies; 458 unsigned long cur_nice_jiffies;
460 459
461 cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, 460 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
462 j_dbs_info->prev_cpu_nice); 461 j_dbs_info->prev_cpu_nice;
463 /* 462 /*
464 * Assumption: nice time between sampling periods will 463 * Assumption: nice time between sampling periods will
465 * be less than 2^32 jiffies for 32 bit sys 464 * be less than 2^32 jiffies for 32 bit sys
@@ -467,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
467 cur_nice_jiffies = (unsigned long) 466 cur_nice_jiffies = (unsigned long)
468 cputime64_to_jiffies64(cur_nice); 467 cputime64_to_jiffies64(cur_nice);
469 468
470 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 469 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
471 idle_time += jiffies_to_usecs(cur_nice_jiffies); 470 idle_time += jiffies_to_usecs(cur_nice_jiffies);
472 } 471 }
473 472
@@ -646,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
646 645
647 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 646 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
648 &j_dbs_info->prev_cpu_wall); 647 &j_dbs_info->prev_cpu_wall);
649 if (dbs_tuners_ins.ignore_nice) { 648 if (dbs_tuners_ins.ignore_nice)
650 j_dbs_info->prev_cpu_nice = 649 j_dbs_info->prev_cpu_nice =
651 kstat_cpu(j).cpustat.nice; 650 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
652 }
653 } 651 }
654 this_dbs_info->cpu = cpu; 652 this_dbs_info->cpu = cpu;
655 this_dbs_info->rate_mult = 1; 653 this_dbs_info->rate_mult = 1;
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index c5072a91e848..2a508edd768b 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -61,9 +61,8 @@ static int cpufreq_stats_update(unsigned int cpu)
61 spin_lock(&cpufreq_stats_lock); 61 spin_lock(&cpufreq_stats_lock);
62 stat = per_cpu(cpufreq_stats_table, cpu); 62 stat = per_cpu(cpufreq_stats_table, cpu);
63 if (stat->time_in_state) 63 if (stat->time_in_state)
64 stat->time_in_state[stat->last_index] = 64 stat->time_in_state[stat->last_index] +=
65 cputime64_add(stat->time_in_state[stat->last_index], 65 cur_time - stat->last_time;
66 cputime_sub(cur_time, stat->last_time));
67 stat->last_time = cur_time; 66 stat->last_time = cur_time;
68 spin_unlock(&cpufreq_stats_lock); 67 spin_unlock(&cpufreq_stats_lock);
69 return 0; 68 return 0;
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index ab8f469f5cf8..5a99bb3f255a 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -124,7 +124,7 @@ config MV_XOR
124 124
125config MX3_IPU 125config MX3_IPU
126 bool "MX3x Image Processing Unit support" 126 bool "MX3x Image Processing Unit support"
127 depends on ARCH_MX3 127 depends on SOC_IMX31 || SOC_IMX35
128 select DMA_ENGINE 128 select DMA_ENGINE
129 default y 129 default y
130 help 130 help
@@ -216,7 +216,7 @@ config PCH_DMA
216 216
217config IMX_SDMA 217config IMX_SDMA
218 tristate "i.MX SDMA support" 218 tristate "i.MX SDMA support"
219 depends on ARCH_MX25 || ARCH_MX3 || ARCH_MX5 219 depends on ARCH_MX25 || SOC_IMX31 || SOC_IMX35 || ARCH_MX5
220 select DMA_ENGINE 220 select DMA_ENGINE
221 help 221 help
222 Support the i.MX SDMA engine. This engine is integrated into 222 Support the i.MX SDMA engine. This engine is integrated into
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 7a402bfbee7d..88df48956c1b 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1609,11 +1609,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
1609 mce->cpuvendor, mce->cpuid, mce->time, 1609 mce->cpuvendor, mce->cpuid, mce->time,
1610 mce->socketid, mce->apicid); 1610 mce->socketid, mce->apicid);
1611 1611
1612#ifdef CONFIG_SMP
1613 /* Only handle if it is the right mc controller */ 1612 /* Only handle if it is the right mc controller */
1614 if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc) 1613 if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc)
1615 return NOTIFY_DONE; 1614 return NOTIFY_DONE;
1616#endif
1617 1615
1618 smp_rmb(); 1616 smp_rmb();
1619 if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { 1617 if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c681dc149d2a..b9da8900ae4e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -756,9 +756,9 @@ intel_enable_semaphores(struct drm_device *dev)
756 if (i915_semaphores >= 0) 756 if (i915_semaphores >= 0)
757 return i915_semaphores; 757 return i915_semaphores;
758 758
759 /* Enable semaphores on SNB when IO remapping is off */ 759 /* Disable semaphores on SNB */
760 if (INTEL_INFO(dev)->gen == 6) 760 if (INTEL_INFO(dev)->gen == 6)
761 return !intel_iommu_enabled; 761 return 0;
762 762
763 return 1; 763 return 1;
764} 764}
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index d809b038ca88..daa5743ccbd6 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -7922,13 +7922,11 @@ static bool intel_enable_rc6(struct drm_device *dev)
7922 return 0; 7922 return 0;
7923 7923
7924 /* 7924 /*
7925 * Enable rc6 on Sandybridge if DMA remapping is disabled 7925 * Disable rc6 on Sandybridge
7926 */ 7926 */
7927 if (INTEL_INFO(dev)->gen == 6) { 7927 if (INTEL_INFO(dev)->gen == 6) {
7928 DRM_DEBUG_DRIVER("Sandybridge: intel_iommu_enabled %s -- RC6 %sabled\n", 7928 DRM_DEBUG_DRIVER("Sandybridge: RC6 disabled\n");
7929 intel_iommu_enabled ? "true" : "false", 7929 return 0;
7930 !intel_iommu_enabled ? "en" : "dis");
7931 return !intel_iommu_enabled;
7932 } 7930 }
7933 DRM_DEBUG_DRIVER("RC6 enabled\n"); 7931 DRM_DEBUG_DRIVER("RC6 enabled\n");
7934 return 1; 7932 return 1;
diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index 5e00d1670aa9..92c9628c572d 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -3276,6 +3276,18 @@ int evergreen_init(struct radeon_device *rdev)
3276 rdev->accel_working = false; 3276 rdev->accel_working = false;
3277 } 3277 }
3278 } 3278 }
3279
3280 /* Don't start up if the MC ucode is missing on BTC parts.
3281 * The default clocks and voltages before the MC ucode
3282 * is loaded are not suffient for advanced operations.
3283 */
3284 if (ASIC_IS_DCE5(rdev)) {
3285 if (!rdev->mc_fw && !(rdev->flags & RADEON_IS_IGP)) {
3286 DRM_ERROR("radeon: MC ucode required for NI+.\n");
3287 return -EINVAL;
3288 }
3289 }
3290
3279 return 0; 3291 return 0;
3280} 3292}
3281 3293
diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c
index d24baf30efcb..5082d17d14dc 100644
--- a/drivers/gpu/drm/radeon/radeon_atombios.c
+++ b/drivers/gpu/drm/radeon/radeon_atombios.c
@@ -2560,7 +2560,11 @@ void radeon_atombios_get_power_modes(struct radeon_device *rdev)
2560 2560
2561 rdev->pm.current_power_state_index = rdev->pm.default_power_state_index; 2561 rdev->pm.current_power_state_index = rdev->pm.default_power_state_index;
2562 rdev->pm.current_clock_mode_index = 0; 2562 rdev->pm.current_clock_mode_index = 0;
2563 rdev->pm.current_vddc = rdev->pm.power_state[rdev->pm.default_power_state_index].clock_info[0].voltage.voltage; 2563 if (rdev->pm.default_power_state_index >= 0)
2564 rdev->pm.current_vddc =
2565 rdev->pm.power_state[rdev->pm.default_power_state_index].clock_info[0].voltage.voltage;
2566 else
2567 rdev->pm.current_vddc = 0;
2564} 2568}
2565 2569
2566void radeon_atom_set_clock_gating(struct radeon_device *rdev, int enable) 2570void radeon_atom_set_clock_gating(struct radeon_device *rdev, int enable)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index 8aa1dbb45c67..f94b33ae2215 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -1093,7 +1093,6 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev,
1093 struct vmw_surface *surface = NULL; 1093 struct vmw_surface *surface = NULL;
1094 struct vmw_dma_buffer *bo = NULL; 1094 struct vmw_dma_buffer *bo = NULL;
1095 struct ttm_base_object *user_obj; 1095 struct ttm_base_object *user_obj;
1096 u64 required_size;
1097 int ret; 1096 int ret;
1098 1097
1099 /** 1098 /**
@@ -1102,8 +1101,9 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev,
1102 * requested framebuffer. 1101 * requested framebuffer.
1103 */ 1102 */
1104 1103
1105 required_size = mode_cmd->pitch * mode_cmd->height; 1104 if (!vmw_kms_validate_mode_vram(dev_priv,
1106 if (unlikely(required_size > (u64) dev_priv->vram_size)) { 1105 mode_cmd->pitch,
1106 mode_cmd->height)) {
1107 DRM_ERROR("VRAM size is too small for requested mode.\n"); 1107 DRM_ERROR("VRAM size is too small for requested mode.\n");
1108 return ERR_PTR(-ENOMEM); 1108 return ERR_PTR(-ENOMEM);
1109 } 1109 }
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 104b3767516c..1fdef885341c 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -57,16 +57,15 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
57#define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) 57#define TOTAL_ATTRS (MAX_CORE_ATTRS + 1)
58#define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) 58#define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO)
59 59
60#ifdef CONFIG_SMP
61#define TO_PHYS_ID(cpu) cpu_data(cpu).phys_proc_id 60#define TO_PHYS_ID(cpu) cpu_data(cpu).phys_proc_id
62#define TO_CORE_ID(cpu) cpu_data(cpu).cpu_core_id 61#define TO_CORE_ID(cpu) cpu_data(cpu).cpu_core_id
62#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
63
64#ifdef CONFIG_SMP
63#define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) 65#define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu))
64#else 66#else
65#define TO_PHYS_ID(cpu) (cpu)
66#define TO_CORE_ID(cpu) (cpu)
67#define for_each_sibling(i, cpu) for (i = 0; false; ) 67#define for_each_sibling(i, cpu) for (i = 0; false; )
68#endif 68#endif
69#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
70 69
71/* 70/*
72 * Per-Core Temperature Data 71 * Per-Core Temperature Data
diff --git a/drivers/input/mouse/sentelic.c b/drivers/input/mouse/sentelic.c
index c5b12d2e955a..86d6f39178b0 100644
--- a/drivers/input/mouse/sentelic.c
+++ b/drivers/input/mouse/sentelic.c
@@ -2,7 +2,7 @@
2 * Finger Sensing Pad PS/2 mouse driver. 2 * Finger Sensing Pad PS/2 mouse driver.
3 * 3 *
4 * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd. 4 * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd.
5 * Copyright (C) 2005-2010 Tai-hwa Liang, Sentelic Corporation. 5 * Copyright (C) 2005-2011 Tai-hwa Liang, Sentelic Corporation.
6 * 6 *
7 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License 8 * modify it under the terms of the GNU General Public License
@@ -162,7 +162,7 @@ static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val)
162 ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2); 162 ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2);
163 163
164 if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) 164 if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
165 return -1; 165 goto out;
166 166
167 if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) { 167 if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) {
168 /* inversion is required */ 168 /* inversion is required */
@@ -261,7 +261,7 @@ static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val)
261 ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2); 261 ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2);
262 262
263 if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) 263 if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
264 return -1; 264 goto out;
265 265
266 if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) { 266 if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) {
267 ps2_sendbyte(ps2dev, 0x47, FSP_CMD_TIMEOUT2); 267 ps2_sendbyte(ps2dev, 0x47, FSP_CMD_TIMEOUT2);
@@ -309,7 +309,7 @@ static int fsp_get_buttons(struct psmouse *psmouse, int *btn)
309 }; 309 };
310 int val; 310 int val;
311 311
312 if (fsp_reg_read(psmouse, FSP_REG_TMOD_STATUS1, &val) == -1) 312 if (fsp_reg_read(psmouse, FSP_REG_TMOD_STATUS, &val) == -1)
313 return -EIO; 313 return -EIO;
314 314
315 *btn = buttons[(val & 0x30) >> 4]; 315 *btn = buttons[(val & 0x30) >> 4];
diff --git a/drivers/input/mouse/sentelic.h b/drivers/input/mouse/sentelic.h
index ed1395ac7b8b..2e4af24f8c15 100644
--- a/drivers/input/mouse/sentelic.h
+++ b/drivers/input/mouse/sentelic.h
@@ -2,7 +2,7 @@
2 * Finger Sensing Pad PS/2 mouse driver. 2 * Finger Sensing Pad PS/2 mouse driver.
3 * 3 *
4 * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd. 4 * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd.
5 * Copyright (C) 2005-2009 Tai-hwa Liang, Sentelic Corporation. 5 * Copyright (C) 2005-2011 Tai-hwa Liang, Sentelic Corporation.
6 * 6 *
7 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License 8 * modify it under the terms of the GNU General Public License
@@ -33,6 +33,7 @@
33/* Finger-sensing Pad control registers */ 33/* Finger-sensing Pad control registers */
34#define FSP_REG_SYSCTL1 0x10 34#define FSP_REG_SYSCTL1 0x10
35#define FSP_BIT_EN_REG_CLK BIT(5) 35#define FSP_BIT_EN_REG_CLK BIT(5)
36#define FSP_REG_TMOD_STATUS 0x20
36#define FSP_REG_OPC_QDOWN 0x31 37#define FSP_REG_OPC_QDOWN 0x31
37#define FSP_BIT_EN_OPC_TAG BIT(7) 38#define FSP_BIT_EN_OPC_TAG BIT(7)
38#define FSP_REG_OPTZ_XLO 0x34 39#define FSP_REG_OPTZ_XLO 0x34
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index bdc447fd4766..31053a951c34 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -41,6 +41,7 @@
41#include <linux/tboot.h> 41#include <linux/tboot.h>
42#include <linux/dmi.h> 42#include <linux/dmi.h>
43#include <linux/pci-ats.h> 43#include <linux/pci-ats.h>
44#include <linux/memblock.h>
44#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
45#include <asm/iommu.h> 46#include <asm/iommu.h>
46 47
@@ -2188,18 +2189,6 @@ static inline void iommu_prepare_isa(void)
2188 2189
2189static int md_domain_init(struct dmar_domain *domain, int guest_width); 2190static int md_domain_init(struct dmar_domain *domain, int guest_width);
2190 2191
2191static int __init si_domain_work_fn(unsigned long start_pfn,
2192 unsigned long end_pfn, void *datax)
2193{
2194 int *ret = datax;
2195
2196 *ret = iommu_domain_identity_map(si_domain,
2197 (uint64_t)start_pfn << PAGE_SHIFT,
2198 (uint64_t)end_pfn << PAGE_SHIFT);
2199 return *ret;
2200
2201}
2202
2203static int __init si_domain_init(int hw) 2192static int __init si_domain_init(int hw)
2204{ 2193{
2205 struct dmar_drhd_unit *drhd; 2194 struct dmar_drhd_unit *drhd;
@@ -2231,9 +2220,15 @@ static int __init si_domain_init(int hw)
2231 return 0; 2220 return 0;
2232 2221
2233 for_each_online_node(nid) { 2222 for_each_online_node(nid) {
2234 work_with_active_regions(nid, si_domain_work_fn, &ret); 2223 unsigned long start_pfn, end_pfn;
2235 if (ret) 2224 int i;
2236 return ret; 2225
2226 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2227 ret = iommu_domain_identity_map(si_domain,
2228 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2229 if (ret)
2230 return ret;
2231 }
2237 } 2232 }
2238 2233
2239 return 0; 2234 return 0;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2fb2963df553..5b5fa5cdaa31 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -90,7 +90,7 @@ struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
90 if (bus == NULL || bus->iommu_ops == NULL) 90 if (bus == NULL || bus->iommu_ops == NULL)
91 return NULL; 91 return NULL;
92 92
93 domain = kmalloc(sizeof(*domain), GFP_KERNEL); 93 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
94 if (!domain) 94 if (!domain)
95 return NULL; 95 return NULL;
96 96
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 65af42f2d593..39809035320a 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -697,7 +697,7 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
697 * interrupts are enabled. We always leave interrupts enabled while 697 * interrupts are enabled. We always leave interrupts enabled while
698 * running the Guest. 698 * running the Guest.
699 */ 699 */
700 regs->eflags = X86_EFLAGS_IF | 0x2; 700 regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
701 701
702 /* 702 /*
703 * The "Extended Instruction Pointer" register says where the Guest is 703 * The "Extended Instruction Pointer" register says where the Guest is
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 2637c139777b..6dc26b61219b 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -81,13 +81,13 @@ static int rackmeter_ignore_nice;
81 */ 81 */
82static inline cputime64_t get_cpu_idle_time(unsigned int cpu) 82static inline cputime64_t get_cpu_idle_time(unsigned int cpu)
83{ 83{
84 cputime64_t retval; 84 u64 retval;
85 85
86 retval = cputime64_add(kstat_cpu(cpu).cpustat.idle, 86 retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] +
87 kstat_cpu(cpu).cpustat.iowait); 87 kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
88 88
89 if (rackmeter_ignore_nice) 89 if (rackmeter_ignore_nice)
90 retval = cputime64_add(retval, kstat_cpu(cpu).cpustat.nice); 90 retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
91 91
92 return retval; 92 return retval;
93} 93}
@@ -220,13 +220,11 @@ static void rackmeter_do_timer(struct work_struct *work)
220 int i, offset, load, cumm, pause; 220 int i, offset, load, cumm, pause;
221 221
222 cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); 222 cur_jiffies = jiffies64_to_cputime64(get_jiffies_64());
223 total_ticks = (unsigned int)cputime64_sub(cur_jiffies, 223 total_ticks = (unsigned int) (cur_jiffies - rcpu->prev_wall);
224 rcpu->prev_wall);
225 rcpu->prev_wall = cur_jiffies; 224 rcpu->prev_wall = cur_jiffies;
226 225
227 total_idle_ticks = get_cpu_idle_time(cpu); 226 total_idle_ticks = get_cpu_idle_time(cpu);
228 idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks, 227 idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle);
229 rcpu->prev_idle);
230 rcpu->prev_idle = total_idle_ticks; 228 rcpu->prev_idle = total_idle_ticks;
231 229
232 /* We do a very dumb calculation to update the LEDs for now, 230 /* We do a very dumb calculation to update the LEDs for now,
diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c
index 881e04c7ffe6..2ca10dfec91f 100644
--- a/drivers/media/video/gspca/gspca.c
+++ b/drivers/media/video/gspca/gspca.c
@@ -838,13 +838,13 @@ static int gspca_init_transfer(struct gspca_dev *gspca_dev)
838 gspca_dev->usb_err = 0; 838 gspca_dev->usb_err = 0;
839 839
840 /* do the specific subdriver stuff before endpoint selection */ 840 /* do the specific subdriver stuff before endpoint selection */
841 gspca_dev->alt = 0; 841 intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface);
842 gspca_dev->alt = gspca_dev->cam.bulk ? intf->num_altsetting : 0;
842 if (gspca_dev->sd_desc->isoc_init) { 843 if (gspca_dev->sd_desc->isoc_init) {
843 ret = gspca_dev->sd_desc->isoc_init(gspca_dev); 844 ret = gspca_dev->sd_desc->isoc_init(gspca_dev);
844 if (ret < 0) 845 if (ret < 0)
845 goto unlock; 846 goto unlock;
846 } 847 }
847 intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface);
848 xfer = gspca_dev->cam.bulk ? USB_ENDPOINT_XFER_BULK 848 xfer = gspca_dev->cam.bulk ? USB_ENDPOINT_XFER_BULK
849 : USB_ENDPOINT_XFER_ISOC; 849 : USB_ENDPOINT_XFER_ISOC;
850 850
@@ -957,7 +957,7 @@ retry:
957 ret = -EIO; 957 ret = -EIO;
958 goto out; 958 goto out;
959 } 959 }
960 alt = ep_tb[--alt_idx].alt; 960 gspca_dev->alt = ep_tb[--alt_idx].alt;
961 } 961 }
962 } 962 }
963out: 963out:
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 50b5f9926f64..0726e59fd418 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -675,7 +675,8 @@ mmci_data_irq(struct mmci_host *host, struct mmc_data *data,
675 unsigned int status) 675 unsigned int status)
676{ 676{
677 /* First check for errors */ 677 /* First check for errors */
678 if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_TXUNDERRUN|MCI_RXOVERRUN)) { 678 if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
679 MCI_TXUNDERRUN|MCI_RXOVERRUN)) {
679 u32 remain, success; 680 u32 remain, success;
680 681
681 /* Terminate the DMA transfer */ 682 /* Terminate the DMA transfer */
@@ -754,8 +755,12 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
754 } 755 }
755 756
756 if (!cmd->data || cmd->error) { 757 if (!cmd->data || cmd->error) {
757 if (host->data) 758 if (host->data) {
759 /* Terminate the DMA transfer */
760 if (dma_inprogress(host))
761 mmci_dma_data_error(host);
758 mmci_stop_data(host); 762 mmci_stop_data(host);
763 }
759 mmci_request_end(host, cmd->mrq); 764 mmci_request_end(host, cmd->mrq);
760 } else if (!(cmd->data->flags & MMC_DATA_READ)) { 765 } else if (!(cmd->data->flags & MMC_DATA_READ)) {
761 mmci_start_data(host, cmd->data); 766 mmci_start_data(host, cmd->data);
@@ -955,8 +960,9 @@ static irqreturn_t mmci_irq(int irq, void *dev_id)
955 dev_dbg(mmc_dev(host->mmc), "irq0 (data+cmd) %08x\n", status); 960 dev_dbg(mmc_dev(host->mmc), "irq0 (data+cmd) %08x\n", status);
956 961
957 data = host->data; 962 data = host->data;
958 if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_TXUNDERRUN| 963 if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
959 MCI_RXOVERRUN|MCI_DATAEND|MCI_DATABLOCKEND) && data) 964 MCI_TXUNDERRUN|MCI_RXOVERRUN|MCI_DATAEND|
965 MCI_DATABLOCKEND) && data)
960 mmci_data_irq(host, data, status); 966 mmci_data_irq(host, data, status);
961 967
962 cmd = host->cmd; 968 cmd = host->cmd;
diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index 5272f9d4dda9..9de37642f09f 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -23,8 +23,8 @@ if NET_VENDOR_FREESCALE
23config FEC 23config FEC
24 bool "FEC ethernet controller (of ColdFire and some i.MX CPUs)" 24 bool "FEC ethernet controller (of ColdFire and some i.MX CPUs)"
25 depends on (M523x || M527x || M5272 || M528x || M520x || M532x || \ 25 depends on (M523x || M527x || M5272 || M528x || M520x || M532x || \
26 ARCH_MXC || ARCH_MXS) 26 ARCH_MXC || SOC_IMX28)
27 default ARCH_MXC || ARCH_MXS if ARM 27 default ARCH_MXC || SOC_IMX28 if ARM
28 select PHYLIB 28 select PHYLIB
29 ---help--- 29 ---help---
30 Say Y here if you want to use the built-in 10/100 Fast ethernet 30 Say Y here if you want to use the built-in 10/100 Fast ethernet
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index c7b60839ac99..dea0cb4400e2 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -2606,6 +2606,9 @@ static int skge_up(struct net_device *dev)
2606 spin_unlock_irq(&hw->hw_lock); 2606 spin_unlock_irq(&hw->hw_lock);
2607 2607
2608 napi_enable(&skge->napi); 2608 napi_enable(&skge->napi);
2609
2610 skge_set_multicast(dev);
2611
2609 return 0; 2612 return 0;
2610 2613
2611 free_tx_ring: 2614 free_tx_ring:
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 227997d775e8..5829e0b47e7e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -147,6 +147,7 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
147 mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size); 147 mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
148 if (priv->mdev->dev->caps.comp_pool && cq->vector) 148 if (priv->mdev->dev->caps.comp_pool && cq->vector)
149 mlx4_release_eq(priv->mdev->dev, cq->vector); 149 mlx4_release_eq(priv->mdev->dev, cq->vector);
150 cq->vector = 0;
150 cq->buf_size = 0; 151 cq->buf_size = 0;
151 cq->buf = NULL; 152 cq->buf = NULL;
152} 153}
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index d2348a5a7809..a9c5ae75277e 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1843,6 +1843,9 @@ static void ath9k_sta_notify(struct ieee80211_hw *hw,
1843 struct ath_softc *sc = hw->priv; 1843 struct ath_softc *sc = hw->priv;
1844 struct ath_node *an = (struct ath_node *) sta->drv_priv; 1844 struct ath_node *an = (struct ath_node *) sta->drv_priv;
1845 1845
1846 if (!(sc->sc_flags & SC_OP_TXAGGR))
1847 return;
1848
1846 switch (cmd) { 1849 switch (cmd) {
1847 case STA_NOTIFY_SLEEP: 1850 case STA_NOTIFY_SLEEP:
1848 an->sleeping = true; 1851 an->sleeping = true;
diff --git a/drivers/net/wireless/b43/pio.c b/drivers/net/wireless/b43/pio.c
index fcff923b3c18..279a53eae4c5 100644
--- a/drivers/net/wireless/b43/pio.c
+++ b/drivers/net/wireless/b43/pio.c
@@ -617,9 +617,19 @@ static bool pio_rx_frame(struct b43_pio_rxqueue *q)
617 const char *err_msg = NULL; 617 const char *err_msg = NULL;
618 struct b43_rxhdr_fw4 *rxhdr = 618 struct b43_rxhdr_fw4 *rxhdr =
619 (struct b43_rxhdr_fw4 *)wl->pio_scratchspace; 619 (struct b43_rxhdr_fw4 *)wl->pio_scratchspace;
620 size_t rxhdr_size = sizeof(*rxhdr);
620 621
621 BUILD_BUG_ON(sizeof(wl->pio_scratchspace) < sizeof(*rxhdr)); 622 BUILD_BUG_ON(sizeof(wl->pio_scratchspace) < sizeof(*rxhdr));
622 memset(rxhdr, 0, sizeof(*rxhdr)); 623 switch (dev->fw.hdr_format) {
624 case B43_FW_HDR_410:
625 case B43_FW_HDR_351:
626 rxhdr_size -= sizeof(rxhdr->format_598) -
627 sizeof(rxhdr->format_351);
628 break;
629 case B43_FW_HDR_598:
630 break;
631 }
632 memset(rxhdr, 0, rxhdr_size);
623 633
624 /* Check if we have data and wait for it to get ready. */ 634 /* Check if we have data and wait for it to get ready. */
625 if (q->rev >= 8) { 635 if (q->rev >= 8) {
@@ -657,11 +667,11 @@ data_ready:
657 667
658 /* Get the preamble (RX header) */ 668 /* Get the preamble (RX header) */
659 if (q->rev >= 8) { 669 if (q->rev >= 8) {
660 b43_block_read(dev, rxhdr, sizeof(*rxhdr), 670 b43_block_read(dev, rxhdr, rxhdr_size,
661 q->mmio_base + B43_PIO8_RXDATA, 671 q->mmio_base + B43_PIO8_RXDATA,
662 sizeof(u32)); 672 sizeof(u32));
663 } else { 673 } else {
664 b43_block_read(dev, rxhdr, sizeof(*rxhdr), 674 b43_block_read(dev, rxhdr, rxhdr_size,
665 q->mmio_base + B43_PIO_RXDATA, 675 q->mmio_base + B43_PIO_RXDATA,
666 sizeof(u16)); 676 sizeof(u16));
667 } 677 }
diff --git a/drivers/net/wireless/mwifiex/sta_ioctl.c b/drivers/net/wireless/mwifiex/sta_ioctl.c
index ea4a29b7e331..1679c2593b7b 100644
--- a/drivers/net/wireless/mwifiex/sta_ioctl.c
+++ b/drivers/net/wireless/mwifiex/sta_ioctl.c
@@ -55,9 +55,14 @@ int mwifiex_wait_queue_complete(struct mwifiex_adapter *adapter)
55{ 55{
56 bool cancel_flag = false; 56 bool cancel_flag = false;
57 int status = adapter->cmd_wait_q.status; 57 int status = adapter->cmd_wait_q.status;
58 struct cmd_ctrl_node *cmd_queued = adapter->cmd_queued; 58 struct cmd_ctrl_node *cmd_queued;
59 59
60 if (!adapter->cmd_queued)
61 return 0;
62
63 cmd_queued = adapter->cmd_queued;
60 adapter->cmd_queued = NULL; 64 adapter->cmd_queued = NULL;
65
61 dev_dbg(adapter->dev, "cmd pending\n"); 66 dev_dbg(adapter->dev, "cmd pending\n");
62 atomic_inc(&adapter->cmd_pending); 67 atomic_inc(&adapter->cmd_pending);
63 68
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index cbd5d701c7e0..63b3ec48c203 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -314,7 +314,7 @@ static const struct of_dev_auxdata *of_dev_lookup(const struct of_dev_auxdata *l
314 if (!lookup) 314 if (!lookup)
315 return NULL; 315 return NULL;
316 316
317 for(; lookup->name != NULL; lookup++) { 317 for(; lookup->compatible != NULL; lookup++) {
318 if (!of_device_is_compatible(np, lookup->compatible)) 318 if (!of_device_is_compatible(np, lookup->compatible))
319 continue; 319 continue;
320 if (of_address_to_resource(np, 0, &res)) 320 if (of_address_to_resource(np, 0, &res))
diff --git a/drivers/oprofile/nmi_timer_int.c b/drivers/oprofile/nmi_timer_int.c
new file mode 100644
index 000000000000..76f1c9357f39
--- /dev/null
+++ b/drivers/oprofile/nmi_timer_int.c
@@ -0,0 +1,173 @@
1/**
2 * @file nmi_timer_int.c
3 *
4 * @remark Copyright 2011 Advanced Micro Devices, Inc.
5 *
6 * @author Robert Richter <robert.richter@amd.com>
7 */
8
9#include <linux/init.h>
10#include <linux/smp.h>
11#include <linux/errno.h>
12#include <linux/oprofile.h>
13#include <linux/perf_event.h>
14
15#ifdef CONFIG_OPROFILE_NMI_TIMER
16
17static DEFINE_PER_CPU(struct perf_event *, nmi_timer_events);
18static int ctr_running;
19
20static struct perf_event_attr nmi_timer_attr = {
21 .type = PERF_TYPE_HARDWARE,
22 .config = PERF_COUNT_HW_CPU_CYCLES,
23 .size = sizeof(struct perf_event_attr),
24 .pinned = 1,
25 .disabled = 1,
26};
27
28static void nmi_timer_callback(struct perf_event *event,
29 struct perf_sample_data *data,
30 struct pt_regs *regs)
31{
32 event->hw.interrupts = 0; /* don't throttle interrupts */
33 oprofile_add_sample(regs, 0);
34}
35
36static int nmi_timer_start_cpu(int cpu)
37{
38 struct perf_event *event = per_cpu(nmi_timer_events, cpu);
39
40 if (!event) {
41 event = perf_event_create_kernel_counter(&nmi_timer_attr, cpu, NULL,
42 nmi_timer_callback, NULL);
43 if (IS_ERR(event))
44 return PTR_ERR(event);
45 per_cpu(nmi_timer_events, cpu) = event;
46 }
47
48 if (event && ctr_running)
49 perf_event_enable(event);
50
51 return 0;
52}
53
54static void nmi_timer_stop_cpu(int cpu)
55{
56 struct perf_event *event = per_cpu(nmi_timer_events, cpu);
57
58 if (event && ctr_running)
59 perf_event_disable(event);
60}
61
62static int nmi_timer_cpu_notifier(struct notifier_block *b, unsigned long action,
63 void *data)
64{
65 int cpu = (unsigned long)data;
66 switch (action) {
67 case CPU_DOWN_FAILED:
68 case CPU_ONLINE:
69 nmi_timer_start_cpu(cpu);
70 break;
71 case CPU_DOWN_PREPARE:
72 nmi_timer_stop_cpu(cpu);
73 break;
74 }
75 return NOTIFY_DONE;
76}
77
78static struct notifier_block nmi_timer_cpu_nb = {
79 .notifier_call = nmi_timer_cpu_notifier
80};
81
82static int nmi_timer_start(void)
83{
84 int cpu;
85
86 get_online_cpus();
87 ctr_running = 1;
88 for_each_online_cpu(cpu)
89 nmi_timer_start_cpu(cpu);
90 put_online_cpus();
91
92 return 0;
93}
94
95static void nmi_timer_stop(void)
96{
97 int cpu;
98
99 get_online_cpus();
100 for_each_online_cpu(cpu)
101 nmi_timer_stop_cpu(cpu);
102 ctr_running = 0;
103 put_online_cpus();
104}
105
106static void nmi_timer_shutdown(void)
107{
108 struct perf_event *event;
109 int cpu;
110
111 get_online_cpus();
112 unregister_cpu_notifier(&nmi_timer_cpu_nb);
113 for_each_possible_cpu(cpu) {
114 event = per_cpu(nmi_timer_events, cpu);
115 if (!event)
116 continue;
117 perf_event_disable(event);
118 per_cpu(nmi_timer_events, cpu) = NULL;
119 perf_event_release_kernel(event);
120 }
121
122 put_online_cpus();
123}
124
125static int nmi_timer_setup(void)
126{
127 int cpu, err;
128 u64 period;
129
130 /* clock cycles per tick: */
131 period = (u64)cpu_khz * 1000;
132 do_div(period, HZ);
133 nmi_timer_attr.sample_period = period;
134
135 get_online_cpus();
136 err = register_cpu_notifier(&nmi_timer_cpu_nb);
137 if (err)
138 goto out;
139 /* can't attach events to offline cpus: */
140 for_each_online_cpu(cpu) {
141 err = nmi_timer_start_cpu(cpu);
142 if (err)
143 break;
144 }
145 if (err)
146 nmi_timer_shutdown();
147out:
148 put_online_cpus();
149 return err;
150}
151
152int __init op_nmi_timer_init(struct oprofile_operations *ops)
153{
154 int err = 0;
155
156 err = nmi_timer_setup();
157 if (err)
158 return err;
159 nmi_timer_shutdown(); /* only check, don't alloc */
160
161 ops->create_files = NULL;
162 ops->setup = nmi_timer_setup;
163 ops->shutdown = nmi_timer_shutdown;
164 ops->start = nmi_timer_start;
165 ops->stop = nmi_timer_stop;
166 ops->cpu_type = "timer";
167
168 printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
169
170 return 0;
171}
172
173#endif
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index f8c752e408a6..ed2c3ec07024 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -246,37 +246,31 @@ static int __init oprofile_init(void)
246 int err; 246 int err;
247 247
248 /* always init architecture to setup backtrace support */ 248 /* always init architecture to setup backtrace support */
249 timer_mode = 0;
249 err = oprofile_arch_init(&oprofile_ops); 250 err = oprofile_arch_init(&oprofile_ops);
251 if (!err) {
252 if (!timer && !oprofilefs_register())
253 return 0;
254 oprofile_arch_exit();
255 }
250 256
251 timer_mode = err || timer; /* fall back to timer mode on errors */ 257 /* setup timer mode: */
252 if (timer_mode) { 258 timer_mode = 1;
253 if (!err) 259 /* no nmi timer mode if oprofile.timer is set */
254 oprofile_arch_exit(); 260 if (timer || op_nmi_timer_init(&oprofile_ops)) {
255 err = oprofile_timer_init(&oprofile_ops); 261 err = oprofile_timer_init(&oprofile_ops);
256 if (err) 262 if (err)
257 return err; 263 return err;
258 } 264 }
259 265
260 err = oprofilefs_register(); 266 return oprofilefs_register();
261 if (!err)
262 return 0;
263
264 /* failed */
265 if (timer_mode)
266 oprofile_timer_exit();
267 else
268 oprofile_arch_exit();
269
270 return err;
271} 267}
272 268
273 269
274static void __exit oprofile_exit(void) 270static void __exit oprofile_exit(void)
275{ 271{
276 oprofilefs_unregister(); 272 oprofilefs_unregister();
277 if (timer_mode) 273 if (!timer_mode)
278 oprofile_timer_exit();
279 else
280 oprofile_arch_exit(); 274 oprofile_arch_exit();
281} 275}
282 276
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index 177b73de5e5f..d32ef816337c 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -35,7 +35,15 @@ struct dentry;
35 35
36void oprofile_create_files(struct super_block *sb, struct dentry *root); 36void oprofile_create_files(struct super_block *sb, struct dentry *root);
37int oprofile_timer_init(struct oprofile_operations *ops); 37int oprofile_timer_init(struct oprofile_operations *ops);
38void oprofile_timer_exit(void); 38#ifdef CONFIG_OPROFILE_NMI_TIMER
39int op_nmi_timer_init(struct oprofile_operations *ops);
40#else
41static inline int op_nmi_timer_init(struct oprofile_operations *ops)
42{
43 return -ENODEV;
44}
45#endif
46
39 47
40int oprofile_set_ulong(unsigned long *addr, unsigned long val); 48int oprofile_set_ulong(unsigned long *addr, unsigned long val);
41int oprofile_set_timeout(unsigned long time); 49int oprofile_set_timeout(unsigned long time);
diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c
index 878fba126582..93404f72dfa8 100644
--- a/drivers/oprofile/timer_int.c
+++ b/drivers/oprofile/timer_int.c
@@ -97,24 +97,24 @@ static struct notifier_block __refdata oprofile_cpu_notifier = {
97 .notifier_call = oprofile_cpu_notify, 97 .notifier_call = oprofile_cpu_notify,
98}; 98};
99 99
100int oprofile_timer_init(struct oprofile_operations *ops) 100static int oprofile_hrtimer_setup(void)
101{ 101{
102 int rc; 102 return register_hotcpu_notifier(&oprofile_cpu_notifier);
103
104 rc = register_hotcpu_notifier(&oprofile_cpu_notifier);
105 if (rc)
106 return rc;
107 ops->create_files = NULL;
108 ops->setup = NULL;
109 ops->shutdown = NULL;
110 ops->start = oprofile_hrtimer_start;
111 ops->stop = oprofile_hrtimer_stop;
112 ops->cpu_type = "timer";
113 printk(KERN_INFO "oprofile: using timer interrupt.\n");
114 return 0;
115} 103}
116 104
117void oprofile_timer_exit(void) 105static void oprofile_hrtimer_shutdown(void)
118{ 106{
119 unregister_hotcpu_notifier(&oprofile_cpu_notifier); 107 unregister_hotcpu_notifier(&oprofile_cpu_notifier);
120} 108}
109
110int oprofile_timer_init(struct oprofile_operations *ops)
111{
112 ops->create_files = NULL;
113 ops->setup = oprofile_hrtimer_setup;
114 ops->shutdown = oprofile_hrtimer_shutdown;
115 ops->start = oprofile_hrtimer_start;
116 ops->stop = oprofile_hrtimer_stop;
117 ops->cpu_type = "timer";
118 printk(KERN_INFO "oprofile: using timer interrupt.\n");
119 return 0;
120}
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index f02b5235056d..37856f7c7781 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -98,11 +98,11 @@ config PCI_PASID
98 If unsure, say N. 98 If unsure, say N.
99 99
100config PCI_IOAPIC 100config PCI_IOAPIC
101 bool 101 tristate "PCI IO-APIC hotplug support" if X86
102 depends on PCI 102 depends on PCI
103 depends on ACPI 103 depends on ACPI
104 depends on HOTPLUG 104 depends on HOTPLUG
105 default y 105 default !X86
106 106
107config PCI_LABEL 107config PCI_LABEL
108 def_bool y if (DMI || ACPI) 108 def_bool y if (DMI || ACPI)
diff --git a/drivers/pci/ioapic.c b/drivers/pci/ioapic.c
index 5775638ac017..205af8dc83c2 100644
--- a/drivers/pci/ioapic.c
+++ b/drivers/pci/ioapic.c
@@ -17,7 +17,7 @@
17 */ 17 */
18 18
19#include <linux/pci.h> 19#include <linux/pci.h>
20#include <linux/export.h> 20#include <linux/module.h>
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <acpi/acpi_bus.h> 23#include <acpi/acpi_bus.h>
@@ -27,7 +27,7 @@ struct ioapic {
27 u32 gsi_base; 27 u32 gsi_base;
28}; 28};
29 29
30static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent) 30static int __devinit ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent)
31{ 31{
32 acpi_handle handle; 32 acpi_handle handle;
33 acpi_status status; 33 acpi_status status;
@@ -88,7 +88,7 @@ exit_free:
88 return -ENODEV; 88 return -ENODEV;
89} 89}
90 90
91static void ioapic_remove(struct pci_dev *dev) 91static void __devexit ioapic_remove(struct pci_dev *dev)
92{ 92{
93 struct ioapic *ioapic = pci_get_drvdata(dev); 93 struct ioapic *ioapic = pci_get_drvdata(dev);
94 94
@@ -99,13 +99,12 @@ static void ioapic_remove(struct pci_dev *dev)
99} 99}
100 100
101 101
102static struct pci_device_id ioapic_devices[] = { 102static DEFINE_PCI_DEVICE_TABLE(ioapic_devices) = {
103 { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 103 { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOAPIC, ~0) },
104 PCI_CLASS_SYSTEM_PIC_IOAPIC << 8, 0xffff00, }, 104 { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOXAPIC, ~0) },
105 { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
106 PCI_CLASS_SYSTEM_PIC_IOXAPIC << 8, 0xffff00, },
107 { } 105 { }
108}; 106};
107MODULE_DEVICE_TABLE(pci, ioapic_devices);
109 108
110static struct pci_driver ioapic_driver = { 109static struct pci_driver ioapic_driver = {
111 .name = "ioapic", 110 .name = "ioapic",
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 3bcc7cfcaba7..8e286259a007 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -73,8 +73,6 @@ int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm)
73 err = -EINVAL; 73 err = -EINVAL;
74 74
75 mutex_unlock(&rtc->ops_lock); 75 mutex_unlock(&rtc->ops_lock);
76 /* A timer might have just expired */
77 schedule_work(&rtc->irqwork);
78 return err; 76 return err;
79} 77}
80EXPORT_SYMBOL_GPL(rtc_set_time); 78EXPORT_SYMBOL_GPL(rtc_set_time);
@@ -114,8 +112,6 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
114 err = -EINVAL; 112 err = -EINVAL;
115 113
116 mutex_unlock(&rtc->ops_lock); 114 mutex_unlock(&rtc->ops_lock);
117 /* A timer might have just expired */
118 schedule_work(&rtc->irqwork);
119 115
120 return err; 116 return err;
121} 117}
@@ -323,20 +319,6 @@ int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
323} 319}
324EXPORT_SYMBOL_GPL(rtc_read_alarm); 320EXPORT_SYMBOL_GPL(rtc_read_alarm);
325 321
326static int ___rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
327{
328 int err;
329
330 if (!rtc->ops)
331 err = -ENODEV;
332 else if (!rtc->ops->set_alarm)
333 err = -EINVAL;
334 else
335 err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
336
337 return err;
338}
339
340static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) 322static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
341{ 323{
342 struct rtc_time tm; 324 struct rtc_time tm;
@@ -360,7 +342,14 @@ static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
360 * over right here, before we set the alarm. 342 * over right here, before we set the alarm.
361 */ 343 */
362 344
363 return ___rtc_set_alarm(rtc, alarm); 345 if (!rtc->ops)
346 err = -ENODEV;
347 else if (!rtc->ops->set_alarm)
348 err = -EINVAL;
349 else
350 err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
351
352 return err;
364} 353}
365 354
366int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) 355int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
@@ -407,8 +396,6 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
407 timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node); 396 timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node);
408 } 397 }
409 mutex_unlock(&rtc->ops_lock); 398 mutex_unlock(&rtc->ops_lock);
410 /* maybe that was in the past.*/
411 schedule_work(&rtc->irqwork);
412 return err; 399 return err;
413} 400}
414EXPORT_SYMBOL_GPL(rtc_initialize_alarm); 401EXPORT_SYMBOL_GPL(rtc_initialize_alarm);
@@ -776,20 +763,6 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
776 return 0; 763 return 0;
777} 764}
778 765
779static void rtc_alarm_disable(struct rtc_device *rtc)
780{
781 struct rtc_wkalrm alarm;
782 struct rtc_time tm;
783
784 __rtc_read_time(rtc, &tm);
785
786 alarm.time = rtc_ktime_to_tm(ktime_add(rtc_tm_to_ktime(tm),
787 ktime_set(300, 0)));
788 alarm.enabled = 0;
789
790 ___rtc_set_alarm(rtc, &alarm);
791}
792
793/** 766/**
794 * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue 767 * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue
795 * @rtc rtc device 768 * @rtc rtc device
@@ -811,10 +784,8 @@ static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
811 struct rtc_wkalrm alarm; 784 struct rtc_wkalrm alarm;
812 int err; 785 int err;
813 next = timerqueue_getnext(&rtc->timerqueue); 786 next = timerqueue_getnext(&rtc->timerqueue);
814 if (!next) { 787 if (!next)
815 rtc_alarm_disable(rtc);
816 return; 788 return;
817 }
818 alarm.time = rtc_ktime_to_tm(next->expires); 789 alarm.time = rtc_ktime_to_tm(next->expires);
819 alarm.enabled = 1; 790 alarm.enabled = 1;
820 err = __rtc_set_alarm(rtc, &alarm); 791 err = __rtc_set_alarm(rtc, &alarm);
@@ -876,8 +847,7 @@ again:
876 err = __rtc_set_alarm(rtc, &alarm); 847 err = __rtc_set_alarm(rtc, &alarm);
877 if (err == -ETIME) 848 if (err == -ETIME)
878 goto again; 849 goto again;
879 } else 850 }
880 rtc_alarm_disable(rtc);
881 851
882 mutex_unlock(&rtc->ops_lock); 852 mutex_unlock(&rtc->ops_lock);
883} 853}
diff --git a/drivers/watchdog/coh901327_wdt.c b/drivers/watchdog/coh901327_wdt.c
index 03f449a430d2..5b89f7d6cd0f 100644
--- a/drivers/watchdog/coh901327_wdt.c
+++ b/drivers/watchdog/coh901327_wdt.c
@@ -76,8 +76,6 @@ static int irq;
76static void __iomem *virtbase; 76static void __iomem *virtbase;
77static unsigned long coh901327_users; 77static unsigned long coh901327_users;
78static unsigned long boot_status; 78static unsigned long boot_status;
79static u16 wdogenablestore;
80static u16 irqmaskstore;
81static struct device *parent; 79static struct device *parent;
82 80
83/* 81/*
@@ -461,6 +459,10 @@ out:
461} 459}
462 460
463#ifdef CONFIG_PM 461#ifdef CONFIG_PM
462
463static u16 wdogenablestore;
464static u16 irqmaskstore;
465
464static int coh901327_suspend(struct platform_device *pdev, pm_message_t state) 466static int coh901327_suspend(struct platform_device *pdev, pm_message_t state)
465{ 467{
466 irqmaskstore = readw(virtbase + U300_WDOG_IMR) & 0x0001U; 468 irqmaskstore = readw(virtbase + U300_WDOG_IMR) & 0x0001U;
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 3774c9b8dac9..8464ea1c36a1 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -231,6 +231,7 @@ static int __devinit cru_detect(unsigned long map_entry,
231 231
232 cmn_regs.u1.reax = CRU_BIOS_SIGNATURE_VALUE; 232 cmn_regs.u1.reax = CRU_BIOS_SIGNATURE_VALUE;
233 233
234 set_memory_x((unsigned long)bios32_entrypoint, (2 * PAGE_SIZE));
234 asminline_call(&cmn_regs, bios32_entrypoint); 235 asminline_call(&cmn_regs, bios32_entrypoint);
235 236
236 if (cmn_regs.u1.ral != 0) { 237 if (cmn_regs.u1.ral != 0) {
@@ -248,8 +249,10 @@ static int __devinit cru_detect(unsigned long map_entry,
248 if ((physical_bios_base + physical_bios_offset)) { 249 if ((physical_bios_base + physical_bios_offset)) {
249 cru_rom_addr = 250 cru_rom_addr =
250 ioremap(cru_physical_address, cru_length); 251 ioremap(cru_physical_address, cru_length);
251 if (cru_rom_addr) 252 if (cru_rom_addr) {
253 set_memory_x((unsigned long)cru_rom_addr, cru_length);
252 retval = 0; 254 retval = 0;
255 }
253 } 256 }
254 257
255 printk(KERN_DEBUG "hpwdt: CRU Base Address: 0x%lx\n", 258 printk(KERN_DEBUG "hpwdt: CRU Base Address: 0x%lx\n",
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index ba6ad662635a..99796c5d913d 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -384,10 +384,10 @@ MODULE_PARM_DESC(nowayout,
384 "Watchdog cannot be stopped once started (default=" 384 "Watchdog cannot be stopped once started (default="
385 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); 385 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
386 386
387static int turn_SMI_watchdog_clear_off = 0; 387static int turn_SMI_watchdog_clear_off = 1;
388module_param(turn_SMI_watchdog_clear_off, int, 0); 388module_param(turn_SMI_watchdog_clear_off, int, 0);
389MODULE_PARM_DESC(turn_SMI_watchdog_clear_off, 389MODULE_PARM_DESC(turn_SMI_watchdog_clear_off,
390 "Turn off SMI clearing watchdog (default=0)"); 390 "Turn off SMI clearing watchdog (depends on TCO-version)(default=1)");
391 391
392/* 392/*
393 * Some TCO specific functions 393 * Some TCO specific functions
@@ -813,7 +813,7 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
813 ret = -EIO; 813 ret = -EIO;
814 goto out_unmap; 814 goto out_unmap;
815 } 815 }
816 if (turn_SMI_watchdog_clear_off) { 816 if (turn_SMI_watchdog_clear_off >= iTCO_wdt_private.iTCO_version) {
817 /* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */ 817 /* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */
818 val32 = inl(SMI_EN); 818 val32 = inl(SMI_EN);
819 val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */ 819 val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */
diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index cc2cfbe33b30..bfaf9bb1ee0d 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -351,7 +351,7 @@ static int __devexit sp805_wdt_remove(struct amba_device *adev)
351 return 0; 351 return 0;
352} 352}
353 353
354static struct amba_id sp805_wdt_ids[] __initdata = { 354static struct amba_id sp805_wdt_ids[] = {
355 { 355 {
356 .id = 0x00141805, 356 .id = 0x00141805,
357 .mask = 0x00ffffff, 357 .mask = 0x00ffffff,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3eeb97661262..98954003a8d3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1094,42 +1094,19 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1094/* 1094/*
1095 * Set/clear/test dir complete flag on the dir's dentry. 1095 * Set/clear/test dir complete flag on the dir's dentry.
1096 */ 1096 */
1097static struct dentry * __d_find_any_alias(struct inode *inode)
1098{
1099 struct dentry *alias;
1100
1101 if (list_empty(&inode->i_dentry))
1102 return NULL;
1103 alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
1104 return alias;
1105}
1106
1107void ceph_dir_set_complete(struct inode *inode) 1097void ceph_dir_set_complete(struct inode *inode)
1108{ 1098{
1109 struct dentry *dentry = __d_find_any_alias(inode); 1099 /* not yet implemented */
1110
1111 if (dentry && ceph_dentry(dentry)) {
1112 dout(" marking %p (%p) complete\n", inode, dentry);
1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114 }
1115} 1100}
1116 1101
1117void ceph_dir_clear_complete(struct inode *inode) 1102void ceph_dir_clear_complete(struct inode *inode)
1118{ 1103{
1119 struct dentry *dentry = __d_find_any_alias(inode); 1104 /* not yet implemented */
1120
1121 if (dentry && ceph_dentry(dentry)) {
1122 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1123 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1124 }
1125} 1105}
1126 1106
1127bool ceph_dir_test_complete(struct inode *inode) 1107bool ceph_dir_test_complete(struct inode *inode)
1128{ 1108{
1129 struct dentry *dentry = __d_find_any_alias(inode); 1109 /* not yet implemented */
1130
1131 if (dentry && ceph_dentry(dentry))
1132 return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1133 return false; 1110 return false;
1134} 1111}
1135 1112
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8cd4b52d4217..f3670cf72587 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -282,7 +282,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
282 byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); 282 byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
283 byte_count += total_in_buf2; 283 byte_count += total_in_buf2;
284 /* don't allow buffer to overflow */ 284 /* don't allow buffer to overflow */
285 if (byte_count > CIFSMaxBufSize) 285 if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
286 return -ENOBUFS; 286 return -ENOBUFS;
287 pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); 287 pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
288 288
@@ -2122,7 +2122,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2122 warned_on_ntlm = true; 2122 warned_on_ntlm = true;
2123 cERROR(1, "default security mechanism requested. The default " 2123 cERROR(1, "default security mechanism requested. The default "
2124 "security mechanism will be upgraded from ntlm to " 2124 "security mechanism will be upgraded from ntlm to "
2125 "ntlmv2 in kernel release 3.2"); 2125 "ntlmv2 in kernel release 3.3");
2126 } 2126 }
2127 ses->overrideSecFlg = volume_info->secFlg; 2127 ses->overrideSecFlg = volume_info->secFlg;
2128 2128
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 51352de88ef1..a10e428b32b4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1506,35 +1506,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1506 return -ENOIOCTLCMD; 1506 return -ENOIOCTLCMD;
1507} 1507}
1508 1508
1509static void compat_ioctl_error(struct file *filp, unsigned int fd,
1510 unsigned int cmd, unsigned long arg)
1511{
1512 char buf[10];
1513 char *fn = "?";
1514 char *path;
1515
1516 /* find the name of the device. */
1517 path = (char *)__get_free_page(GFP_KERNEL);
1518 if (path) {
1519 fn = d_path(&filp->f_path, path, PAGE_SIZE);
1520 if (IS_ERR(fn))
1521 fn = "?";
1522 }
1523
1524 sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
1525 if (!isprint(buf[1]))
1526 sprintf(buf, "%02x", buf[1]);
1527 compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
1528 "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
1529 current->comm, current->pid,
1530 (int)fd, (unsigned int)cmd, buf,
1531 (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
1532 (unsigned int)arg, fn);
1533
1534 if (path)
1535 free_page((unsigned long)path);
1536}
1537
1538static int compat_ioctl_check_table(unsigned int xcmd) 1509static int compat_ioctl_check_table(unsigned int xcmd)
1539{ 1510{
1540 int i; 1511 int i;
@@ -1621,13 +1592,8 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
1621 goto found_handler; 1592 goto found_handler;
1622 1593
1623 error = do_ioctl_trans(fd, cmd, arg, filp); 1594 error = do_ioctl_trans(fd, cmd, arg, filp);
1624 if (error == -ENOIOCTLCMD) { 1595 if (error == -ENOIOCTLCMD)
1625 static int count; 1596 error = -ENOTTY;
1626
1627 if (++count <= 50)
1628 compat_ioctl_error(filp, fd, cmd, arg);
1629 error = -EINVAL;
1630 }
1631 1597
1632 goto out_fput; 1598 goto out_fput;
1633 1599
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1d9b9fcb2db4..066836e81848 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -42,7 +42,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
42 42
43 error = filp->f_op->unlocked_ioctl(filp, cmd, arg); 43 error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
44 if (error == -ENOIOCTLCMD) 44 if (error == -ENOIOCTLCMD)
45 error = -EINVAL; 45 error = -ENOTTY;
46 out: 46 out:
47 return error; 47 return error;
48} 48}
diff --git a/fs/locks.c b/fs/locks.c
index 3b0d05dcd7c1..637694bf3a03 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1205,6 +1205,8 @@ int __break_lease(struct inode *inode, unsigned int mode)
1205 int want_write = (mode & O_ACCMODE) != O_RDONLY; 1205 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1206 1206
1207 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); 1207 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1208 if (IS_ERR(new_fl))
1209 return PTR_ERR(new_fl);
1208 1210
1209 lock_flocks(); 1211 lock_flocks();
1210 1212
@@ -1221,12 +1223,6 @@ int __break_lease(struct inode *inode, unsigned int mode)
1221 if (fl->fl_owner == current->files) 1223 if (fl->fl_owner == current->files)
1222 i_have_this_lease = 1; 1224 i_have_this_lease = 1;
1223 1225
1224 if (IS_ERR(new_fl) && !i_have_this_lease
1225 && ((mode & O_NONBLOCK) == 0)) {
1226 error = PTR_ERR(new_fl);
1227 goto out;
1228 }
1229
1230 break_time = 0; 1226 break_time = 0;
1231 if (lease_break_time > 0) { 1227 if (lease_break_time > 0) {
1232 break_time = jiffies + lease_break_time * HZ; 1228 break_time = jiffies + lease_break_time * HZ;
@@ -1284,8 +1280,7 @@ restart:
1284 1280
1285out: 1281out:
1286 unlock_flocks(); 1282 unlock_flocks();
1287 if (!IS_ERR(new_fl)) 1283 locks_free_lock(new_fl);
1288 locks_free_lock(new_fl);
1289 return error; 1284 return error;
1290} 1285}
1291 1286
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 1d9e33966db0..4d46a6a59070 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -263,23 +263,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
263 goto out_no_root; 263 goto out_no_root;
264 } 264 }
265 265
266 ret = -ENOMEM;
267 s->s_root = d_alloc_root(root_inode);
268 if (!s->s_root)
269 goto out_iput;
270
271 if (!(s->s_flags & MS_RDONLY)) {
272 if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
273 ms->s_state &= ~MINIX_VALID_FS;
274 mark_buffer_dirty(bh);
275 }
276 if (!(sbi->s_mount_state & MINIX_VALID_FS))
277 printk("MINIX-fs: mounting unchecked file system, "
278 "running fsck is recommended\n");
279 else if (sbi->s_mount_state & MINIX_ERROR_FS)
280 printk("MINIX-fs: mounting file system with errors, "
281 "running fsck is recommended\n");
282
283 /* Apparently minix can create filesystems that allocate more blocks for 266 /* Apparently minix can create filesystems that allocate more blocks for
284 * the bitmaps than needed. We simply ignore that, but verify it didn't 267 * the bitmaps than needed. We simply ignore that, but verify it didn't
285 * create one with not enough blocks and bail out if so. 268 * create one with not enough blocks and bail out if so.
@@ -300,6 +283,23 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
300 goto out_iput; 283 goto out_iput;
301 } 284 }
302 285
286 ret = -ENOMEM;
287 s->s_root = d_alloc_root(root_inode);
288 if (!s->s_root)
289 goto out_iput;
290
291 if (!(s->s_flags & MS_RDONLY)) {
292 if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
293 ms->s_state &= ~MINIX_VALID_FS;
294 mark_buffer_dirty(bh);
295 }
296 if (!(sbi->s_mount_state & MINIX_VALID_FS))
297 printk("MINIX-fs: mounting unchecked file system, "
298 "running fsck is recommended\n");
299 else if (sbi->s_mount_state & MINIX_ERROR_FS)
300 printk("MINIX-fs: mounting file system with errors, "
301 "running fsck is recommended\n");
302
303 return 0; 303 return 0;
304 304
305out_iput: 305out_iput:
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd228d1..8c344f037bd0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
394 394
395 sigemptyset(&sigign); 395 sigemptyset(&sigign);
396 sigemptyset(&sigcatch); 396 sigemptyset(&sigcatch);
397 cutime = cstime = utime = stime = cputime_zero; 397 cutime = cstime = utime = stime = 0;
398 cgtime = gtime = cputime_zero; 398 cgtime = gtime = 0;
399 399
400 if (lock_task_sighand(task, &flags)) { 400 if (lock_task_sighand(task, &flags)) {
401 struct signal_struct *sig = task->signal; 401 struct signal_struct *sig = task->signal;
@@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
423 do { 423 do {
424 min_flt += t->min_flt; 424 min_flt += t->min_flt;
425 maj_flt += t->maj_flt; 425 maj_flt += t->maj_flt;
426 gtime = cputime_add(gtime, t->gtime); 426 gtime += t->gtime;
427 t = next_thread(t); 427 t = next_thread(t);
428 } while (t != task); 428 } while (t != task);
429 429
430 min_flt += sig->min_flt; 430 min_flt += sig->min_flt;
431 maj_flt += sig->maj_flt; 431 maj_flt += sig->maj_flt;
432 thread_group_times(task, &utime, &stime); 432 thread_group_times(task, &utime, &stime);
433 gtime = cputime_add(gtime, sig->gtime); 433 gtime += sig->gtime;
434 } 434 }
435 435
436 sid = task_session_nr_ns(task, ns); 436 sid = task_session_nr_ns(task, ns);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 2a30d67dd6b8..d76ca6ae2b1b 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,31 +22,29 @@
22#define arch_idle_time(cpu) 0 22#define arch_idle_time(cpu) 0
23#endif 23#endif
24 24
25static cputime64_t get_idle_time(int cpu) 25static u64 get_idle_time(int cpu)
26{ 26{
27 u64 idle_time = get_cpu_idle_time_us(cpu, NULL); 27 u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
28 cputime64_t idle;
29 28
30 if (idle_time == -1ULL) { 29 if (idle_time == -1ULL) {
31 /* !NO_HZ so we can rely on cpustat.idle */ 30 /* !NO_HZ so we can rely on cpustat.idle */
32 idle = kstat_cpu(cpu).cpustat.idle; 31 idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
33 idle = cputime64_add(idle, arch_idle_time(cpu)); 32 idle += arch_idle_time(cpu);
34 } else 33 } else
35 idle = nsecs_to_jiffies64(1000 * idle_time); 34 idle = usecs_to_cputime64(idle_time);
36 35
37 return idle; 36 return idle;
38} 37}
39 38
40static cputime64_t get_iowait_time(int cpu) 39static u64 get_iowait_time(int cpu)
41{ 40{
42 u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); 41 u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
43 cputime64_t iowait;
44 42
45 if (iowait_time == -1ULL) 43 if (iowait_time == -1ULL)
46 /* !NO_HZ so we can rely on cpustat.iowait */ 44 /* !NO_HZ so we can rely on cpustat.iowait */
47 iowait = kstat_cpu(cpu).cpustat.iowait; 45 iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
48 else 46 else
49 iowait = nsecs_to_jiffies64(1000 * iowait_time); 47 iowait = usecs_to_cputime64(iowait_time);
50 48
51 return iowait; 49 return iowait;
52} 50}
@@ -55,33 +53,30 @@ static int show_stat(struct seq_file *p, void *v)
55{ 53{
56 int i, j; 54 int i, j;
57 unsigned long jif; 55 unsigned long jif;
58 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 56 u64 user, nice, system, idle, iowait, irq, softirq, steal;
59 cputime64_t guest, guest_nice; 57 u64 guest, guest_nice;
60 u64 sum = 0; 58 u64 sum = 0;
61 u64 sum_softirq = 0; 59 u64 sum_softirq = 0;
62 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 60 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
63 struct timespec boottime; 61 struct timespec boottime;
64 62
65 user = nice = system = idle = iowait = 63 user = nice = system = idle = iowait =
66 irq = softirq = steal = cputime64_zero; 64 irq = softirq = steal = 0;
67 guest = guest_nice = cputime64_zero; 65 guest = guest_nice = 0;
68 getboottime(&boottime); 66 getboottime(&boottime);
69 jif = boottime.tv_sec; 67 jif = boottime.tv_sec;
70 68
71 for_each_possible_cpu(i) { 69 for_each_possible_cpu(i) {
72 user = cputime64_add(user, kstat_cpu(i).cpustat.user); 70 user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
73 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); 71 nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
74 system = cputime64_add(system, kstat_cpu(i).cpustat.system); 72 system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
75 idle = cputime64_add(idle, get_idle_time(i)); 73 idle += get_idle_time(i);
76 iowait = cputime64_add(iowait, get_iowait_time(i)); 74 iowait += get_iowait_time(i);
77 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); 75 irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
78 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 76 softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
79 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
80 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
81 guest_nice = cputime64_add(guest_nice, 79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
82 kstat_cpu(i).cpustat.guest_nice);
83 sum += kstat_cpu_irqs_sum(i);
84 sum += arch_irq_stat_cpu(i);
85 80
86 for (j = 0; j < NR_SOFTIRQS; j++) { 81 for (j = 0; j < NR_SOFTIRQS; j++) {
87 unsigned int softirq_stat = kstat_softirqs_cpu(j, i); 82 unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -106,16 +101,16 @@ static int show_stat(struct seq_file *p, void *v)
106 (unsigned long long)cputime64_to_clock_t(guest_nice)); 101 (unsigned long long)cputime64_to_clock_t(guest_nice));
107 for_each_online_cpu(i) { 102 for_each_online_cpu(i) {
108 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 103 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
109 user = kstat_cpu(i).cpustat.user; 104 user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
110 nice = kstat_cpu(i).cpustat.nice; 105 nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
111 system = kstat_cpu(i).cpustat.system; 106 system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
112 idle = get_idle_time(i); 107 idle = get_idle_time(i);
113 iowait = get_iowait_time(i); 108 iowait = get_iowait_time(i);
114 irq = kstat_cpu(i).cpustat.irq; 109 irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
115 softirq = kstat_cpu(i).cpustat.softirq; 110 softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
116 steal = kstat_cpu(i).cpustat.steal; 111 steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
117 guest = kstat_cpu(i).cpustat.guest; 112 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
118 guest_nice = kstat_cpu(i).cpustat.guest_nice; 113 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
119 seq_printf(p, 114 seq_printf(p,
120 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " 115 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
121 "%llu\n", 116 "%llu\n",
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 766b1d456050..9610ac772d7e 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v)
11{ 11{
12 struct timespec uptime; 12 struct timespec uptime;
13 struct timespec idle; 13 struct timespec idle;
14 u64 idletime;
15 u64 nsec;
16 u32 rem;
14 int i; 17 int i;
15 cputime_t idletime = cputime_zero;
16 18
19 idletime = 0;
17 for_each_possible_cpu(i) 20 for_each_possible_cpu(i)
18 idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); 21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
19 22
20 do_posix_clock_monotonic_gettime(&uptime); 23 do_posix_clock_monotonic_gettime(&uptime);
21 monotonic_to_bootbased(&uptime); 24 monotonic_to_bootbased(&uptime);
22 cputime_to_timespec(idletime, &idle); 25 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
26 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
27 idle.tv_nsec = rem;
23 seq_printf(m, "%lu.%02lu %lu.%02lu\n", 28 seq_printf(m, "%lu.%02lu %lu.%02lu\n",
24 (unsigned long) uptime.tv_sec, 29 (unsigned long) uptime.tv_sec,
25 (uptime.tv_nsec / (NSEC_PER_SEC / 100)), 30 (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3eca58f51ae9..8a899496fd5f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -869,27 +869,6 @@ xfs_fs_dirty_inode(
869} 869}
870 870
871STATIC int 871STATIC int
872xfs_log_inode(
873 struct xfs_inode *ip)
874{
875 struct xfs_mount *mp = ip->i_mount;
876 struct xfs_trans *tp;
877 int error;
878
879 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
880 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
881 if (error) {
882 xfs_trans_cancel(tp, 0);
883 return error;
884 }
885
886 xfs_ilock(ip, XFS_ILOCK_EXCL);
887 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
888 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
889 return xfs_trans_commit(tp, 0);
890}
891
892STATIC int
893xfs_fs_write_inode( 872xfs_fs_write_inode(
894 struct inode *inode, 873 struct inode *inode,
895 struct writeback_control *wbc) 874 struct writeback_control *wbc)
@@ -902,10 +881,8 @@ xfs_fs_write_inode(
902 881
903 if (XFS_FORCED_SHUTDOWN(mp)) 882 if (XFS_FORCED_SHUTDOWN(mp))
904 return -XFS_ERROR(EIO); 883 return -XFS_ERROR(EIO);
905 if (!ip->i_update_core)
906 return 0;
907 884
908 if (wbc->sync_mode == WB_SYNC_ALL) { 885 if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
909 /* 886 /*
910 * Make sure the inode has made it it into the log. Instead 887 * Make sure the inode has made it it into the log. Instead
911 * of forcing it all the way to stable storage using a 888 * of forcing it all the way to stable storage using a
@@ -913,11 +890,14 @@ xfs_fs_write_inode(
913 * ->sync_fs call do that for thus, which reduces the number 890 * ->sync_fs call do that for thus, which reduces the number
914 * of synchronous log forces dramatically. 891 * of synchronous log forces dramatically.
915 */ 892 */
916 error = xfs_log_inode(ip); 893 error = xfs_log_dirty_inode(ip, NULL, 0);
917 if (error) 894 if (error)
918 goto out; 895 goto out;
919 return 0; 896 return 0;
920 } else { 897 } else {
898 if (!ip->i_update_core)
899 return 0;
900
921 /* 901 /*
922 * We make this non-blocking if the inode is contended, return 902 * We make this non-blocking if the inode is contended, return
923 * EAGAIN to indicate to the caller that they did not succeed. 903 * EAGAIN to indicate to the caller that they did not succeed.
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index be5c51d8f757..f0994aedcd15 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -336,6 +336,32 @@ xfs_sync_fsdata(
336 return error; 336 return error;
337} 337}
338 338
339int
340xfs_log_dirty_inode(
341 struct xfs_inode *ip,
342 struct xfs_perag *pag,
343 int flags)
344{
345 struct xfs_mount *mp = ip->i_mount;
346 struct xfs_trans *tp;
347 int error;
348
349 if (!ip->i_update_core)
350 return 0;
351
352 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
353 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
354 if (error) {
355 xfs_trans_cancel(tp, 0);
356 return error;
357 }
358
359 xfs_ilock(ip, XFS_ILOCK_EXCL);
360 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
361 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
362 return xfs_trans_commit(tp, 0);
363}
364
339/* 365/*
340 * When remounting a filesystem read-only or freezing the filesystem, we have 366 * When remounting a filesystem read-only or freezing the filesystem, we have
341 * two phases to execute. This first phase is syncing the data before we 367 * two phases to execute. This first phase is syncing the data before we
@@ -359,6 +385,16 @@ xfs_quiesce_data(
359{ 385{
360 int error, error2 = 0; 386 int error, error2 = 0;
361 387
388 /*
389 * Log all pending size and timestamp updates. The vfs writeback
390 * code is supposed to do this, but due to its overagressive
391 * livelock detection it will skip inodes where appending writes
392 * were written out in the first non-blocking sync phase if their
393 * completion took long enough that it happened after taking the
394 * timestamp for the cut-off in the blocking phase.
395 */
396 xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
397
362 xfs_qm_sync(mp, SYNC_TRYLOCK); 398 xfs_qm_sync(mp, SYNC_TRYLOCK);
363 xfs_qm_sync(mp, SYNC_WAIT); 399 xfs_qm_sync(mp, SYNC_WAIT);
364 400
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6e..fa965479d788 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
34 34
35void xfs_flush_inodes(struct xfs_inode *ip); 35void xfs_flush_inodes(struct xfs_inode *ip);
36 36
37int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
38
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 39int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp); 40int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 41void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 62ce6823c0f2..9a62937c56ca 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -4,70 +4,66 @@
4#include <linux/time.h> 4#include <linux/time.h>
5#include <linux/jiffies.h> 5#include <linux/jiffies.h>
6 6
7typedef unsigned long cputime_t; 7typedef unsigned long __nocast cputime_t;
8 8
9#define cputime_zero (0UL)
10#define cputime_one_jiffy jiffies_to_cputime(1) 9#define cputime_one_jiffy jiffies_to_cputime(1)
11#define cputime_max ((~0UL >> 1) - 1) 10#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct)
12#define cputime_add(__a, __b) ((__a) + (__b))
13#define cputime_sub(__a, __b) ((__a) - (__b))
14#define cputime_div(__a, __n) ((__a) / (__n))
15#define cputime_halve(__a) ((__a) >> 1)
16#define cputime_eq(__a, __b) ((__a) == (__b))
17#define cputime_gt(__a, __b) ((__a) > (__b))
18#define cputime_ge(__a, __b) ((__a) >= (__b))
19#define cputime_lt(__a, __b) ((__a) < (__b))
20#define cputime_le(__a, __b) ((__a) <= (__b))
21#define cputime_to_jiffies(__ct) (__ct)
22#define cputime_to_scaled(__ct) (__ct) 11#define cputime_to_scaled(__ct) (__ct)
23#define jiffies_to_cputime(__hz) (__hz) 12#define jiffies_to_cputime(__hz) (__force cputime_t)(__hz)
24 13
25typedef u64 cputime64_t; 14typedef u64 __nocast cputime64_t;
26 15
27#define cputime64_zero (0ULL) 16#define cputime64_to_jiffies64(__ct) (__force u64)(__ct)
28#define cputime64_add(__a, __b) ((__a) + (__b)) 17#define jiffies64_to_cputime64(__jif) (__force cputime64_t)(__jif)
29#define cputime64_sub(__a, __b) ((__a) - (__b))
30#define cputime64_to_jiffies64(__ct) (__ct)
31#define jiffies64_to_cputime64(__jif) (__jif)
32#define cputime_to_cputime64(__ct) ((u64) __ct)
33#define cputime64_gt(__a, __b) ((__a) > (__b))
34 18
35#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct) 19#define nsecs_to_cputime64(__ct) \
20 jiffies64_to_cputime64(nsecs_to_jiffies64(__ct))
36 21
37 22
38/* 23/*
39 * Convert cputime to microseconds and back. 24 * Convert cputime to microseconds and back.
40 */ 25 */
41#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) 26#define cputime_to_usecs(__ct) \
42#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) 27 jiffies_to_usecs(cputime_to_jiffies(__ct))
28#define usecs_to_cputime(__usec) \
29 jiffies_to_cputime(usecs_to_jiffies(__usec))
30#define usecs_to_cputime64(__usec) \
31 jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000))
43 32
44/* 33/*
45 * Convert cputime to seconds and back. 34 * Convert cputime to seconds and back.
46 */ 35 */
47#define cputime_to_secs(jif) ((jif) / HZ) 36#define cputime_to_secs(jif) (cputime_to_jiffies(jif) / HZ)
48#define secs_to_cputime(sec) ((sec) * HZ) 37#define secs_to_cputime(sec) jiffies_to_cputime((sec) * HZ)
49 38
50/* 39/*
51 * Convert cputime to timespec and back. 40 * Convert cputime to timespec and back.
52 */ 41 */
53#define timespec_to_cputime(__val) timespec_to_jiffies(__val) 42#define timespec_to_cputime(__val) \
54#define cputime_to_timespec(__ct,__val) jiffies_to_timespec(__ct,__val) 43 jiffies_to_cputime(timespec_to_jiffies(__val))
44#define cputime_to_timespec(__ct,__val) \
45 jiffies_to_timespec(cputime_to_jiffies(__ct),__val)
55 46
56/* 47/*
57 * Convert cputime to timeval and back. 48 * Convert cputime to timeval and back.
58 */ 49 */
59#define timeval_to_cputime(__val) timeval_to_jiffies(__val) 50#define timeval_to_cputime(__val) \
60#define cputime_to_timeval(__ct,__val) jiffies_to_timeval(__ct,__val) 51 jiffies_to_cputime(timeval_to_jiffies(__val))
52#define cputime_to_timeval(__ct,__val) \
53 jiffies_to_timeval(cputime_to_jiffies(__ct),__val)
61 54
62/* 55/*
63 * Convert cputime to clock and back. 56 * Convert cputime to clock and back.
64 */ 57 */
65#define cputime_to_clock_t(__ct) jiffies_to_clock_t(__ct) 58#define cputime_to_clock_t(__ct) \
66#define clock_t_to_cputime(__x) clock_t_to_jiffies(__x) 59 jiffies_to_clock_t(cputime_to_jiffies(__ct))
60#define clock_t_to_cputime(__x) \
61 jiffies_to_cputime(clock_t_to_jiffies(__x))
67 62
68/* 63/*
69 * Convert cputime64 to clock. 64 * Convert cputime64 to clock.
70 */ 65 */
71#define cputime64_to_clock_t(__ct) jiffies_64_to_clock_t(__ct) 66#define cputime64_to_clock_t(__ct) \
67 jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct))
72 68
73#endif 69#endif
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a3ef66a2a083..3c1063acb2ab 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -22,8 +22,14 @@ extern unsigned long __sw_hweight64(__u64 w);
22#include <asm/bitops.h> 22#include <asm/bitops.h>
23 23
24#define for_each_set_bit(bit, addr, size) \ 24#define for_each_set_bit(bit, addr, size) \
25 for ((bit) = find_first_bit((addr), (size)); \ 25 for ((bit) = find_first_bit((addr), (size)); \
26 (bit) < (size); \ 26 (bit) < (size); \
27 (bit) = find_next_bit((addr), (size), (bit) + 1))
28
29/* same as for_each_set_bit() but use bit as value to start with */
30#define for_each_set_bit_cont(bit, addr, size) \
31 for ((bit) = find_next_bit((addr), (size), (bit)); \
32 (bit) < (size); \
27 (bit) = find_next_bit((addr), (size), (bit) + 1)) 33 (bit) = find_next_bit((addr), (size), (bit) + 1))
28 34
29static __inline__ int get_bitmask_order(unsigned int count) 35static __inline__ int get_bitmask_order(unsigned int count)
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index ab344a521105..66d3e954eb6c 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -44,7 +44,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
44 unsigned long endpfn); 44 unsigned long endpfn);
45extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); 45extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
46 46
47unsigned long free_all_memory_core_early(int nodeid); 47extern unsigned long free_low_memory_core_early(int nodeid);
48extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); 48extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
49extern unsigned long free_all_bootmem(void); 49extern unsigned long free_all_bootmem(void);
50 50
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6cb60fd2ea84..305c263021e7 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -27,6 +27,7 @@ struct cpu {
27 27
28extern int register_cpu(struct cpu *cpu, int num); 28extern int register_cpu(struct cpu *cpu, int num);
29extern struct sys_device *get_cpu_sysdev(unsigned cpu); 29extern struct sys_device *get_cpu_sysdev(unsigned cpu);
30extern bool cpu_is_hotpluggable(unsigned cpu);
30 31
31extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr); 32extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr);
32extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr); 33extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr);
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 65970b811e22..0e5f5785d9f2 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -46,6 +46,8 @@ struct debug_obj {
46 * fails 46 * fails
47 * @fixup_free: fixup function, which is called when the free check 47 * @fixup_free: fixup function, which is called when the free check
48 * fails 48 * fails
49 * @fixup_assert_init: fixup function, which is called when the assert_init
50 * check fails
49 */ 51 */
50struct debug_obj_descr { 52struct debug_obj_descr {
51 const char *name; 53 const char *name;
@@ -54,6 +56,7 @@ struct debug_obj_descr {
54 int (*fixup_activate) (void *addr, enum debug_obj_state state); 56 int (*fixup_activate) (void *addr, enum debug_obj_state state);
55 int (*fixup_destroy) (void *addr, enum debug_obj_state state); 57 int (*fixup_destroy) (void *addr, enum debug_obj_state state);
56 int (*fixup_free) (void *addr, enum debug_obj_state state); 58 int (*fixup_free) (void *addr, enum debug_obj_state state);
59 int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
57}; 60};
58 61
59#ifdef CONFIG_DEBUG_OBJECTS 62#ifdef CONFIG_DEBUG_OBJECTS
@@ -64,6 +67,7 @@ extern void debug_object_activate (void *addr, struct debug_obj_descr *descr);
64extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr); 67extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr);
65extern void debug_object_destroy (void *addr, struct debug_obj_descr *descr); 68extern void debug_object_destroy (void *addr, struct debug_obj_descr *descr);
66extern void debug_object_free (void *addr, struct debug_obj_descr *descr); 69extern void debug_object_free (void *addr, struct debug_obj_descr *descr);
70extern void debug_object_assert_init(void *addr, struct debug_obj_descr *descr);
67 71
68/* 72/*
69 * Active state: 73 * Active state:
@@ -89,6 +93,8 @@ static inline void
89debug_object_destroy (void *addr, struct debug_obj_descr *descr) { } 93debug_object_destroy (void *addr, struct debug_obj_descr *descr) { }
90static inline void 94static inline void
91debug_object_free (void *addr, struct debug_obj_descr *descr) { } 95debug_object_free (void *addr, struct debug_obj_descr *descr) { }
96static inline void
97debug_object_assert_init(void *addr, struct debug_obj_descr *descr) { }
92 98
93static inline void debug_objects_early_init(void) { } 99static inline void debug_objects_early_init(void) { }
94static inline void debug_objects_mem_init(void) { } 100static inline void debug_objects_mem_init(void) { }
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f743883f769e..bb7f30971858 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -139,20 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
139extern void account_system_vtime(struct task_struct *tsk); 139extern void account_system_vtime(struct task_struct *tsk);
140#endif 140#endif
141 141
142#if defined(CONFIG_NO_HZ)
143#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) 142#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
144extern void rcu_enter_nohz(void);
145extern void rcu_exit_nohz(void);
146
147static inline void rcu_irq_enter(void)
148{
149 rcu_exit_nohz();
150}
151
152static inline void rcu_irq_exit(void)
153{
154 rcu_enter_nohz();
155}
156 143
157static inline void rcu_nmi_enter(void) 144static inline void rcu_nmi_enter(void)
158{ 145{
@@ -163,17 +150,9 @@ static inline void rcu_nmi_exit(void)
163} 150}
164 151
165#else 152#else
166extern void rcu_irq_enter(void);
167extern void rcu_irq_exit(void);
168extern void rcu_nmi_enter(void); 153extern void rcu_nmi_enter(void);
169extern void rcu_nmi_exit(void); 154extern void rcu_nmi_exit(void);
170#endif 155#endif
171#else
172# define rcu_irq_enter() do { } while (0)
173# define rcu_irq_exit() do { } while (0)
174# define rcu_nmi_enter() do { } while (0)
175# define rcu_nmi_exit() do { } while (0)
176#endif /* #if defined(CONFIG_NO_HZ) */
177 156
178/* 157/*
179 * It is safe to do non-atomic ops on ->hardirq_context, 158 * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 388b0d425b50..5ce8b140428f 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/compiler.h> 5#include <linux/compiler.h>
6#include <linux/workqueue.h>
6 7
7#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) 8#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
8 9
@@ -14,6 +15,12 @@ struct jump_label_key {
14#endif 15#endif
15}; 16};
16 17
18struct jump_label_key_deferred {
19 struct jump_label_key key;
20 unsigned long timeout;
21 struct delayed_work work;
22};
23
17# include <asm/jump_label.h> 24# include <asm/jump_label.h>
18# define HAVE_JUMP_LABEL 25# define HAVE_JUMP_LABEL
19#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ 26#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
@@ -51,8 +58,11 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry,
51extern int jump_label_text_reserved(void *start, void *end); 58extern int jump_label_text_reserved(void *start, void *end);
52extern void jump_label_inc(struct jump_label_key *key); 59extern void jump_label_inc(struct jump_label_key *key);
53extern void jump_label_dec(struct jump_label_key *key); 60extern void jump_label_dec(struct jump_label_key *key);
61extern void jump_label_dec_deferred(struct jump_label_key_deferred *key);
54extern bool jump_label_enabled(struct jump_label_key *key); 62extern bool jump_label_enabled(struct jump_label_key *key);
55extern void jump_label_apply_nops(struct module *mod); 63extern void jump_label_apply_nops(struct module *mod);
64extern void jump_label_rate_limit(struct jump_label_key_deferred *key,
65 unsigned long rl);
56 66
57#else /* !HAVE_JUMP_LABEL */ 67#else /* !HAVE_JUMP_LABEL */
58 68
@@ -68,6 +78,10 @@ static __always_inline void jump_label_init(void)
68{ 78{
69} 79}
70 80
81struct jump_label_key_deferred {
82 struct jump_label_key key;
83};
84
71static __always_inline bool static_branch(struct jump_label_key *key) 85static __always_inline bool static_branch(struct jump_label_key *key)
72{ 86{
73 if (unlikely(atomic_read(&key->enabled))) 87 if (unlikely(atomic_read(&key->enabled)))
@@ -85,6 +99,11 @@ static inline void jump_label_dec(struct jump_label_key *key)
85 atomic_dec(&key->enabled); 99 atomic_dec(&key->enabled);
86} 100}
87 101
102static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key)
103{
104 jump_label_dec(&key->key);
105}
106
88static inline int jump_label_text_reserved(void *start, void *end) 107static inline int jump_label_text_reserved(void *start, void *end)
89{ 108{
90 return 0; 109 return 0;
@@ -102,6 +121,14 @@ static inline int jump_label_apply_nops(struct module *mod)
102{ 121{
103 return 0; 122 return 0;
104} 123}
124
125static inline void jump_label_rate_limit(struct jump_label_key_deferred *key,
126 unsigned long rl)
127{
128}
105#endif /* HAVE_JUMP_LABEL */ 129#endif /* HAVE_JUMP_LABEL */
106 130
131#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), })
132#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), })
133
107#endif /* _LINUX_JUMP_LABEL_H */ 134#endif /* _LINUX_JUMP_LABEL_H */
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0cce2db580c3..2fbd9053c2df 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -6,6 +6,7 @@
6#include <linux/percpu.h> 6#include <linux/percpu.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/sched.h>
9#include <asm/irq.h> 10#include <asm/irq.h>
10#include <asm/cputime.h> 11#include <asm/cputime.h>
11 12
@@ -15,21 +16,25 @@
15 * used by rstatd/perfmeter 16 * used by rstatd/perfmeter
16 */ 17 */
17 18
18struct cpu_usage_stat { 19enum cpu_usage_stat {
19 cputime64_t user; 20 CPUTIME_USER,
20 cputime64_t nice; 21 CPUTIME_NICE,
21 cputime64_t system; 22 CPUTIME_SYSTEM,
22 cputime64_t softirq; 23 CPUTIME_SOFTIRQ,
23 cputime64_t irq; 24 CPUTIME_IRQ,
24 cputime64_t idle; 25 CPUTIME_IDLE,
25 cputime64_t iowait; 26 CPUTIME_IOWAIT,
26 cputime64_t steal; 27 CPUTIME_STEAL,
27 cputime64_t guest; 28 CPUTIME_GUEST,
28 cputime64_t guest_nice; 29 CPUTIME_GUEST_NICE,
30 NR_STATS,
31};
32
33struct kernel_cpustat {
34 u64 cpustat[NR_STATS];
29}; 35};
30 36
31struct kernel_stat { 37struct kernel_stat {
32 struct cpu_usage_stat cpustat;
33#ifndef CONFIG_GENERIC_HARDIRQS 38#ifndef CONFIG_GENERIC_HARDIRQS
34 unsigned int irqs[NR_IRQS]; 39 unsigned int irqs[NR_IRQS];
35#endif 40#endif
@@ -38,10 +43,13 @@ struct kernel_stat {
38}; 43};
39 44
40DECLARE_PER_CPU(struct kernel_stat, kstat); 45DECLARE_PER_CPU(struct kernel_stat, kstat);
46DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
41 47
42#define kstat_cpu(cpu) per_cpu(kstat, cpu)
43/* Must have preemption disabled for this to be meaningful. */ 48/* Must have preemption disabled for this to be meaningful. */
44#define kstat_this_cpu __get_cpu_var(kstat) 49#define kstat_this_cpu (&__get_cpu_var(kstat))
50#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat))
51#define kstat_cpu(cpu) per_cpu(kstat, cpu)
52#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu)
45 53
46extern unsigned long long nr_context_switches(void); 54extern unsigned long long nr_context_switches(void);
47 55
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c3892fc1d538..68e67e50d028 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo {
557#define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ 557#define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
558#define KVM_CAP_PPC_PAPR 68 558#define KVM_CAP_PPC_PAPR 68
559#define KVM_CAP_S390_GMAP 71 559#define KVM_CAP_S390_GMAP 71
560#define KVM_CAP_TSC_DEADLINE_TIMER 72
560 561
561#ifdef KVM_CAP_IRQ_ROUTING 562#ifdef KVM_CAP_IRQ_ROUTING
562 563
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index b0e99898527c..e23121f9d82a 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -10,6 +10,8 @@
10#define _INCLUDE_GUARD_LATENCYTOP_H_ 10#define _INCLUDE_GUARD_LATENCYTOP_H_
11 11
12#include <linux/compiler.h> 12#include <linux/compiler.h>
13struct task_struct;
14
13#ifdef CONFIG_LATENCYTOP 15#ifdef CONFIG_LATENCYTOP
14 16
15#define LT_SAVECOUNT 32 17#define LT_SAVECOUNT 32
@@ -23,7 +25,6 @@ struct latency_record {
23}; 25};
24 26
25 27
26struct task_struct;
27 28
28extern int latencytop_enabled; 29extern int latencytop_enabled;
29void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); 30void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b6a56e37284c..d36619ead3ba 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -343,6 +343,8 @@ extern void lockdep_trace_alloc(gfp_t mask);
343 343
344#define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l)) 344#define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l))
345 345
346#define lockdep_recursing(tsk) ((tsk)->lockdep_recursion)
347
346#else /* !LOCKDEP */ 348#else /* !LOCKDEP */
347 349
348static inline void lockdep_off(void) 350static inline void lockdep_off(void)
@@ -392,6 +394,8 @@ struct lock_class_key { };
392 394
393#define lockdep_assert_held(l) do { } while (0) 395#define lockdep_assert_held(l) do { } while (0)
394 396
397#define lockdep_recursing(tsk) (0)
398
395#endif /* !LOCKDEP */ 399#endif /* !LOCKDEP */
396 400
397#ifdef CONFIG_LOCK_STAT 401#ifdef CONFIG_LOCK_STAT
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e6b843e16e81..a6bb10235148 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -2,8 +2,6 @@
2#define _LINUX_MEMBLOCK_H 2#define _LINUX_MEMBLOCK_H
3#ifdef __KERNEL__ 3#ifdef __KERNEL__
4 4
5#define MEMBLOCK_ERROR 0
6
7#ifdef CONFIG_HAVE_MEMBLOCK 5#ifdef CONFIG_HAVE_MEMBLOCK
8/* 6/*
9 * Logical memory blocks. 7 * Logical memory blocks.
@@ -19,81 +17,161 @@
19#include <linux/init.h> 17#include <linux/init.h>
20#include <linux/mm.h> 18#include <linux/mm.h>
21 19
22#include <asm/memblock.h>
23
24#define INIT_MEMBLOCK_REGIONS 128 20#define INIT_MEMBLOCK_REGIONS 128
25 21
26struct memblock_region { 22struct memblock_region {
27 phys_addr_t base; 23 phys_addr_t base;
28 phys_addr_t size; 24 phys_addr_t size;
25#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
26 int nid;
27#endif
29}; 28};
30 29
31struct memblock_type { 30struct memblock_type {
32 unsigned long cnt; /* number of regions */ 31 unsigned long cnt; /* number of regions */
33 unsigned long max; /* size of the allocated array */ 32 unsigned long max; /* size of the allocated array */
33 phys_addr_t total_size; /* size of all regions */
34 struct memblock_region *regions; 34 struct memblock_region *regions;
35}; 35};
36 36
37struct memblock { 37struct memblock {
38 phys_addr_t current_limit; 38 phys_addr_t current_limit;
39 phys_addr_t memory_size; /* Updated by memblock_analyze() */
40 struct memblock_type memory; 39 struct memblock_type memory;
41 struct memblock_type reserved; 40 struct memblock_type reserved;
42}; 41};
43 42
44extern struct memblock memblock; 43extern struct memblock memblock;
45extern int memblock_debug; 44extern int memblock_debug;
46extern int memblock_can_resize;
47 45
48#define memblock_dbg(fmt, ...) \ 46#define memblock_dbg(fmt, ...) \
49 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 47 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
50 48
51u64 memblock_find_in_range(u64 start, u64 end, u64 size, u64 align); 49phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end,
50 phys_addr_t size, phys_addr_t align, int nid);
51phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
52 phys_addr_t size, phys_addr_t align);
52int memblock_free_reserved_regions(void); 53int memblock_free_reserved_regions(void);
53int memblock_reserve_reserved_regions(void); 54int memblock_reserve_reserved_regions(void);
54 55
55extern void memblock_init(void); 56void memblock_allow_resize(void);
56extern void memblock_analyze(void); 57int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
57extern long memblock_add(phys_addr_t base, phys_addr_t size); 58int memblock_add(phys_addr_t base, phys_addr_t size);
58extern long memblock_remove(phys_addr_t base, phys_addr_t size); 59int memblock_remove(phys_addr_t base, phys_addr_t size);
59extern long memblock_free(phys_addr_t base, phys_addr_t size); 60int memblock_free(phys_addr_t base, phys_addr_t size);
60extern long memblock_reserve(phys_addr_t base, phys_addr_t size); 61int memblock_reserve(phys_addr_t base, phys_addr_t size);
62
63#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
64void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
65 unsigned long *out_end_pfn, int *out_nid);
66
67/**
68 * for_each_mem_pfn_range - early memory pfn range iterator
69 * @i: an integer used as loop variable
70 * @nid: node selector, %MAX_NUMNODES for all nodes
71 * @p_start: ptr to ulong for start pfn of the range, can be %NULL
72 * @p_end: ptr to ulong for end pfn of the range, can be %NULL
73 * @p_nid: ptr to int for nid of the range, can be %NULL
74 *
75 * Walks over configured memory ranges. Available after early_node_map is
76 * populated.
77 */
78#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
79 for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
80 i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
81#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
82
83void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
84 phys_addr_t *out_end, int *out_nid);
85
86/**
87 * for_each_free_mem_range - iterate through free memblock areas
88 * @i: u64 used as loop variable
89 * @nid: node selector, %MAX_NUMNODES for all nodes
90 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
91 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
92 * @p_nid: ptr to int for nid of the range, can be %NULL
93 *
94 * Walks over free (memory && !reserved) areas of memblock. Available as
95 * soon as memblock is initialized.
96 */
97#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \
98 for (i = 0, \
99 __next_free_mem_range(&i, nid, p_start, p_end, p_nid); \
100 i != (u64)ULLONG_MAX; \
101 __next_free_mem_range(&i, nid, p_start, p_end, p_nid))
102
103void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
104 phys_addr_t *out_end, int *out_nid);
61 105
62/* The numa aware allocator is only available if 106/**
63 * CONFIG_ARCH_POPULATES_NODE_MAP is set 107 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
108 * @i: u64 used as loop variable
109 * @nid: node selector, %MAX_NUMNODES for all nodes
110 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
111 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
112 * @p_nid: ptr to int for nid of the range, can be %NULL
113 *
114 * Walks over free (memory && !reserved) areas of memblock in reverse
115 * order. Available as soon as memblock is initialized.
64 */ 116 */
65extern phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, 117#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
66 int nid); 118 for (i = (u64)ULLONG_MAX, \
67extern phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, 119 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \
68 int nid); 120 i != (u64)ULLONG_MAX; \
121 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid))
69 122
70extern phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); 123#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
124int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid);
125
126static inline void memblock_set_region_node(struct memblock_region *r, int nid)
127{
128 r->nid = nid;
129}
130
131static inline int memblock_get_region_node(const struct memblock_region *r)
132{
133 return r->nid;
134}
135#else
136static inline void memblock_set_region_node(struct memblock_region *r, int nid)
137{
138}
139
140static inline int memblock_get_region_node(const struct memblock_region *r)
141{
142 return 0;
143}
144#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
145
146phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
147phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
148
149phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
71 150
72/* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ 151/* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
73#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) 152#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
74#define MEMBLOCK_ALLOC_ACCESSIBLE 0 153#define MEMBLOCK_ALLOC_ACCESSIBLE 0
75 154
76extern phys_addr_t memblock_alloc_base(phys_addr_t size, 155phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
77 phys_addr_t align, 156 phys_addr_t max_addr);
78 phys_addr_t max_addr); 157phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
79extern phys_addr_t __memblock_alloc_base(phys_addr_t size, 158 phys_addr_t max_addr);
80 phys_addr_t align, 159phys_addr_t memblock_phys_mem_size(void);
81 phys_addr_t max_addr); 160phys_addr_t memblock_start_of_DRAM(void);
82extern phys_addr_t memblock_phys_mem_size(void); 161phys_addr_t memblock_end_of_DRAM(void);
83extern phys_addr_t memblock_start_of_DRAM(void); 162void memblock_enforce_memory_limit(phys_addr_t memory_limit);
84extern phys_addr_t memblock_end_of_DRAM(void); 163int memblock_is_memory(phys_addr_t addr);
85extern void memblock_enforce_memory_limit(phys_addr_t memory_limit); 164int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
86extern int memblock_is_memory(phys_addr_t addr); 165int memblock_is_reserved(phys_addr_t addr);
87extern int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); 166int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
88extern int memblock_is_reserved(phys_addr_t addr); 167
89extern int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); 168extern void __memblock_dump_all(void);
90 169
91extern void memblock_dump_all(void); 170static inline void memblock_dump_all(void)
92 171{
93/* Provided by the architecture */ 172 if (memblock_debug)
94extern phys_addr_t memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid); 173 __memblock_dump_all();
95extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, 174}
96 phys_addr_t addr2, phys_addr_t size2);
97 175
98/** 176/**
99 * memblock_set_current_limit - Set the current allocation limit to allow 177 * memblock_set_current_limit - Set the current allocation limit to allow
@@ -101,7 +179,7 @@ extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
101 * accessible during boot 179 * accessible during boot
102 * @limit: New limit value (physical address) 180 * @limit: New limit value (physical address)
103 */ 181 */
104extern void memblock_set_current_limit(phys_addr_t limit); 182void memblock_set_current_limit(phys_addr_t limit);
105 183
106 184
107/* 185/*
@@ -154,9 +232,9 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
154 region++) 232 region++)
155 233
156 234
157#ifdef ARCH_DISCARD_MEMBLOCK 235#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
158#define __init_memblock __init 236#define __init_memblock __meminit
159#define __initdata_memblock __initdata 237#define __initdata_memblock __meminitdata
160#else 238#else
161#define __init_memblock 239#define __init_memblock
162#define __initdata_memblock 240#define __initdata_memblock
@@ -165,7 +243,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
165#else 243#else
166static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) 244static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
167{ 245{
168 return MEMBLOCK_ERROR; 246 return 0;
169} 247}
170 248
171#endif /* CONFIG_HAVE_MEMBLOCK */ 249#endif /* CONFIG_HAVE_MEMBLOCK */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4baadd18f4ad..5d9b4c9813bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1253,41 +1253,34 @@ static inline void pgtable_page_dtor(struct page *page)
1253extern void free_area_init(unsigned long * zones_size); 1253extern void free_area_init(unsigned long * zones_size);
1254extern void free_area_init_node(int nid, unsigned long * zones_size, 1254extern void free_area_init_node(int nid, unsigned long * zones_size,
1255 unsigned long zone_start_pfn, unsigned long *zholes_size); 1255 unsigned long zone_start_pfn, unsigned long *zholes_size);
1256#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 1256#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1257/* 1257/*
1258 * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its 1258 * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
1259 * zones, allocate the backing mem_map and account for memory holes in a more 1259 * zones, allocate the backing mem_map and account for memory holes in a more
1260 * architecture independent manner. This is a substitute for creating the 1260 * architecture independent manner. This is a substitute for creating the
1261 * zone_sizes[] and zholes_size[] arrays and passing them to 1261 * zone_sizes[] and zholes_size[] arrays and passing them to
1262 * free_area_init_node() 1262 * free_area_init_node()
1263 * 1263 *
1264 * An architecture is expected to register range of page frames backed by 1264 * An architecture is expected to register range of page frames backed by
1265 * physical memory with add_active_range() before calling 1265 * physical memory with memblock_add[_node]() before calling
1266 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic 1266 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
1267 * usage, an architecture is expected to do something like 1267 * usage, an architecture is expected to do something like
1268 * 1268 *
1269 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, 1269 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
1270 * max_highmem_pfn}; 1270 * max_highmem_pfn};
1271 * for_each_valid_physical_page_range() 1271 * for_each_valid_physical_page_range()
1272 * add_active_range(node_id, start_pfn, end_pfn) 1272 * memblock_add_node(base, size, nid)
1273 * free_area_init_nodes(max_zone_pfns); 1273 * free_area_init_nodes(max_zone_pfns);
1274 * 1274 *
1275 * If the architecture guarantees that there are no holes in the ranges 1275 * free_bootmem_with_active_regions() calls free_bootmem_node() for each
1276 * registered with add_active_range(), free_bootmem_active_regions() 1276 * registered physical page range. Similarly
1277 * will call free_bootmem_node() for each registered physical page range. 1277 * sparse_memory_present_with_active_regions() calls memory_present() for
1278 * Similarly sparse_memory_present_with_active_regions() calls 1278 * each range when SPARSEMEM is enabled.
1279 * memory_present() for each range when SPARSEMEM is enabled.
1280 * 1279 *
1281 * See mm/page_alloc.c for more information on each function exposed by 1280 * See mm/page_alloc.c for more information on each function exposed by
1282 * CONFIG_ARCH_POPULATES_NODE_MAP 1281 * CONFIG_HAVE_MEMBLOCK_NODE_MAP.
1283 */ 1282 */
1284extern void free_area_init_nodes(unsigned long *max_zone_pfn); 1283extern void free_area_init_nodes(unsigned long *max_zone_pfn);
1285extern void add_active_range(unsigned int nid, unsigned long start_pfn,
1286 unsigned long end_pfn);
1287extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
1288 unsigned long end_pfn);
1289extern void remove_all_active_ranges(void);
1290void sort_node_map(void);
1291unsigned long node_map_pfn_alignment(void); 1284unsigned long node_map_pfn_alignment(void);
1292unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, 1285unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
1293 unsigned long end_pfn); 1286 unsigned long end_pfn);
@@ -1300,14 +1293,11 @@ extern void free_bootmem_with_active_regions(int nid,
1300 unsigned long max_low_pfn); 1293 unsigned long max_low_pfn);
1301int add_from_early_node_map(struct range *range, int az, 1294int add_from_early_node_map(struct range *range, int az,
1302 int nr_range, int nid); 1295 int nr_range, int nid);
1303u64 __init find_memory_core_early(int nid, u64 size, u64 align,
1304 u64 goal, u64 limit);
1305typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
1306extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
1307extern void sparse_memory_present_with_active_regions(int nid); 1296extern void sparse_memory_present_with_active_regions(int nid);
1308#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
1309 1297
1310#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ 1298#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
1299
1300#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
1311 !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) 1301 !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
1312static inline int __early_pfn_to_nid(unsigned long pfn) 1302static inline int __early_pfn_to_nid(unsigned long pfn)
1313{ 1303{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 188cb2ffe8db..3ac040f19369 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -598,13 +598,13 @@ struct zonelist {
598#endif 598#endif
599}; 599};
600 600
601#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 601#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
602struct node_active_region { 602struct node_active_region {
603 unsigned long start_pfn; 603 unsigned long start_pfn;
604 unsigned long end_pfn; 604 unsigned long end_pfn;
605 int nid; 605 int nid;
606}; 606};
607#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 607#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
608 608
609#ifndef CONFIG_DISCONTIGMEM 609#ifndef CONFIG_DISCONTIGMEM
610/* The array of struct pages - for discontigmem use pgdat->lmem_map */ 610/* The array of struct pages - for discontigmem use pgdat->lmem_map */
@@ -720,7 +720,7 @@ extern int movable_zone;
720 720
721static inline int zone_movable_is_highmem(void) 721static inline int zone_movable_is_highmem(void)
722{ 722{
723#if defined(CONFIG_HIGHMEM) && defined(CONFIG_ARCH_POPULATES_NODE_MAP) 723#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE)
724 return movable_zone == ZONE_HIGHMEM; 724 return movable_zone == ZONE_HIGHMEM;
725#else 725#else
726 return 0; 726 return 0;
@@ -938,7 +938,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
938#endif 938#endif
939 939
940#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ 940#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
941 !defined(CONFIG_ARCH_POPULATES_NODE_MAP) 941 !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
942static inline unsigned long early_pfn_to_nid(unsigned long pfn) 942static inline unsigned long early_pfn_to_nid(unsigned long pfn)
943{ 943{
944 return 0; 944 return 0;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b1f89122bf6a..08855613ceb3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -54,6 +54,7 @@ enum perf_hw_id {
54 PERF_COUNT_HW_BUS_CYCLES = 6, 54 PERF_COUNT_HW_BUS_CYCLES = 6,
55 PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, 55 PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7,
56 PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, 56 PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8,
57 PERF_COUNT_HW_REF_CPU_CYCLES = 9,
57 58
58 PERF_COUNT_HW_MAX, /* non-ABI */ 59 PERF_COUNT_HW_MAX, /* non-ABI */
59}; 60};
@@ -890,6 +891,7 @@ struct perf_event_context {
890 int nr_active; 891 int nr_active;
891 int is_active; 892 int is_active;
892 int nr_stat; 893 int nr_stat;
894 int nr_freq;
893 int rotate_disable; 895 int rotate_disable;
894 atomic_t refcount; 896 atomic_t refcount;
895 struct task_struct *task; 897 struct task_struct *task;
@@ -1063,12 +1065,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
1063 } 1065 }
1064} 1066}
1065 1067
1066extern struct jump_label_key perf_sched_events; 1068extern struct jump_label_key_deferred perf_sched_events;
1067 1069
1068static inline void perf_event_task_sched_in(struct task_struct *prev, 1070static inline void perf_event_task_sched_in(struct task_struct *prev,
1069 struct task_struct *task) 1071 struct task_struct *task)
1070{ 1072{
1071 if (static_branch(&perf_sched_events)) 1073 if (static_branch(&perf_sched_events.key))
1072 __perf_event_task_sched_in(prev, task); 1074 __perf_event_task_sched_in(prev, task);
1073} 1075}
1074 1076
@@ -1077,7 +1079,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
1077{ 1079{
1078 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); 1080 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
1079 1081
1080 if (static_branch(&perf_sched_events)) 1082 if (static_branch(&perf_sched_events.key))
1081 __perf_event_task_sched_out(prev, next); 1083 __perf_event_task_sched_out(prev, next);
1082} 1084}
1083 1085
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 79159de0e341..2110a81c5e2a 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -40,12 +40,6 @@
40#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ 40#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */
41#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ 41#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */
42 42
43#ifdef CONFIG_PHYS_ADDR_T_64BIT
44#define MEMBLOCK_INACTIVE 0x3a84fb0144c9e71bULL
45#else
46#define MEMBLOCK_INACTIVE 0x44c9e71bUL
47#endif
48
49#define SLUB_RED_INACTIVE 0xbb 43#define SLUB_RED_INACTIVE 0xbb
50#define SLUB_RED_ACTIVE 0xcc 44#define SLUB_RED_ACTIVE 0xcc
51 45
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2cf4226ade7e..81c04f4348ec 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -51,6 +51,8 @@ extern int rcutorture_runnable; /* for sysctl */
51#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) 51#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
52extern void rcutorture_record_test_transition(void); 52extern void rcutorture_record_test_transition(void);
53extern void rcutorture_record_progress(unsigned long vernum); 53extern void rcutorture_record_progress(unsigned long vernum);
54extern void do_trace_rcu_torture_read(char *rcutorturename,
55 struct rcu_head *rhp);
54#else 56#else
55static inline void rcutorture_record_test_transition(void) 57static inline void rcutorture_record_test_transition(void)
56{ 58{
@@ -58,6 +60,12 @@ static inline void rcutorture_record_test_transition(void)
58static inline void rcutorture_record_progress(unsigned long vernum) 60static inline void rcutorture_record_progress(unsigned long vernum)
59{ 61{
60} 62}
63#ifdef CONFIG_RCU_TRACE
64extern void do_trace_rcu_torture_read(char *rcutorturename,
65 struct rcu_head *rhp);
66#else
67#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
68#endif
61#endif 69#endif
62 70
63#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) 71#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
@@ -177,23 +185,10 @@ extern void rcu_sched_qs(int cpu);
177extern void rcu_bh_qs(int cpu); 185extern void rcu_bh_qs(int cpu);
178extern void rcu_check_callbacks(int cpu, int user); 186extern void rcu_check_callbacks(int cpu, int user);
179struct notifier_block; 187struct notifier_block;
180 188extern void rcu_idle_enter(void);
181#ifdef CONFIG_NO_HZ 189extern void rcu_idle_exit(void);
182 190extern void rcu_irq_enter(void);
183extern void rcu_enter_nohz(void); 191extern void rcu_irq_exit(void);
184extern void rcu_exit_nohz(void);
185
186#else /* #ifdef CONFIG_NO_HZ */
187
188static inline void rcu_enter_nohz(void)
189{
190}
191
192static inline void rcu_exit_nohz(void)
193{
194}
195
196#endif /* #else #ifdef CONFIG_NO_HZ */
197 192
198/* 193/*
199 * Infrastructure to implement the synchronize_() primitives in 194 * Infrastructure to implement the synchronize_() primitives in
@@ -233,22 +228,30 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
233 228
234#ifdef CONFIG_DEBUG_LOCK_ALLOC 229#ifdef CONFIG_DEBUG_LOCK_ALLOC
235 230
236extern struct lockdep_map rcu_lock_map; 231#ifdef CONFIG_PROVE_RCU
237# define rcu_read_acquire() \ 232extern int rcu_is_cpu_idle(void);
238 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) 233#else /* !CONFIG_PROVE_RCU */
239# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) 234static inline int rcu_is_cpu_idle(void)
235{
236 return 0;
237}
238#endif /* else !CONFIG_PROVE_RCU */
240 239
241extern struct lockdep_map rcu_bh_lock_map; 240static inline void rcu_lock_acquire(struct lockdep_map *map)
242# define rcu_read_acquire_bh() \ 241{
243 lock_acquire(&rcu_bh_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) 242 WARN_ON_ONCE(rcu_is_cpu_idle());
244# define rcu_read_release_bh() lock_release(&rcu_bh_lock_map, 1, _THIS_IP_) 243 lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_);
244}
245 245
246extern struct lockdep_map rcu_sched_lock_map; 246static inline void rcu_lock_release(struct lockdep_map *map)
247# define rcu_read_acquire_sched() \ 247{
248 lock_acquire(&rcu_sched_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) 248 WARN_ON_ONCE(rcu_is_cpu_idle());
249# define rcu_read_release_sched() \ 249 lock_release(map, 1, _THIS_IP_);
250 lock_release(&rcu_sched_lock_map, 1, _THIS_IP_) 250}
251 251
252extern struct lockdep_map rcu_lock_map;
253extern struct lockdep_map rcu_bh_lock_map;
254extern struct lockdep_map rcu_sched_lock_map;
252extern int debug_lockdep_rcu_enabled(void); 255extern int debug_lockdep_rcu_enabled(void);
253 256
254/** 257/**
@@ -262,11 +265,18 @@ extern int debug_lockdep_rcu_enabled(void);
262 * 265 *
263 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot 266 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
264 * and while lockdep is disabled. 267 * and while lockdep is disabled.
268 *
269 * Note that rcu_read_lock() and the matching rcu_read_unlock() must
270 * occur in the same context, for example, it is illegal to invoke
271 * rcu_read_unlock() in process context if the matching rcu_read_lock()
272 * was invoked from within an irq handler.
265 */ 273 */
266static inline int rcu_read_lock_held(void) 274static inline int rcu_read_lock_held(void)
267{ 275{
268 if (!debug_lockdep_rcu_enabled()) 276 if (!debug_lockdep_rcu_enabled())
269 return 1; 277 return 1;
278 if (rcu_is_cpu_idle())
279 return 0;
270 return lock_is_held(&rcu_lock_map); 280 return lock_is_held(&rcu_lock_map);
271} 281}
272 282
@@ -290,6 +300,19 @@ extern int rcu_read_lock_bh_held(void);
290 * 300 *
291 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 301 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
292 * and while lockdep is disabled. 302 * and while lockdep is disabled.
303 *
304 * Note that if the CPU is in the idle loop from an RCU point of
305 * view (ie: that we are in the section between rcu_idle_enter() and
306 * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
307 * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
308 * that are in such a section, considering these as in extended quiescent
309 * state, so such a CPU is effectively never in an RCU read-side critical
310 * section regardless of what RCU primitives it invokes. This state of
311 * affairs is required --- we need to keep an RCU-free window in idle
312 * where the CPU may possibly enter into low power mode. This way we can
313 * notice an extended quiescent state to other CPUs that started a grace
314 * period. Otherwise we would delay any grace period as long as we run in
315 * the idle task.
293 */ 316 */
294#ifdef CONFIG_PREEMPT_COUNT 317#ifdef CONFIG_PREEMPT_COUNT
295static inline int rcu_read_lock_sched_held(void) 318static inline int rcu_read_lock_sched_held(void)
@@ -298,6 +321,8 @@ static inline int rcu_read_lock_sched_held(void)
298 321
299 if (!debug_lockdep_rcu_enabled()) 322 if (!debug_lockdep_rcu_enabled())
300 return 1; 323 return 1;
324 if (rcu_is_cpu_idle())
325 return 0;
301 if (debug_locks) 326 if (debug_locks)
302 lockdep_opinion = lock_is_held(&rcu_sched_lock_map); 327 lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
303 return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); 328 return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
@@ -311,12 +336,8 @@ static inline int rcu_read_lock_sched_held(void)
311 336
312#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 337#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
313 338
314# define rcu_read_acquire() do { } while (0) 339# define rcu_lock_acquire(a) do { } while (0)
315# define rcu_read_release() do { } while (0) 340# define rcu_lock_release(a) do { } while (0)
316# define rcu_read_acquire_bh() do { } while (0)
317# define rcu_read_release_bh() do { } while (0)
318# define rcu_read_acquire_sched() do { } while (0)
319# define rcu_read_release_sched() do { } while (0)
320 341
321static inline int rcu_read_lock_held(void) 342static inline int rcu_read_lock_held(void)
322{ 343{
@@ -637,7 +658,7 @@ static inline void rcu_read_lock(void)
637{ 658{
638 __rcu_read_lock(); 659 __rcu_read_lock();
639 __acquire(RCU); 660 __acquire(RCU);
640 rcu_read_acquire(); 661 rcu_lock_acquire(&rcu_lock_map);
641} 662}
642 663
643/* 664/*
@@ -657,7 +678,7 @@ static inline void rcu_read_lock(void)
657 */ 678 */
658static inline void rcu_read_unlock(void) 679static inline void rcu_read_unlock(void)
659{ 680{
660 rcu_read_release(); 681 rcu_lock_release(&rcu_lock_map);
661 __release(RCU); 682 __release(RCU);
662 __rcu_read_unlock(); 683 __rcu_read_unlock();
663} 684}
@@ -673,12 +694,17 @@ static inline void rcu_read_unlock(void)
673 * critical sections in interrupt context can use just rcu_read_lock(), 694 * critical sections in interrupt context can use just rcu_read_lock(),
674 * though this should at least be commented to avoid confusing people 695 * though this should at least be commented to avoid confusing people
675 * reading the code. 696 * reading the code.
697 *
698 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
699 * must occur in the same context, for example, it is illegal to invoke
700 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
701 * was invoked from some other task.
676 */ 702 */
677static inline void rcu_read_lock_bh(void) 703static inline void rcu_read_lock_bh(void)
678{ 704{
679 local_bh_disable(); 705 local_bh_disable();
680 __acquire(RCU_BH); 706 __acquire(RCU_BH);
681 rcu_read_acquire_bh(); 707 rcu_lock_acquire(&rcu_bh_lock_map);
682} 708}
683 709
684/* 710/*
@@ -688,7 +714,7 @@ static inline void rcu_read_lock_bh(void)
688 */ 714 */
689static inline void rcu_read_unlock_bh(void) 715static inline void rcu_read_unlock_bh(void)
690{ 716{
691 rcu_read_release_bh(); 717 rcu_lock_release(&rcu_bh_lock_map);
692 __release(RCU_BH); 718 __release(RCU_BH);
693 local_bh_enable(); 719 local_bh_enable();
694} 720}
@@ -700,12 +726,17 @@ static inline void rcu_read_unlock_bh(void)
700 * are being done using call_rcu_sched() or synchronize_rcu_sched(). 726 * are being done using call_rcu_sched() or synchronize_rcu_sched().
701 * Read-side critical sections can also be introduced by anything that 727 * Read-side critical sections can also be introduced by anything that
702 * disables preemption, including local_irq_disable() and friends. 728 * disables preemption, including local_irq_disable() and friends.
729 *
730 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
731 * must occur in the same context, for example, it is illegal to invoke
732 * rcu_read_unlock_sched() from process context if the matching
733 * rcu_read_lock_sched() was invoked from an NMI handler.
703 */ 734 */
704static inline void rcu_read_lock_sched(void) 735static inline void rcu_read_lock_sched(void)
705{ 736{
706 preempt_disable(); 737 preempt_disable();
707 __acquire(RCU_SCHED); 738 __acquire(RCU_SCHED);
708 rcu_read_acquire_sched(); 739 rcu_lock_acquire(&rcu_sched_lock_map);
709} 740}
710 741
711/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ 742/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
@@ -722,7 +753,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void)
722 */ 753 */
723static inline void rcu_read_unlock_sched(void) 754static inline void rcu_read_unlock_sched(void)
724{ 755{
725 rcu_read_release_sched(); 756 rcu_lock_release(&rcu_sched_lock_map);
726 __release(RCU_SCHED); 757 __release(RCU_SCHED);
727 preempt_enable(); 758 preempt_enable();
728} 759}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9b9bc5..cf0eb342bcba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -273,9 +273,11 @@ extern int runqueue_is_locked(int cpu);
273 273
274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
275extern void select_nohz_load_balancer(int stop_tick); 275extern void select_nohz_load_balancer(int stop_tick);
276extern void set_cpu_sd_state_idle(void);
276extern int get_nohz_timer_target(void); 277extern int get_nohz_timer_target(void);
277#else 278#else
278static inline void select_nohz_load_balancer(int stop_tick) { } 279static inline void select_nohz_load_balancer(int stop_tick) { }
280static inline void set_cpu_sd_state_idle(void) { }
279#endif 281#endif
280 282
281/* 283/*
@@ -483,8 +485,8 @@ struct task_cputime {
483 485
484#define INIT_CPUTIME \ 486#define INIT_CPUTIME \
485 (struct task_cputime) { \ 487 (struct task_cputime) { \
486 .utime = cputime_zero, \ 488 .utime = 0, \
487 .stime = cputime_zero, \ 489 .stime = 0, \
488 .sum_exec_runtime = 0, \ 490 .sum_exec_runtime = 0, \
489 } 491 }
490 492
@@ -901,6 +903,10 @@ struct sched_group_power {
901 * single CPU. 903 * single CPU.
902 */ 904 */
903 unsigned int power, power_orig; 905 unsigned int power, power_orig;
906 /*
907 * Number of busy cpus in this group.
908 */
909 atomic_t nr_busy_cpus;
904}; 910};
905 911
906struct sched_group { 912struct sched_group {
@@ -925,6 +931,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
925 return to_cpumask(sg->cpumask); 931 return to_cpumask(sg->cpumask);
926} 932}
927 933
934/**
935 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
936 * @group: The group whose first cpu is to be returned.
937 */
938static inline unsigned int group_first_cpu(struct sched_group *group)
939{
940 return cpumask_first(sched_group_cpus(group));
941}
942
928struct sched_domain_attr { 943struct sched_domain_attr {
929 int relax_domain_level; 944 int relax_domain_level;
930}; 945};
@@ -1315,8 +1330,8 @@ struct task_struct {
1315 * older sibling, respectively. (p->father can be replaced with 1330 * older sibling, respectively. (p->father can be replaced with
1316 * p->real_parent->pid) 1331 * p->real_parent->pid)
1317 */ 1332 */
1318 struct task_struct *real_parent; /* real parent process */ 1333 struct task_struct __rcu *real_parent; /* real parent process */
1319 struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */ 1334 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1320 /* 1335 /*
1321 * children/sibling forms the list of my natural children 1336 * children/sibling forms the list of my natural children
1322 */ 1337 */
@@ -2070,6 +2085,14 @@ extern int sched_setscheduler(struct task_struct *, int,
2070extern int sched_setscheduler_nocheck(struct task_struct *, int, 2085extern int sched_setscheduler_nocheck(struct task_struct *, int,
2071 const struct sched_param *); 2086 const struct sched_param *);
2072extern struct task_struct *idle_task(int cpu); 2087extern struct task_struct *idle_task(int cpu);
2088/**
2089 * is_idle_task - is the specified task an idle task?
2090 * @tsk: the task in question.
2091 */
2092static inline bool is_idle_task(struct task_struct *p)
2093{
2094 return p->pid == 0;
2095}
2073extern struct task_struct *curr_task(int cpu); 2096extern struct task_struct *curr_task(int cpu);
2074extern void set_curr_task(int cpu, struct task_struct *p); 2097extern void set_curr_task(int cpu, struct task_struct *p);
2075 2098
diff --git a/include/linux/security.h b/include/linux/security.h
index 19d8e04e1688..e8c619d39291 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2056,7 +2056,7 @@ static inline int security_old_inode_init_security(struct inode *inode,
2056 char **name, void **value, 2056 char **name, void **value,
2057 size_t *len) 2057 size_t *len)
2058{ 2058{
2059 return 0; 2059 return -EOPNOTSUPP;
2060} 2060}
2061 2061
2062static inline int security_inode_create(struct inode *dir, 2062static inline int security_inode_create(struct inode *dir,
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 58971e891f48..e1b005918bbb 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -28,6 +28,7 @@
28#define _LINUX_SRCU_H 28#define _LINUX_SRCU_H
29 29
30#include <linux/mutex.h> 30#include <linux/mutex.h>
31#include <linux/rcupdate.h>
31 32
32struct srcu_struct_array { 33struct srcu_struct_array {
33 int c[2]; 34 int c[2];
@@ -60,18 +61,10 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
60 __init_srcu_struct((sp), #sp, &__srcu_key); \ 61 __init_srcu_struct((sp), #sp, &__srcu_key); \
61}) 62})
62 63
63# define srcu_read_acquire(sp) \
64 lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_)
65# define srcu_read_release(sp) \
66 lock_release(&(sp)->dep_map, 1, _THIS_IP_)
67
68#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 64#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
69 65
70int init_srcu_struct(struct srcu_struct *sp); 66int init_srcu_struct(struct srcu_struct *sp);
71 67
72# define srcu_read_acquire(sp) do { } while (0)
73# define srcu_read_release(sp) do { } while (0)
74
75#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 68#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
76 69
77void cleanup_srcu_struct(struct srcu_struct *sp); 70void cleanup_srcu_struct(struct srcu_struct *sp);
@@ -90,12 +83,32 @@ long srcu_batches_completed(struct srcu_struct *sp);
90 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, 83 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
91 * this assumes we are in an SRCU read-side critical section unless it can 84 * this assumes we are in an SRCU read-side critical section unless it can
92 * prove otherwise. 85 * prove otherwise.
86 *
87 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
88 * and while lockdep is disabled.
89 *
90 * Note that if the CPU is in the idle loop from an RCU point of view
91 * (ie: that we are in the section between rcu_idle_enter() and
92 * rcu_idle_exit()) then srcu_read_lock_held() returns false even if
93 * the CPU did an srcu_read_lock(). The reason for this is that RCU
94 * ignores CPUs that are in such a section, considering these as in
95 * extended quiescent state, so such a CPU is effectively never in an
96 * RCU read-side critical section regardless of what RCU primitives it
97 * invokes. This state of affairs is required --- we need to keep an
98 * RCU-free window in idle where the CPU may possibly enter into low
99 * power mode. This way we can notice an extended quiescent state to
100 * other CPUs that started a grace period. Otherwise we would delay any
101 * grace period as long as we run in the idle task.
93 */ 102 */
94static inline int srcu_read_lock_held(struct srcu_struct *sp) 103static inline int srcu_read_lock_held(struct srcu_struct *sp)
95{ 104{
96 if (debug_locks) 105 if (rcu_is_cpu_idle())
97 return lock_is_held(&sp->dep_map); 106 return 0;
98 return 1; 107
108 if (!debug_lockdep_rcu_enabled())
109 return 1;
110
111 return lock_is_held(&sp->dep_map);
99} 112}
100 113
101#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 114#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -145,12 +158,17 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
145 * one way to indirectly wait on an SRCU grace period is to acquire 158 * one way to indirectly wait on an SRCU grace period is to acquire
146 * a mutex that is held elsewhere while calling synchronize_srcu() or 159 * a mutex that is held elsewhere while calling synchronize_srcu() or
147 * synchronize_srcu_expedited(). 160 * synchronize_srcu_expedited().
161 *
162 * Note that srcu_read_lock() and the matching srcu_read_unlock() must
163 * occur in the same context, for example, it is illegal to invoke
164 * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
165 * was invoked in process context.
148 */ 166 */
149static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) 167static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
150{ 168{
151 int retval = __srcu_read_lock(sp); 169 int retval = __srcu_read_lock(sp);
152 170
153 srcu_read_acquire(sp); 171 rcu_lock_acquire(&(sp)->dep_map);
154 return retval; 172 return retval;
155} 173}
156 174
@@ -164,8 +182,51 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
164static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) 182static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
165 __releases(sp) 183 __releases(sp)
166{ 184{
167 srcu_read_release(sp); 185 rcu_lock_release(&(sp)->dep_map);
186 __srcu_read_unlock(sp, idx);
187}
188
189/**
190 * srcu_read_lock_raw - register a new reader for an SRCU-protected structure.
191 * @sp: srcu_struct in which to register the new reader.
192 *
193 * Enter an SRCU read-side critical section. Similar to srcu_read_lock(),
194 * but avoids the RCU-lockdep checking. This means that it is legal to
195 * use srcu_read_lock_raw() in one context, for example, in an exception
196 * handler, and then have the matching srcu_read_unlock_raw() in another
197 * context, for example in the task that took the exception.
198 *
199 * However, the entire SRCU read-side critical section must reside within a
200 * single task. For example, beware of using srcu_read_lock_raw() in
201 * a device interrupt handler and srcu_read_unlock() in the interrupted
202 * task: This will not work if interrupts are threaded.
203 */
204static inline int srcu_read_lock_raw(struct srcu_struct *sp)
205{
206 unsigned long flags;
207 int ret;
208
209 local_irq_save(flags);
210 ret = __srcu_read_lock(sp);
211 local_irq_restore(flags);
212 return ret;
213}
214
215/**
216 * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure.
217 * @sp: srcu_struct in which to unregister the old reader.
218 * @idx: return value from corresponding srcu_read_lock_raw().
219 *
220 * Exit an SRCU read-side critical section without lockdep-RCU checking.
221 * See srcu_read_lock_raw() for more details.
222 */
223static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx)
224{
225 unsigned long flags;
226
227 local_irq_save(flags);
168 __srcu_read_unlock(sp, idx); 228 __srcu_read_unlock(sp, idx);
229 local_irq_restore(flags);
169} 230}
170 231
171#endif 232#endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index b232ccc0ee29..ab8be90b5cc9 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -7,6 +7,7 @@
7#define _LINUX_TICK_H 7#define _LINUX_TICK_H
8 8
9#include <linux/clockchips.h> 9#include <linux/clockchips.h>
10#include <linux/irqflags.h>
10 11
11#ifdef CONFIG_GENERIC_CLOCKEVENTS 12#ifdef CONFIG_GENERIC_CLOCKEVENTS
12 13
@@ -121,14 +122,16 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
121#endif /* !CONFIG_GENERIC_CLOCKEVENTS */ 122#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
122 123
123# ifdef CONFIG_NO_HZ 124# ifdef CONFIG_NO_HZ
124extern void tick_nohz_stop_sched_tick(int inidle); 125extern void tick_nohz_idle_enter(void);
125extern void tick_nohz_restart_sched_tick(void); 126extern void tick_nohz_idle_exit(void);
127extern void tick_nohz_irq_exit(void);
126extern ktime_t tick_nohz_get_sleep_length(void); 128extern ktime_t tick_nohz_get_sleep_length(void);
127extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); 129extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
128extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); 130extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
129# else 131# else
130static inline void tick_nohz_stop_sched_tick(int inidle) { } 132static inline void tick_nohz_idle_enter(void) { }
131static inline void tick_nohz_restart_sched_tick(void) { } 133static inline void tick_nohz_idle_exit(void) { }
134
132static inline ktime_t tick_nohz_get_sleep_length(void) 135static inline ktime_t tick_nohz_get_sleep_length(void)
133{ 136{
134 ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; 137 ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3efc9f3f43a0..a9ce45e8501c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -77,13 +77,13 @@ struct task_struct;
77#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ 77#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \
78 { .flags = word, .bit_nr = bit, } 78 { .flags = word, .bit_nr = bit, }
79 79
80extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); 80extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);
81 81
82#define init_waitqueue_head(q) \ 82#define init_waitqueue_head(q) \
83 do { \ 83 do { \
84 static struct lock_class_key __key; \ 84 static struct lock_class_key __key; \
85 \ 85 \
86 __init_waitqueue_head((q), &__key); \ 86 __init_waitqueue_head((q), #q, &__key); \
87 } while (0) 87 } while (0)
88 88
89#ifdef CONFIG_LOCKDEP 89#ifdef CONFIG_LOCKDEP
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 873d5be7926c..e5a7b9aaf552 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1207,7 +1207,7 @@ extern void ip_vs_control_cleanup(void);
1207extern struct ip_vs_dest * 1207extern struct ip_vs_dest *
1208ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, 1208ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr,
1209 __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, 1209 __be16 dport, const union nf_inet_addr *vaddr, __be16 vport,
1210 __u16 protocol, __u32 fwmark); 1210 __u16 protocol, __u32 fwmark, __u32 flags);
1211extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); 1211extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
1212 1212
1213 1213
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 669fbd62ec25..d2d88bed891b 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -241,24 +241,73 @@ TRACE_EVENT(rcu_fqs,
241 241
242/* 242/*
243 * Tracepoint for dyntick-idle entry/exit events. These take a string 243 * Tracepoint for dyntick-idle entry/exit events. These take a string
244 * as argument: "Start" for entering dyntick-idle mode and "End" for 244 * as argument: "Start" for entering dyntick-idle mode, "End" for
245 * leaving it. 245 * leaving it, "--=" for events moving towards idle, and "++=" for events
246 * moving away from idle. "Error on entry: not idle task" and "Error on
247 * exit: not idle task" indicate that a non-idle task is erroneously
248 * toying with the idle loop.
249 *
250 * These events also take a pair of numbers, which indicate the nesting
251 * depth before and after the event of interest. Note that task-related
252 * events use the upper bits of each number, while interrupt-related
253 * events use the lower bits.
246 */ 254 */
247TRACE_EVENT(rcu_dyntick, 255TRACE_EVENT(rcu_dyntick,
248 256
249 TP_PROTO(char *polarity), 257 TP_PROTO(char *polarity, long long oldnesting, long long newnesting),
250 258
251 TP_ARGS(polarity), 259 TP_ARGS(polarity, oldnesting, newnesting),
252 260
253 TP_STRUCT__entry( 261 TP_STRUCT__entry(
254 __field(char *, polarity) 262 __field(char *, polarity)
263 __field(long long, oldnesting)
264 __field(long long, newnesting)
255 ), 265 ),
256 266
257 TP_fast_assign( 267 TP_fast_assign(
258 __entry->polarity = polarity; 268 __entry->polarity = polarity;
269 __entry->oldnesting = oldnesting;
270 __entry->newnesting = newnesting;
271 ),
272
273 TP_printk("%s %llx %llx", __entry->polarity,
274 __entry->oldnesting, __entry->newnesting)
275);
276
277/*
278 * Tracepoint for RCU preparation for idle, the goal being to get RCU
279 * processing done so that the current CPU can shut off its scheduling
280 * clock and enter dyntick-idle mode. One way to accomplish this is
281 * to drain all RCU callbacks from this CPU, and the other is to have
282 * done everything RCU requires for the current grace period. In this
283 * latter case, the CPU will be awakened at the end of the current grace
284 * period in order to process the remainder of its callbacks.
285 *
286 * These tracepoints take a string as argument:
287 *
288 * "No callbacks": Nothing to do, no callbacks on this CPU.
289 * "In holdoff": Nothing to do, holding off after unsuccessful attempt.
290 * "Begin holdoff": Attempt failed, don't retry until next jiffy.
291 * "Dyntick with callbacks": Entering dyntick-idle despite callbacks.
292 * "More callbacks": Still more callbacks, try again to clear them out.
293 * "Callbacks drained": All callbacks processed, off to dyntick idle!
294 * "Timer": Timer fired to cause CPU to continue processing callbacks.
295 */
296TRACE_EVENT(rcu_prep_idle,
297
298 TP_PROTO(char *reason),
299
300 TP_ARGS(reason),
301
302 TP_STRUCT__entry(
303 __field(char *, reason)
304 ),
305
306 TP_fast_assign(
307 __entry->reason = reason;
259 ), 308 ),
260 309
261 TP_printk("%s", __entry->polarity) 310 TP_printk("%s", __entry->reason)
262); 311);
263 312
264/* 313/*
@@ -412,27 +461,71 @@ TRACE_EVENT(rcu_invoke_kfree_callback,
412 461
413/* 462/*
414 * Tracepoint for exiting rcu_do_batch after RCU callbacks have been 463 * Tracepoint for exiting rcu_do_batch after RCU callbacks have been
415 * invoked. The first argument is the name of the RCU flavor and 464 * invoked. The first argument is the name of the RCU flavor,
416 * the second argument is number of callbacks actually invoked. 465 * the second argument is number of callbacks actually invoked,
466 * the third argument (cb) is whether or not any of the callbacks that
467 * were ready to invoke at the beginning of this batch are still
468 * queued, the fourth argument (nr) is the return value of need_resched(),
469 * the fifth argument (iit) is 1 if the current task is the idle task,
470 * and the sixth argument (risk) is the return value from
471 * rcu_is_callbacks_kthread().
417 */ 472 */
418TRACE_EVENT(rcu_batch_end, 473TRACE_EVENT(rcu_batch_end,
419 474
420 TP_PROTO(char *rcuname, int callbacks_invoked), 475 TP_PROTO(char *rcuname, int callbacks_invoked,
476 bool cb, bool nr, bool iit, bool risk),
421 477
422 TP_ARGS(rcuname, callbacks_invoked), 478 TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk),
423 479
424 TP_STRUCT__entry( 480 TP_STRUCT__entry(
425 __field(char *, rcuname) 481 __field(char *, rcuname)
426 __field(int, callbacks_invoked) 482 __field(int, callbacks_invoked)
483 __field(bool, cb)
484 __field(bool, nr)
485 __field(bool, iit)
486 __field(bool, risk)
427 ), 487 ),
428 488
429 TP_fast_assign( 489 TP_fast_assign(
430 __entry->rcuname = rcuname; 490 __entry->rcuname = rcuname;
431 __entry->callbacks_invoked = callbacks_invoked; 491 __entry->callbacks_invoked = callbacks_invoked;
492 __entry->cb = cb;
493 __entry->nr = nr;
494 __entry->iit = iit;
495 __entry->risk = risk;
496 ),
497
498 TP_printk("%s CBs-invoked=%d idle=%c%c%c%c",
499 __entry->rcuname, __entry->callbacks_invoked,
500 __entry->cb ? 'C' : '.',
501 __entry->nr ? 'S' : '.',
502 __entry->iit ? 'I' : '.',
503 __entry->risk ? 'R' : '.')
504);
505
506/*
507 * Tracepoint for rcutorture readers. The first argument is the name
508 * of the RCU flavor from rcutorture's viewpoint and the second argument
509 * is the callback address.
510 */
511TRACE_EVENT(rcu_torture_read,
512
513 TP_PROTO(char *rcutorturename, struct rcu_head *rhp),
514
515 TP_ARGS(rcutorturename, rhp),
516
517 TP_STRUCT__entry(
518 __field(char *, rcutorturename)
519 __field(struct rcu_head *, rhp)
520 ),
521
522 TP_fast_assign(
523 __entry->rcutorturename = rcutorturename;
524 __entry->rhp = rhp;
432 ), 525 ),
433 526
434 TP_printk("%s CBs-invoked=%d", 527 TP_printk("%s torture read %p",
435 __entry->rcuname, __entry->callbacks_invoked) 528 __entry->rcutorturename, __entry->rhp)
436); 529);
437 530
438#else /* #ifdef CONFIG_RCU_TRACE */ 531#else /* #ifdef CONFIG_RCU_TRACE */
@@ -443,13 +536,16 @@ TRACE_EVENT(rcu_batch_end,
443#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) 536#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
444#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) 537#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
445#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) 538#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
446#define trace_rcu_dyntick(polarity) do { } while (0) 539#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0)
540#define trace_rcu_prep_idle(reason) do { } while (0)
447#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) 541#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
448#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) 542#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
449#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) 543#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
450#define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) 544#define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
451#define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) 545#define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
452#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) 546#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \
547 do { } while (0)
548#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
453 549
454#endif /* #else #ifdef CONFIG_RCU_TRACE */ 550#endif /* #else #ifdef CONFIG_RCU_TRACE */
455 551
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 959ff18b63b6..6ba596b07a72 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -331,6 +331,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
331 TP_ARGS(tsk, delay)); 331 TP_ARGS(tsk, delay));
332 332
333/* 333/*
334 * Tracepoint for accounting blocked time (time the task is in uninterruptible).
335 */
336DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
337 TP_PROTO(struct task_struct *tsk, u64 delay),
338 TP_ARGS(tsk, delay));
339
340/*
334 * Tracepoint for accounting runtime (time the task is executing 341 * Tracepoint for accounting runtime (time the task is executing
335 * on a CPU). 342 * on a CPU).
336 */ 343 */
@@ -363,6 +370,56 @@ TRACE_EVENT(sched_stat_runtime,
363 (unsigned long long)__entry->vruntime) 370 (unsigned long long)__entry->vruntime)
364); 371);
365 372
373#ifdef CREATE_TRACE_POINTS
374static inline u64 trace_get_sleeptime(struct task_struct *tsk)
375{
376#ifdef CONFIG_SCHEDSTATS
377 u64 block, sleep;
378
379 block = tsk->se.statistics.block_start;
380 sleep = tsk->se.statistics.sleep_start;
381 tsk->se.statistics.block_start = 0;
382 tsk->se.statistics.sleep_start = 0;
383
384 return block ? block : sleep ? sleep : 0;
385#else
386 return 0;
387#endif
388}
389#endif
390
391/*
392 * Tracepoint for accounting sleeptime (time the task is sleeping
393 * or waiting for I/O).
394 */
395TRACE_EVENT(sched_stat_sleeptime,
396
397 TP_PROTO(struct task_struct *tsk, u64 now),
398
399 TP_ARGS(tsk, now),
400
401 TP_STRUCT__entry(
402 __array( char, comm, TASK_COMM_LEN )
403 __field( pid_t, pid )
404 __field( u64, sleeptime )
405 ),
406
407 TP_fast_assign(
408 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
409 __entry->pid = tsk->pid;
410 __entry->sleeptime = trace_get_sleeptime(tsk);
411 __entry->sleeptime = __entry->sleeptime ?
412 now - __entry->sleeptime : 0;
413 )
414 TP_perf_assign(
415 __perf_count(__entry->sleeptime);
416 ),
417
418 TP_printk("comm=%s pid=%d sleeptime=%Lu [ns]",
419 __entry->comm, __entry->pid,
420 (unsigned long long)__entry->sleeptime)
421);
422
366/* 423/*
367 * Tracepoint for showing priority inheritance modifying a tasks 424 * Tracepoint for showing priority inheritance modifying a tasks
368 * priority. 425 * priority.
diff --git a/init/Kconfig b/init/Kconfig
index 43298f9810fb..82b6a4c675b2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -469,14 +469,14 @@ config RCU_FANOUT_EXACT
469 469
470config RCU_FAST_NO_HZ 470config RCU_FAST_NO_HZ
471 bool "Accelerate last non-dyntick-idle CPU's grace periods" 471 bool "Accelerate last non-dyntick-idle CPU's grace periods"
472 depends on TREE_RCU && NO_HZ && SMP 472 depends on NO_HZ && SMP
473 default n 473 default n
474 help 474 help
475 This option causes RCU to attempt to accelerate grace periods 475 This option causes RCU to attempt to accelerate grace periods
476 in order to allow the final CPU to enter dynticks-idle state 476 in order to allow CPUs to enter dynticks-idle state more
477 more quickly. On the other hand, this option increases the 477 quickly. On the other hand, this option increases the overhead
478 overhead of the dynticks-idle checking, particularly on systems 478 of the dynticks-idle checking, particularly on systems with
479 with large numbers of CPUs. 479 large numbers of CPUs.
480 480
481 Say Y if energy efficiency is critically important, particularly 481 Say Y if energy efficiency is critically important, particularly
482 if you have relatively few CPUs. 482 if you have relatively few CPUs.
diff --git a/init/main.c b/init/main.c
index 217ed23e9487..2c76efb513c2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -469,13 +469,12 @@ asmlinkage void __init start_kernel(void)
469 char * command_line; 469 char * command_line;
470 extern const struct kernel_param __start___param[], __stop___param[]; 470 extern const struct kernel_param __start___param[], __stop___param[];
471 471
472 smp_setup_processor_id();
473
474 /* 472 /*
475 * Need to run as early as possible, to initialize the 473 * Need to run as early as possible, to initialize the
476 * lockdep hash: 474 * lockdep hash:
477 */ 475 */
478 lockdep_init(); 476 lockdep_init();
477 smp_setup_processor_id();
479 debug_objects_early_init(); 478 debug_objects_early_init();
480 479
481 /* 480 /*
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02c..f70396e5a24b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o
15 14
16ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg 19CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_irq_work.o = -pg 22CFLAGS_REMOVE_irq_work.o = -pg
25endif 23endif
26 24
25obj-y += sched/
26
27obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 28obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 102obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_CPU_PM) += cpu_pm.o 103obj-$(CONFIG_CPU_PM) += cpu_pm.o
105 104
@@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
111obj-$(CONFIG_JUMP_LABEL) += jump_label.o 110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
112 111
113ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
114# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
115# needed for x86 only. Why this used to be enabled for all architectures is beyond
116# me. I suspect most platforms don't need this, but until we know that for sure
117# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
118# to get a correct value for the wait-channel (WCHAN in ps). --davidm
119CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
120endif
121
122$(obj)/configs.o: $(obj)/config_data.h 112$(obj)/configs.o: $(obj)/config_data.h
123 113
124# config_data.h contains the same information as ikconfig.h but gzipped. 114# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..203dfead2e06 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead)
613 pacct->ac_flag |= ACORE; 613 pacct->ac_flag |= ACORE;
614 if (current->flags & PF_SIGNALED) 614 if (current->flags & PF_SIGNALED)
615 pacct->ac_flag |= AXSIG; 615 pacct->ac_flag |= AXSIG;
616 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); 616 pacct->ac_utime += current->utime;
617 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); 617 pacct->ac_stime += current->stime;
618 pacct->ac_minflt += current->min_flt; 618 pacct->ac_minflt += current->min_flt;
619 pacct->ac_majflt += current->maj_flt; 619 pacct->ac_majflt += current->maj_flt;
620 spin_unlock_irq(&current->sighand->siglock); 620 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f13609470..5ca38d5d238a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
178 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
179 for_each_process(p) { 179 for_each_process(p) {
180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
181 (!cputime_eq(p->utime, cputime_zero) || 181 (p->utime || p->stime))
182 !cputime_eq(p->stime, cputime_zero)))
183 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 182 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
184 "(state = %ld, flags = %x)\n", 183 "(state = %ld, flags = %x)\n",
185 p->comm, task_pid_nr(p), cpu, 184 p->comm, task_pid_nr(p), cpu,
@@ -380,6 +379,7 @@ out:
380 cpu_maps_update_done(); 379 cpu_maps_update_done();
381 return err; 380 return err;
382} 381}
382EXPORT_SYMBOL_GPL(cpu_up);
383 383
384#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
385static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' : 636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' : 637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; 638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) { 639 if (is_idle_task(p)) {
640 /* Idle task. Is it really idle, apart from the kdb 640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */ 641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { 642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c36..22d901f9caf4 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o 5obj-y := core.o ring_buffer.o callchain.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000000..057e24b665cf
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,191 @@
1/*
2 * Performance events callchain code, extracted from core.c:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/slab.h>
14#include "internal.h"
15
16struct callchain_cpus_entries {
17 struct rcu_head rcu_head;
18 struct perf_callchain_entry *cpu_entries[0];
19};
20
21static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
22static atomic_t nr_callchain_events;
23static DEFINE_MUTEX(callchain_mutex);
24static struct callchain_cpus_entries *callchain_cpus_entries;
25
26
27__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
28 struct pt_regs *regs)
29{
30}
31
32__weak void perf_callchain_user(struct perf_callchain_entry *entry,
33 struct pt_regs *regs)
34{
35}
36
37static void release_callchain_buffers_rcu(struct rcu_head *head)
38{
39 struct callchain_cpus_entries *entries;
40 int cpu;
41
42 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
43
44 for_each_possible_cpu(cpu)
45 kfree(entries->cpu_entries[cpu]);
46
47 kfree(entries);
48}
49
50static void release_callchain_buffers(void)
51{
52 struct callchain_cpus_entries *entries;
53
54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57}
58
59static int alloc_callchain_buffers(void)
60{
61 int cpu;
62 int size;
63 struct callchain_cpus_entries *entries;
64
65 /*
66 * We can't use the percpu allocation API for data that can be
67 * accessed from NMI. Use a temporary manual per cpu allocation
68 * until that gets sorted out.
69 */
70 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
71
72 entries = kzalloc(size, GFP_KERNEL);
73 if (!entries)
74 return -ENOMEM;
75
76 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
77
78 for_each_possible_cpu(cpu) {
79 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
80 cpu_to_node(cpu));
81 if (!entries->cpu_entries[cpu])
82 goto fail;
83 }
84
85 rcu_assign_pointer(callchain_cpus_entries, entries);
86
87 return 0;
88
89fail:
90 for_each_possible_cpu(cpu)
91 kfree(entries->cpu_entries[cpu]);
92 kfree(entries);
93
94 return -ENOMEM;
95}
96
97int get_callchain_buffers(void)
98{
99 int err = 0;
100 int count;
101
102 mutex_lock(&callchain_mutex);
103
104 count = atomic_inc_return(&nr_callchain_events);
105 if (WARN_ON_ONCE(count < 1)) {
106 err = -EINVAL;
107 goto exit;
108 }
109
110 if (count > 1) {
111 /* If the allocation failed, give up */
112 if (!callchain_cpus_entries)
113 err = -ENOMEM;
114 goto exit;
115 }
116
117 err = alloc_callchain_buffers();
118 if (err)
119 release_callchain_buffers();
120exit:
121 mutex_unlock(&callchain_mutex);
122
123 return err;
124}
125
126void put_callchain_buffers(void)
127{
128 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
129 release_callchain_buffers();
130 mutex_unlock(&callchain_mutex);
131 }
132}
133
134static struct perf_callchain_entry *get_callchain_entry(int *rctx)
135{
136 int cpu;
137 struct callchain_cpus_entries *entries;
138
139 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
140 if (*rctx == -1)
141 return NULL;
142
143 entries = rcu_dereference(callchain_cpus_entries);
144 if (!entries)
145 return NULL;
146
147 cpu = smp_processor_id();
148
149 return &entries->cpu_entries[cpu][*rctx];
150}
151
152static void
153put_callchain_entry(int rctx)
154{
155 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
156}
157
158struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
159{
160 int rctx;
161 struct perf_callchain_entry *entry;
162
163
164 entry = get_callchain_entry(&rctx);
165 if (rctx == -1)
166 return NULL;
167
168 if (!entry)
169 goto exit_put;
170
171 entry->nr = 0;
172
173 if (!user_mode(regs)) {
174 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
175 perf_callchain_kernel(entry, regs);
176 if (current->mm)
177 regs = task_pt_regs(current);
178 else
179 regs = NULL;
180 }
181
182 if (regs) {
183 perf_callchain_store(entry, PERF_CONTEXT_USER);
184 perf_callchain_user(entry, regs);
185 }
186
187exit_put:
188 put_callchain_entry(rctx);
189
190 return entry;
191}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 58690af323e4..890eb02c2f21 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -128,7 +128,7 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 128 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 130 */
131struct jump_label_key perf_sched_events __read_mostly; 131struct jump_label_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
133 133
134static atomic_t nr_mmap_events __read_mostly; 134static atomic_t nr_mmap_events __read_mostly;
@@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event,
1130 if (!is_software_event(event)) 1130 if (!is_software_event(event))
1131 cpuctx->active_oncpu--; 1131 cpuctx->active_oncpu--;
1132 ctx->nr_active--; 1132 ctx->nr_active--;
1133 if (event->attr.freq && event->attr.sample_freq)
1134 ctx->nr_freq--;
1133 if (event->attr.exclusive || !cpuctx->active_oncpu) 1135 if (event->attr.exclusive || !cpuctx->active_oncpu)
1134 cpuctx->exclusive = 0; 1136 cpuctx->exclusive = 0;
1135} 1137}
@@ -1325,6 +1327,7 @@ retry:
1325 } 1327 }
1326 raw_spin_unlock_irq(&ctx->lock); 1328 raw_spin_unlock_irq(&ctx->lock);
1327} 1329}
1330EXPORT_SYMBOL_GPL(perf_event_disable);
1328 1331
1329static void perf_set_shadow_time(struct perf_event *event, 1332static void perf_set_shadow_time(struct perf_event *event,
1330 struct perf_event_context *ctx, 1333 struct perf_event_context *ctx,
@@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event,
1406 if (!is_software_event(event)) 1409 if (!is_software_event(event))
1407 cpuctx->active_oncpu++; 1410 cpuctx->active_oncpu++;
1408 ctx->nr_active++; 1411 ctx->nr_active++;
1412 if (event->attr.freq && event->attr.sample_freq)
1413 ctx->nr_freq++;
1409 1414
1410 if (event->attr.exclusive) 1415 if (event->attr.exclusive)
1411 cpuctx->exclusive = 1; 1416 cpuctx->exclusive = 1;
@@ -1662,8 +1667,7 @@ retry:
1662 * Note: this works for group members as well as group leaders 1667 * Note: this works for group members as well as group leaders
1663 * since the non-leader members' sibling_lists will be empty. 1668 * since the non-leader members' sibling_lists will be empty.
1664 */ 1669 */
1665static void __perf_event_mark_enabled(struct perf_event *event, 1670static void __perf_event_mark_enabled(struct perf_event *event)
1666 struct perf_event_context *ctx)
1667{ 1671{
1668 struct perf_event *sub; 1672 struct perf_event *sub;
1669 u64 tstamp = perf_event_time(event); 1673 u64 tstamp = perf_event_time(event);
@@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info)
1701 */ 1705 */
1702 perf_cgroup_set_timestamp(current, ctx); 1706 perf_cgroup_set_timestamp(current, ctx);
1703 1707
1704 __perf_event_mark_enabled(event, ctx); 1708 __perf_event_mark_enabled(event);
1705 1709
1706 if (!event_filter_match(event)) { 1710 if (!event_filter_match(event)) {
1707 if (is_cgroup_event(event)) 1711 if (is_cgroup_event(event))
@@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
1782 1786
1783retry: 1787retry:
1784 if (!ctx->is_active) { 1788 if (!ctx->is_active) {
1785 __perf_event_mark_enabled(event, ctx); 1789 __perf_event_mark_enabled(event);
1786 goto out; 1790 goto out;
1787 } 1791 }
1788 1792
@@ -1809,6 +1813,7 @@ retry:
1809out: 1813out:
1810 raw_spin_unlock_irq(&ctx->lock); 1814 raw_spin_unlock_irq(&ctx->lock);
1811} 1815}
1816EXPORT_SYMBOL_GPL(perf_event_enable);
1812 1817
1813int perf_event_refresh(struct perf_event *event, int refresh) 1818int perf_event_refresh(struct perf_event *event, int refresh)
1814{ 1819{
@@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2327 u64 interrupts, now; 2332 u64 interrupts, now;
2328 s64 delta; 2333 s64 delta;
2329 2334
2335 if (!ctx->nr_freq)
2336 return;
2337
2330 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2338 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2331 if (event->state != PERF_EVENT_STATE_ACTIVE) 2339 if (event->state != PERF_EVENT_STATE_ACTIVE)
2332 continue; 2340 continue;
@@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2382{ 2390{
2383 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; 2391 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2384 struct perf_event_context *ctx = NULL; 2392 struct perf_event_context *ctx = NULL;
2385 int rotate = 0, remove = 1; 2393 int rotate = 0, remove = 1, freq = 0;
2386 2394
2387 if (cpuctx->ctx.nr_events) { 2395 if (cpuctx->ctx.nr_events) {
2388 remove = 0; 2396 remove = 0;
2389 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 2397 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2390 rotate = 1; 2398 rotate = 1;
2399 if (cpuctx->ctx.nr_freq)
2400 freq = 1;
2391 } 2401 }
2392 2402
2393 ctx = cpuctx->task_ctx; 2403 ctx = cpuctx->task_ctx;
@@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2395 remove = 0; 2405 remove = 0;
2396 if (ctx->nr_events != ctx->nr_active) 2406 if (ctx->nr_events != ctx->nr_active)
2397 rotate = 1; 2407 rotate = 1;
2408 if (ctx->nr_freq)
2409 freq = 1;
2398 } 2410 }
2399 2411
2412 if (!rotate && !freq)
2413 goto done;
2414
2400 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2415 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2401 perf_pmu_disable(cpuctx->ctx.pmu); 2416 perf_pmu_disable(cpuctx->ctx.pmu);
2402 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2403 if (ctx)
2404 perf_ctx_adjust_freq(ctx, interval);
2405 2417
2406 if (!rotate) 2418 if (freq) {
2407 goto done; 2419 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2420 if (ctx)
2421 perf_ctx_adjust_freq(ctx, interval);
2422 }
2408 2423
2409 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2424 if (rotate) {
2410 if (ctx) 2425 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2411 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 2426 if (ctx)
2427 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2412 2428
2413 rotate_ctx(&cpuctx->ctx); 2429 rotate_ctx(&cpuctx->ctx);
2414 if (ctx) 2430 if (ctx)
2415 rotate_ctx(ctx); 2431 rotate_ctx(ctx);
2416 2432
2417 perf_event_sched_in(cpuctx, ctx, current); 2433 perf_event_sched_in(cpuctx, ctx, current);
2434 }
2435
2436 perf_pmu_enable(cpuctx->ctx.pmu);
2437 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2418 2438
2419done: 2439done:
2420 if (remove) 2440 if (remove)
2421 list_del_init(&cpuctx->rotation_list); 2441 list_del_init(&cpuctx->rotation_list);
2422
2423 perf_pmu_enable(cpuctx->ctx.pmu);
2424 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2425} 2442}
2426 2443
2427void perf_event_task_tick(void) 2444void perf_event_task_tick(void)
@@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
2448 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2465 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2449 return 0; 2466 return 0;
2450 2467
2451 __perf_event_mark_enabled(event, ctx); 2468 __perf_event_mark_enabled(event);
2452 2469
2453 return 1; 2470 return 1;
2454} 2471}
@@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2480 raw_spin_lock(&ctx->lock); 2497 raw_spin_lock(&ctx->lock);
2481 task_ctx_sched_out(ctx); 2498 task_ctx_sched_out(ctx);
2482 2499
2483 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2500 list_for_each_entry(event, &ctx->event_list, event_entry) {
2484 ret = event_enable_on_exec(event, ctx);
2485 if (ret)
2486 enabled = 1;
2487 }
2488
2489 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2490 ret = event_enable_on_exec(event, ctx); 2501 ret = event_enable_on_exec(event, ctx);
2491 if (ret) 2502 if (ret)
2492 enabled = 1; 2503 enabled = 1;
@@ -2574,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event)
2574} 2585}
2575 2586
2576/* 2587/*
2577 * Callchain support
2578 */
2579
2580struct callchain_cpus_entries {
2581 struct rcu_head rcu_head;
2582 struct perf_callchain_entry *cpu_entries[0];
2583};
2584
2585static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2586static atomic_t nr_callchain_events;
2587static DEFINE_MUTEX(callchain_mutex);
2588struct callchain_cpus_entries *callchain_cpus_entries;
2589
2590
2591__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2592 struct pt_regs *regs)
2593{
2594}
2595
2596__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2597 struct pt_regs *regs)
2598{
2599}
2600
2601static void release_callchain_buffers_rcu(struct rcu_head *head)
2602{
2603 struct callchain_cpus_entries *entries;
2604 int cpu;
2605
2606 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2607
2608 for_each_possible_cpu(cpu)
2609 kfree(entries->cpu_entries[cpu]);
2610
2611 kfree(entries);
2612}
2613
2614static void release_callchain_buffers(void)
2615{
2616 struct callchain_cpus_entries *entries;
2617
2618 entries = callchain_cpus_entries;
2619 rcu_assign_pointer(callchain_cpus_entries, NULL);
2620 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2621}
2622
2623static int alloc_callchain_buffers(void)
2624{
2625 int cpu;
2626 int size;
2627 struct callchain_cpus_entries *entries;
2628
2629 /*
2630 * We can't use the percpu allocation API for data that can be
2631 * accessed from NMI. Use a temporary manual per cpu allocation
2632 * until that gets sorted out.
2633 */
2634 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2635
2636 entries = kzalloc(size, GFP_KERNEL);
2637 if (!entries)
2638 return -ENOMEM;
2639
2640 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2641
2642 for_each_possible_cpu(cpu) {
2643 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2644 cpu_to_node(cpu));
2645 if (!entries->cpu_entries[cpu])
2646 goto fail;
2647 }
2648
2649 rcu_assign_pointer(callchain_cpus_entries, entries);
2650
2651 return 0;
2652
2653fail:
2654 for_each_possible_cpu(cpu)
2655 kfree(entries->cpu_entries[cpu]);
2656 kfree(entries);
2657
2658 return -ENOMEM;
2659}
2660
2661static int get_callchain_buffers(void)
2662{
2663 int err = 0;
2664 int count;
2665
2666 mutex_lock(&callchain_mutex);
2667
2668 count = atomic_inc_return(&nr_callchain_events);
2669 if (WARN_ON_ONCE(count < 1)) {
2670 err = -EINVAL;
2671 goto exit;
2672 }
2673
2674 if (count > 1) {
2675 /* If the allocation failed, give up */
2676 if (!callchain_cpus_entries)
2677 err = -ENOMEM;
2678 goto exit;
2679 }
2680
2681 err = alloc_callchain_buffers();
2682 if (err)
2683 release_callchain_buffers();
2684exit:
2685 mutex_unlock(&callchain_mutex);
2686
2687 return err;
2688}
2689
2690static void put_callchain_buffers(void)
2691{
2692 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2693 release_callchain_buffers();
2694 mutex_unlock(&callchain_mutex);
2695 }
2696}
2697
2698static int get_recursion_context(int *recursion)
2699{
2700 int rctx;
2701
2702 if (in_nmi())
2703 rctx = 3;
2704 else if (in_irq())
2705 rctx = 2;
2706 else if (in_softirq())
2707 rctx = 1;
2708 else
2709 rctx = 0;
2710
2711 if (recursion[rctx])
2712 return -1;
2713
2714 recursion[rctx]++;
2715 barrier();
2716
2717 return rctx;
2718}
2719
2720static inline void put_recursion_context(int *recursion, int rctx)
2721{
2722 barrier();
2723 recursion[rctx]--;
2724}
2725
2726static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2727{
2728 int cpu;
2729 struct callchain_cpus_entries *entries;
2730
2731 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2732 if (*rctx == -1)
2733 return NULL;
2734
2735 entries = rcu_dereference(callchain_cpus_entries);
2736 if (!entries)
2737 return NULL;
2738
2739 cpu = smp_processor_id();
2740
2741 return &entries->cpu_entries[cpu][*rctx];
2742}
2743
2744static void
2745put_callchain_entry(int rctx)
2746{
2747 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2748}
2749
2750static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2751{
2752 int rctx;
2753 struct perf_callchain_entry *entry;
2754
2755
2756 entry = get_callchain_entry(&rctx);
2757 if (rctx == -1)
2758 return NULL;
2759
2760 if (!entry)
2761 goto exit_put;
2762
2763 entry->nr = 0;
2764
2765 if (!user_mode(regs)) {
2766 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2767 perf_callchain_kernel(entry, regs);
2768 if (current->mm)
2769 regs = task_pt_regs(current);
2770 else
2771 regs = NULL;
2772 }
2773
2774 if (regs) {
2775 perf_callchain_store(entry, PERF_CONTEXT_USER);
2776 perf_callchain_user(entry, regs);
2777 }
2778
2779exit_put:
2780 put_callchain_entry(rctx);
2781
2782 return entry;
2783}
2784
2785/*
2786 * Initialize the perf_event context in a task_struct: 2588 * Initialize the perf_event context in a task_struct:
2787 */ 2589 */
2788static void __perf_event_init_context(struct perf_event_context *ctx) 2590static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event)
2946 2748
2947 if (!event->parent) { 2749 if (!event->parent) {
2948 if (event->attach_state & PERF_ATTACH_TASK) 2750 if (event->attach_state & PERF_ATTACH_TASK)
2949 jump_label_dec(&perf_sched_events); 2751 jump_label_dec_deferred(&perf_sched_events);
2950 if (event->attr.mmap || event->attr.mmap_data) 2752 if (event->attr.mmap || event->attr.mmap_data)
2951 atomic_dec(&nr_mmap_events); 2753 atomic_dec(&nr_mmap_events);
2952 if (event->attr.comm) 2754 if (event->attr.comm)
@@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event)
2957 put_callchain_buffers(); 2759 put_callchain_buffers();
2958 if (is_cgroup_event(event)) { 2760 if (is_cgroup_event(event)) {
2959 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2761 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2960 jump_label_dec(&perf_sched_events); 2762 jump_label_dec_deferred(&perf_sched_events);
2961 } 2763 }
2962 } 2764 }
2963 2765
@@ -4820,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4820 struct hw_perf_event *hwc = &event->hw; 4622 struct hw_perf_event *hwc = &event->hw;
4821 int throttle = 0; 4623 int throttle = 0;
4822 4624
4823 data->period = event->hw.last_period;
4824 if (!overflow) 4625 if (!overflow)
4825 overflow = perf_swevent_set_period(event); 4626 overflow = perf_swevent_set_period(event);
4826 4627
@@ -4854,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4854 if (!is_sampling_event(event)) 4655 if (!is_sampling_event(event))
4855 return; 4656 return;
4856 4657
4658 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4659 data->period = nr;
4660 return perf_swevent_overflow(event, 1, data, regs);
4661 } else
4662 data->period = event->hw.last_period;
4663
4857 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4664 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4858 return perf_swevent_overflow(event, 1, data, regs); 4665 return perf_swevent_overflow(event, 1, data, regs);
4859 4666
@@ -5366,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5366 regs = get_irq_regs(); 5173 regs = get_irq_regs();
5367 5174
5368 if (regs && !perf_exclude_event(event, regs)) { 5175 if (regs && !perf_exclude_event(event, regs)) {
5369 if (!(event->attr.exclude_idle && current->pid == 0)) 5176 if (!(event->attr.exclude_idle && is_idle_task(current)))
5370 if (perf_event_overflow(event, &data, regs)) 5177 if (perf_event_overflow(event, &data, regs))
5371 ret = HRTIMER_NORESTART; 5178 ret = HRTIMER_NORESTART;
5372 } 5179 }
@@ -5981,7 +5788,7 @@ done:
5981 5788
5982 if (!event->parent) { 5789 if (!event->parent) {
5983 if (event->attach_state & PERF_ATTACH_TASK) 5790 if (event->attach_state & PERF_ATTACH_TASK)
5984 jump_label_inc(&perf_sched_events); 5791 jump_label_inc(&perf_sched_events.key);
5985 if (event->attr.mmap || event->attr.mmap_data) 5792 if (event->attr.mmap || event->attr.mmap_data)
5986 atomic_inc(&nr_mmap_events); 5793 atomic_inc(&nr_mmap_events);
5987 if (event->attr.comm) 5794 if (event->attr.comm)
@@ -6219,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open,
6219 * - that may need work on context switch 6026 * - that may need work on context switch
6220 */ 6027 */
6221 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6028 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6222 jump_label_inc(&perf_sched_events); 6029 jump_label_inc(&perf_sched_events.key);
6223 } 6030 }
6224 6031
6225 /* 6032 /*
@@ -7065,6 +6872,9 @@ void __init perf_event_init(void)
7065 6872
7066 ret = init_hw_breakpoint(); 6873 ret = init_hw_breakpoint();
7067 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6874 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6875
6876 /* do not patch jump label more than once per second */
6877 jump_label_rate_limit(&perf_sched_events, HZ);
7068} 6878}
7069 6879
7070static int __init perf_event_sysfs_init(void) 6880static int __init perf_event_sysfs_init(void)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 64568a699375..b0b107f90afc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H 1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h>
5
6/* Buffer handling */
7
4#define RING_BUFFER_WRITABLE 0x01 8#define RING_BUFFER_WRITABLE 0x01
5 9
6struct ring_buffer { 10struct ring_buffer {
@@ -67,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
67} 71}
68#endif 72#endif
69 73
70static unsigned long perf_data_size(struct ring_buffer *rb) 74static inline unsigned long perf_data_size(struct ring_buffer *rb)
71{ 75{
72 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 76 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
73} 77}
@@ -96,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
96 } while (len); 100 } while (len);
97} 101}
98 102
103/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
105extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void);
107
108static inline int get_recursion_context(int *recursion)
109{
110 int rctx;
111
112 if (in_nmi())
113 rctx = 3;
114 else if (in_irq())
115 rctx = 2;
116 else if (in_softirq())
117 rctx = 1;
118 else
119 rctx = 0;
120
121 if (recursion[rctx])
122 return -1;
123
124 recursion[rctx]++;
125 barrier();
126
127 return rctx;
128}
129
130static inline void put_recursion_context(int *recursion, int rctx)
131{
132 barrier();
133 recursion[rctx]--;
134}
135
99#endif /* _KERNEL_EVENTS_INTERNAL_H */ 136#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index d0b7d988f873..d579a459309d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk)
121 * We won't ever get here for the group leader, since it 121 * We won't ever get here for the group leader, since it
122 * will have been the last reference on the signal_struct. 122 * will have been the last reference on the signal_struct.
123 */ 123 */
124 sig->utime = cputime_add(sig->utime, tsk->utime); 124 sig->utime += tsk->utime;
125 sig->stime = cputime_add(sig->stime, tsk->stime); 125 sig->stime += tsk->stime;
126 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 126 sig->gtime += tsk->gtime;
127 sig->min_flt += tsk->min_flt; 127 sig->min_flt += tsk->min_flt;
128 sig->maj_flt += tsk->maj_flt; 128 sig->maj_flt += tsk->maj_flt;
129 sig->nvcsw += tsk->nvcsw; 129 sig->nvcsw += tsk->nvcsw;
@@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1255 spin_lock_irq(&p->real_parent->sighand->siglock); 1255 spin_lock_irq(&p->real_parent->sighand->siglock);
1256 psig = p->real_parent->signal; 1256 psig = p->real_parent->signal;
1257 sig = p->signal; 1257 sig = p->signal;
1258 psig->cutime = 1258 psig->cutime += tgutime + sig->cutime;
1259 cputime_add(psig->cutime, 1259 psig->cstime += tgstime + sig->cstime;
1260 cputime_add(tgutime, 1260 psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
1261 sig->cutime));
1262 psig->cstime =
1263 cputime_add(psig->cstime,
1264 cputime_add(tgstime,
1265 sig->cstime));
1266 psig->cgtime =
1267 cputime_add(psig->cgtime,
1268 cputime_add(p->gtime,
1269 cputime_add(sig->gtime,
1270 sig->cgtime)));
1271 psig->cmin_flt += 1261 psig->cmin_flt +=
1272 p->min_flt + sig->min_flt + sig->cmin_flt; 1262 p->min_flt + sig->min_flt + sig->cmin_flt;
1273 psig->cmaj_flt += 1263 psig->cmaj_flt +=
@@ -1540,8 +1530,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1540 } 1530 }
1541 1531
1542 /* dead body doesn't have much to contribute */ 1532 /* dead body doesn't have much to contribute */
1543 if (p->exit_state == EXIT_DEAD) 1533 if (unlikely(p->exit_state == EXIT_DEAD)) {
1534 /*
1535 * But do not ignore this task until the tracer does
1536 * wait_task_zombie()->do_notify_parent().
1537 */
1538 if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
1539 wo->notask_error = 0;
1544 return 0; 1540 return 0;
1541 }
1545 1542
1546 /* slay zombie? */ 1543 /* slay zombie? */
1547 if (p->exit_state == EXIT_ZOMBIE) { 1544 if (p->exit_state == EXIT_ZOMBIE) {
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..b058c5820ecd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1023 */ 1023 */
1024static void posix_cpu_timers_init(struct task_struct *tsk) 1024static void posix_cpu_timers_init(struct task_struct *tsk)
1025{ 1025{
1026 tsk->cputime_expires.prof_exp = cputime_zero; 1026 tsk->cputime_expires.prof_exp = 0;
1027 tsk->cputime_expires.virt_exp = cputime_zero; 1027 tsk->cputime_expires.virt_exp = 0;
1028 tsk->cputime_expires.sched_exp = 0; 1028 tsk->cputime_expires.sched_exp = 0;
1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]); 1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]); 1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1132,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1132 1132
1133 init_sigpending(&p->pending); 1133 init_sigpending(&p->pending);
1134 1134
1135 p->utime = cputime_zero; 1135 p->utime = p->stime = p->gtime = 0;
1136 p->stime = cputime_zero; 1136 p->utimescaled = p->stimescaled = 0;
1137 p->gtime = cputime_zero;
1138 p->utimescaled = cputime_zero;
1139 p->stimescaled = cputime_zero;
1140#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1137#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1141 p->prev_utime = cputime_zero; 1138 p->prev_utime = p->prev_stime = 0;
1142 p->prev_stime = cputime_zero;
1143#endif 1139#endif
1144#if defined(SPLIT_RSS_COUNTING) 1140#if defined(SPLIT_RSS_COUNTING)
1145 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1141 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
diff --git a/kernel/futex.c b/kernel/futex.c
index ea87f4d2f455..1614be20173d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -314,17 +314,29 @@ again:
314#endif 314#endif
315 315
316 lock_page(page_head); 316 lock_page(page_head);
317
318 /*
319 * If page_head->mapping is NULL, then it cannot be a PageAnon
320 * page; but it might be the ZERO_PAGE or in the gate area or
321 * in a special mapping (all cases which we are happy to fail);
322 * or it may have been a good file page when get_user_pages_fast
323 * found it, but truncated or holepunched or subjected to
324 * invalidate_complete_page2 before we got the page lock (also
325 * cases which we are happy to fail). And we hold a reference,
326 * so refcount care in invalidate_complete_page's remove_mapping
327 * prevents drop_caches from setting mapping to NULL beneath us.
328 *
329 * The case we do have to guard against is when memory pressure made
330 * shmem_writepage move it from filecache to swapcache beneath us:
331 * an unlikely race, but we do need to retry for page_head->mapping.
332 */
317 if (!page_head->mapping) { 333 if (!page_head->mapping) {
334 int shmem_swizzled = PageSwapCache(page_head);
318 unlock_page(page_head); 335 unlock_page(page_head);
319 put_page(page_head); 336 put_page(page_head);
320 /* 337 if (shmem_swizzled)
321 * ZERO_PAGE pages don't have a mapping. Avoid a busy loop 338 goto again;
322 * trying to find one. RW mapping would have COW'd (and thus 339 return -EFAULT;
323 * have a mapping) so this page is RO and won't ever change.
324 */
325 if ((page_head == ZERO_PAGE(address)))
326 return -EFAULT;
327 goto again;
328 } 340 }
329 341
330 /* 342 /*
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 8b1748d0172c..2e48ec0c2e91 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
74 74
75 /* 75 /*
76 * Ensure the task is not frozen. 76 * Ensure the task is not frozen.
77 * Also, when a freshly created task is scheduled once, changes 77 * Also, skip vfork and any other user process that freezer should skip.
78 * its state to TASK_UNINTERRUPTIBLE without having ever been
79 * switched out once, it musn't be checked.
80 */ 78 */
81 if (unlikely(t->flags & PF_FROZEN || !switch_count)) 79 if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
80 return;
81
82 /*
83 * When a freshly created task is scheduled once, changes its state to
84 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
85 * musn't be checked.
86 */
87 if (unlikely(!switch_count))
82 return; 88 return;
83 89
84 if (switch_count != t->last_switch_count) { 90 if (switch_count != t->last_switch_count) {
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
52 52
53 cval = it->expires; 53 cval = it->expires;
54 cinterval = it->incr; 54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) { 55 if (cval) {
56 struct task_cputime cputime; 56 struct task_cputime cputime;
57 cputime_t t; 57 cputime_t t;
58 58
59 thread_group_cputimer(tsk, &cputime); 59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF) 60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime); 61 t = cputime.utime + cputime.stime;
62 else 62 else
63 /* CPUCLOCK_VIRT */ 63 /* CPUCLOCK_VIRT */
64 t = cputime.utime; 64 t = cputime.utime;
65 65
66 if (cputime_le(cval, t)) 66 if (cval < t)
67 /* about to fire */ 67 /* about to fire */
68 cval = cputime_one_jiffy; 68 cval = cputime_one_jiffy;
69 else 69 else
70 cval = cputime_sub(cval, t); 70 cval = cval - t;
71 } 71 }
72 72
73 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
161 161
162 cval = it->expires; 162 cval = it->expires;
163 cinterval = it->incr; 163 cinterval = it->incr;
164 if (!cputime_eq(cval, cputime_zero) || 164 if (cval || nval) {
165 !cputime_eq(nval, cputime_zero)) { 165 if (nval > 0)
166 if (cputime_gt(nval, cputime_zero)) 166 nval += cputime_one_jiffy;
167 nval = cputime_add(nval, cputime_one_jiffy);
168 set_process_cpu_timer(tsk, clock_id, &nval, &cval); 167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
169 } 168 }
170 it->expires = nval; 169 it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 66ff7109f697..30c3c7708132 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -72,15 +72,46 @@ void jump_label_inc(struct jump_label_key *key)
72 jump_label_unlock(); 72 jump_label_unlock();
73} 73}
74 74
75void jump_label_dec(struct jump_label_key *key) 75static void __jump_label_dec(struct jump_label_key *key,
76 unsigned long rate_limit, struct delayed_work *work)
76{ 77{
77 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 78 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
78 return; 79 return;
79 80
80 jump_label_update(key, JUMP_LABEL_DISABLE); 81 if (rate_limit) {
82 atomic_inc(&key->enabled);
83 schedule_delayed_work(work, rate_limit);
84 } else
85 jump_label_update(key, JUMP_LABEL_DISABLE);
86
81 jump_label_unlock(); 87 jump_label_unlock();
82} 88}
83 89
90static void jump_label_update_timeout(struct work_struct *work)
91{
92 struct jump_label_key_deferred *key =
93 container_of(work, struct jump_label_key_deferred, work.work);
94 __jump_label_dec(&key->key, 0, NULL);
95}
96
97void jump_label_dec(struct jump_label_key *key)
98{
99 __jump_label_dec(key, 0, NULL);
100}
101
102void jump_label_dec_deferred(struct jump_label_key_deferred *key)
103{
104 __jump_label_dec(&key->key, key->timeout, &key->work);
105}
106
107
108void jump_label_rate_limit(struct jump_label_key_deferred *key,
109 unsigned long rl)
110{
111 key->timeout = rl;
112 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
113}
114
84static int addr_conflict(struct jump_entry *entry, void *start, void *end) 115static int addr_conflict(struct jump_entry *entry, void *start, void *end)
85{ 116{
86 if (entry->code <= (unsigned long)end && 117 if (entry->code <= (unsigned long)end &&
@@ -111,7 +142,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
111 * running code can override this to make the non-live update case 142 * running code can override this to make the non-live update case
112 * cheaper. 143 * cheaper.
113 */ 144 */
114void __weak arch_jump_label_transform_static(struct jump_entry *entry, 145void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
115 enum jump_label_type type) 146 enum jump_label_type type)
116{ 147{
117 arch_jump_label_transform(entry, type); 148 arch_jump_label_transform(entry, type);
@@ -217,8 +248,13 @@ void jump_label_apply_nops(struct module *mod)
217 if (iter_start == iter_stop) 248 if (iter_start == iter_stop)
218 return; 249 return;
219 250
220 for (iter = iter_start; iter < iter_stop; iter++) 251 for (iter = iter_start; iter < iter_stop; iter++) {
221 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); 252 struct jump_label_key *iterk;
253
254 iterk = (struct jump_label_key *)(unsigned long)iter->key;
255 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
256 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
257 }
222} 258}
223 259
224static int jump_label_add_module(struct module *mod) 260static int jump_label_add_module(struct module *mod)
@@ -258,8 +294,7 @@ static int jump_label_add_module(struct module *mod)
258 key->next = jlm; 294 key->next = jlm;
259 295
260 if (jump_label_enabled(key)) 296 if (jump_label_enabled(key))
261 __jump_label_update(key, iter, iter_stop, 297 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
262 JUMP_LABEL_ENABLE);
263 } 298 }
264 299
265 return 0; 300 return 0;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b2e08c932d91..8889f7dd7c46 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,6 +431,7 @@ unsigned int max_lockdep_depth;
431 * about it later on, in lockdep_info(). 431 * about it later on, in lockdep_info().
432 */ 432 */
433static int lockdep_init_error; 433static int lockdep_init_error;
434static const char *lock_init_error;
434static unsigned long lockdep_init_trace_data[20]; 435static unsigned long lockdep_init_trace_data[20];
435static struct stack_trace lockdep_init_trace = { 436static struct stack_trace lockdep_init_trace = {
436 .max_entries = ARRAY_SIZE(lockdep_init_trace_data), 437 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -499,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
499 usage[i] = '\0'; 500 usage[i] = '\0';
500} 501}
501 502
502static int __print_lock_name(struct lock_class *class) 503static void __print_lock_name(struct lock_class *class)
503{ 504{
504 char str[KSYM_NAME_LEN]; 505 char str[KSYM_NAME_LEN];
505 const char *name; 506 const char *name;
506 507
507 name = class->name; 508 name = class->name;
508 if (!name)
509 name = __get_key_name(class->key, str);
510
511 return printk("%s", name);
512}
513
514static void print_lock_name(struct lock_class *class)
515{
516 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
517 const char *name;
518
519 get_usage_chars(class, usage);
520
521 name = class->name;
522 if (!name) { 509 if (!name) {
523 name = __get_key_name(class->key, str); 510 name = __get_key_name(class->key, str);
524 printk(" (%s", name); 511 printk("%s", name);
525 } else { 512 } else {
526 printk(" (%s", name); 513 printk("%s", name);
527 if (class->name_version > 1) 514 if (class->name_version > 1)
528 printk("#%d", class->name_version); 515 printk("#%d", class->name_version);
529 if (class->subclass) 516 if (class->subclass)
530 printk("/%d", class->subclass); 517 printk("/%d", class->subclass);
531 } 518 }
519}
520
521static void print_lock_name(struct lock_class *class)
522{
523 char usage[LOCK_USAGE_CHARS];
524
525 get_usage_chars(class, usage);
526
527 printk(" (");
528 __print_lock_name(class);
532 printk("){%s}", usage); 529 printk("){%s}", usage);
533} 530}
534 531
@@ -568,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
568 } 565 }
569} 566}
570 567
571static void print_kernel_version(void) 568static void print_kernel_ident(void)
572{ 569{
573 printk("%s %.*s\n", init_utsname()->release, 570 printk("%s %.*s %s\n", init_utsname()->release,
574 (int)strcspn(init_utsname()->version, " "), 571 (int)strcspn(init_utsname()->version, " "),
575 init_utsname()->version); 572 init_utsname()->version,
573 print_tainted());
576} 574}
577 575
578static int very_verbose(struct lock_class *class) 576static int very_verbose(struct lock_class *class)
@@ -656,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
656 if (unlikely(!lockdep_initialized)) { 654 if (unlikely(!lockdep_initialized)) {
657 lockdep_init(); 655 lockdep_init();
658 lockdep_init_error = 1; 656 lockdep_init_error = 1;
657 lock_init_error = lock->name;
659 save_stack_trace(&lockdep_init_trace); 658 save_stack_trace(&lockdep_init_trace);
660 } 659 }
661#endif 660#endif
@@ -723,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
723 722
724 class = look_up_lock_class(lock, subclass); 723 class = look_up_lock_class(lock, subclass);
725 if (likely(class)) 724 if (likely(class))
726 return class; 725 goto out_set_class_cache;
727 726
728 /* 727 /*
729 * Debug-check: all keys must be persistent! 728 * Debug-check: all keys must be persistent!
@@ -808,6 +807,7 @@ out_unlock_set:
808 graph_unlock(); 807 graph_unlock();
809 raw_local_irq_restore(flags); 808 raw_local_irq_restore(flags);
810 809
810out_set_class_cache:
811 if (!subclass || force) 811 if (!subclass || force)
812 lock->class_cache[0] = class; 812 lock->class_cache[0] = class;
813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
@@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1149 printk("\n"); 1149 printk("\n");
1150 printk("======================================================\n"); 1150 printk("======================================================\n");
1151 printk("[ INFO: possible circular locking dependency detected ]\n"); 1151 printk("[ INFO: possible circular locking dependency detected ]\n");
1152 print_kernel_version(); 1152 print_kernel_ident();
1153 printk("-------------------------------------------------------\n"); 1153 printk("-------------------------------------------------------\n");
1154 printk("%s/%d is trying to acquire lock:\n", 1154 printk("%s/%d is trying to acquire lock:\n",
1155 curr->comm, task_pid_nr(curr)); 1155 curr->comm, task_pid_nr(curr));
@@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1488 printk("======================================================\n"); 1488 printk("======================================================\n");
1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1490 irqclass, irqclass); 1490 irqclass, irqclass);
1491 print_kernel_version(); 1491 print_kernel_ident();
1492 printk("------------------------------------------------------\n"); 1492 printk("------------------------------------------------------\n");
1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1494 curr->comm, task_pid_nr(curr), 1494 curr->comm, task_pid_nr(curr),
@@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1717 printk("\n"); 1717 printk("\n");
1718 printk("=============================================\n"); 1718 printk("=============================================\n");
1719 printk("[ INFO: possible recursive locking detected ]\n"); 1719 printk("[ INFO: possible recursive locking detected ]\n");
1720 print_kernel_version(); 1720 print_kernel_ident();
1721 printk("---------------------------------------------\n"); 1721 printk("---------------------------------------------\n");
1722 printk("%s/%d is trying to acquire lock:\n", 1722 printk("%s/%d is trying to acquire lock:\n",
1723 curr->comm, task_pid_nr(curr)); 1723 curr->comm, task_pid_nr(curr));
@@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2224 printk("\n"); 2224 printk("\n");
2225 printk("=================================\n"); 2225 printk("=================================\n");
2226 printk("[ INFO: inconsistent lock state ]\n"); 2226 printk("[ INFO: inconsistent lock state ]\n");
2227 print_kernel_version(); 2227 print_kernel_ident();
2228 printk("---------------------------------\n"); 2228 printk("---------------------------------\n");
2229 2229
2230 printk("inconsistent {%s} -> {%s} usage.\n", 2230 printk("inconsistent {%s} -> {%s} usage.\n",
@@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr,
2289 printk("\n"); 2289 printk("\n");
2290 printk("=========================================================\n"); 2290 printk("=========================================================\n");
2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n");
2292 print_kernel_version(); 2292 print_kernel_ident();
2293 printk("---------------------------------------------------------\n"); 2293 printk("---------------------------------------------------------\n");
2294 printk("%s/%d just changed the state of lock:\n", 2294 printk("%s/%d just changed the state of lock:\n",
2295 curr->comm, task_pid_nr(curr)); 2295 curr->comm, task_pid_nr(curr));
@@ -3175,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3175 printk("\n"); 3175 printk("\n");
3176 printk("=====================================\n"); 3176 printk("=====================================\n");
3177 printk("[ BUG: bad unlock balance detected! ]\n"); 3177 printk("[ BUG: bad unlock balance detected! ]\n");
3178 print_kernel_ident();
3178 printk("-------------------------------------\n"); 3179 printk("-------------------------------------\n");
3179 printk("%s/%d is trying to release lock (", 3180 printk("%s/%d is trying to release lock (",
3180 curr->comm, task_pid_nr(curr)); 3181 curr->comm, task_pid_nr(curr));
@@ -3619,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3619 printk("\n"); 3620 printk("\n");
3620 printk("=================================\n"); 3621 printk("=================================\n");
3621 printk("[ BUG: bad contention detected! ]\n"); 3622 printk("[ BUG: bad contention detected! ]\n");
3623 print_kernel_ident();
3622 printk("---------------------------------\n"); 3624 printk("---------------------------------\n");
3623 printk("%s/%d is trying to contend lock (", 3625 printk("%s/%d is trying to contend lock (",
3624 curr->comm, task_pid_nr(curr)); 3626 curr->comm, task_pid_nr(curr));
@@ -3974,7 +3976,8 @@ void __init lockdep_info(void)
3974 3976
3975#ifdef CONFIG_DEBUG_LOCKDEP 3977#ifdef CONFIG_DEBUG_LOCKDEP
3976 if (lockdep_init_error) { 3978 if (lockdep_init_error) {
3977 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); 3979 printk("WARNING: lockdep init error! lock-%s was acquired"
3980 "before lockdep_init\n", lock_init_error);
3978 printk("Call stack leading to lockdep invocation was:\n"); 3981 printk("Call stack leading to lockdep invocation was:\n");
3979 print_stack_trace(&lockdep_init_trace, 0); 3982 print_stack_trace(&lockdep_init_trace, 0);
3980 } 3983 }
@@ -3993,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3993 printk("\n"); 3996 printk("\n");
3994 printk("=========================\n"); 3997 printk("=========================\n");
3995 printk("[ BUG: held lock freed! ]\n"); 3998 printk("[ BUG: held lock freed! ]\n");
3999 print_kernel_ident();
3996 printk("-------------------------\n"); 4000 printk("-------------------------\n");
3997 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4001 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3998 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4002 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
@@ -4050,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr)
4050 printk("\n"); 4054 printk("\n");
4051 printk("=====================================\n"); 4055 printk("=====================================\n");
4052 printk("[ BUG: lock held at task exit time! ]\n"); 4056 printk("[ BUG: lock held at task exit time! ]\n");
4057 print_kernel_ident();
4053 printk("-------------------------------------\n"); 4058 printk("-------------------------------------\n");
4054 printk("%s/%d is exiting with locks still held!\n", 4059 printk("%s/%d is exiting with locks still held!\n",
4055 curr->comm, task_pid_nr(curr)); 4060 curr->comm, task_pid_nr(curr));
@@ -4147,6 +4152,7 @@ void lockdep_sys_exit(void)
4147 printk("\n"); 4152 printk("\n");
4148 printk("================================================\n"); 4153 printk("================================================\n");
4149 printk("[ BUG: lock held when returning to user space! ]\n"); 4154 printk("[ BUG: lock held when returning to user space! ]\n");
4155 print_kernel_ident();
4150 printk("------------------------------------------------\n"); 4156 printk("------------------------------------------------\n");
4151 printk("%s/%d is leaving the kernel with locks still held!\n", 4157 printk("%s/%d is leaving the kernel with locks still held!\n",
4152 curr->comm, curr->pid); 4158 curr->comm, curr->pid);
@@ -4166,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4166 printk("\n"); 4172 printk("\n");
4167 printk("===============================\n"); 4173 printk("===============================\n");
4168 printk("[ INFO: suspicious RCU usage. ]\n"); 4174 printk("[ INFO: suspicious RCU usage. ]\n");
4175 print_kernel_ident();
4169 printk("-------------------------------\n"); 4176 printk("-------------------------------\n");
4170 printk("%s:%d %s!\n", file, line, s); 4177 printk("%s:%d %s!\n", file, line, s);
4171 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4172 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4180
4181 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section
4183 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
4184 * considers that CPU to be in an "extended quiescent state",
4185 * which means that RCU will be completely ignoring that CPU.
4186 * Therefore, rcu_read_lock() and friends have absolutely no
4187 * effect on a CPU running in that state. In other words, even if
4188 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
4189 * delete data structures out from under it. RCU really has no
4190 * choice here: we need to keep an RCU-free window in idle where
4191 * the CPU may possibly enter into low power mode. This way we can
4192 * notice an extended quiescent state to other CPUs that started a grace
4193 * period. Otherwise we would delay any grace period as long as we run
4194 * in the idle task.
4195 *
4196 * So complain bitterly if someone does call rcu_read_lock(),
4197 * rcu_read_lock_bh() and so on from extended quiescent states.
4198 */
4199 if (rcu_is_cpu_idle())
4200 printk("RCU used illegally from extended quiescent state!\n");
4201
4173 lockdep_print_held_locks(curr); 4202 lockdep_print_held_locks(curr);
4174 printk("\nstack backtrace:\n"); 4203 printk("\nstack backtrace:\n");
4175 dump_stack(); 4204 dump_stack();
diff --git a/kernel/panic.c b/kernel/panic.c
index b26593604214..3458469eb7c3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -237,11 +237,20 @@ void add_taint(unsigned flag)
237 * Can't trust the integrity of the kernel anymore. 237 * Can't trust the integrity of the kernel anymore.
238 * We don't call directly debug_locks_off() because the issue 238 * We don't call directly debug_locks_off() because the issue
239 * is not necessarily serious enough to set oops_in_progress to 1 239 * is not necessarily serious enough to set oops_in_progress to 1
240 * Also we want to keep up lockdep for staging development and 240 * Also we want to keep up lockdep for staging/out-of-tree
241 * post-warning case. 241 * development and post-warning case.
242 */ 242 */
243 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) 243 switch (flag) {
244 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); 244 case TAINT_CRAP:
245 case TAINT_OOT_MODULE:
246 case TAINT_WARN:
247 case TAINT_FIRMWARE_WORKAROUND:
248 break;
249
250 default:
251 if (__debug_locks_off())
252 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
253 }
245 254
246 set_bit(flag, &tainted_mask); 255 set_bit(flag, &tainted_mask);
247} 256}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e7cb76dc18f5..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79 return now.sched < then.sched; 79 return now.sched < then.sched;
80 } else { 80 } else {
81 return cputime_lt(now.cpu, then.cpu); 81 return now.cpu < then.cpu;
82 } 82 }
83} 83}
84static inline void cpu_time_add(const clockid_t which_clock, 84static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89 acc->sched += val.sched; 89 acc->sched += val.sched;
90 } else { 90 } else {
91 acc->cpu = cputime_add(acc->cpu, val.cpu); 91 acc->cpu += val.cpu;
92 } 92 }
93} 93}
94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, 94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99 a.sched -= b.sched; 99 a.sched -= b.sched;
100 } else { 100 } else {
101 a.cpu = cputime_sub(a.cpu, b.cpu); 101 a.cpu -= b.cpu;
102 } 102 }
103 return a; 103 return a;
104} 104}
105 105
106/* 106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114 cputime_t res = cputime_div(time, div);
115
116 return max_t(cputime_t, res, 1);
117}
118
119/*
120 * Update expiry time from increment, and increase overrun count, 107 * Update expiry time from increment, and increase overrun count,
121 * given the current clock sample. 108 * given the current clock sample.
122 */ 109 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
148 } else { 135 } else {
149 cputime_t delta, incr; 136 cputime_t delta, incr;
150 137
151 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) 138 if (now.cpu < timer->it.cpu.expires.cpu)
152 return; 139 return;
153 incr = timer->it.cpu.incr.cpu; 140 incr = timer->it.cpu.incr.cpu;
154 delta = cputime_sub(cputime_add(now.cpu, incr), 141 delta = now.cpu + incr - timer->it.cpu.expires.cpu;
155 timer->it.cpu.expires.cpu);
156 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 142 /* Don't use (incr*2 < delta), incr*2 might overflow. */
157 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 143 for (i = 0; incr < delta - incr; i++)
158 incr = cputime_add(incr, incr); 144 incr += incr;
159 for (; i >= 0; incr = cputime_halve(incr), i--) { 145 for (; i >= 0; incr = incr >> 1, i--) {
160 if (cputime_lt(delta, incr)) 146 if (delta < incr)
161 continue; 147 continue;
162 timer->it.cpu.expires.cpu = 148 timer->it.cpu.expires.cpu += incr;
163 cputime_add(timer->it.cpu.expires.cpu, incr);
164 timer->it_overrun += 1 << i; 149 timer->it_overrun += 1 << i;
165 delta = cputime_sub(delta, incr); 150 delta -= incr;
166 } 151 }
167 } 152 }
168} 153}
169 154
170static inline cputime_t prof_ticks(struct task_struct *p) 155static inline cputime_t prof_ticks(struct task_struct *p)
171{ 156{
172 return cputime_add(p->utime, p->stime); 157 return p->utime + p->stime;
173} 158}
174static inline cputime_t virt_ticks(struct task_struct *p) 159static inline cputime_t virt_ticks(struct task_struct *p)
175{ 160{
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
248 233
249 t = tsk; 234 t = tsk;
250 do { 235 do {
251 times->utime = cputime_add(times->utime, t->utime); 236 times->utime += t->utime;
252 times->stime = cputime_add(times->stime, t->stime); 237 times->stime += t->stime;
253 times->sum_exec_runtime += task_sched_runtime(t); 238 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 239 } while_each_thread(tsk, t);
255out: 240out:
@@ -258,10 +243,10 @@ out:
258 243
259static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
260{ 245{
261 if (cputime_gt(b->utime, a->utime)) 246 if (b->utime > a->utime)
262 a->utime = b->utime; 247 a->utime = b->utime;
263 248
264 if (cputime_gt(b->stime, a->stime)) 249 if (b->stime > a->stime)
265 a->stime = b->stime; 250 a->stime = b->stime;
266 251
267 if (b->sum_exec_runtime > a->sum_exec_runtime) 252 if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
306 return -EINVAL; 291 return -EINVAL;
307 case CPUCLOCK_PROF: 292 case CPUCLOCK_PROF:
308 thread_group_cputime(p, &cputime); 293 thread_group_cputime(p, &cputime);
309 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 294 cpu->cpu = cputime.utime + cputime.stime;
310 break; 295 break;
311 case CPUCLOCK_VIRT: 296 case CPUCLOCK_VIRT:
312 thread_group_cputime(p, &cputime); 297 thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
470 unsigned long long sum_exec_runtime) 455 unsigned long long sum_exec_runtime)
471{ 456{
472 struct cpu_timer_list *timer, *next; 457 struct cpu_timer_list *timer, *next;
473 cputime_t ptime = cputime_add(utime, stime); 458 cputime_t ptime = utime + stime;
474 459
475 list_for_each_entry_safe(timer, next, head, entry) { 460 list_for_each_entry_safe(timer, next, head, entry) {
476 list_del_init(&timer->entry); 461 list_del_init(&timer->entry);
477 if (cputime_lt(timer->expires.cpu, ptime)) { 462 if (timer->expires.cpu < ptime) {
478 timer->expires.cpu = cputime_zero; 463 timer->expires.cpu = 0;
479 } else { 464 } else {
480 timer->expires.cpu = cputime_sub(timer->expires.cpu, 465 timer->expires.cpu -= ptime;
481 ptime);
482 } 466 }
483 } 467 }
484 468
485 ++head; 469 ++head;
486 list_for_each_entry_safe(timer, next, head, entry) { 470 list_for_each_entry_safe(timer, next, head, entry) {
487 list_del_init(&timer->entry); 471 list_del_init(&timer->entry);
488 if (cputime_lt(timer->expires.cpu, utime)) { 472 if (timer->expires.cpu < utime) {
489 timer->expires.cpu = cputime_zero; 473 timer->expires.cpu = 0;
490 } else { 474 } else {
491 timer->expires.cpu = cputime_sub(timer->expires.cpu, 475 timer->expires.cpu -= utime;
492 utime);
493 } 476 }
494 } 477 }
495 478
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
520 struct signal_struct *const sig = tsk->signal; 503 struct signal_struct *const sig = tsk->signal;
521 504
522 cleanup_timers(tsk->signal->cpu_timers, 505 cleanup_timers(tsk->signal->cpu_timers,
523 cputime_add(tsk->utime, sig->utime), 506 tsk->utime + sig->utime, tsk->stime + sig->stime,
524 cputime_add(tsk->stime, sig->stime),
525 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 507 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
526} 508}
527 509
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
540 522
541static inline int expires_gt(cputime_t expires, cputime_t new_exp) 523static inline int expires_gt(cputime_t expires, cputime_t new_exp)
542{ 524{
543 return cputime_eq(expires, cputime_zero) || 525 return expires == 0 || expires > new_exp;
544 cputime_gt(expires, new_exp);
545} 526}
546 527
547/* 528/*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
651 default: 632 default:
652 return -EINVAL; 633 return -EINVAL;
653 case CPUCLOCK_PROF: 634 case CPUCLOCK_PROF:
654 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 635 cpu->cpu = cputime.utime + cputime.stime;
655 break; 636 break;
656 case CPUCLOCK_VIRT: 637 case CPUCLOCK_VIRT:
657 cpu->cpu = cputime.utime; 638 cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
918 unsigned long soft; 899 unsigned long soft;
919 900
920 maxfire = 20; 901 maxfire = 20;
921 tsk->cputime_expires.prof_exp = cputime_zero; 902 tsk->cputime_expires.prof_exp = 0;
922 while (!list_empty(timers)) { 903 while (!list_empty(timers)) {
923 struct cpu_timer_list *t = list_first_entry(timers, 904 struct cpu_timer_list *t = list_first_entry(timers,
924 struct cpu_timer_list, 905 struct cpu_timer_list,
925 entry); 906 entry);
926 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 907 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
927 tsk->cputime_expires.prof_exp = t->expires.cpu; 908 tsk->cputime_expires.prof_exp = t->expires.cpu;
928 break; 909 break;
929 } 910 }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
933 914
934 ++timers; 915 ++timers;
935 maxfire = 20; 916 maxfire = 20;
936 tsk->cputime_expires.virt_exp = cputime_zero; 917 tsk->cputime_expires.virt_exp = 0;
937 while (!list_empty(timers)) { 918 while (!list_empty(timers)) {
938 struct cpu_timer_list *t = list_first_entry(timers, 919 struct cpu_timer_list *t = list_first_entry(timers,
939 struct cpu_timer_list, 920 struct cpu_timer_list,
940 entry); 921 entry);
941 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 922 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
942 tsk->cputime_expires.virt_exp = t->expires.cpu; 923 tsk->cputime_expires.virt_exp = t->expires.cpu;
943 break; 924 break;
944 } 925 }
@@ -1009,20 +990,19 @@ static u32 onecputick;
1009static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 990static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1010 cputime_t *expires, cputime_t cur_time, int signo) 991 cputime_t *expires, cputime_t cur_time, int signo)
1011{ 992{
1012 if (cputime_eq(it->expires, cputime_zero)) 993 if (!it->expires)
1013 return; 994 return;
1014 995
1015 if (cputime_ge(cur_time, it->expires)) { 996 if (cur_time >= it->expires) {
1016 if (!cputime_eq(it->incr, cputime_zero)) { 997 if (it->incr) {
1017 it->expires = cputime_add(it->expires, it->incr); 998 it->expires += it->incr;
1018 it->error += it->incr_error; 999 it->error += it->incr_error;
1019 if (it->error >= onecputick) { 1000 if (it->error >= onecputick) {
1020 it->expires = cputime_sub(it->expires, 1001 it->expires -= cputime_one_jiffy;
1021 cputime_one_jiffy);
1022 it->error -= onecputick; 1002 it->error -= onecputick;
1023 } 1003 }
1024 } else { 1004 } else {
1025 it->expires = cputime_zero; 1005 it->expires = 0;
1026 } 1006 }
1027 1007
1028 trace_itimer_expire(signo == SIGPROF ? 1008 trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1031 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); 1011 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1032 } 1012 }
1033 1013
1034 if (!cputime_eq(it->expires, cputime_zero) && 1014 if (it->expires && (!*expires || it->expires < *expires)) {
1035 (cputime_eq(*expires, cputime_zero) ||
1036 cputime_lt(it->expires, *expires))) {
1037 *expires = it->expires; 1015 *expires = it->expires;
1038 } 1016 }
1039} 1017}
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1048 */ 1026 */
1049static inline int task_cputime_zero(const struct task_cputime *cputime) 1027static inline int task_cputime_zero(const struct task_cputime *cputime)
1050{ 1028{
1051 if (cputime_eq(cputime->utime, cputime_zero) && 1029 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1052 cputime_eq(cputime->stime, cputime_zero) &&
1053 cputime->sum_exec_runtime == 0)
1054 return 1; 1030 return 1;
1055 return 0; 1031 return 0;
1056} 1032}
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
1076 */ 1052 */
1077 thread_group_cputimer(tsk, &cputime); 1053 thread_group_cputimer(tsk, &cputime);
1078 utime = cputime.utime; 1054 utime = cputime.utime;
1079 ptime = cputime_add(utime, cputime.stime); 1055 ptime = utime + cputime.stime;
1080 sum_sched_runtime = cputime.sum_exec_runtime; 1056 sum_sched_runtime = cputime.sum_exec_runtime;
1081 maxfire = 20; 1057 maxfire = 20;
1082 prof_expires = cputime_zero; 1058 prof_expires = 0;
1083 while (!list_empty(timers)) { 1059 while (!list_empty(timers)) {
1084 struct cpu_timer_list *tl = list_first_entry(timers, 1060 struct cpu_timer_list *tl = list_first_entry(timers,
1085 struct cpu_timer_list, 1061 struct cpu_timer_list,
1086 entry); 1062 entry);
1087 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { 1063 if (!--maxfire || ptime < tl->expires.cpu) {
1088 prof_expires = tl->expires.cpu; 1064 prof_expires = tl->expires.cpu;
1089 break; 1065 break;
1090 } 1066 }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
1094 1070
1095 ++timers; 1071 ++timers;
1096 maxfire = 20; 1072 maxfire = 20;
1097 virt_expires = cputime_zero; 1073 virt_expires = 0;
1098 while (!list_empty(timers)) { 1074 while (!list_empty(timers)) {
1099 struct cpu_timer_list *tl = list_first_entry(timers, 1075 struct cpu_timer_list *tl = list_first_entry(timers,
1100 struct cpu_timer_list, 1076 struct cpu_timer_list,
1101 entry); 1077 entry);
1102 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { 1078 if (!--maxfire || utime < tl->expires.cpu) {
1103 virt_expires = tl->expires.cpu; 1079 virt_expires = tl->expires.cpu;
1104 break; 1080 break;
1105 } 1081 }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
1154 } 1130 }
1155 } 1131 }
1156 x = secs_to_cputime(soft); 1132 x = secs_to_cputime(soft);
1157 if (cputime_eq(prof_expires, cputime_zero) || 1133 if (!prof_expires || x < prof_expires) {
1158 cputime_lt(x, prof_expires)) {
1159 prof_expires = x; 1134 prof_expires = x;
1160 } 1135 }
1161 } 1136 }
@@ -1249,12 +1224,9 @@ out:
1249static inline int task_cputime_expired(const struct task_cputime *sample, 1224static inline int task_cputime_expired(const struct task_cputime *sample,
1250 const struct task_cputime *expires) 1225 const struct task_cputime *expires)
1251{ 1226{
1252 if (!cputime_eq(expires->utime, cputime_zero) && 1227 if (expires->utime && sample->utime >= expires->utime)
1253 cputime_ge(sample->utime, expires->utime))
1254 return 1; 1228 return 1;
1255 if (!cputime_eq(expires->stime, cputime_zero) && 1229 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1256 cputime_ge(cputime_add(sample->utime, sample->stime),
1257 expires->stime))
1258 return 1; 1230 return 1;
1259 if (expires->sum_exec_runtime != 0 && 1231 if (expires->sum_exec_runtime != 0 &&
1260 sample->sum_exec_runtime >= expires->sum_exec_runtime) 1232 sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1389 * it to be relative, *newval argument is relative and we update 1361 * it to be relative, *newval argument is relative and we update
1390 * it to be absolute. 1362 * it to be absolute.
1391 */ 1363 */
1392 if (!cputime_eq(*oldval, cputime_zero)) { 1364 if (*oldval) {
1393 if (cputime_le(*oldval, now.cpu)) { 1365 if (*oldval <= now.cpu) {
1394 /* Just about to fire. */ 1366 /* Just about to fire. */
1395 *oldval = cputime_one_jiffy; 1367 *oldval = cputime_one_jiffy;
1396 } else { 1368 } else {
1397 *oldval = cputime_sub(*oldval, now.cpu); 1369 *oldval -= now.cpu;
1398 } 1370 }
1399 } 1371 }
1400 1372
1401 if (cputime_eq(*newval, cputime_zero)) 1373 if (!*newval)
1402 return; 1374 return;
1403 *newval = cputime_add(*newval, now.cpu); 1375 *newval += now.cpu;
1404 } 1376 }
1405 1377
1406 /* 1378 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 7982a0a841ea..989e4a52da76 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
199 unsigned long mem; 199 unsigned long mem;
200 200
201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR) 202 if (!mem)
203 return; 203 return;
204 new_log_buf = __va(mem); 204 new_log_buf = __va(mem);
205 } else { 205 } else {
@@ -688,6 +688,7 @@ static void zap_locks(void)
688 688
689 oops_timestamp = jiffies; 689 oops_timestamp = jiffies;
690 690
691 debug_locks_off();
691 /* If a crash is occurring, make sure we can't deadlock */ 692 /* If a crash is occurring, make sure we can't deadlock */
692 raw_spin_lock_init(&logbuf_lock); 693 raw_spin_lock_init(&logbuf_lock);
693 /* And make sure that we print immediately */ 694 /* And make sure that we print immediately */
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
840 boot_delay_msec(); 841 boot_delay_msec();
841 printk_delay(); 842 printk_delay();
842 843
843 preempt_disable();
844 /* This stops the holder of console_sem just where we want him */ 844 /* This stops the holder of console_sem just where we want him */
845 raw_local_irq_save(flags); 845 local_irq_save(flags);
846 this_cpu = smp_processor_id(); 846 this_cpu = smp_processor_id();
847 847
848 /* 848 /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 * recursion and return - but flag the recursion so that 856 * recursion and return - but flag the recursion so that
857 * it can be printed at the next appropriate moment: 857 * it can be printed at the next appropriate moment:
858 */ 858 */
859 if (!oops_in_progress) { 859 if (!oops_in_progress && !lockdep_recursing(current)) {
860 recursion_bug = 1; 860 recursion_bug = 1;
861 goto out_restore_irqs; 861 goto out_restore_irqs;
862 } 862 }
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
962 962
963 lockdep_on(); 963 lockdep_on();
964out_restore_irqs: 964out_restore_irqs:
965 raw_local_irq_restore(flags); 965 local_irq_restore(flags);
966 966
967 preempt_enable();
968 return printed_len; 967 return printed_len;
969} 968}
970EXPORT_SYMBOL(printk); 969EXPORT_SYMBOL(printk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 24d04477b257..78ab24a7b0e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child)
96 */ 96 */
97 if (!(child->flags & PF_EXITING) && 97 if (!(child->flags & PF_EXITING) &&
98 (child->signal->flags & SIGNAL_STOP_STOPPED || 98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
99 child->signal->group_stop_count)) 99 child->signal->group_stop_count)) {
100 child->jobctl |= JOBCTL_STOP_PENDING; 100 child->jobctl |= JOBCTL_STOP_PENDING;
101 101
102 /*
103 * This is only possible if this thread was cloned by the
104 * traced task running in the stopped group, set the signal
105 * for the future reports.
106 * FIXME: we should change ptrace_init_task() to handle this
107 * case.
108 */
109 if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
110 child->jobctl |= SIGSTOP;
111 }
112
102 /* 113 /*
103 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick 114 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
104 * @child in the butt. Note that @resume should be used iff @child 115 * @child in the butt. Note that @resume should be used iff @child
diff --git a/kernel/rcu.h b/kernel/rcu.h
index f600868d550d..aa88baab5f78 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -30,6 +30,13 @@
30#endif /* #else #ifdef CONFIG_RCU_TRACE */ 30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31 31
32/* 32/*
33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from
35 * process context.
36 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
38
39/*
33 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
34 * by call_rcu() and rcu callback execution, and are therefore not part of the 41 * by call_rcu() and rcu callback execution, and are therefore not part of the
35 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. 42 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c5b98e565aee..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
93{ 93{
94 if (!debug_lockdep_rcu_enabled()) 94 if (!debug_lockdep_rcu_enabled())
95 return 1; 95 return 1;
96 if (rcu_is_cpu_idle())
97 return 0;
96 return in_softirq() || irqs_disabled(); 98 return in_softirq() || irqs_disabled();
97} 99}
98EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
316}; 318};
317EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 319EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
318#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 320#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
321
322#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
323void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
324{
325 trace_rcu_torture_read(rcutorturename, rhp);
326}
327EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
328#else
329#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
330#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 636af6d9c6e5..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56#ifdef CONFIG_NO_HZ 56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
57 57
58static long rcu_dynticks_nesting = 1; 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval)
60{
61 if (rcu_dynticks_nesting) {
62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting));
64 return;
65 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
67 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id());
69
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting));
72 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */
76 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
78}
59 79
60/* 80/*
61 * Enter dynticks-idle mode, which is an extended quiescent state 81 * Enter idle, which is an extended quiescent state if we have fully
62 * if we have fully entered that mode (i.e., if the new value of 82 * entered that mode (i.e., if the new value of dynticks_nesting is zero).
63 * dynticks_nesting is zero).
64 */ 83 */
65void rcu_enter_nohz(void) 84void rcu_idle_enter(void)
66{ 85{
67 if (--rcu_dynticks_nesting == 0) 86 unsigned long flags;
68 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 87 long long oldval;
88
89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0;
92 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags);
69} 94}
70 95
71/* 96/*
72 * Exit dynticks-idle mode, so that we are no longer in an extended 97 * Exit an interrupt handler towards idle.
73 * quiescent state.
74 */ 98 */
75void rcu_exit_nohz(void) 99void rcu_irq_exit(void)
100{
101 unsigned long flags;
102 long long oldval;
103
104 local_irq_save(flags);
105 oldval = rcu_dynticks_nesting;
106 rcu_dynticks_nesting--;
107 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
108 rcu_idle_enter_common(oldval);
109 local_irq_restore(flags);
110}
111
112/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
113static void rcu_idle_exit_common(long long oldval)
76{ 114{
115 if (oldval) {
116 RCU_TRACE(trace_rcu_dyntick("++=",
117 oldval, rcu_dynticks_nesting));
118 return;
119 }
120 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
121 if (!is_idle_task(current)) {
122 struct task_struct *idle = idle_task(smp_processor_id());
123
124 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
125 oldval, rcu_dynticks_nesting));
126 ftrace_dump(DUMP_ALL);
127 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
128 current->pid, current->comm,
129 idle->pid, idle->comm); /* must be idle task! */
130 }
131}
132
133/*
134 * Exit idle, so that we are no longer in an extended quiescent state.
135 */
136void rcu_idle_exit(void)
137{
138 unsigned long flags;
139 long long oldval;
140
141 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
145 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags);
147}
148
149/*
150 * Enter an interrupt handler, moving away from idle.
151 */
152void rcu_irq_enter(void)
153{
154 unsigned long flags;
155 long long oldval;
156
157 local_irq_save(flags);
158 oldval = rcu_dynticks_nesting;
77 rcu_dynticks_nesting++; 159 rcu_dynticks_nesting++;
160 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
161 rcu_idle_exit_common(oldval);
162 local_irq_restore(flags);
163}
164
165#ifdef CONFIG_PROVE_RCU
166
167/*
168 * Test whether RCU thinks that the current CPU is idle.
169 */
170int rcu_is_cpu_idle(void)
171{
172 return !rcu_dynticks_nesting;
78} 173}
174EXPORT_SYMBOL(rcu_is_cpu_idle);
175
176#endif /* #ifdef CONFIG_PROVE_RCU */
79 177
80#endif /* #ifdef CONFIG_NO_HZ */ 178/*
179 * Test whether the current CPU was interrupted from idle. Nested
180 * interrupts don't count, we must be running at the first interrupt
181 * level.
182 */
183int rcu_is_cpu_rrupt_from_idle(void)
184{
185 return rcu_dynticks_nesting <= 0;
186}
81 187
82/* 188/*
83 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 189 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
126 232
127/* 233/*
128 * Check to see if the scheduling-clock interrupt came from an extended 234 * Check to see if the scheduling-clock interrupt came from an extended
129 * quiescent state, and, if so, tell RCU about it. 235 * quiescent state, and, if so, tell RCU about it. This function must
236 * be called from hardirq context. It is normally called from the
237 * scheduling-clock interrupt.
130 */ 238 */
131void rcu_check_callbacks(int cpu, int user) 239void rcu_check_callbacks(int cpu, int user)
132{ 240{
133 if (user || 241 if (user || rcu_is_cpu_rrupt_from_idle())
134 (idle_cpu(cpu) &&
135 !in_softirq() &&
136 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
137 rcu_sched_qs(cpu); 242 rcu_sched_qs(cpu);
138 else if (!in_softirq()) 243 else if (!in_softirq())
139 rcu_bh_qs(cpu); 244 rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
154 /* If no RCU callbacks ready to invoke, just return. */ 259 /* If no RCU callbacks ready to invoke, just return. */
155 if (&rcp->rcucblist == rcp->donetail) { 260 if (&rcp->rcucblist == rcp->donetail) {
156 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
157 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); 262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(),
265 is_idle_task(current),
266 rcu_is_callbacks_kthread()));
158 return; 267 return;
159 } 268 }
160 269
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
183 RCU_TRACE(cb_count++); 292 RCU_TRACE(cb_count++);
184 } 293 }
185 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 294 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); 295 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
296 is_idle_task(current),
297 rcu_is_callbacks_kthread()));
187} 298}
188 299
189static void rcu_process_callbacks(struct softirq_action *unused) 300static void rcu_process_callbacks(struct softirq_action *unused)
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 2b0484a5dc28..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -312,8 +312,8 @@ static int rcu_boost(void)
312 rt_mutex_lock(&mtx); 312 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
314 314
315 return rcu_preempt_ctrlblk.boost_tasks != NULL || 315 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
316 rcu_preempt_ctrlblk.exp_tasks != NULL; 316 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
317} 317}
318 318
319/* 319/*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
885 wake_up(&rcu_kthread_wq); 885 wake_up(&rcu_kthread_wq);
886} 886}
887 887
888#ifdef CONFIG_RCU_TRACE
889
890/*
891 * Is the current CPU running the RCU-callbacks kthread?
892 * Caller must have preemption disabled.
893 */
894static bool rcu_is_callbacks_kthread(void)
895{
896 return rcu_kthread_task == current;
897}
898
899#endif /* #ifdef CONFIG_RCU_TRACE */
900
888/* 901/*
889 * This kthread invokes RCU callbacks whose grace periods have 902 * This kthread invokes RCU callbacks whose grace periods have
890 * elapsed. It is awakened as needed, and takes the place of the 903 * elapsed. It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
938 raise_softirq(RCU_SOFTIRQ); 951 raise_softirq(RCU_SOFTIRQ);
939} 952}
940 953
954#ifdef CONFIG_RCU_TRACE
955
956/*
957 * There is no callback kthread, so this thread is never it.
958 */
959static bool rcu_is_callbacks_kthread(void)
960{
961 return false;
962}
963
964#endif /* #ifdef CONFIG_RCU_TRACE */
965
941void rcu_init(void) 966void rcu_init(void)
942{ 967{
943 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 764825c2685c..88f17b8a3b1d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,9 +61,11 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 93MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
92module_param(fqs_stutter, int, 0444); 94module_param(fqs_stutter, int, 0444);
93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
98module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
94module_param(test_boost, int, 0444); 100module_param(test_boost, int, 0444);
95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96module_param(test_boost_interval, int, 0444); 102module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
119static struct task_struct *stutter_task; 125static struct task_struct *stutter_task;
120static struct task_struct *fqs_task; 126static struct task_struct *fqs_task;
121static struct task_struct *boost_tasks[NR_CPUS]; 127static struct task_struct *boost_tasks[NR_CPUS];
128static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */
122 132
123#define RCU_TORTURE_PIPE_LEN 10 133#define RCU_TORTURE_PIPE_LEN 10
124 134
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_failure; 159static long n_rcu_torture_boost_failure;
150static long n_rcu_torture_boosts; 160static long n_rcu_torture_boosts;
151static long n_rcu_torture_timers; 161static long n_rcu_torture_timers;
162static long n_offline_attempts;
163static long n_offline_successes;
164static long n_online_attempts;
165static long n_online_successes;
152static struct list_head rcu_torture_removed; 166static struct list_head rcu_torture_removed;
153static cpumask_var_t shuffle_tmp_mask; 167static cpumask_var_t shuffle_tmp_mask;
154 168
@@ -160,6 +174,8 @@ static int stutter_pause_test;
160#define RCUTORTURE_RUNNABLE_INIT 0 174#define RCUTORTURE_RUNNABLE_INIT 0
161#endif 175#endif
162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 176int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
177module_param(rcutorture_runnable, int, 0444);
178MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
163 179
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 180#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 181#define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
167#define rcu_can_boost() 0 183#define rcu_can_boost() 0
168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 184#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169 185
186static unsigned long shutdown_time; /* jiffies to system shutdown. */
170static unsigned long boost_starttime; /* jiffies of next boost test start. */ 187static unsigned long boost_starttime; /* jiffies of next boost test start. */
171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 188DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */ 189 /* and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
182 */ 199 */
183static DEFINE_MUTEX(fullstop_mutex); 200static DEFINE_MUTEX(fullstop_mutex);
184 201
202/* Forward reference. */
203static void rcu_torture_cleanup(void);
204
185/* 205/*
186 * Detect and respond to a system shutdown. 206 * Detect and respond to a system shutdown.
187 */ 207 */
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
612 .name = "srcu" 632 .name = "srcu"
613}; 633};
614 634
635static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
636{
637 return srcu_read_lock_raw(&srcu_ctl);
638}
639
640static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
641{
642 srcu_read_unlock_raw(&srcu_ctl, idx);
643}
644
645static struct rcu_torture_ops srcu_raw_ops = {
646 .init = srcu_torture_init,
647 .cleanup = srcu_torture_cleanup,
648 .readlock = srcu_torture_read_lock_raw,
649 .read_delay = srcu_read_delay,
650 .readunlock = srcu_torture_read_unlock_raw,
651 .completed = srcu_torture_completed,
652 .deferred_free = rcu_sync_torture_deferred_free,
653 .sync = srcu_torture_synchronize,
654 .cb_barrier = NULL,
655 .stats = srcu_torture_stats,
656 .name = "srcu_raw"
657};
658
615static void srcu_torture_synchronize_expedited(void) 659static void srcu_torture_synchronize_expedited(void)
616{ 660{
617 synchronize_srcu_expedited(&srcu_ctl); 661 synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
913 return 0; 957 return 0;
914} 958}
915 959
960void rcutorture_trace_dump(void)
961{
962 static atomic_t beenhere = ATOMIC_INIT(0);
963
964 if (atomic_read(&beenhere))
965 return;
966 if (atomic_xchg(&beenhere, 1) != 0)
967 return;
968 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
969 ftrace_dump(DUMP_ALL);
970}
971
916/* 972/*
917 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 973 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
918 * incrementing the corresponding element of the pipeline array. The 974 * incrementing the corresponding element of the pipeline array. The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
934 rcu_read_lock_bh_held() || 990 rcu_read_lock_bh_held() ||
935 rcu_read_lock_sched_held() || 991 rcu_read_lock_sched_held() ||
936 srcu_read_lock_held(&srcu_ctl)); 992 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
937 if (p == NULL) { 994 if (p == NULL) {
938 /* Leave because rcu_torture_writer is not yet underway */ 995 /* Leave because rcu_torture_writer is not yet underway */
939 cur_ops->readunlock(idx); 996 cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
951 /* Should not happen, but... */ 1008 /* Should not happen, but... */
952 pipe_count = RCU_TORTURE_PIPE_LEN; 1009 pipe_count = RCU_TORTURE_PIPE_LEN;
953 } 1010 }
1011 if (pipe_count > 1)
1012 rcutorture_trace_dump();
954 __this_cpu_inc(rcu_torture_count[pipe_count]); 1013 __this_cpu_inc(rcu_torture_count[pipe_count]);
955 completed = cur_ops->completed() - completed; 1014 completed = cur_ops->completed() - completed;
956 if (completed > RCU_TORTURE_PIPE_LEN) { 1015 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
994 rcu_read_lock_bh_held() || 1053 rcu_read_lock_bh_held() ||
995 rcu_read_lock_sched_held() || 1054 rcu_read_lock_sched_held() ||
996 srcu_read_lock_held(&srcu_ctl)); 1055 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
997 if (p == NULL) { 1057 if (p == NULL) {
998 /* Wait for rcu_torture_writer to get underway */ 1058 /* Wait for rcu_torture_writer to get underway */
999 cur_ops->readunlock(idx); 1059 cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
1009 /* Should not happen, but... */ 1069 /* Should not happen, but... */
1010 pipe_count = RCU_TORTURE_PIPE_LEN; 1070 pipe_count = RCU_TORTURE_PIPE_LEN;
1011 } 1071 }
1072 if (pipe_count > 1)
1073 rcutorture_trace_dump();
1012 __this_cpu_inc(rcu_torture_count[pipe_count]); 1074 __this_cpu_inc(rcu_torture_count[pipe_count]);
1013 completed = cur_ops->completed() - completed; 1075 completed = cur_ops->completed() - completed;
1014 if (completed > RCU_TORTURE_PIPE_LEN) { 1076 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
1056 cnt += sprintf(&page[cnt], 1118 cnt += sprintf(&page[cnt],
1057 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1119 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1058 "rtmbe: %d rtbke: %ld rtbre: %ld " 1120 "rtmbe: %d rtbke: %ld rtbre: %ld "
1059 "rtbf: %ld rtb: %ld nt: %ld", 1121 "rtbf: %ld rtb: %ld nt: %ld "
1122 "onoff: %ld/%ld:%ld/%ld",
1060 rcu_torture_current, 1123 rcu_torture_current,
1061 rcu_torture_current_version, 1124 rcu_torture_current_version,
1062 list_empty(&rcu_torture_freelist), 1125 list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
1068 n_rcu_torture_boost_rterror, 1131 n_rcu_torture_boost_rterror,
1069 n_rcu_torture_boost_failure, 1132 n_rcu_torture_boost_failure,
1070 n_rcu_torture_boosts, 1133 n_rcu_torture_boosts,
1071 n_rcu_torture_timers); 1134 n_rcu_torture_timers,
1135 n_online_successes,
1136 n_online_attempts,
1137 n_offline_successes,
1138 n_offline_attempts);
1072 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1139 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1073 n_rcu_torture_boost_ktrerror != 0 || 1140 n_rcu_torture_boost_ktrerror != 0 ||
1074 n_rcu_torture_boost_rterror != 0 || 1141 n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1232 "shuffle_interval=%d stutter=%d irqreader=%d " 1299 "shuffle_interval=%d stutter=%d irqreader=%d "
1233 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1234 "test_boost=%d/%d test_boost_interval=%d " 1301 "test_boost=%d/%d test_boost_interval=%d "
1235 "test_boost_duration=%d\n", 1302 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n",
1236 torture_type, tag, nrealreaders, nfakewriters, 1304 torture_type, tag, nrealreaders, nfakewriters,
1237 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1238 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1239 test_boost, cur_ops->can_boost, 1307 test_boost, cur_ops->can_boost,
1240 test_boost_interval, test_boost_duration); 1308 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval);
1241} 1310}
1242 1311
1243static struct notifier_block rcutorture_shutdown_nb = { 1312static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
1287 return 0; 1356 return 0;
1288} 1357}
1289 1358
1359/*
1360 * Cause the rcutorture test to shutdown the system after the test has
1361 * run for the time specified by the shutdown_secs module parameter.
1362 */
1363static int
1364rcu_torture_shutdown(void *arg)
1365{
1366 long delta;
1367 unsigned long jiffies_snap;
1368
1369 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1370 jiffies_snap = ACCESS_ONCE(jiffies);
1371 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1372 !kthread_should_stop()) {
1373 delta = shutdown_time - jiffies_snap;
1374 if (verbose)
1375 printk(KERN_ALERT "%s" TORTURE_FLAG
1376 "rcu_torture_shutdown task: %lu "
1377 "jiffies remaining\n",
1378 torture_type, delta);
1379 schedule_timeout_interruptible(delta);
1380 jiffies_snap = ACCESS_ONCE(jiffies);
1381 }
1382 if (kthread_should_stop()) {
1383 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1384 return 0;
1385 }
1386
1387 /* OK, shut down the system. */
1388
1389 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1390 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1391 rcu_torture_cleanup(); /* Get the success/failure message. */
1392 kernel_power_off(); /* Shut down the system. */
1393 return 0;
1394}
1395
1396#ifdef CONFIG_HOTPLUG_CPU
1397
1398/*
1399 * Execute random CPU-hotplug operations at the interval specified
1400 * by the onoff_interval.
1401 */
1402static int
1403rcu_torture_onoff(void *arg)
1404{
1405 int cpu;
1406 int maxcpu = -1;
1407 DEFINE_RCU_RANDOM(rand);
1408
1409 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1410 for_each_online_cpu(cpu)
1411 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0);
1413 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1416 if (verbose)
1417 printk(KERN_ALERT "%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offlining %d\n",
1419 torture_type, cpu);
1420 n_offline_attempts++;
1421 if (cpu_down(cpu) == 0) {
1422 if (verbose)
1423 printk(KERN_ALERT "%s" TORTURE_FLAG
1424 "rcu_torture_onoff task: "
1425 "offlined %d\n",
1426 torture_type, cpu);
1427 n_offline_successes++;
1428 }
1429 } else if (cpu_is_hotpluggable(cpu)) {
1430 if (verbose)
1431 printk(KERN_ALERT "%s" TORTURE_FLAG
1432 "rcu_torture_onoff task: onlining %d\n",
1433 torture_type, cpu);
1434 n_online_attempts++;
1435 if (cpu_up(cpu) == 0) {
1436 if (verbose)
1437 printk(KERN_ALERT "%s" TORTURE_FLAG
1438 "rcu_torture_onoff task: "
1439 "onlined %d\n",
1440 torture_type, cpu);
1441 n_online_successes++;
1442 }
1443 }
1444 schedule_timeout_interruptible(onoff_interval * HZ);
1445 }
1446 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1447 return 0;
1448}
1449
1450static int
1451rcu_torture_onoff_init(void)
1452{
1453 if (onoff_interval <= 0)
1454 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) {
1457 onoff_task = NULL;
1458 return PTR_ERR(onoff_task);
1459 }
1460 return 0;
1461}
1462
1463static void rcu_torture_onoff_cleanup(void)
1464{
1465 if (onoff_task == NULL)
1466 return;
1467 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1468 kthread_stop(onoff_task);
1469}
1470
1471#else /* #ifdef CONFIG_HOTPLUG_CPU */
1472
1473static void
1474rcu_torture_onoff_init(void)
1475{
1476}
1477
1478static void rcu_torture_onoff_cleanup(void)
1479{
1480}
1481
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483
1290static int rcutorture_cpu_notify(struct notifier_block *self, 1484static int rcutorture_cpu_notify(struct notifier_block *self,
1291 unsigned long action, void *hcpu) 1485 unsigned long action, void *hcpu)
1292{ 1486{
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
1391 for_each_possible_cpu(i) 1585 for_each_possible_cpu(i)
1392 rcutorture_booster_cleanup(i); 1586 rcutorture_booster_cleanup(i);
1393 } 1587 }
1588 if (shutdown_task != NULL) {
1589 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1590 kthread_stop(shutdown_task);
1591 }
1592 rcu_torture_onoff_cleanup();
1394 1593
1395 /* Wait for all RCU callbacks to fire. */ 1594 /* Wait for all RCU callbacks to fire. */
1396 1595
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
1416 static struct rcu_torture_ops *torture_ops[] = 1615 static struct rcu_torture_ops *torture_ops[] =
1417 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1616 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1418 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1617 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1419 &srcu_ops, &srcu_expedited_ops, 1618 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
1420 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1619 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1421 1620
1422 mutex_lock(&fullstop_mutex); 1621 mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
1607 } 1806 }
1608 } 1807 }
1609 } 1808 }
1809 if (shutdown_secs > 0) {
1810 shutdown_time = jiffies + shutdown_secs * HZ;
1811 shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
1812 "rcu_torture_shutdown");
1813 if (IS_ERR(shutdown_task)) {
1814 firsterr = PTR_ERR(shutdown_task);
1815 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
1816 shutdown_task = NULL;
1817 goto unwind;
1818 }
1819 }
1820 rcu_torture_onoff_init();
1610 register_reboot_notifier(&rcutorture_shutdown_nb); 1821 register_reboot_notifier(&rcutorture_shutdown_nb);
1611 rcutorture_record_test_transition(); 1822 rcutorture_record_test_transition();
1612 mutex_unlock(&fullstop_mutex); 1823 mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b76d812740c..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
69 NUM_RCU_LVL_3, \ 69 NUM_RCU_LVL_3, \
70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ 70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
71 }, \ 71 }, \
72 .signaled = RCU_GP_IDLE, \ 72 .fqs_state = RCU_GP_IDLE, \
73 .gpnum = -300, \ 73 .gpnum = -300, \
74 .completed = -300, \ 74 .completed = -300, \
75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
195} 195}
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 197
198#ifdef CONFIG_NO_HZ
199DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
200 .dynticks_nesting = 1, 199 .dynticks_nesting = DYNTICK_TASK_NESTING,
201 .dynticks = ATOMIC_INIT(1), 200 .dynticks = ATOMIC_INIT(1),
202}; 201};
203#endif /* #ifdef CONFIG_NO_HZ */
204 202
205static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 203static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
206static int qhimark = 10000; /* If this many pending, ignore blimit. */ 204static int qhimark = 10000; /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
328 return 1; 326 return 1;
329 } 327 }
330 328
331 /* If preemptible RCU, no point in sending reschedule IPI. */ 329 /*
332 if (rdp->preemptible) 330 * The CPU is online, so send it a reschedule IPI. This forces
333 return 0; 331 * it through the scheduler, and (inefficiently) also handles cases
334 332 * where idle loops fail to inform RCU about the CPU being idle.
335 /* The CPU is online, so send it a reschedule IPI. */ 333 */
336 if (rdp->cpu != smp_processor_id()) 334 if (rdp->cpu != smp_processor_id())
337 smp_send_reschedule(rdp->cpu); 335 smp_send_reschedule(rdp->cpu);
338 else 336 else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
343 341
344#endif /* #ifdef CONFIG_SMP */ 342#endif /* #ifdef CONFIG_SMP */
345 343
346#ifdef CONFIG_NO_HZ 344/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 *
347 * If the new value of the ->dynticks_nesting counter now is zero,
348 * we really have entered idle, and must do the appropriate accounting.
349 * The caller must have disabled interrupts.
350 */
351static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
352{
353 trace_rcu_dyntick("Start", oldval, 0);
354 if (!is_idle_task(current)) {
355 struct task_struct *idle = idle_task(smp_processor_id());
356
357 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
358 ftrace_dump(DUMP_ALL);
359 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
360 current->pid, current->comm,
361 idle->pid, idle->comm); /* must be idle task! */
362 }
363 rcu_prepare_for_idle(smp_processor_id());
364 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
365 smp_mb__before_atomic_inc(); /* See above. */
366 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
369}
347 370
348/** 371/**
349 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 372 * rcu_idle_enter - inform RCU that current CPU is entering idle
350 * 373 *
351 * Enter nohz mode, in other words, -leave- the mode in which RCU 374 * Enter idle mode, in other words, -leave- the mode in which RCU
352 * read-side critical sections can occur. (Though RCU read-side 375 * read-side critical sections can occur. (Though RCU read-side
353 * critical sections can occur in irq handlers in nohz mode, a possibility 376 * critical sections can occur in irq handlers in idle, a possibility
354 * handled by rcu_irq_enter() and rcu_irq_exit()). 377 * handled by irq_enter() and irq_exit().)
378 *
379 * We crowbar the ->dynticks_nesting field to zero to allow for
380 * the possibility of usermode upcalls having messed up our count
381 * of interrupt nesting level during the prior busy period.
355 */ 382 */
356void rcu_enter_nohz(void) 383void rcu_idle_enter(void)
357{ 384{
358 unsigned long flags; 385 unsigned long flags;
386 long long oldval;
359 struct rcu_dynticks *rdtp; 387 struct rcu_dynticks *rdtp;
360 388
361 local_irq_save(flags); 389 local_irq_save(flags);
362 rdtp = &__get_cpu_var(rcu_dynticks); 390 rdtp = &__get_cpu_var(rcu_dynticks);
363 if (--rdtp->dynticks_nesting) { 391 oldval = rdtp->dynticks_nesting;
364 local_irq_restore(flags); 392 rdtp->dynticks_nesting = 0;
365 return; 393 rcu_idle_enter_common(rdtp, oldval);
366 }
367 trace_rcu_dyntick("Start");
368 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
369 smp_mb__before_atomic_inc(); /* See above. */
370 atomic_inc(&rdtp->dynticks);
371 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
373 local_irq_restore(flags); 394 local_irq_restore(flags);
374} 395}
375 396
376/* 397/**
377 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz 398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
399 *
400 * Exit from an interrupt handler, which might possibly result in entering
401 * idle mode, in other words, leaving the mode in which read-side critical
402 * sections can occur.
378 * 403 *
379 * Exit nohz mode, in other words, -enter- the mode in which RCU 404 * This code assumes that the idle loop never does anything that might
380 * read-side critical sections normally occur. 405 * result in unbalanced calls to irq_enter() and irq_exit(). If your
406 * architecture violates this assumption, RCU will give you what you
407 * deserve, good and hard. But very infrequently and irreproducibly.
408 *
409 * Use things like work queues to work around this limitation.
410 *
411 * You have been warned.
381 */ 412 */
382void rcu_exit_nohz(void) 413void rcu_irq_exit(void)
383{ 414{
384 unsigned long flags; 415 unsigned long flags;
416 long long oldval;
385 struct rcu_dynticks *rdtp; 417 struct rcu_dynticks *rdtp;
386 418
387 local_irq_save(flags); 419 local_irq_save(flags);
388 rdtp = &__get_cpu_var(rcu_dynticks); 420 rdtp = &__get_cpu_var(rcu_dynticks);
389 if (rdtp->dynticks_nesting++) { 421 oldval = rdtp->dynticks_nesting;
390 local_irq_restore(flags); 422 rdtp->dynticks_nesting--;
391 return; 423 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
392 } 424 if (rdtp->dynticks_nesting)
425 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
426 else
427 rcu_idle_enter_common(rdtp, oldval);
428 local_irq_restore(flags);
429}
430
431/*
432 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
433 *
434 * If the new value of the ->dynticks_nesting counter was previously zero,
435 * we really have exited idle, and must do the appropriate accounting.
436 * The caller must have disabled interrupts.
437 */
438static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
439{
393 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 440 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
394 atomic_inc(&rdtp->dynticks); 441 atomic_inc(&rdtp->dynticks);
395 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 442 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
396 smp_mb__after_atomic_inc(); /* See above. */ 443 smp_mb__after_atomic_inc(); /* See above. */
397 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 444 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
398 trace_rcu_dyntick("End"); 445 rcu_cleanup_after_idle(smp_processor_id());
446 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
447 if (!is_idle_task(current)) {
448 struct task_struct *idle = idle_task(smp_processor_id());
449
450 trace_rcu_dyntick("Error on exit: not idle task",
451 oldval, rdtp->dynticks_nesting);
452 ftrace_dump(DUMP_ALL);
453 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
454 current->pid, current->comm,
455 idle->pid, idle->comm); /* must be idle task! */
456 }
457}
458
459/**
460 * rcu_idle_exit - inform RCU that current CPU is leaving idle
461 *
462 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur.
464 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
466 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just
468 * now starting.
469 */
470void rcu_idle_exit(void)
471{
472 unsigned long flags;
473 struct rcu_dynticks *rdtp;
474 long long oldval;
475
476 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
481 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags);
483}
484
485/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
487 *
488 * Enter an interrupt handler, which might possibly result in exiting
489 * idle mode, in other words, entering the mode in which read-side critical
490 * sections can occur.
491 *
492 * Note that the Linux kernel is fully capable of entering an interrupt
493 * handler that it never exits, for example when doing upcalls to
494 * user mode! This code assumes that the idle loop never does upcalls to
495 * user mode. If your architecture does do upcalls from the idle loop (or
496 * does anything else that results in unbalanced calls to the irq_enter()
497 * and irq_exit() functions), RCU will give you what you deserve, good
498 * and hard. But very infrequently and irreproducibly.
499 *
500 * Use things like work queues to work around this limitation.
501 *
502 * You have been warned.
503 */
504void rcu_irq_enter(void)
505{
506 unsigned long flags;
507 struct rcu_dynticks *rdtp;
508 long long oldval;
509
510 local_irq_save(flags);
511 rdtp = &__get_cpu_var(rcu_dynticks);
512 oldval = rdtp->dynticks_nesting;
513 rdtp->dynticks_nesting++;
514 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
515 if (oldval)
516 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
517 else
518 rcu_idle_exit_common(rdtp, oldval);
399 local_irq_restore(flags); 519 local_irq_restore(flags);
400} 520}
401 521
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
442 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 562 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
443} 563}
444 564
565#ifdef CONFIG_PROVE_RCU
566
445/** 567/**
446 * rcu_irq_enter - inform RCU of entry to hard irq context 568 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
447 * 569 *
448 * If the CPU was idle with dynamic ticks active, this updates the 570 * If the current CPU is in its idle loop and is neither in an interrupt
449 * rdtp->dynticks to let the RCU handling know that the CPU is active. 571 * or NMI handler, return true.
450 */ 572 */
451void rcu_irq_enter(void) 573int rcu_is_cpu_idle(void)
452{ 574{
453 rcu_exit_nohz(); 575 int ret;
576
577 preempt_disable();
578 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
579 preempt_enable();
580 return ret;
454} 581}
582EXPORT_SYMBOL(rcu_is_cpu_idle);
583
584#endif /* #ifdef CONFIG_PROVE_RCU */
455 585
456/** 586/**
457 * rcu_irq_exit - inform RCU of exit from hard irq context 587 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
458 * 588 *
459 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks 589 * If the current CPU is idle or running at a first-level (not nested)
460 * to put let the RCU handling be aware that the CPU is going back to idle 590 * interrupt from idle, return true. The caller must have at least
461 * with no ticks. 591 * disabled preemption.
462 */ 592 */
463void rcu_irq_exit(void) 593int rcu_is_cpu_rrupt_from_idle(void)
464{ 594{
465 rcu_enter_nohz(); 595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
466} 596}
467 597
468#ifdef CONFIG_SMP 598#ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
475static int dyntick_save_progress_counter(struct rcu_data *rdp) 605static int dyntick_save_progress_counter(struct rcu_data *rdp)
476{ 606{
477 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 607 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
478 return 0; 608 return (rdp->dynticks_snap & 0x1) == 0;
479} 609}
480 610
481/* 611/*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
512 642
513#endif /* #ifdef CONFIG_SMP */ 643#endif /* #ifdef CONFIG_SMP */
514 644
515#else /* #ifdef CONFIG_NO_HZ */
516
517#ifdef CONFIG_SMP
518
519static int dyntick_save_progress_counter(struct rcu_data *rdp)
520{
521 return 0;
522}
523
524static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
525{
526 return rcu_implicit_offline_qs(rdp);
527}
528
529#endif /* #ifdef CONFIG_SMP */
530
531#endif /* #else #ifdef CONFIG_NO_HZ */
532
533int rcu_cpu_stall_suppress __read_mostly;
534
535static void record_gp_stall_check_time(struct rcu_state *rsp) 645static void record_gp_stall_check_time(struct rcu_state *rsp)
536{ 646{
537 rsp->gp_start = jiffies; 647 rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
866 /* Advance to a new grace period and initialize state. */ 976 /* Advance to a new grace period and initialize state. */
867 rsp->gpnum++; 977 rsp->gpnum++;
868 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 978 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
869 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 979 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
870 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
871 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
872 record_gp_stall_check_time(rsp); 982 record_gp_stall_check_time(rsp);
873 983
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
877 rnp->qsmask = rnp->qsmaskinit; 987 rnp->qsmask = rnp->qsmaskinit;
878 rnp->gpnum = rsp->gpnum; 988 rnp->gpnum = rsp->gpnum;
879 rnp->completed = rsp->completed; 989 rnp->completed = rsp->completed;
880 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
881 rcu_start_gp_per_cpu(rsp, rnp, rdp); 991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
882 rcu_preempt_boost_start_gp(rnp); 992 rcu_preempt_boost_start_gp(rnp);
883 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
927 1037
928 rnp = rcu_get_root(rsp); 1038 rnp = rcu_get_root(rsp);
929 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1039 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
930 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 1040 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
931 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1041 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
932 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1042 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
933} 1043}
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
991 1101
992 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1102 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
993 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1103 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
994 rsp->signaled = RCU_GP_IDLE; 1104 rsp->fqs_state = RCU_GP_IDLE;
995 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1105 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
996} 1106}
997 1107
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1221 else 1331 else
1222 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1332 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1223 if (need_report & RCU_OFL_TASKS_EXP_GP) 1333 if (need_report & RCU_OFL_TASKS_EXP_GP)
1224 rcu_report_exp_rnp(rsp, rnp); 1334 rcu_report_exp_rnp(rsp, rnp, true);
1225 rcu_node_kthread_setaffinity(rnp, -1); 1335 rcu_node_kthread_setaffinity(rnp, -1);
1226} 1336}
1227 1337
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1263 /* If no callbacks are ready, just return.*/ 1373 /* If no callbacks are ready, just return.*/
1264 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1265 trace_rcu_batch_start(rsp->name, 0, 0); 1375 trace_rcu_batch_start(rsp->name, 0, 0);
1266 trace_rcu_batch_end(rsp->name, 0); 1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread());
1267 return; 1379 return;
1268 } 1380 }
1269 1381
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1291 debug_rcu_head_unqueue(list); 1403 debug_rcu_head_unqueue(list);
1292 __rcu_reclaim(rsp->name, list); 1404 __rcu_reclaim(rsp->name, list);
1293 list = next; 1405 list = next;
1294 if (++count >= bl) 1406 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl &&
1408 (need_resched() ||
1409 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
1295 break; 1410 break;
1296 } 1411 }
1297 1412
1298 local_irq_save(flags); 1413 local_irq_save(flags);
1299 trace_rcu_batch_end(rsp->name, count); 1414 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
1415 is_idle_task(current),
1416 rcu_is_callbacks_kthread());
1300 1417
1301 /* Update count, and requeue any remaining callbacks. */ 1418 /* Update count, and requeue any remaining callbacks. */
1302 rdp->qlen -= count; 1419 rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1334 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1451 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1335 * Also schedule RCU core processing. 1452 * Also schedule RCU core processing.
1336 * 1453 *
1337 * This function must be called with hardirqs disabled. It is normally 1454 * This function must be called from hardirq context. It is normally
1338 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1455 * invoked from the scheduling-clock interrupt. If rcu_pending returns
1339 * false, there is no point in invoking rcu_check_callbacks(). 1456 * false, there is no point in invoking rcu_check_callbacks().
1340 */ 1457 */
1341void rcu_check_callbacks(int cpu, int user) 1458void rcu_check_callbacks(int cpu, int user)
1342{ 1459{
1343 trace_rcu_utilization("Start scheduler-tick"); 1460 trace_rcu_utilization("Start scheduler-tick");
1344 if (user || 1461 if (user || rcu_is_cpu_rrupt_from_idle()) {
1345 (idle_cpu(cpu) && rcu_scheduler_active &&
1346 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
1347 1462
1348 /* 1463 /*
1349 * Get here if this CPU took its interrupt from user 1464 * Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1457 goto unlock_fqs_ret; /* no GP in progress, time updated. */ 1572 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1458 } 1573 }
1459 rsp->fqs_active = 1; 1574 rsp->fqs_active = 1;
1460 switch (rsp->signaled) { 1575 switch (rsp->fqs_state) {
1461 case RCU_GP_IDLE: 1576 case RCU_GP_IDLE:
1462 case RCU_GP_INIT: 1577 case RCU_GP_INIT:
1463 1578
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1473 force_qs_rnp(rsp, dyntick_save_progress_counter); 1588 force_qs_rnp(rsp, dyntick_save_progress_counter);
1474 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1589 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1475 if (rcu_gp_in_progress(rsp)) 1590 if (rcu_gp_in_progress(rsp))
1476 rsp->signaled = RCU_FORCE_QS; 1591 rsp->fqs_state = RCU_FORCE_QS;
1477 break; 1592 break;
1478 1593
1479 case RCU_FORCE_QS: 1594 case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
1812 * by the current CPU, even if none need be done immediately, returning 1927 * by the current CPU, even if none need be done immediately, returning
1813 * 1 if so. 1928 * 1 if so.
1814 */ 1929 */
1815static int rcu_needs_cpu_quick_check(int cpu) 1930static int rcu_cpu_has_callbacks(int cpu)
1816{ 1931{
1817 /* RCU callbacks either ready or pending? */ 1932 /* RCU callbacks either ready or pending? */
1818 return per_cpu(rcu_sched_data, cpu).nxtlist || 1933 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1913 for (i = 0; i < RCU_NEXT_SIZE; i++) 2028 for (i = 0; i < RCU_NEXT_SIZE; i++)
1914 rdp->nxttail[i] = &rdp->nxtlist; 2029 rdp->nxttail[i] = &rdp->nxtlist;
1915 rdp->qlen = 0; 2030 rdp->qlen = 0;
1916#ifdef CONFIG_NO_HZ
1917 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1918#endif /* #ifdef CONFIG_NO_HZ */ 2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
1919 rdp->cpu = cpu; 2034 rdp->cpu = cpu;
1920 rdp->rsp = rsp; 2035 rdp->rsp = rsp;
1921 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2036 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1942 rdp->qlen_last_fqs_check = 0; 2057 rdp->qlen_last_fqs_check = 0;
1943 rdp->n_force_qs_snap = rsp->n_force_qs; 2058 rdp->n_force_qs_snap = rsp->n_force_qs;
1944 rdp->blimit = blimit; 2059 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
2061 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu);
1945 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2064 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1946 2065
1947 /* 2066 /*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2023 rcu_send_cbs_to_online(&rcu_bh_state); 2142 rcu_send_cbs_to_online(&rcu_bh_state);
2024 rcu_send_cbs_to_online(&rcu_sched_state); 2143 rcu_send_cbs_to_online(&rcu_sched_state);
2025 rcu_preempt_send_cbs_to_online(); 2144 rcu_preempt_send_cbs_to_online();
2145 rcu_cleanup_after_idle(cpu);
2026 break; 2146 break;
2027 case CPU_DEAD: 2147 case CPU_DEAD:
2028 case CPU_DEAD_FROZEN: 2148 case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 849ce9ec51fe..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track irq/process nesting level. */ 87 long long dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 /* Process level is worth LLONG_MAX/2. */
89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */
90}; 91};
91 92
92/* RCU's kthread states for tracing. */ 93/* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
274 /* did other CPU force QS recently? */ 275 /* did other CPU force QS recently? */
275 long blimit; /* Upper limit on a processed batch */ 276 long blimit; /* Upper limit on a processed batch */
276 277
277#ifdef CONFIG_NO_HZ
278 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
280 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
281#endif /* #ifdef CONFIG_NO_HZ */
282 281
283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
284#ifdef CONFIG_NO_HZ
285 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
286#endif /* #ifdef CONFIG_NO_HZ */
287 unsigned long offline_fqs; /* Kicked due to being offline. */ 284 unsigned long offline_fqs; /* Kicked due to being offline. */
288 unsigned long resched_ipi; /* Sent a resched IPI. */ 285 unsigned long resched_ipi; /* Sent a resched IPI. */
289 286
@@ -302,16 +299,12 @@ struct rcu_data {
302 struct rcu_state *rsp; 299 struct rcu_state *rsp;
303}; 300};
304 301
305/* Values for signaled field in struct rcu_state. */ 302/* Values for fqs_state field in struct rcu_state. */
306#define RCU_GP_IDLE 0 /* No grace period in progress. */ 303#define RCU_GP_IDLE 0 /* No grace period in progress. */
307#define RCU_GP_INIT 1 /* Grace period being initialized. */ 304#define RCU_GP_INIT 1 /* Grace period being initialized. */
308#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 305#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
309#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 306#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
310#ifdef CONFIG_NO_HZ
311#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 307#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
312#else /* #ifdef CONFIG_NO_HZ */
313#define RCU_SIGNAL_INIT RCU_FORCE_QS
314#endif /* #else #ifdef CONFIG_NO_HZ */
315 308
316#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 309#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
317 310
@@ -361,7 +354,7 @@ struct rcu_state {
361 354
362 /* The following fields are guarded by the root rcu_node's lock. */ 355 /* The following fields are guarded by the root rcu_node's lock. */
363 356
364 u8 signaled ____cacheline_internodealigned_in_smp; 357 u8 fqs_state ____cacheline_internodealigned_in_smp;
365 /* Force QS state. */ 358 /* Force QS state. */
366 u8 fqs_active; /* force_quiescent_state() */ 359 u8 fqs_active; /* force_quiescent_state() */
367 /* is running. */ 360 /* is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
451static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
452void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
453#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 446#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
454static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); 447static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake);
455#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
456static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
457static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
461static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
462static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
463static void invoke_rcu_callbacks_kthread(void); 457static void invoke_rcu_callbacks_kthread(void);
458static bool rcu_is_callbacks_kthread(void);
464#ifdef CONFIG_RCU_BOOST 459#ifdef CONFIG_RCU_BOOST
465static void rcu_preempt_do_callbacks(void); 460static void rcu_preempt_do_callbacks(void);
466static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, 461static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
473#endif /* #ifdef CONFIG_RCU_BOOST */ 468#endif /* #ifdef CONFIG_RCU_BOOST */
474static void rcu_cpu_kthread_setrt(int cpu, int to_rt); 469static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
475static void __cpuinit rcu_prepare_kthreads(int cpu); 470static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu);
476 474
477#endif /* #ifndef RCU_TREE_NONCORE */ 475#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b9b9f8a4184..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
312{ 312{
313 int empty; 313 int empty;
314 int empty_exp; 314 int empty_exp;
315 int empty_exp_now;
315 unsigned long flags; 316 unsigned long flags;
316 struct list_head *np; 317 struct list_head *np;
317#ifdef CONFIG_RCU_BOOST 318#ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
382 /* 383 /*
383 * If this was the last task on the current list, and if 384 * If this was the last task on the current list, and if
384 * we aren't waiting on any CPUs, report the quiescent state. 385 * we aren't waiting on any CPUs, report the quiescent state.
385 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 386 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
387 * so we must take a snapshot of the expedited state.
386 */ 388 */
389 empty_exp_now = !rcu_preempted_readers_exp(rnp);
387 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 390 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
388 trace_rcu_quiescent_state_report("preempt_rcu", 391 trace_rcu_quiescent_state_report("preempt_rcu",
389 rnp->gpnum, 392 rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
406 * If this was the last task on the expedited lists, 409 * If this was the last task on the expedited lists,
407 * then we need to report up the rcu_node hierarchy. 410 * then we need to report up the rcu_node hierarchy.
408 */ 411 */
409 if (!empty_exp && !rcu_preempted_readers_exp(rnp)) 412 if (!empty_exp && empty_exp_now)
410 rcu_report_exp_rnp(&rcu_preempt_state, rnp); 413 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
411 } else { 414 } else {
412 local_irq_restore(flags); 415 local_irq_restore(flags);
413 } 416 }
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
729 * recursively up the tree. (Calm down, calm down, we do the recursion 732 * recursively up the tree. (Calm down, calm down, we do the recursion
730 * iteratively!) 733 * iteratively!)
731 * 734 *
735 * Most callers will set the "wake" flag, but the task initiating the
736 * expedited grace period need not wake itself.
737 *
732 * Caller must hold sync_rcu_preempt_exp_mutex. 738 * Caller must hold sync_rcu_preempt_exp_mutex.
733 */ 739 */
734static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 740static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
741 bool wake)
735{ 742{
736 unsigned long flags; 743 unsigned long flags;
737 unsigned long mask; 744 unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
744 } 751 }
745 if (rnp->parent == NULL) { 752 if (rnp->parent == NULL) {
746 raw_spin_unlock_irqrestore(&rnp->lock, flags); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
747 wake_up(&sync_rcu_preempt_exp_wq); 754 if (wake)
755 wake_up(&sync_rcu_preempt_exp_wq);
748 break; 756 break;
749 } 757 }
750 mask = rnp->grpmask; 758 mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
777 must_wait = 1; 785 must_wait = 1;
778 } 786 }
779 if (!must_wait) 787 if (!must_wait)
780 rcu_report_exp_rnp(rsp, rnp); 788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
781} 789}
782 790
783/* 791/*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1069 * report on tasks preempted in RCU read-side critical sections during 1077 * report on tasks preempted in RCU read-side critical sections during
1070 * expedited RCU grace periods. 1078 * expedited RCU grace periods.
1071 */ 1079 */
1072static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 1080static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1081 bool wake)
1073{ 1082{
1074 return;
1075} 1083}
1076 1084
1077#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1085#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1157 1165
1158#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1166#endif /* #else #ifdef CONFIG_RCU_TRACE */
1159 1167
1160static struct lock_class_key rcu_boost_class;
1161
1162/* 1168/*
1163 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1169 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1164 * or ->boost_tasks, advancing the pointer to the next task in the 1170 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
1221 */ 1227 */
1222 t = container_of(tb, struct task_struct, rcu_node_entry); 1228 t = container_of(tb, struct task_struct, rcu_node_entry);
1223 rt_mutex_init_proxy_locked(&mtx, t); 1229 rt_mutex_init_proxy_locked(&mtx, t);
1224 /* Avoid lockdep false positives. This rt_mutex is its own thing. */
1225 lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
1226 "rcu_boost_mutex");
1227 t->rcu_boost_mutex = &mtx; 1230 t->rcu_boost_mutex = &mtx;
1228 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1231 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1229 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1232 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1230 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1233 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1231 1234
1232 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; 1235 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1236 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1233} 1237}
1234 1238
1235/* 1239/*
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void)
1329} 1333}
1330 1334
1331/* 1335/*
1336 * Is the current CPU running the RCU-callbacks kthread?
1337 * Caller must have preemption disabled.
1338 */
1339static bool rcu_is_callbacks_kthread(void)
1340{
1341 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1342}
1343
1344/*
1332 * Set the affinity of the boost kthread. The CPU-hotplug locks are 1345 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1333 * held, so no one should be messing with the existence of the boost 1346 * held, so no one should be messing with the existence of the boost
1334 * kthread. 1347 * kthread.
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
1772 WARN_ON_ONCE(1); 1785 WARN_ON_ONCE(1);
1773} 1786}
1774 1787
1788static bool rcu_is_callbacks_kthread(void)
1789{
1790 return false;
1791}
1792
1775static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1793static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1776{ 1794{
1777} 1795}
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
1907 * grace period works for us. 1925 * grace period works for us.
1908 */ 1926 */
1909 get_online_cpus(); 1927 get_online_cpus();
1910 snap = atomic_read(&sync_sched_expedited_started) - 1; 1928 snap = atomic_read(&sync_sched_expedited_started);
1911 smp_mb(); /* ensure read is before try_stop_cpus(). */ 1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1912 } 1930 }
1913 1931
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1939 * 1 if so. This function is part of the RCU implementation; it is -not- 1957 * 1 if so. This function is part of the RCU implementation; it is -not-
1940 * an exported member of the RCU API. 1958 * an exported member of the RCU API.
1941 * 1959 *
1942 * Because we have preemptible RCU, just check whether this CPU needs 1960 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1943 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption 1961 * any flavor of RCU.
1944 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1945 */ 1962 */
1946int rcu_needs_cpu(int cpu) 1963int rcu_needs_cpu(int cpu)
1947{ 1964{
1948 return rcu_needs_cpu_quick_check(cpu); 1965 return rcu_cpu_has_callbacks(cpu);
1966}
1967
1968/*
1969 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1970 */
1971static void rcu_prepare_for_idle_init(int cpu)
1972{
1973}
1974
1975/*
1976 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1977 * after it.
1978 */
1979static void rcu_cleanup_after_idle(int cpu)
1980{
1981}
1982
1983/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
1985 * is nothing.
1986 */
1987static void rcu_prepare_for_idle(int cpu)
1988{
1949} 1989}
1950 1990
1951#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1991#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1952 1992
1953#define RCU_NEEDS_CPU_FLUSHES 5 1993/*
1994 * This code is invoked when a CPU goes idle, at which point we want
1995 * to have the CPU do everything required for RCU so that it can enter
1996 * the energy-efficient dyntick-idle mode. This is handled by a
1997 * state machine implemented by rcu_prepare_for_idle() below.
1998 *
1999 * The following three proprocessor symbols control this state machine:
2000 *
2001 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
2002 * to satisfy RCU. Beyond this point, it is better to incur a periodic
2003 * scheduling-clock interrupt than to loop through the state machine
2004 * at full power.
2005 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
2006 * optional if RCU does not need anything immediately from this
2007 * CPU, even if this CPU still has RCU callbacks queued. The first
2008 * times through the state machine are mandatory: we need to give
2009 * the state machine a chance to communicate a quiescent state
2010 * to the RCU core.
2011 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
2012 * to sleep in dyntick-idle mode with RCU callbacks pending. This
2013 * is sized to be roughly one RCU grace period. Those energy-efficiency
2014 * benchmarkers who might otherwise be tempted to set this to a large
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it!
2018 *
2019 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though
2021 * making the state machine smarter might be a better option.
2022 */
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
2026
1954static DEFINE_PER_CPU(int, rcu_dyntick_drain); 2027static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1955static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait;
1956 2031
1957/* 2032/*
1958 * Check to see if any future RCU-related work will need to be done 2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1959 * by the current CPU, even if none need be done immediately, returning 2034 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1960 * 1 if so. This function is part of the RCU implementation; it is -not- 2035 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1961 * an exported member of the RCU API. 2036 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2037 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2038 * it is better to incur scheduling-clock interrupts than to spin
2039 * continuously for the same time duration!
2040 */
2041int rcu_needs_cpu(int cpu)
2042{
2043 /* If no callbacks, RCU doesn't need the CPU. */
2044 if (!rcu_cpu_has_callbacks(cpu))
2045 return 0;
2046 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
2047 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
2048}
2049
2050/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the
2054 * real work is done upon re-entry to idle, or by the next scheduling-clock
2055 * interrupt should idle not be re-entered.
2056 */
2057static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2058{
2059 trace_rcu_prep_idle("Timer");
2060 return HRTIMER_NORESTART;
2061}
2062
2063/*
2064 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
2065 */
2066static void rcu_prepare_for_idle_init(int cpu)
2067{
2068 static int firsttime = 1;
2069 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2070
2071 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2072 hrtp->function = rcu_idle_gp_timer_func;
2073 if (firsttime) {
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0;
2078 }
2079}
2080
2081/*
2082 * Clean up for exit from idle. Because we are exiting from idle, there
2083 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
2084 * do nothing if this timer is not active, so just cancel it unconditionally.
2085 */
2086static void rcu_cleanup_after_idle(int cpu)
2087{
2088 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
2089}
2090
2091/*
2092 * Check to see if any RCU-related work can be done by the current CPU,
2093 * and if so, schedule a softirq to get it done. This function is part
2094 * of the RCU implementation; it is -not- an exported member of the RCU API.
1962 * 2095 *
1963 * Because we are not supporting preemptible RCU, attempt to accelerate 2096 * The idea is for the current CPU to clear out all work required by the
1964 * any current grace periods so that RCU no longer needs this CPU, but 2097 * RCU core for the current grace period, so that this CPU can be permitted
1965 * only if all other CPUs are already in dynticks-idle mode. This will 2098 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1966 * allow the CPU cores to be powered down immediately, as opposed to after 2099 * at the end of the grace period by whatever CPU ends the grace period.
1967 * waiting many milliseconds for grace periods to elapse. 2100 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
2101 * number of wakeups by a modest integer factor.
1968 * 2102 *
1969 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2103 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1970 * disabled, we do one pass of force_quiescent_state(), then do a 2104 * disabled, we do one pass of force_quiescent_state(), then do a
1971 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2105 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1972 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2106 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
2107 *
2108 * The caller must have disabled interrupts.
1973 */ 2109 */
1974int rcu_needs_cpu(int cpu) 2110static void rcu_prepare_for_idle(int cpu)
1975{ 2111{
1976 int c = 0; 2112 unsigned long flags;
1977 int snap; 2113
1978 int thatcpu; 2114 local_irq_save(flags);
1979 2115
1980 /* Check for being in the holdoff period. */ 2116 /*
1981 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) 2117 * If there are no callbacks on this CPU, enter dyntick-idle mode.
1982 return rcu_needs_cpu_quick_check(cpu); 2118 * Also reset state to avoid prejudicing later attempts.
1983 2119 */
1984 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 2120 if (!rcu_cpu_has_callbacks(cpu)) {
1985 for_each_online_cpu(thatcpu) { 2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1986 if (thatcpu == cpu) 2122 per_cpu(rcu_dyntick_drain, cpu) = 0;
1987 continue; 2123 local_irq_restore(flags);
1988 snap = atomic_add_return(0, &per_cpu(rcu_dynticks, 2124 trace_rcu_prep_idle("No callbacks");
1989 thatcpu).dynticks); 2125 return;
1990 smp_mb(); /* Order sampling of snap with end of grace period. */ 2126 }
1991 if ((snap & 0x1) != 0) { 2127
1992 per_cpu(rcu_dyntick_drain, cpu) = 0; 2128 /*
1993 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2129 * If in holdoff mode, just return. We will presumably have
1994 return rcu_needs_cpu_quick_check(cpu); 2130 * refrained from disabling the scheduling-clock tick.
1995 } 2131 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff");
2135 return;
1996 } 2136 }
1997 2137
1998 /* Check and update the rcu_dyntick_drain sequencing. */ 2138 /* Check and update the rcu_dyntick_drain sequencing. */
1999 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2139 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2000 /* First time through, initialize the counter. */ 2140 /* First time through, initialize the counter. */
2001 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; 2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) {
2144 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */
2002 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2003 /* We have hit the limit, so time to give up. */ 2152 /* We have hit the limit, so time to give up. */
2004 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2005 return rcu_needs_cpu_quick_check(cpu); 2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return;
2006 } 2158 }
2007 2159
2008 /* Do one step pushing remaining RCU callbacks through. */ 2160 /*
2161 * Do one step of pushing the remaining RCU callbacks through
2162 * the RCU core state machine.
2163 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2009 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2010 rcu_sched_qs(cpu); 2174 rcu_sched_qs(cpu);
2011 force_quiescent_state(&rcu_sched_state, 0); 2175 force_quiescent_state(&rcu_sched_state, 0);
2012 c = c || per_cpu(rcu_sched_data, cpu).nxtlist; 2176 local_irq_save(flags);
2013 } 2177 }
2014 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2015 rcu_bh_qs(cpu); 2180 rcu_bh_qs(cpu);
2016 force_quiescent_state(&rcu_bh_state, 0); 2181 force_quiescent_state(&rcu_bh_state, 0);
2017 c = c || per_cpu(rcu_bh_data, cpu).nxtlist; 2182 local_irq_save(flags);
2018 } 2183 }
2019 2184
2020 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 2185 /*
2021 if (c) 2186 * If RCU callbacks are still pending, RCU still needs this CPU.
2187 * So try forcing the callbacks through the grace period.
2188 */
2189 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks");
2022 invoke_rcu_core(); 2192 invoke_rcu_core();
2023 return c; 2193 } else {
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained");
2196 }
2024} 2197}
2025 2198
2026#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9feffa4c0695..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
67 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
69 rdp->qs_pending); 69 rdp->qs_pending);
70#ifdef CONFIG_NO_HZ 70 seq_printf(m, " dt=%d/%llx/%d df=%lu",
71 seq_printf(m, " dt=%d/%d/%d df=%lu",
72 atomic_read(&rdp->dynticks->dynticks), 71 atomic_read(&rdp->dynticks->dynticks),
73 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
74 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
75 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
76#endif /* #ifdef CONFIG_NO_HZ */
77 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
78 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld qs=%c%c%c%c",
79 rdp->qlen, 77 rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
141 rdp->completed, rdp->gpnum, 139 rdp->completed, rdp->gpnum,
142 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 140 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
143 rdp->qs_pending); 141 rdp->qs_pending);
144#ifdef CONFIG_NO_HZ 142 seq_printf(m, ",%d,%llx,%d,%lu",
145 seq_printf(m, ",%d,%d,%d,%lu",
146 atomic_read(&rdp->dynticks->dynticks), 143 atomic_read(&rdp->dynticks->dynticks),
147 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
148 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
149 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
150#endif /* #ifdef CONFIG_NO_HZ */
151 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
152 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
153 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
171static int show_rcudata_csv(struct seq_file *m, void *unused) 167static int show_rcudata_csv(struct seq_file *m, void *unused)
172{ 168{
173 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
174#ifdef CONFIG_NO_HZ
175 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
176#endif /* #ifdef CONFIG_NO_HZ */
177 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
178#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
179 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
279 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
280 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
281 rsp->completed, gpnum, rsp->signaled, 275 rsp->completed, gpnum, rsp->fqs_state,
282 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
283 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
284 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 8eafd1bd273e..16502d3a71c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
101 101
102 printk("\n============================================\n"); 102 printk("\n============================================\n");
103 printk( "[ BUG: circular locking deadlock detected! ]\n"); 103 printk( "[ BUG: circular locking deadlock detected! ]\n");
104 printk("%s\n", print_tainted());
104 printk( "--------------------------------------------\n"); 105 printk( "--------------------------------------------\n");
105 printk("%s/%d is deadlocking current task %s/%d\n\n", 106 printk("%s/%d is deadlocking current task %s/%d\n\n",
106 task->comm, task_pid_nr(task), 107 task->comm, task_pid_nr(task),
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f9d8482dd487..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
579 struct rt_mutex_waiter *waiter) 579 struct rt_mutex_waiter *waiter)
580{ 580{
581 int ret = 0; 581 int ret = 0;
582 int was_disabled;
583 582
584 for (;;) { 583 for (;;) {
585 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
602 601
603 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
604 603
605 was_disabled = irqs_disabled();
606 if (was_disabled)
607 local_irq_enable();
608
609 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
610 605
611 schedule_rt_mutex(lock); 606 schedule_rt_mutex(lock);
612 607
613 if (was_disabled)
614 local_irq_disable();
615
616 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
617 set_current_state(state); 609 set_current_state(state);
618 } 610 }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492df..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index d6b149ccf925..4dbfd04a2148 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -75,129 +74,17 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
78#include <asm/mutex.h>
79#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h> 78#include <asm/paravirt.h>
81#endif 79#endif
82 80
83#include "sched_cpupri.h" 81#include "sched.h"
84#include "workqueue_sched.h" 82#include "../workqueue_sched.h"
85#include "sched_autogroup.h"
86 83
87#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 85#include <trace/events/sched.h>
89 86
90/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
91 * Convert user-nice values [ -20 ... 0 ... 19 ]
92 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
93 * and back.
94 */
95#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
96#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
97#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
98
99/*
100 * 'User priority' is the nice value converted to something we
101 * can work with better when scaling various scheduler parameters,
102 * it's a [ 0 ... 39 ] range.
103 */
104#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
105#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
106#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
107
108/*
109 * Helpers for converting nanosecond timing to jiffy resolution
110 */
111#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
112
113#define NICE_0_LOAD SCHED_LOAD_SCALE
114#define NICE_0_SHIFT SCHED_LOAD_SHIFT
115
116/*
117 * These are the 'tuning knobs' of the scheduler:
118 *
119 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
120 * Timeslices get refilled after they expire.
121 */
122#define DEF_TIMESLICE (100 * HZ / 1000)
123
124/*
125 * single value that denotes runtime == period, ie unlimited time.
126 */
127#define RUNTIME_INF ((u64)~0ULL)
128
129static inline int rt_policy(int policy)
130{
131 if (policy == SCHED_FIFO || policy == SCHED_RR)
132 return 1;
133 return 0;
134}
135
136static inline int task_has_rt_policy(struct task_struct *p)
137{
138 return rt_policy(p->policy);
139}
140
141/*
142 * This is the priority-queue data structure of the RT scheduling class:
143 */
144struct rt_prio_array {
145 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
146 struct list_head queue[MAX_RT_PRIO];
147};
148
149struct rt_bandwidth {
150 /* nests inside the rq lock: */
151 raw_spinlock_t rt_runtime_lock;
152 ktime_t rt_period;
153 u64 rt_runtime;
154 struct hrtimer rt_period_timer;
155};
156
157static struct rt_bandwidth def_rt_bandwidth;
158
159static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
160
161static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
162{
163 struct rt_bandwidth *rt_b =
164 container_of(timer, struct rt_bandwidth, rt_period_timer);
165 ktime_t now;
166 int overrun;
167 int idle = 0;
168
169 for (;;) {
170 now = hrtimer_cb_get_time(timer);
171 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
172
173 if (!overrun)
174 break;
175
176 idle = do_sched_rt_period_timer(rt_b, overrun);
177 }
178
179 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
180}
181
182static
183void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
184{
185 rt_b->rt_period = ns_to_ktime(period);
186 rt_b->rt_runtime = runtime;
187
188 raw_spin_lock_init(&rt_b->rt_runtime_lock);
189
190 hrtimer_init(&rt_b->rt_period_timer,
191 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
192 rt_b->rt_period_timer.function = sched_rt_period_timer;
193}
194
195static inline int rt_bandwidth_enabled(void)
196{
197 return sysctl_sched_rt_runtime >= 0;
198}
199
200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
201{ 88{
202 unsigned long delta; 89 unsigned long delta;
203 ktime_t soft, hard, now; 90 ktime_t soft, hard, now;
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
217 } 104 }
218} 105}
219 106
220static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 107DEFINE_MUTEX(sched_domains_mutex);
221{ 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
228 raw_spin_lock(&rt_b->rt_runtime_lock);
229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
230 raw_spin_unlock(&rt_b->rt_runtime_lock);
231}
232
233#ifdef CONFIG_RT_GROUP_SCHED
234static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235{
236 hrtimer_cancel(&rt_b->rt_period_timer);
237}
238#endif
239
240/*
241 * sched_domains_mutex serializes calls to init_sched_domains,
242 * detach_destroy_domains and partition_sched_domains.
243 */
244static DEFINE_MUTEX(sched_domains_mutex);
245
246#ifdef CONFIG_CGROUP_SCHED
247
248#include <linux/cgroup.h>
249
250struct cfs_rq;
251
252static LIST_HEAD(task_groups);
253
254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
258 u64 quota, runtime;
259 s64 hierarchal_quota;
260 u64 runtime_expires;
261
262 int idle, timer_active;
263 struct hrtimer period_timer, slack_timer;
264 struct list_head throttled_cfs_rq;
265
266 /* statistics */
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
269#endif
270};
271
272/* task group related information */
273struct task_group {
274 struct cgroup_subsys_state css;
275
276#ifdef CONFIG_FAIR_GROUP_SCHED
277 /* schedulable entities of this group on each cpu */
278 struct sched_entity **se;
279 /* runqueue "owned" by this group on each cpu */
280 struct cfs_rq **cfs_rq;
281 unsigned long shares;
282
283 atomic_t load_weight;
284#endif
285
286#ifdef CONFIG_RT_GROUP_SCHED
287 struct sched_rt_entity **rt_se;
288 struct rt_rq **rt_rq;
289
290 struct rt_bandwidth rt_bandwidth;
291#endif
292
293 struct rcu_head rcu;
294 struct list_head list;
295
296 struct task_group *parent;
297 struct list_head siblings;
298 struct list_head children;
299
300#ifdef CONFIG_SCHED_AUTOGROUP
301 struct autogroup *autogroup;
302#endif
303
304 struct cfs_bandwidth cfs_bandwidth;
305};
306
307/* task_group_lock serializes the addition/removal of task groups */
308static DEFINE_SPINLOCK(task_group_lock);
309
310#ifdef CONFIG_FAIR_GROUP_SCHED
311
312# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
313
314/*
315 * A weight of 0 or 1 can cause arithmetics problems.
316 * A weight of a cfs_rq is the sum of weights of which entities
317 * are queued on this cfs_rq, so a weight of a entity should not be
318 * too large, so as the shares value of a task group.
319 * (The default weight is 1024 - so there's no practical
320 * limitation from this.)
321 */
322#define MIN_SHARES (1UL << 1)
323#define MAX_SHARES (1UL << 18)
324
325static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
326#endif
327
328/* Default task group.
329 * Every task in system belong to this group at bootup.
330 */
331struct task_group root_task_group;
332
333#endif /* CONFIG_CGROUP_SCHED */
334
335/* CFS-related fields in a runqueue */
336struct cfs_rq {
337 struct load_weight load;
338 unsigned long nr_running, h_nr_running;
339
340 u64 exec_clock;
341 u64 min_vruntime;
342#ifndef CONFIG_64BIT
343 u64 min_vruntime_copy;
344#endif
345
346 struct rb_root tasks_timeline;
347 struct rb_node *rb_leftmost;
348
349 struct list_head tasks;
350 struct list_head *balance_iterator;
351
352 /*
353 * 'curr' points to currently running entity on this cfs_rq.
354 * It is set to NULL otherwise (i.e when none are currently running).
355 */
356 struct sched_entity *curr, *next, *last, *skip;
357
358#ifdef CONFIG_SCHED_DEBUG
359 unsigned int nr_spread_over;
360#endif
361
362#ifdef CONFIG_FAIR_GROUP_SCHED
363 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
364
365 /*
366 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
367 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
368 * (like users, containers etc.)
369 *
370 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
371 * list is used during load balance.
372 */
373 int on_list;
374 struct list_head leaf_cfs_rq_list;
375 struct task_group *tg; /* group that "owns" this runqueue */
376
377#ifdef CONFIG_SMP
378 /*
379 * the part of load.weight contributed by tasks
380 */
381 unsigned long task_weight;
382
383 /*
384 * h_load = weight * f(tg)
385 *
386 * Where f(tg) is the recursive weight fraction assigned to
387 * this group.
388 */
389 unsigned long h_load;
390
391 /*
392 * Maintaining per-cpu shares distribution for group scheduling
393 *
394 * load_stamp is the last time we updated the load average
395 * load_last is the last time we updated the load average and saw load
396 * load_unacc_exec_time is currently unaccounted execution time
397 */
398 u64 load_avg;
399 u64 load_period;
400 u64 load_stamp, load_last, load_unacc_exec_time;
401
402 unsigned long load_contribution;
403#endif
404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
406 u64 runtime_expires;
407 s64 runtime_remaining;
408
409 u64 throttled_timestamp;
410 int throttled, throttle_count;
411 struct list_head throttled_list;
412#endif
413#endif
414};
415
416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
460 cfs_b->runtime = 0;
461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
463
464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
475}
476
477/* requires cfs_b->lock, may release to reprogram timer */
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480 /*
481 * The timer may be active because we're trying to set a new bandwidth
482 * period or because we're racing with the tear-down path
483 * (timer_active==0 becomes visible before the hrtimer call-back
484 * terminates). In either case we ensure that it's re-programmed
485 */
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488 /* ensure cfs_b->lock is available while we wait */
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492 /* if someone else restarted the timer then we're done */
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
502{
503 hrtimer_cancel(&cfs_b->period_timer);
504 hrtimer_cancel(&cfs_b->slack_timer);
505}
506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif /* CONFIG_CFS_BANDWIDTH */
516#endif /* CONFIG_FAIR_GROUP_SCHED */
517
518/* Real-Time classes' related field in a runqueue: */
519struct rt_rq {
520 struct rt_prio_array active;
521 unsigned long rt_nr_running;
522#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
523 struct {
524 int curr; /* highest queued rt task prio */
525#ifdef CONFIG_SMP
526 int next; /* next highest */
527#endif
528 } highest_prio;
529#endif
530#ifdef CONFIG_SMP
531 unsigned long rt_nr_migratory;
532 unsigned long rt_nr_total;
533 int overloaded;
534 struct plist_head pushable_tasks;
535#endif
536 int rt_throttled;
537 u64 rt_time;
538 u64 rt_runtime;
539 /* Nests inside the rq lock: */
540 raw_spinlock_t rt_runtime_lock;
541
542#ifdef CONFIG_RT_GROUP_SCHED
543 unsigned long rt_nr_boosted;
544
545 struct rq *rq;
546 struct list_head leaf_rt_rq_list;
547 struct task_group *tg;
548#endif
549};
550
551#ifdef CONFIG_SMP
552
553/*
554 * We add the notion of a root-domain which will be used to define per-domain
555 * variables. Each exclusive cpuset essentially defines an island domain by
556 * fully partitioning the member cpus from any other cpuset. Whenever a new
557 * exclusive cpuset is created, we also create and attach a new root-domain
558 * object.
559 *
560 */
561struct root_domain {
562 atomic_t refcount;
563 atomic_t rto_count;
564 struct rcu_head rcu;
565 cpumask_var_t span;
566 cpumask_var_t online;
567
568 /*
569 * The "RT overload" flag: it gets set if a CPU has more than
570 * one runnable RT task.
571 */
572 cpumask_var_t rto_mask;
573 struct cpupri cpupri;
574};
575
576/*
577 * By default the system creates a single root-domain with all cpus as
578 * members (mimicking the global state we have today).
579 */
580static struct root_domain def_root_domain;
581
582#endif /* CONFIG_SMP */
583
584/*
585 * This is the main, per-CPU runqueue data structure.
586 *
587 * Locking rule: those places that want to lock multiple runqueues
588 * (such as the load balancing or the thread migration code), lock
589 * acquire operations must be ordered by ascending &runqueue.
590 */
591struct rq {
592 /* runqueue lock: */
593 raw_spinlock_t lock;
594
595 /*
596 * nr_running and cpu_load should be in the same cacheline because
597 * remote CPUs use both these fields when doing load calculation.
598 */
599 unsigned long nr_running;
600 #define CPU_LOAD_IDX_MAX 5
601 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
602 unsigned long last_load_update_tick;
603#ifdef CONFIG_NO_HZ
604 u64 nohz_stamp;
605 unsigned char nohz_balance_kick;
606#endif
607 int skip_clock_update;
608
609 /* capture load from *all* tasks on this cpu: */
610 struct load_weight load;
611 unsigned long nr_load_updates;
612 u64 nr_switches;
613
614 struct cfs_rq cfs;
615 struct rt_rq rt;
616
617#ifdef CONFIG_FAIR_GROUP_SCHED
618 /* list of leaf cfs_rq on this cpu: */
619 struct list_head leaf_cfs_rq_list;
620#endif
621#ifdef CONFIG_RT_GROUP_SCHED
622 struct list_head leaf_rt_rq_list;
623#endif
624
625 /*
626 * This is part of a global counter where only the total sum
627 * over all CPUs matters. A task can increase this counter on
628 * one CPU and if it got migrated afterwards it may decrease
629 * it on another CPU. Always updated under the runqueue lock:
630 */
631 unsigned long nr_uninterruptible;
632
633 struct task_struct *curr, *idle, *stop;
634 unsigned long next_balance;
635 struct mm_struct *prev_mm;
636
637 u64 clock;
638 u64 clock_task;
639
640 atomic_t nr_iowait;
641
642#ifdef CONFIG_SMP
643 struct root_domain *rd;
644 struct sched_domain *sd;
645
646 unsigned long cpu_power;
647
648 unsigned char idle_balance;
649 /* For active balancing */
650 int post_schedule;
651 int active_balance;
652 int push_cpu;
653 struct cpu_stop_work active_balance_work;
654 /* cpu of this runqueue: */
655 int cpu;
656 int online;
657
658 u64 rt_avg;
659 u64 age_stamp;
660 u64 idle_stamp;
661 u64 avg_idle;
662#endif
663
664#ifdef CONFIG_IRQ_TIME_ACCOUNTING
665 u64 prev_irq_time;
666#endif
667#ifdef CONFIG_PARAVIRT
668 u64 prev_steal_time;
669#endif
670#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
671 u64 prev_steal_time_rq;
672#endif
673
674 /* calc_load related fields */
675 unsigned long calc_load_update;
676 long calc_load_active;
677
678#ifdef CONFIG_SCHED_HRTICK
679#ifdef CONFIG_SMP
680 int hrtick_csd_pending;
681 struct call_single_data hrtick_csd;
682#endif
683 struct hrtimer hrtick_timer;
684#endif
685
686#ifdef CONFIG_SCHEDSTATS
687 /* latency stats */
688 struct sched_info rq_sched_info;
689 unsigned long long rq_cpu_time;
690 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
691
692 /* sys_sched_yield() stats */
693 unsigned int yld_count;
694
695 /* schedule() stats */
696 unsigned int sched_switch;
697 unsigned int sched_count;
698 unsigned int sched_goidle;
699
700 /* try_to_wake_up() stats */
701 unsigned int ttwu_count;
702 unsigned int ttwu_local;
703#endif
704
705#ifdef CONFIG_SMP
706 struct llist_head wake_list;
707#endif
708};
709
710static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
711
712
713static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
714
715static inline int cpu_of(struct rq *rq)
716{
717#ifdef CONFIG_SMP
718 return rq->cpu;
719#else
720 return 0;
721#endif
722}
723
724#define rcu_dereference_check_sched_domain(p) \
725 rcu_dereference_check((p), \
726 lockdep_is_held(&sched_domains_mutex))
727
728/*
729 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
730 * See detach_destroy_domains: synchronize_sched for details.
731 *
732 * The domain tree of any CPU may only be accessed from within
733 * preempt-disabled sections.
734 */
735#define for_each_domain(cpu, __sd) \
736 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
737
738#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
739#define this_rq() (&__get_cpu_var(runqueues))
740#define task_rq(p) cpu_rq(task_cpu(p))
741#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
742#define raw_rq() (&__raw_get_cpu_var(runqueues))
743
744#ifdef CONFIG_CGROUP_SCHED
745
746/*
747 * Return the group to which this tasks belongs.
748 *
749 * We use task_subsys_state_check() and extend the RCU verification with
750 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
751 * task it moves into the cgroup. Therefore by holding either of those locks,
752 * we pin the task to the current cgroup.
753 */
754static inline struct task_group *task_group(struct task_struct *p)
755{
756 struct task_group *tg;
757 struct cgroup_subsys_state *css;
758
759 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
760 lockdep_is_held(&p->pi_lock) ||
761 lockdep_is_held(&task_rq(p)->lock));
762 tg = container_of(css, struct task_group, css);
763
764 return autogroup_task_group(p, tg);
765}
766
767/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
768static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
769{
770#ifdef CONFIG_FAIR_GROUP_SCHED
771 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
772 p->se.parent = task_group(p)->se[cpu];
773#endif
774
775#ifdef CONFIG_RT_GROUP_SCHED
776 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
777 p->rt.parent = task_group(p)->rt_se[cpu];
778#endif
779}
780
781#else /* CONFIG_CGROUP_SCHED */
782
783static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
784static inline struct task_group *task_group(struct task_struct *p)
785{
786 return NULL;
787}
788
789#endif /* CONFIG_CGROUP_SCHED */
790 109
791static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
792 111
793static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
794{ 113{
795 s64 delta; 114 s64 delta;
796 115
@@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
803} 122}
804 123
805/* 124/*
806 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
807 */
808#ifdef CONFIG_SCHED_DEBUG
809# define const_debug __read_mostly
810#else
811# define const_debug static const
812#endif
813
814/**
815 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
816 * @cpu: the processor in question.
817 *
818 * This interface allows printk to be called with the runqueue lock
819 * held and know whether or not it is OK to wake up the klogd.
820 */
821int runqueue_is_locked(int cpu)
822{
823 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
824}
825
826/*
827 * Debugging: various feature bits 125 * Debugging: various feature bits
828 */ 126 */
829 127
830#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
831 __SCHED_FEAT_##name ,
832
833enum {
834#include "sched_features.h"
835};
836
837#undef SCHED_FEAT
838
839#define SCHED_FEAT(name, enabled) \
840 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
841 130
842const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
843#include "sched_features.h" 132#include "features.h"
844 0; 133 0;
845 134
846#undef SCHED_FEAT 135#undef SCHED_FEAT
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
850 #name , 139 #name ,
851 140
852static __read_mostly char *sched_feat_names[] = { 141static __read_mostly char *sched_feat_names[] = {
853#include "sched_features.h" 142#include "features.h"
854 NULL 143 NULL
855}; 144};
856 145
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
860{ 149{
861 int i; 150 int i;
862 151
863 for (i = 0; sched_feat_names[i]; i++) { 152 for (i = 0; i < __SCHED_FEAT_NR; i++) {
864 if (!(sysctl_sched_features & (1UL << i))) 153 if (!(sysctl_sched_features & (1UL << i)))
865 seq_puts(m, "NO_"); 154 seq_puts(m, "NO_");
866 seq_printf(m, "%s ", sched_feat_names[i]); 155 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
870 return 0; 159 return 0;
871} 160}
872 161
162#ifdef HAVE_JUMP_LABEL
163
164#define jump_label_key__true jump_label_key_enabled
165#define jump_label_key__false jump_label_key_disabled
166
167#define SCHED_FEAT(name, enabled) \
168 jump_label_key__##enabled ,
169
170struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
171#include "features.h"
172};
173
174#undef SCHED_FEAT
175
176static void sched_feat_disable(int i)
177{
178 if (jump_label_enabled(&sched_feat_keys[i]))
179 jump_label_dec(&sched_feat_keys[i]);
180}
181
182static void sched_feat_enable(int i)
183{
184 if (!jump_label_enabled(&sched_feat_keys[i]))
185 jump_label_inc(&sched_feat_keys[i]);
186}
187#else
188static void sched_feat_disable(int i) { };
189static void sched_feat_enable(int i) { };
190#endif /* HAVE_JUMP_LABEL */
191
873static ssize_t 192static ssize_t
874sched_feat_write(struct file *filp, const char __user *ubuf, 193sched_feat_write(struct file *filp, const char __user *ubuf,
875 size_t cnt, loff_t *ppos) 194 size_t cnt, loff_t *ppos)
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
893 cmp += 3; 212 cmp += 3;
894 } 213 }
895 214
896 for (i = 0; sched_feat_names[i]; i++) { 215 for (i = 0; i < __SCHED_FEAT_NR; i++) {
897 if (strcmp(cmp, sched_feat_names[i]) == 0) { 216 if (strcmp(cmp, sched_feat_names[i]) == 0) {
898 if (neg) 217 if (neg) {
899 sysctl_sched_features &= ~(1UL << i); 218 sysctl_sched_features &= ~(1UL << i);
900 else 219 sched_feat_disable(i);
220 } else {
901 sysctl_sched_features |= (1UL << i); 221 sysctl_sched_features |= (1UL << i);
222 sched_feat_enable(i);
223 }
902 break; 224 break;
903 } 225 }
904 } 226 }
905 227
906 if (!sched_feat_names[i]) 228 if (i == __SCHED_FEAT_NR)
907 return -EINVAL; 229 return -EINVAL;
908 230
909 *ppos += cnt; 231 *ppos += cnt;
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void)
932 return 0; 254 return 0;
933} 255}
934late_initcall(sched_init_debug); 256late_initcall(sched_init_debug);
935 257#endif /* CONFIG_SCHED_DEBUG */
936#endif
937
938#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
939 258
940/* 259/*
941 * Number of tasks to iterate in a single balance run. 260 * Number of tasks to iterate in a single balance run.
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
957 */ 276 */
958unsigned int sysctl_sched_rt_period = 1000000; 277unsigned int sysctl_sched_rt_period = 1000000;
959 278
960static __read_mostly int scheduler_running; 279__read_mostly int scheduler_running;
961 280
962/* 281/*
963 * part of the period that we allow rt tasks to run in us. 282 * part of the period that we allow rt tasks to run in us.
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running;
965 */ 284 */
966int sysctl_sched_rt_runtime = 950000; 285int sysctl_sched_rt_runtime = 950000;
967 286
968static inline u64 global_rt_period(void)
969{
970 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
971}
972 287
973static inline u64 global_rt_runtime(void)
974{
975 if (sysctl_sched_rt_runtime < 0)
976 return RUNTIME_INF;
977
978 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
979}
980
981#ifndef prepare_arch_switch
982# define prepare_arch_switch(next) do { } while (0)
983#endif
984#ifndef finish_arch_switch
985# define finish_arch_switch(prev) do { } while (0)
986#endif
987
988static inline int task_current(struct rq *rq, struct task_struct *p)
989{
990 return rq->curr == p;
991}
992
993static inline int task_running(struct rq *rq, struct task_struct *p)
994{
995#ifdef CONFIG_SMP
996 return p->on_cpu;
997#else
998 return task_current(rq, p);
999#endif
1000}
1001
1002#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1003static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1004{
1005#ifdef CONFIG_SMP
1006 /*
1007 * We can optimise this out completely for !SMP, because the
1008 * SMP rebalancing from interrupt is the only thing that cares
1009 * here.
1010 */
1011 next->on_cpu = 1;
1012#endif
1013}
1014
1015static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1016{
1017#ifdef CONFIG_SMP
1018 /*
1019 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1020 * We must ensure this doesn't happen until the switch is completely
1021 * finished.
1022 */
1023 smp_wmb();
1024 prev->on_cpu = 0;
1025#endif
1026#ifdef CONFIG_DEBUG_SPINLOCK
1027 /* this is a valid case when another task releases the spinlock */
1028 rq->lock.owner = current;
1029#endif
1030 /*
1031 * If we are tracking spinlock dependencies then we have to
1032 * fix up the runqueue lock - which gets 'carried over' from
1033 * prev into current:
1034 */
1035 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1036
1037 raw_spin_unlock_irq(&rq->lock);
1038}
1039
1040#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1041static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1042{
1043#ifdef CONFIG_SMP
1044 /*
1045 * We can optimise this out completely for !SMP, because the
1046 * SMP rebalancing from interrupt is the only thing that cares
1047 * here.
1048 */
1049 next->on_cpu = 1;
1050#endif
1051#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1052 raw_spin_unlock_irq(&rq->lock);
1053#else
1054 raw_spin_unlock(&rq->lock);
1055#endif
1056}
1057
1058static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1059{
1060#ifdef CONFIG_SMP
1061 /*
1062 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1063 * We must ensure this doesn't happen until the switch is completely
1064 * finished.
1065 */
1066 smp_wmb();
1067 prev->on_cpu = 0;
1068#endif
1069#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1070 local_irq_enable();
1071#endif
1072}
1073#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1074 288
1075/* 289/*
1076 * __task_rq_lock - lock the rq @p resides on. 290 * __task_rq_lock - lock the rq @p resides on.
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void)
1153 * rq->lock. 367 * rq->lock.
1154 */ 368 */
1155 369
1156/*
1157 * Use hrtick when:
1158 * - enabled by features
1159 * - hrtimer is actually high res
1160 */
1161static inline int hrtick_enabled(struct rq *rq)
1162{
1163 if (!sched_feat(HRTICK))
1164 return 0;
1165 if (!cpu_active(cpu_of(rq)))
1166 return 0;
1167 return hrtimer_is_hres_active(&rq->hrtick_timer);
1168}
1169
1170static void hrtick_clear(struct rq *rq) 370static void hrtick_clear(struct rq *rq)
1171{ 371{
1172 if (hrtimer_active(&rq->hrtick_timer)) 372 if (hrtimer_active(&rq->hrtick_timer))
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg)
1210 * 410 *
1211 * called with rq->lock held and irqs disabled 411 * called with rq->lock held and irqs disabled
1212 */ 412 */
1213static void hrtick_start(struct rq *rq, u64 delay) 413void hrtick_start(struct rq *rq, u64 delay)
1214{ 414{
1215 struct hrtimer *timer = &rq->hrtick_timer; 415 struct hrtimer *timer = &rq->hrtick_timer;
1216 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 416 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void)
1254 * 454 *
1255 * called with rq->lock held and irqs disabled 455 * called with rq->lock held and irqs disabled
1256 */ 456 */
1257static void hrtick_start(struct rq *rq, u64 delay) 457void hrtick_start(struct rq *rq, u64 delay)
1258{ 458{
1259 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 459 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1260 HRTIMER_MODE_REL_PINNED, 0); 460 HRTIMER_MODE_REL_PINNED, 0);
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void)
1305#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 505#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1306#endif 506#endif
1307 507
1308static void resched_task(struct task_struct *p) 508void resched_task(struct task_struct *p)
1309{ 509{
1310 int cpu; 510 int cpu;
1311 511
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p)
1326 smp_send_reschedule(cpu); 526 smp_send_reschedule(cpu);
1327} 527}
1328 528
1329static void resched_cpu(int cpu) 529void resched_cpu(int cpu)
1330{ 530{
1331 struct rq *rq = cpu_rq(cpu); 531 struct rq *rq = cpu_rq(cpu);
1332 unsigned long flags; 532 unsigned long flags;
@@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu)
1407 607
1408static inline bool got_nohz_idle_kick(void) 608static inline bool got_nohz_idle_kick(void)
1409{ 609{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; 610 int cpu = smp_processor_id();
611 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
1411} 612}
1412 613
1413#else /* CONFIG_NO_HZ */ 614#else /* CONFIG_NO_HZ */
@@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void)
1419 620
1420#endif /* CONFIG_NO_HZ */ 621#endif /* CONFIG_NO_HZ */
1421 622
1422static u64 sched_avg_period(void) 623void sched_avg_update(struct rq *rq)
1423{
1424 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1425}
1426
1427static void sched_avg_update(struct rq *rq)
1428{ 624{
1429 s64 period = sched_avg_period(); 625 s64 period = sched_avg_period();
1430 626
@@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq)
1440 } 636 }
1441} 637}
1442 638
1443static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1444{
1445 rq->rt_avg += rt_delta;
1446 sched_avg_update(rq);
1447}
1448
1449#else /* !CONFIG_SMP */ 639#else /* !CONFIG_SMP */
1450static void resched_task(struct task_struct *p) 640void resched_task(struct task_struct *p)
1451{ 641{
1452 assert_raw_spin_locked(&task_rq(p)->lock); 642 assert_raw_spin_locked(&task_rq(p)->lock);
1453 set_tsk_need_resched(p); 643 set_tsk_need_resched(p);
1454} 644}
1455
1456static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1457{
1458}
1459
1460static void sched_avg_update(struct rq *rq)
1461{
1462}
1463#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
1464 646
1465#if BITS_PER_LONG == 32
1466# define WMULT_CONST (~0UL)
1467#else
1468# define WMULT_CONST (1UL << 32)
1469#endif
1470
1471#define WMULT_SHIFT 32
1472
1473/*
1474 * Shift right and round:
1475 */
1476#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1477
1478/*
1479 * delta *= weight / lw
1480 */
1481static unsigned long
1482calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1483 struct load_weight *lw)
1484{
1485 u64 tmp;
1486
1487 /*
1488 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1489 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1490 * 2^SCHED_LOAD_RESOLUTION.
1491 */
1492 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1493 tmp = (u64)delta_exec * scale_load_down(weight);
1494 else
1495 tmp = (u64)delta_exec;
1496
1497 if (!lw->inv_weight) {
1498 unsigned long w = scale_load_down(lw->weight);
1499
1500 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1501 lw->inv_weight = 1;
1502 else if (unlikely(!w))
1503 lw->inv_weight = WMULT_CONST;
1504 else
1505 lw->inv_weight = WMULT_CONST / w;
1506 }
1507
1508 /*
1509 * Check whether we'd overflow the 64-bit multiplication:
1510 */
1511 if (unlikely(tmp > WMULT_CONST))
1512 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1513 WMULT_SHIFT/2);
1514 else
1515 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1516
1517 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1518}
1519
1520static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1521{
1522 lw->weight += inc;
1523 lw->inv_weight = 0;
1524}
1525
1526static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1527{
1528 lw->weight -= dec;
1529 lw->inv_weight = 0;
1530}
1531
1532static inline void update_load_set(struct load_weight *lw, unsigned long w)
1533{
1534 lw->weight = w;
1535 lw->inv_weight = 0;
1536}
1537
1538/*
1539 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1540 * of tasks with abnormal "nice" values across CPUs the contribution that
1541 * each task makes to its run queue's load is weighted according to its
1542 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1543 * scaled version of the new time slice allocation that they receive on time
1544 * slice expiry etc.
1545 */
1546
1547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
1549
1550/*
1551 * Nice levels are multiplicative, with a gentle 10% change for every
1552 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1553 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1554 * that remained on nice 0.
1555 *
1556 * The "10% effect" is relative and cumulative: from _any_ nice level,
1557 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1558 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1559 * If a task goes up by ~10% and another task goes down by ~10% then
1560 * the relative distance between them is ~25%.)
1561 */
1562static const int prio_to_weight[40] = {
1563 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1564 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1565 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1566 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1567 /* 0 */ 1024, 820, 655, 526, 423,
1568 /* 5 */ 335, 272, 215, 172, 137,
1569 /* 10 */ 110, 87, 70, 56, 45,
1570 /* 15 */ 36, 29, 23, 18, 15,
1571};
1572
1573/*
1574 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1575 *
1576 * In cases where the weight does not change often, we can use the
1577 * precalculated inverse to speed up arithmetics by turning divisions
1578 * into multiplications:
1579 */
1580static const u32 prio_to_wmult[40] = {
1581 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1582 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1583 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1584 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1585 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1586 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1587 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1588 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1589};
1590
1591/* Time spent by the tasks of the cpu accounting group executing in ... */
1592enum cpuacct_stat_index {
1593 CPUACCT_STAT_USER, /* ... user mode */
1594 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1595
1596 CPUACCT_STAT_NSTATS,
1597};
1598
1599#ifdef CONFIG_CGROUP_CPUACCT
1600static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1601static void cpuacct_update_stats(struct task_struct *tsk,
1602 enum cpuacct_stat_index idx, cputime_t val);
1603#else
1604static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1605static inline void cpuacct_update_stats(struct task_struct *tsk,
1606 enum cpuacct_stat_index idx, cputime_t val) {}
1607#endif
1608
1609static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1610{
1611 update_load_add(&rq->load, load);
1612}
1613
1614static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1615{
1616 update_load_sub(&rq->load, load);
1617}
1618
1619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 647#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 648 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1621typedef int (*tg_visitor)(struct task_group *, void *);
1622
1623/* 649/*
1624 * Iterate task_group tree rooted at *from, calling @down when first entering a 650 * Iterate task_group tree rooted at *from, calling @down when first entering a
1625 * node and @up when leaving it for the final time. 651 * node and @up when leaving it for the final time.
1626 * 652 *
1627 * Caller must hold rcu_lock or sufficient equivalent. 653 * Caller must hold rcu_lock or sufficient equivalent.
1628 */ 654 */
1629static int walk_tg_tree_from(struct task_group *from, 655int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data) 656 tg_visitor down, tg_visitor up, void *data)
1631{ 657{
1632 struct task_group *parent, *child; 658 struct task_group *parent, *child;
@@ -1657,270 +683,13 @@ out:
1657 return ret; 683 return ret;
1658} 684}
1659 685
1660/* 686int tg_nop(struct task_group *tg, void *data)
1661 * Iterate the full tree, calling @down when first entering a node and @up when
1662 * leaving it for the final time.
1663 *
1664 * Caller must hold rcu_lock or sufficient equivalent.
1665 */
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
1672static int tg_nop(struct task_group *tg, void *data)
1673{ 687{
1674 return 0; 688 return 0;
1675} 689}
1676#endif 690#endif
1677 691
1678#ifdef CONFIG_SMP 692void update_cpu_load(struct rq *this_rq);
1679/* Used instead of source_load when we know the type == 0 */
1680static unsigned long weighted_cpuload(const int cpu)
1681{
1682 return cpu_rq(cpu)->load.weight;
1683}
1684
1685/*
1686 * Return a low guess at the load of a migration-source cpu weighted
1687 * according to the scheduling class and "nice" value.
1688 *
1689 * We want to under-estimate the load of migration sources, to
1690 * balance conservatively.
1691 */
1692static unsigned long source_load(int cpu, int type)
1693{
1694 struct rq *rq = cpu_rq(cpu);
1695 unsigned long total = weighted_cpuload(cpu);
1696
1697 if (type == 0 || !sched_feat(LB_BIAS))
1698 return total;
1699
1700 return min(rq->cpu_load[type-1], total);
1701}
1702
1703/*
1704 * Return a high guess at the load of a migration-target cpu weighted
1705 * according to the scheduling class and "nice" value.
1706 */
1707static unsigned long target_load(int cpu, int type)
1708{
1709 struct rq *rq = cpu_rq(cpu);
1710 unsigned long total = weighted_cpuload(cpu);
1711
1712 if (type == 0 || !sched_feat(LB_BIAS))
1713 return total;
1714
1715 return max(rq->cpu_load[type-1], total);
1716}
1717
1718static unsigned long power_of(int cpu)
1719{
1720 return cpu_rq(cpu)->cpu_power;
1721}
1722
1723static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1724
1725static unsigned long cpu_avg_load_per_task(int cpu)
1726{
1727 struct rq *rq = cpu_rq(cpu);
1728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1729
1730 if (nr_running)
1731 return rq->load.weight / nr_running;
1732
1733 return 0;
1734}
1735
1736#ifdef CONFIG_PREEMPT
1737
1738static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1739
1740/*
1741 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1742 * way at the expense of forcing extra atomic operations in all
1743 * invocations. This assures that the double_lock is acquired using the
1744 * same underlying policy as the spinlock_t on this architecture, which
1745 * reduces latency compared to the unfair variant below. However, it
1746 * also adds more overhead and therefore may reduce throughput.
1747 */
1748static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1749 __releases(this_rq->lock)
1750 __acquires(busiest->lock)
1751 __acquires(this_rq->lock)
1752{
1753 raw_spin_unlock(&this_rq->lock);
1754 double_rq_lock(this_rq, busiest);
1755
1756 return 1;
1757}
1758
1759#else
1760/*
1761 * Unfair double_lock_balance: Optimizes throughput at the expense of
1762 * latency by eliminating extra atomic operations when the locks are
1763 * already in proper order on entry. This favors lower cpu-ids and will
1764 * grant the double lock to lower cpus over higher ids under contention,
1765 * regardless of entry order into the function.
1766 */
1767static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1768 __releases(this_rq->lock)
1769 __acquires(busiest->lock)
1770 __acquires(this_rq->lock)
1771{
1772 int ret = 0;
1773
1774 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1775 if (busiest < this_rq) {
1776 raw_spin_unlock(&this_rq->lock);
1777 raw_spin_lock(&busiest->lock);
1778 raw_spin_lock_nested(&this_rq->lock,
1779 SINGLE_DEPTH_NESTING);
1780 ret = 1;
1781 } else
1782 raw_spin_lock_nested(&busiest->lock,
1783 SINGLE_DEPTH_NESTING);
1784 }
1785 return ret;
1786}
1787
1788#endif /* CONFIG_PREEMPT */
1789
1790/*
1791 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1792 */
1793static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1794{
1795 if (unlikely(!irqs_disabled())) {
1796 /* printk() doesn't work good under rq->lock */
1797 raw_spin_unlock(&this_rq->lock);
1798 BUG_ON(1);
1799 }
1800
1801 return _double_lock_balance(this_rq, busiest);
1802}
1803
1804static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 __releases(busiest->lock)
1806{
1807 raw_spin_unlock(&busiest->lock);
1808 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1809}
1810
1811/*
1812 * double_rq_lock - safely lock two runqueues
1813 *
1814 * Note this does not disable interrupts like task_rq_lock,
1815 * you need to do so manually before calling.
1816 */
1817static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1818 __acquires(rq1->lock)
1819 __acquires(rq2->lock)
1820{
1821 BUG_ON(!irqs_disabled());
1822 if (rq1 == rq2) {
1823 raw_spin_lock(&rq1->lock);
1824 __acquire(rq2->lock); /* Fake it out ;) */
1825 } else {
1826 if (rq1 < rq2) {
1827 raw_spin_lock(&rq1->lock);
1828 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1829 } else {
1830 raw_spin_lock(&rq2->lock);
1831 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1832 }
1833 }
1834}
1835
1836/*
1837 * double_rq_unlock - safely unlock two runqueues
1838 *
1839 * Note this does not restore interrupts like task_rq_unlock,
1840 * you need to do so manually after calling.
1841 */
1842static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1843 __releases(rq1->lock)
1844 __releases(rq2->lock)
1845{
1846 raw_spin_unlock(&rq1->lock);
1847 if (rq1 != rq2)
1848 raw_spin_unlock(&rq2->lock);
1849 else
1850 __release(rq2->lock);
1851}
1852
1853#else /* CONFIG_SMP */
1854
1855/*
1856 * double_rq_lock - safely lock two runqueues
1857 *
1858 * Note this does not disable interrupts like task_rq_lock,
1859 * you need to do so manually before calling.
1860 */
1861static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1862 __acquires(rq1->lock)
1863 __acquires(rq2->lock)
1864{
1865 BUG_ON(!irqs_disabled());
1866 BUG_ON(rq1 != rq2);
1867 raw_spin_lock(&rq1->lock);
1868 __acquire(rq2->lock); /* Fake it out ;) */
1869}
1870
1871/*
1872 * double_rq_unlock - safely unlock two runqueues
1873 *
1874 * Note this does not restore interrupts like task_rq_unlock,
1875 * you need to do so manually after calling.
1876 */
1877static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1878 __releases(rq1->lock)
1879 __releases(rq2->lock)
1880{
1881 BUG_ON(rq1 != rq2);
1882 raw_spin_unlock(&rq1->lock);
1883 __release(rq2->lock);
1884}
1885
1886#endif
1887
1888static void calc_load_account_idle(struct rq *this_rq);
1889static void update_sysctl(void);
1890static int get_update_sysctl_factor(void);
1891static void update_cpu_load(struct rq *this_rq);
1892
1893static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1894{
1895 set_task_rq(p, cpu);
1896#ifdef CONFIG_SMP
1897 /*
1898 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1899 * successfully executed on another CPU. We must ensure that updates of
1900 * per-task data have been completed by this moment.
1901 */
1902 smp_wmb();
1903 task_thread_info(p)->cpu = cpu;
1904#endif
1905}
1906
1907static const struct sched_class rt_sched_class;
1908
1909#define sched_class_highest (&stop_sched_class)
1910#define for_each_class(class) \
1911 for (class = sched_class_highest; class; class = class->next)
1912
1913#include "sched_stats.h"
1914
1915static void inc_nr_running(struct rq *rq)
1916{
1917 rq->nr_running++;
1918}
1919
1920static void dec_nr_running(struct rq *rq)
1921{
1922 rq->nr_running--;
1923}
1924 693
1925static void set_load_weight(struct task_struct *p) 694static void set_load_weight(struct task_struct *p)
1926{ 695{
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1957/* 726/*
1958 * activate_task - move a task to the runqueue. 727 * activate_task - move a task to the runqueue.
1959 */ 728 */
1960static void activate_task(struct rq *rq, struct task_struct *p, int flags) 729void activate_task(struct rq *rq, struct task_struct *p, int flags)
1961{ 730{
1962 if (task_contributes_to_load(p)) 731 if (task_contributes_to_load(p))
1963 rq->nr_uninterruptible--; 732 rq->nr_uninterruptible--;
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1968/* 737/*
1969 * deactivate_task - remove a task from the runqueue. 738 * deactivate_task - remove a task from the runqueue.
1970 */ 739 */
1971static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 740void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1972{ 741{
1973 if (task_contributes_to_load(p)) 742 if (task_contributes_to_load(p))
1974 rq->nr_uninterruptible++; 743 rq->nr_uninterruptible++;
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2159#ifdef CONFIG_IRQ_TIME_ACCOUNTING 928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2160static int irqtime_account_hi_update(void) 929static int irqtime_account_hi_update(void)
2161{ 930{
2162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 931 u64 *cpustat = kcpustat_this_cpu->cpustat;
2163 unsigned long flags; 932 unsigned long flags;
2164 u64 latest_ns; 933 u64 latest_ns;
2165 int ret = 0; 934 int ret = 0;
2166 935
2167 local_irq_save(flags); 936 local_irq_save(flags);
2168 latest_ns = this_cpu_read(cpu_hardirq_time); 937 latest_ns = this_cpu_read(cpu_hardirq_time);
2169 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 938 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
2170 ret = 1; 939 ret = 1;
2171 local_irq_restore(flags); 940 local_irq_restore(flags);
2172 return ret; 941 return ret;
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void)
2174 943
2175static int irqtime_account_si_update(void) 944static int irqtime_account_si_update(void)
2176{ 945{
2177 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 946 u64 *cpustat = kcpustat_this_cpu->cpustat;
2178 unsigned long flags; 947 unsigned long flags;
2179 u64 latest_ns; 948 u64 latest_ns;
2180 int ret = 0; 949 int ret = 0;
2181 950
2182 local_irq_save(flags); 951 local_irq_save(flags);
2183 latest_ns = this_cpu_read(cpu_softirq_time); 952 latest_ns = this_cpu_read(cpu_softirq_time);
2184 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 953 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
2185 ret = 1; 954 ret = 1;
2186 local_irq_restore(flags); 955 local_irq_restore(flags);
2187 return ret; 956 return ret;
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void)
2193 962
2194#endif 963#endif
2195 964
2196#include "sched_idletask.c"
2197#include "sched_fair.c"
2198#include "sched_rt.c"
2199#include "sched_autogroup.c"
2200#include "sched_stoptask.c"
2201#ifdef CONFIG_SCHED_DEBUG
2202# include "sched_debug.c"
2203#endif
2204
2205void sched_set_stop_task(int cpu, struct task_struct *stop) 965void sched_set_stop_task(int cpu, struct task_struct *stop)
2206{ 966{
2207 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 967 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2299 p->sched_class->prio_changed(rq, p, oldprio); 1059 p->sched_class->prio_changed(rq, p, oldprio);
2300} 1060}
2301 1061
2302static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1062void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2303{ 1063{
2304 const struct sched_class *class; 1064 const struct sched_class *class;
2305 1065
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2325} 1085}
2326 1086
2327#ifdef CONFIG_SMP 1087#ifdef CONFIG_SMP
2328/*
2329 * Is this task likely cache-hot:
2330 */
2331static int
2332task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2333{
2334 s64 delta;
2335
2336 if (p->sched_class != &fair_sched_class)
2337 return 0;
2338
2339 if (unlikely(p->policy == SCHED_IDLE))
2340 return 0;
2341
2342 /*
2343 * Buddy candidates are cache hot:
2344 */
2345 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2346 (&p->se == cfs_rq_of(&p->se)->next ||
2347 &p->se == cfs_rq_of(&p->se)->last))
2348 return 1;
2349
2350 if (sysctl_sched_migration_cost == -1)
2351 return 1;
2352 if (sysctl_sched_migration_cost == 0)
2353 return 0;
2354
2355 delta = now - p->se.exec_start;
2356
2357 return delta < (s64)sysctl_sched_migration_cost;
2358}
2359
2360void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1088void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2361{ 1089{
2362#ifdef CONFIG_SCHED_DEBUG 1090#ifdef CONFIG_SCHED_DEBUG
@@ -2783,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2783 1511
2784} 1512}
2785#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1513#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1514
1515static inline int ttwu_share_cache(int this_cpu, int that_cpu)
1516{
1517 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1518}
2786#endif /* CONFIG_SMP */ 1519#endif /* CONFIG_SMP */
2787 1520
2788static void ttwu_queue(struct task_struct *p, int cpu) 1521static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2790,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2790 struct rq *rq = cpu_rq(cpu); 1523 struct rq *rq = cpu_rq(cpu);
2791 1524
2792#if defined(CONFIG_SMP) 1525#if defined(CONFIG_SMP)
2793 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 1526 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
2794 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1527 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2795 ttwu_queue_remote(p, cpu); 1528 ttwu_queue_remote(p, cpu);
2796 return; 1529 return;
@@ -3204,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3204 local_irq_enable(); 1937 local_irq_enable();
3205#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1938#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3206 finish_lock_switch(rq, prev); 1939 finish_lock_switch(rq, prev);
1940 trace_sched_stat_sleeptime(current, rq->clock);
3207 1941
3208 fire_sched_in_preempt_notifiers(current); 1942 fire_sched_in_preempt_notifiers(current);
3209 if (mm) 1943 if (mm)
@@ -3439,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3439 */ 2173 */
3440static atomic_long_t calc_load_tasks_idle; 2174static atomic_long_t calc_load_tasks_idle;
3441 2175
3442static void calc_load_account_idle(struct rq *this_rq) 2176void calc_load_account_idle(struct rq *this_rq)
3443{ 2177{
3444 long delta; 2178 long delta;
3445 2179
@@ -3583,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks)
3583 */ 2317 */
3584} 2318}
3585#else 2319#else
3586static void calc_load_account_idle(struct rq *this_rq) 2320void calc_load_account_idle(struct rq *this_rq)
3587{ 2321{
3588} 2322}
3589 2323
@@ -3726,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3726 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2460 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3727 * every tick. We fix it up based on jiffies. 2461 * every tick. We fix it up based on jiffies.
3728 */ 2462 */
3729static void update_cpu_load(struct rq *this_rq) 2463void update_cpu_load(struct rq *this_rq)
3730{ 2464{
3731 unsigned long this_load = this_rq->load.weight; 2465 unsigned long this_load = this_rq->load.weight;
3732 unsigned long curr_jiffies = jiffies; 2466 unsigned long curr_jiffies = jiffies;
@@ -3804,8 +2538,10 @@ unlock:
3804#endif 2538#endif
3805 2539
3806DEFINE_PER_CPU(struct kernel_stat, kstat); 2540DEFINE_PER_CPU(struct kernel_stat, kstat);
2541DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3807 2542
3808EXPORT_PER_CPU_SYMBOL(kstat); 2543EXPORT_PER_CPU_SYMBOL(kstat);
2544EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3809 2545
3810/* 2546/*
3811 * Return any ns on the sched_clock that have not yet been accounted in 2547 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3858,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3858 return ns; 2594 return ns;
3859} 2595}
3860 2596
2597#ifdef CONFIG_CGROUP_CPUACCT
2598struct cgroup_subsys cpuacct_subsys;
2599struct cpuacct root_cpuacct;
2600#endif
2601
2602static inline void task_group_account_field(struct task_struct *p, int index,
2603 u64 tmp)
2604{
2605#ifdef CONFIG_CGROUP_CPUACCT
2606 struct kernel_cpustat *kcpustat;
2607 struct cpuacct *ca;
2608#endif
2609 /*
2610 * Since all updates are sure to touch the root cgroup, we
2611 * get ourselves ahead and touch it first. If the root cgroup
2612 * is the only cgroup, then nothing else should be necessary.
2613 *
2614 */
2615 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2616
2617#ifdef CONFIG_CGROUP_CPUACCT
2618 if (unlikely(!cpuacct_subsys.active))
2619 return;
2620
2621 rcu_read_lock();
2622 ca = task_ca(p);
2623 while (ca && (ca != &root_cpuacct)) {
2624 kcpustat = this_cpu_ptr(ca->cpustat);
2625 kcpustat->cpustat[index] += tmp;
2626 ca = parent_ca(ca);
2627 }
2628 rcu_read_unlock();
2629#endif
2630}
2631
2632
3861/* 2633/*
3862 * Account user cpu time to a process. 2634 * Account user cpu time to a process.
3863 * @p: the process that the cpu time gets accounted to 2635 * @p: the process that the cpu time gets accounted to
@@ -3867,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3867void account_user_time(struct task_struct *p, cputime_t cputime, 2639void account_user_time(struct task_struct *p, cputime_t cputime,
3868 cputime_t cputime_scaled) 2640 cputime_t cputime_scaled)
3869{ 2641{
3870 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2642 int index;
3871 cputime64_t tmp;
3872 2643
3873 /* Add user time to process. */ 2644 /* Add user time to process. */
3874 p->utime = cputime_add(p->utime, cputime); 2645 p->utime += cputime;
3875 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2646 p->utimescaled += cputime_scaled;
3876 account_group_user_time(p, cputime); 2647 account_group_user_time(p, cputime);
3877 2648
2649 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2650
3878 /* Add user time to cpustat. */ 2651 /* Add user time to cpustat. */
3879 tmp = cputime_to_cputime64(cputime); 2652 task_group_account_field(p, index, (__force u64) cputime);
3880 if (TASK_NICE(p) > 0)
3881 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3882 else
3883 cpustat->user = cputime64_add(cpustat->user, tmp);
3884 2653
3885 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3886 /* Account for user time used */ 2654 /* Account for user time used */
3887 acct_update_integrals(p); 2655 acct_update_integrals(p);
3888} 2656}
@@ -3896,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
3896static void account_guest_time(struct task_struct *p, cputime_t cputime, 2664static void account_guest_time(struct task_struct *p, cputime_t cputime,
3897 cputime_t cputime_scaled) 2665 cputime_t cputime_scaled)
3898{ 2666{
3899 cputime64_t tmp; 2667 u64 *cpustat = kcpustat_this_cpu->cpustat;
3900 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3901
3902 tmp = cputime_to_cputime64(cputime);
3903 2668
3904 /* Add guest time to process. */ 2669 /* Add guest time to process. */
3905 p->utime = cputime_add(p->utime, cputime); 2670 p->utime += cputime;
3906 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2671 p->utimescaled += cputime_scaled;
3907 account_group_user_time(p, cputime); 2672 account_group_user_time(p, cputime);
3908 p->gtime = cputime_add(p->gtime, cputime); 2673 p->gtime += cputime;
3909 2674
3910 /* Add guest time to cpustat. */ 2675 /* Add guest time to cpustat. */
3911 if (TASK_NICE(p) > 0) { 2676 if (TASK_NICE(p) > 0) {
3912 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2677 cpustat[CPUTIME_NICE] += (__force u64) cputime;
3913 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 2678 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
3914 } else { 2679 } else {
3915 cpustat->user = cputime64_add(cpustat->user, tmp); 2680 cpustat[CPUTIME_USER] += (__force u64) cputime;
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp); 2681 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
3917 } 2682 }
3918} 2683}
3919 2684
@@ -3926,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3926 */ 2691 */
3927static inline 2692static inline
3928void __account_system_time(struct task_struct *p, cputime_t cputime, 2693void __account_system_time(struct task_struct *p, cputime_t cputime,
3929 cputime_t cputime_scaled, cputime64_t *target_cputime64) 2694 cputime_t cputime_scaled, int index)
3930{ 2695{
3931 cputime64_t tmp = cputime_to_cputime64(cputime);
3932
3933 /* Add system time to process. */ 2696 /* Add system time to process. */
3934 p->stime = cputime_add(p->stime, cputime); 2697 p->stime += cputime;
3935 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 2698 p->stimescaled += cputime_scaled;
3936 account_group_system_time(p, cputime); 2699 account_group_system_time(p, cputime);
3937 2700
3938 /* Add system time to cpustat. */ 2701 /* Add system time to cpustat. */
3939 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 2702 task_group_account_field(p, index, (__force u64) cputime);
3940 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3941 2703
3942 /* Account for system time used */ 2704 /* Account for system time used */
3943 acct_update_integrals(p); 2705 acct_update_integrals(p);
@@ -3953,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3953void account_system_time(struct task_struct *p, int hardirq_offset, 2715void account_system_time(struct task_struct *p, int hardirq_offset,
3954 cputime_t cputime, cputime_t cputime_scaled) 2716 cputime_t cputime, cputime_t cputime_scaled)
3955{ 2717{
3956 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2718 int index;
3957 cputime64_t *target_cputime64;
3958 2719
3959 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2720 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3960 account_guest_time(p, cputime, cputime_scaled); 2721 account_guest_time(p, cputime, cputime_scaled);
@@ -3962,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3962 } 2723 }
3963 2724
3964 if (hardirq_count() - hardirq_offset) 2725 if (hardirq_count() - hardirq_offset)
3965 target_cputime64 = &cpustat->irq; 2726 index = CPUTIME_IRQ;
3966 else if (in_serving_softirq()) 2727 else if (in_serving_softirq())
3967 target_cputime64 = &cpustat->softirq; 2728 index = CPUTIME_SOFTIRQ;
3968 else 2729 else
3969 target_cputime64 = &cpustat->system; 2730 index = CPUTIME_SYSTEM;
3970 2731
3971 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 2732 __account_system_time(p, cputime, cputime_scaled, index);
3972} 2733}
3973 2734
3974/* 2735/*
@@ -3977,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3977 */ 2738 */
3978void account_steal_time(cputime_t cputime) 2739void account_steal_time(cputime_t cputime)
3979{ 2740{
3980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2741 u64 *cpustat = kcpustat_this_cpu->cpustat;
3981 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3982 2742
3983 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 2743 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
3984} 2744}
3985 2745
3986/* 2746/*
@@ -3989,14 +2749,13 @@ void account_steal_time(cputime_t cputime)
3989 */ 2749 */
3990void account_idle_time(cputime_t cputime) 2750void account_idle_time(cputime_t cputime)
3991{ 2751{
3992 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2752 u64 *cpustat = kcpustat_this_cpu->cpustat;
3993 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3994 struct rq *rq = this_rq(); 2753 struct rq *rq = this_rq();
3995 2754
3996 if (atomic_read(&rq->nr_iowait) > 0) 2755 if (atomic_read(&rq->nr_iowait) > 0)
3997 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 2756 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
3998 else 2757 else
3999 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 2758 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
4000} 2759}
4001 2760
4002static __always_inline bool steal_account_process_tick(void) 2761static __always_inline bool steal_account_process_tick(void)
@@ -4046,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4046 struct rq *rq) 2805 struct rq *rq)
4047{ 2806{
4048 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2807 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4049 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 2808 u64 *cpustat = kcpustat_this_cpu->cpustat;
4050 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4051 2809
4052 if (steal_account_process_tick()) 2810 if (steal_account_process_tick())
4053 return; 2811 return;
4054 2812
4055 if (irqtime_account_hi_update()) { 2813 if (irqtime_account_hi_update()) {
4056 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2814 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
4057 } else if (irqtime_account_si_update()) { 2815 } else if (irqtime_account_si_update()) {
4058 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2816 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
4059 } else if (this_cpu_ksoftirqd() == p) { 2817 } else if (this_cpu_ksoftirqd() == p) {
4060 /* 2818 /*
4061 * ksoftirqd time do not get accounted in cpu_softirq_time. 2819 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4063,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4063 * Also, p->stime needs to be updated for ksoftirqd. 2821 * Also, p->stime needs to be updated for ksoftirqd.
4064 */ 2822 */
4065 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2823 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4066 &cpustat->softirq); 2824 CPUTIME_SOFTIRQ);
4067 } else if (user_tick) { 2825 } else if (user_tick) {
4068 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2826 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4069 } else if (p == rq->idle) { 2827 } else if (p == rq->idle) {
@@ -4072,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4072 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2830 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4073 } else { 2831 } else {
4074 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2832 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4075 &cpustat->system); 2833 CPUTIME_SYSTEM);
4076 } 2834 }
4077} 2835}
4078 2836
@@ -4171,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4171 2929
4172void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2930void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4173{ 2931{
4174 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 2932 cputime_t rtime, utime = p->utime, total = utime + p->stime;
4175 2933
4176 /* 2934 /*
4177 * Use CFS's precise accounting: 2935 * Use CFS's precise accounting:
@@ -4179,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4179 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 2937 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4180 2938
4181 if (total) { 2939 if (total) {
4182 u64 temp = rtime; 2940 u64 temp = (__force u64) rtime;
4183 2941
4184 temp *= utime; 2942 temp *= (__force u64) utime;
4185 do_div(temp, total); 2943 do_div(temp, (__force u32) total);
4186 utime = (cputime_t)temp; 2944 utime = (__force cputime_t) temp;
4187 } else 2945 } else
4188 utime = rtime; 2946 utime = rtime;
4189 2947
@@ -4191,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4191 * Compare with previous values, to keep monotonicity: 2949 * Compare with previous values, to keep monotonicity:
4192 */ 2950 */
4193 p->prev_utime = max(p->prev_utime, utime); 2951 p->prev_utime = max(p->prev_utime, utime);
4194 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 2952 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
4195 2953
4196 *ut = p->prev_utime; 2954 *ut = p->prev_utime;
4197 *st = p->prev_stime; 2955 *st = p->prev_stime;
@@ -4208,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4208 2966
4209 thread_group_cputime(p, &cputime); 2967 thread_group_cputime(p, &cputime);
4210 2968
4211 total = cputime_add(cputime.utime, cputime.stime); 2969 total = cputime.utime + cputime.stime;
4212 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 2970 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4213 2971
4214 if (total) { 2972 if (total) {
4215 u64 temp = rtime; 2973 u64 temp = (__force u64) rtime;
4216 2974
4217 temp *= cputime.utime; 2975 temp *= (__force u64) cputime.utime;
4218 do_div(temp, total); 2976 do_div(temp, (__force u32) total);
4219 utime = (cputime_t)temp; 2977 utime = (__force cputime_t) temp;
4220 } else 2978 } else
4221 utime = rtime; 2979 utime = rtime;
4222 2980
4223 sig->prev_utime = max(sig->prev_utime, utime); 2981 sig->prev_utime = max(sig->prev_utime, utime);
4224 sig->prev_stime = max(sig->prev_stime, 2982 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
4225 cputime_sub(rtime, sig->prev_utime));
4226 2983
4227 *ut = sig->prev_utime; 2984 *ut = sig->prev_utime;
4228 *st = sig->prev_stime; 2985 *st = sig->prev_stime;
@@ -4321,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
4321{ 3078{
4322 struct pt_regs *regs = get_irq_regs(); 3079 struct pt_regs *regs = get_irq_regs();
4323 3080
3081 if (oops_in_progress)
3082 return;
3083
4324 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3084 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4325 prev->comm, prev->pid, preempt_count()); 3085 prev->comm, prev->pid, preempt_count());
4326 3086
@@ -5852,6 +4612,13 @@ again:
5852 */ 4612 */
5853 if (preempt && rq != p_rq) 4613 if (preempt && rq != p_rq)
5854 resched_task(p_rq->curr); 4614 resched_task(p_rq->curr);
4615 } else {
4616 /*
4617 * We might have set it in task_yield_fair(), but are
4618 * not going to schedule(), so don't want to skip
4619 * the next update.
4620 */
4621 rq->skip_clock_update = 0;
5855 } 4622 }
5856 4623
5857out: 4624out:
@@ -6019,7 +4786,7 @@ void sched_show_task(struct task_struct *p)
6019 free = stack_not_used(p); 4786 free = stack_not_used(p);
6020#endif 4787#endif
6021 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4788 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6022 task_pid_nr(p), task_pid_nr(p->real_parent), 4789 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
6023 (unsigned long)task_thread_info(p)->flags); 4790 (unsigned long)task_thread_info(p)->flags);
6024 4791
6025 show_stack(p, NULL); 4792 show_stack(p, NULL);
@@ -6118,53 +4885,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6118#endif 4885#endif
6119} 4886}
6120 4887
6121/*
6122 * Increase the granularity value when there are more CPUs,
6123 * because with more CPUs the 'effective latency' as visible
6124 * to users decreases. But the relationship is not linear,
6125 * so pick a second-best guess by going with the log2 of the
6126 * number of CPUs.
6127 *
6128 * This idea comes from the SD scheduler of Con Kolivas:
6129 */
6130static int get_update_sysctl_factor(void)
6131{
6132 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6133 unsigned int factor;
6134
6135 switch (sysctl_sched_tunable_scaling) {
6136 case SCHED_TUNABLESCALING_NONE:
6137 factor = 1;
6138 break;
6139 case SCHED_TUNABLESCALING_LINEAR:
6140 factor = cpus;
6141 break;
6142 case SCHED_TUNABLESCALING_LOG:
6143 default:
6144 factor = 1 + ilog2(cpus);
6145 break;
6146 }
6147
6148 return factor;
6149}
6150
6151static void update_sysctl(void)
6152{
6153 unsigned int factor = get_update_sysctl_factor();
6154
6155#define SET_SYSCTL(name) \
6156 (sysctl_##name = (factor) * normalized_sysctl_##name)
6157 SET_SYSCTL(sched_min_granularity);
6158 SET_SYSCTL(sched_latency);
6159 SET_SYSCTL(sched_wakeup_granularity);
6160#undef SET_SYSCTL
6161}
6162
6163static inline void sched_init_granularity(void)
6164{
6165 update_sysctl();
6166}
6167
6168#ifdef CONFIG_SMP 4888#ifdef CONFIG_SMP
6169void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4889void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6170{ 4890{
@@ -6351,30 +5071,6 @@ static void calc_global_load_remove(struct rq *rq)
6351 rq->calc_load_active = 0; 5071 rq->calc_load_active = 0;
6352} 5072}
6353 5073
6354#ifdef CONFIG_CFS_BANDWIDTH
6355static void unthrottle_offline_cfs_rqs(struct rq *rq)
6356{
6357 struct cfs_rq *cfs_rq;
6358
6359 for_each_leaf_cfs_rq(rq, cfs_rq) {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361
6362 if (!cfs_rq->runtime_enabled)
6363 continue;
6364
6365 /*
6366 * clock_task is not advancing so we just need to make sure
6367 * there's some valid quota amount
6368 */
6369 cfs_rq->runtime_remaining = cfs_b->quota;
6370 if (cfs_rq_throttled(cfs_rq))
6371 unthrottle_cfs_rq(cfs_rq);
6372 }
6373}
6374#else
6375static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6376#endif
6377
6378/* 5074/*
6379 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5075 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6380 * try_to_wake_up()->select_task_rq(). 5076 * try_to_wake_up()->select_task_rq().
@@ -6980,6 +5676,12 @@ out:
6980 return -ENOMEM; 5676 return -ENOMEM;
6981} 5677}
6982 5678
5679/*
5680 * By default the system creates a single root-domain with all cpus as
5681 * members (mimicking the global state we have today).
5682 */
5683struct root_domain def_root_domain;
5684
6983static void init_defrootdomain(void) 5685static void init_defrootdomain(void)
6984{ 5686{
6985 init_rootdomain(&def_root_domain); 5687 init_rootdomain(&def_root_domain);
@@ -7051,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
7051} 5753}
7052 5754
7053/* 5755/*
5756 * Keep a special pointer to the highest sched_domain that has
5757 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5758 * allows us to avoid some pointer chasing select_idle_sibling().
5759 *
5760 * Also keep a unique ID per domain (we use the first cpu number in
5761 * the cpumask of the domain), this allows us to quickly tell if
5762 * two cpus are in the same cache domain, see ttwu_share_cache().
5763 */
5764DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5765DEFINE_PER_CPU(int, sd_llc_id);
5766
5767static void update_top_cache_domain(int cpu)
5768{
5769 struct sched_domain *sd;
5770 int id = cpu;
5771
5772 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5773 if (sd)
5774 id = cpumask_first(sched_domain_span(sd));
5775
5776 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5777 per_cpu(sd_llc_id, cpu) = id;
5778}
5779
5780/*
7054 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5781 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7055 * hold the hotplug lock. 5782 * hold the hotplug lock.
7056 */ 5783 */
@@ -7089,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7089 tmp = rq->sd; 5816 tmp = rq->sd;
7090 rcu_assign_pointer(rq->sd, sd); 5817 rcu_assign_pointer(rq->sd, sd);
7091 destroy_sched_domains(tmp, cpu); 5818 destroy_sched_domains(tmp, cpu);
5819
5820 update_top_cache_domain(cpu);
7092} 5821}
7093 5822
7094/* cpus with isolated domains */ 5823/* cpus with isolated domains */
@@ -7248,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7248 continue; 5977 continue;
7249 5978
7250 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5979 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7251 GFP_KERNEL, cpu_to_node(i)); 5980 GFP_KERNEL, cpu_to_node(cpu));
7252 5981
7253 if (!sg) 5982 if (!sg)
7254 goto fail; 5983 goto fail;
@@ -7386,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7386 return; 6115 return;
7387 6116
7388 update_group_power(sd, cpu); 6117 update_group_power(sd, cpu);
6118 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6119}
6120
6121int __weak arch_sd_sibling_asym_packing(void)
6122{
6123 return 0*SD_ASYM_PACKING;
7389} 6124}
7390 6125
7391/* 6126/*
@@ -8023,29 +6758,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8023 } 6758 }
8024} 6759}
8025 6760
8026static int update_runtime(struct notifier_block *nfb,
8027 unsigned long action, void *hcpu)
8028{
8029 int cpu = (int)(long)hcpu;
8030
8031 switch (action) {
8032 case CPU_DOWN_PREPARE:
8033 case CPU_DOWN_PREPARE_FROZEN:
8034 disable_runtime(cpu_rq(cpu));
8035 return NOTIFY_OK;
8036
8037 case CPU_DOWN_FAILED:
8038 case CPU_DOWN_FAILED_FROZEN:
8039 case CPU_ONLINE:
8040 case CPU_ONLINE_FROZEN:
8041 enable_runtime(cpu_rq(cpu));
8042 return NOTIFY_OK;
8043
8044 default:
8045 return NOTIFY_DONE;
8046 }
8047}
8048
8049void __init sched_init_smp(void) 6761void __init sched_init_smp(void)
8050{ 6762{
8051 cpumask_var_t non_isolated_cpus; 6763 cpumask_var_t non_isolated_cpus;
@@ -8094,104 +6806,11 @@ int in_sched_functions(unsigned long addr)
8094 && addr < (unsigned long)__sched_text_end); 6806 && addr < (unsigned long)__sched_text_end);
8095} 6807}
8096 6808
8097static void init_cfs_rq(struct cfs_rq *cfs_rq) 6809#ifdef CONFIG_CGROUP_SCHED
8098{ 6810struct task_group root_task_group;
8099 cfs_rq->tasks_timeline = RB_ROOT;
8100 INIT_LIST_HEAD(&cfs_rq->tasks);
8101 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8102#ifndef CONFIG_64BIT
8103 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8104#endif
8105}
8106
8107static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8108{
8109 struct rt_prio_array *array;
8110 int i;
8111
8112 array = &rt_rq->active;
8113 for (i = 0; i < MAX_RT_PRIO; i++) {
8114 INIT_LIST_HEAD(array->queue + i);
8115 __clear_bit(i, array->bitmap);
8116 }
8117 /* delimiter for bitsearch: */
8118 __set_bit(MAX_RT_PRIO, array->bitmap);
8119
8120#if defined CONFIG_SMP
8121 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8122 rt_rq->highest_prio.next = MAX_RT_PRIO;
8123 rt_rq->rt_nr_migratory = 0;
8124 rt_rq->overloaded = 0;
8125 plist_head_init(&rt_rq->pushable_tasks);
8126#endif
8127
8128 rt_rq->rt_time = 0;
8129 rt_rq->rt_throttled = 0;
8130 rt_rq->rt_runtime = 0;
8131 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8132}
8133
8134#ifdef CONFIG_FAIR_GROUP_SCHED
8135static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8136 struct sched_entity *se, int cpu,
8137 struct sched_entity *parent)
8138{
8139 struct rq *rq = cpu_rq(cpu);
8140
8141 cfs_rq->tg = tg;
8142 cfs_rq->rq = rq;
8143#ifdef CONFIG_SMP
8144 /* allow initial update_cfs_load() to truncate */
8145 cfs_rq->load_stamp = 1;
8146#endif
8147 init_cfs_rq_runtime(cfs_rq);
8148
8149 tg->cfs_rq[cpu] = cfs_rq;
8150 tg->se[cpu] = se;
8151
8152 /* se could be NULL for root_task_group */
8153 if (!se)
8154 return;
8155
8156 if (!parent)
8157 se->cfs_rq = &rq->cfs;
8158 else
8159 se->cfs_rq = parent->my_q;
8160
8161 se->my_q = cfs_rq;
8162 update_load_set(&se->load, 0);
8163 se->parent = parent;
8164}
8165#endif 6811#endif
8166 6812
8167#ifdef CONFIG_RT_GROUP_SCHED 6813DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8168static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8169 struct sched_rt_entity *rt_se, int cpu,
8170 struct sched_rt_entity *parent)
8171{
8172 struct rq *rq = cpu_rq(cpu);
8173
8174 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8175 rt_rq->rt_nr_boosted = 0;
8176 rt_rq->rq = rq;
8177 rt_rq->tg = tg;
8178
8179 tg->rt_rq[cpu] = rt_rq;
8180 tg->rt_se[cpu] = rt_se;
8181
8182 if (!rt_se)
8183 return;
8184
8185 if (!parent)
8186 rt_se->rt_rq = &rq->rt;
8187 else
8188 rt_se->rt_rq = parent->my_q;
8189
8190 rt_se->my_q = rt_rq;
8191 rt_se->parent = parent;
8192 INIT_LIST_HEAD(&rt_se->run_list);
8193}
8194#endif
8195 6814
8196void __init sched_init(void) 6815void __init sched_init(void)
8197{ 6816{
@@ -8249,9 +6868,17 @@ void __init sched_init(void)
8249#ifdef CONFIG_CGROUP_SCHED 6868#ifdef CONFIG_CGROUP_SCHED
8250 list_add(&root_task_group.list, &task_groups); 6869 list_add(&root_task_group.list, &task_groups);
8251 INIT_LIST_HEAD(&root_task_group.children); 6870 INIT_LIST_HEAD(&root_task_group.children);
6871 INIT_LIST_HEAD(&root_task_group.siblings);
8252 autogroup_init(&init_task); 6872 autogroup_init(&init_task);
6873
8253#endif /* CONFIG_CGROUP_SCHED */ 6874#endif /* CONFIG_CGROUP_SCHED */
8254 6875
6876#ifdef CONFIG_CGROUP_CPUACCT
6877 root_cpuacct.cpustat = &kernel_cpustat;
6878 root_cpuacct.cpuusage = alloc_percpu(u64);
6879 /* Too early, not expected to fail */
6880 BUG_ON(!root_cpuacct.cpuusage);
6881#endif
8255 for_each_possible_cpu(i) { 6882 for_each_possible_cpu(i) {
8256 struct rq *rq; 6883 struct rq *rq;
8257 6884
@@ -8263,7 +6890,7 @@ void __init sched_init(void)
8263 init_cfs_rq(&rq->cfs); 6890 init_cfs_rq(&rq->cfs);
8264 init_rt_rq(&rq->rt, rq); 6891 init_rt_rq(&rq->rt, rq);
8265#ifdef CONFIG_FAIR_GROUP_SCHED 6892#ifdef CONFIG_FAIR_GROUP_SCHED
8266 root_task_group.shares = root_task_group_load; 6893 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8267 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6894 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8268 /* 6895 /*
8269 * How much cpu bandwidth does root_task_group get? 6896 * How much cpu bandwidth does root_task_group get?
@@ -8313,7 +6940,7 @@ void __init sched_init(void)
8313 rq->avg_idle = 2*sysctl_sched_migration_cost; 6940 rq->avg_idle = 2*sysctl_sched_migration_cost;
8314 rq_attach_root(rq, &def_root_domain); 6941 rq_attach_root(rq, &def_root_domain);
8315#ifdef CONFIG_NO_HZ 6942#ifdef CONFIG_NO_HZ
8316 rq->nohz_balance_kick = 0; 6943 rq->nohz_flags = 0;
8317#endif 6944#endif
8318#endif 6945#endif
8319 init_rq_hrtick(rq); 6946 init_rq_hrtick(rq);
@@ -8326,10 +6953,6 @@ void __init sched_init(void)
8326 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6953 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8327#endif 6954#endif
8328 6955
8329#ifdef CONFIG_SMP
8330 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8331#endif
8332
8333#ifdef CONFIG_RT_MUTEXES 6956#ifdef CONFIG_RT_MUTEXES
8334 plist_head_init(&init_task.pi_waiters); 6957 plist_head_init(&init_task.pi_waiters);
8335#endif 6958#endif
@@ -8357,17 +6980,11 @@ void __init sched_init(void)
8357 6980
8358#ifdef CONFIG_SMP 6981#ifdef CONFIG_SMP
8359 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6982 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8360#ifdef CONFIG_NO_HZ
8361 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8362 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8363 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8364 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8365 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8366#endif
8367 /* May be allocated at isolcpus cmdline parse time */ 6983 /* May be allocated at isolcpus cmdline parse time */
8368 if (cpu_isolated_map == NULL) 6984 if (cpu_isolated_map == NULL)
8369 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6985 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8370#endif /* SMP */ 6986#endif
6987 init_sched_fair_class();
8371 6988
8372 scheduler_running = 1; 6989 scheduler_running = 1;
8373} 6990}
@@ -8519,169 +7136,14 @@ void set_curr_task(int cpu, struct task_struct *p)
8519 7136
8520#endif 7137#endif
8521 7138
8522#ifdef CONFIG_FAIR_GROUP_SCHED
8523static void free_fair_sched_group(struct task_group *tg)
8524{
8525 int i;
8526
8527 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8528
8529 for_each_possible_cpu(i) {
8530 if (tg->cfs_rq)
8531 kfree(tg->cfs_rq[i]);
8532 if (tg->se)
8533 kfree(tg->se[i]);
8534 }
8535
8536 kfree(tg->cfs_rq);
8537 kfree(tg->se);
8538}
8539
8540static
8541int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8542{
8543 struct cfs_rq *cfs_rq;
8544 struct sched_entity *se;
8545 int i;
8546
8547 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8548 if (!tg->cfs_rq)
8549 goto err;
8550 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8551 if (!tg->se)
8552 goto err;
8553
8554 tg->shares = NICE_0_LOAD;
8555
8556 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8557
8558 for_each_possible_cpu(i) {
8559 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8560 GFP_KERNEL, cpu_to_node(i));
8561 if (!cfs_rq)
8562 goto err;
8563
8564 se = kzalloc_node(sizeof(struct sched_entity),
8565 GFP_KERNEL, cpu_to_node(i));
8566 if (!se)
8567 goto err_free_rq;
8568
8569 init_cfs_rq(cfs_rq);
8570 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8571 }
8572
8573 return 1;
8574
8575err_free_rq:
8576 kfree(cfs_rq);
8577err:
8578 return 0;
8579}
8580
8581static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8582{
8583 struct rq *rq = cpu_rq(cpu);
8584 unsigned long flags;
8585
8586 /*
8587 * Only empty task groups can be destroyed; so we can speculatively
8588 * check on_list without danger of it being re-added.
8589 */
8590 if (!tg->cfs_rq[cpu]->on_list)
8591 return;
8592
8593 raw_spin_lock_irqsave(&rq->lock, flags);
8594 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8595 raw_spin_unlock_irqrestore(&rq->lock, flags);
8596}
8597#else /* !CONFIG_FAIR_GROUP_SCHED */
8598static inline void free_fair_sched_group(struct task_group *tg)
8599{
8600}
8601
8602static inline
8603int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8604{
8605 return 1;
8606}
8607
8608static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8609{
8610}
8611#endif /* CONFIG_FAIR_GROUP_SCHED */
8612
8613#ifdef CONFIG_RT_GROUP_SCHED 7139#ifdef CONFIG_RT_GROUP_SCHED
8614static void free_rt_sched_group(struct task_group *tg)
8615{
8616 int i;
8617
8618 if (tg->rt_se)
8619 destroy_rt_bandwidth(&tg->rt_bandwidth);
8620
8621 for_each_possible_cpu(i) {
8622 if (tg->rt_rq)
8623 kfree(tg->rt_rq[i]);
8624 if (tg->rt_se)
8625 kfree(tg->rt_se[i]);
8626 }
8627
8628 kfree(tg->rt_rq);
8629 kfree(tg->rt_se);
8630}
8631
8632static
8633int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8634{
8635 struct rt_rq *rt_rq;
8636 struct sched_rt_entity *rt_se;
8637 int i;
8638
8639 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8640 if (!tg->rt_rq)
8641 goto err;
8642 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8643 if (!tg->rt_se)
8644 goto err;
8645
8646 init_rt_bandwidth(&tg->rt_bandwidth,
8647 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8648
8649 for_each_possible_cpu(i) {
8650 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8651 GFP_KERNEL, cpu_to_node(i));
8652 if (!rt_rq)
8653 goto err;
8654
8655 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8656 GFP_KERNEL, cpu_to_node(i));
8657 if (!rt_se)
8658 goto err_free_rq;
8659
8660 init_rt_rq(rt_rq, cpu_rq(i));
8661 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8662 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8663 }
8664
8665 return 1;
8666
8667err_free_rq:
8668 kfree(rt_rq);
8669err:
8670 return 0;
8671}
8672#else /* !CONFIG_RT_GROUP_SCHED */ 7140#else /* !CONFIG_RT_GROUP_SCHED */
8673static inline void free_rt_sched_group(struct task_group *tg)
8674{
8675}
8676
8677static inline
8678int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8679{
8680 return 1;
8681}
8682#endif /* CONFIG_RT_GROUP_SCHED */ 7141#endif /* CONFIG_RT_GROUP_SCHED */
8683 7142
8684#ifdef CONFIG_CGROUP_SCHED 7143#ifdef CONFIG_CGROUP_SCHED
7144/* task_group_lock serializes the addition/removal of task groups */
7145static DEFINE_SPINLOCK(task_group_lock);
7146
8685static void free_sched_group(struct task_group *tg) 7147static void free_sched_group(struct task_group *tg)
8686{ 7148{
8687 free_fair_sched_group(tg); 7149 free_fair_sched_group(tg);
@@ -8787,47 +7249,6 @@ void sched_move_task(struct task_struct *tsk)
8787#endif /* CONFIG_CGROUP_SCHED */ 7249#endif /* CONFIG_CGROUP_SCHED */
8788 7250
8789#ifdef CONFIG_FAIR_GROUP_SCHED 7251#ifdef CONFIG_FAIR_GROUP_SCHED
8790static DEFINE_MUTEX(shares_mutex);
8791
8792int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8793{
8794 int i;
8795 unsigned long flags;
8796
8797 /*
8798 * We can't change the weight of the root cgroup.
8799 */
8800 if (!tg->se[0])
8801 return -EINVAL;
8802
8803 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8804
8805 mutex_lock(&shares_mutex);
8806 if (tg->shares == shares)
8807 goto done;
8808
8809 tg->shares = shares;
8810 for_each_possible_cpu(i) {
8811 struct rq *rq = cpu_rq(i);
8812 struct sched_entity *se;
8813
8814 se = tg->se[i];
8815 /* Propagate contribution to hierarchy */
8816 raw_spin_lock_irqsave(&rq->lock, flags);
8817 for_each_sched_entity(se)
8818 update_cfs_shares(group_cfs_rq(se));
8819 raw_spin_unlock_irqrestore(&rq->lock, flags);
8820 }
8821
8822done:
8823 mutex_unlock(&shares_mutex);
8824 return 0;
8825}
8826
8827unsigned long sched_group_shares(struct task_group *tg)
8828{
8829 return tg->shares;
8830}
8831#endif 7252#endif
8832 7253
8833#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7254#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -8852,7 +7273,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8852 struct task_struct *g, *p; 7273 struct task_struct *g, *p;
8853 7274
8854 do_each_thread(g, p) { 7275 do_each_thread(g, p) {
8855 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7276 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8856 return 1; 7277 return 1;
8857 } while_each_thread(g, p); 7278 } while_each_thread(g, p);
8858 7279
@@ -9203,8 +7624,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9203 7624
9204static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7625static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9205{ 7626{
9206 int i, ret = 0, runtime_enabled; 7627 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9207 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7628 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9208 7629
9209 if (tg == &root_task_group) 7630 if (tg == &root_task_group)
9210 return -EINVAL; 7631 return -EINVAL;
@@ -9231,6 +7652,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9231 goto out_unlock; 7652 goto out_unlock;
9232 7653
9233 runtime_enabled = quota != RUNTIME_INF; 7654 runtime_enabled = quota != RUNTIME_INF;
7655 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7656 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9234 raw_spin_lock_irq(&cfs_b->lock); 7657 raw_spin_lock_irq(&cfs_b->lock);
9235 cfs_b->period = ns_to_ktime(period); 7658 cfs_b->period = ns_to_ktime(period);
9236 cfs_b->quota = quota; 7659 cfs_b->quota = quota;
@@ -9246,13 +7669,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9246 7669
9247 for_each_possible_cpu(i) { 7670 for_each_possible_cpu(i) {
9248 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7671 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9249 struct rq *rq = rq_of(cfs_rq); 7672 struct rq *rq = cfs_rq->rq;
9250 7673
9251 raw_spin_lock_irq(&rq->lock); 7674 raw_spin_lock_irq(&rq->lock);
9252 cfs_rq->runtime_enabled = runtime_enabled; 7675 cfs_rq->runtime_enabled = runtime_enabled;
9253 cfs_rq->runtime_remaining = 0; 7676 cfs_rq->runtime_remaining = 0;
9254 7677
9255 if (cfs_rq_throttled(cfs_rq)) 7678 if (cfs_rq->throttled)
9256 unthrottle_cfs_rq(cfs_rq); 7679 unthrottle_cfs_rq(cfs_rq);
9257 raw_spin_unlock_irq(&rq->lock); 7680 raw_spin_unlock_irq(&rq->lock);
9258 } 7681 }
@@ -9266,7 +7689,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9266{ 7689{
9267 u64 quota, period; 7690 u64 quota, period;
9268 7691
9269 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7692 period = ktime_to_ns(tg->cfs_bandwidth.period);
9270 if (cfs_quota_us < 0) 7693 if (cfs_quota_us < 0)
9271 quota = RUNTIME_INF; 7694 quota = RUNTIME_INF;
9272 else 7695 else
@@ -9279,10 +7702,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9279{ 7702{
9280 u64 quota_us; 7703 u64 quota_us;
9281 7704
9282 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7705 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9283 return -1; 7706 return -1;
9284 7707
9285 quota_us = tg_cfs_bandwidth(tg)->quota; 7708 quota_us = tg->cfs_bandwidth.quota;
9286 do_div(quota_us, NSEC_PER_USEC); 7709 do_div(quota_us, NSEC_PER_USEC);
9287 7710
9288 return quota_us; 7711 return quota_us;
@@ -9293,10 +7716,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9293 u64 quota, period; 7716 u64 quota, period;
9294 7717
9295 period = (u64)cfs_period_us * NSEC_PER_USEC; 7718 period = (u64)cfs_period_us * NSEC_PER_USEC;
9296 quota = tg_cfs_bandwidth(tg)->quota; 7719 quota = tg->cfs_bandwidth.quota;
9297
9298 if (period <= 0)
9299 return -EINVAL;
9300 7720
9301 return tg_set_cfs_bandwidth(tg, period, quota); 7721 return tg_set_cfs_bandwidth(tg, period, quota);
9302} 7722}
@@ -9305,7 +7725,7 @@ long tg_get_cfs_period(struct task_group *tg)
9305{ 7725{
9306 u64 cfs_period_us; 7726 u64 cfs_period_us;
9307 7727
9308 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7728 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9309 do_div(cfs_period_us, NSEC_PER_USEC); 7729 do_div(cfs_period_us, NSEC_PER_USEC);
9310 7730
9311 return cfs_period_us; 7731 return cfs_period_us;
@@ -9365,13 +7785,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9365static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7785static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9366{ 7786{
9367 struct cfs_schedulable_data *d = data; 7787 struct cfs_schedulable_data *d = data;
9368 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7788 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9369 s64 quota = 0, parent_quota = -1; 7789 s64 quota = 0, parent_quota = -1;
9370 7790
9371 if (!tg->parent) { 7791 if (!tg->parent) {
9372 quota = RUNTIME_INF; 7792 quota = RUNTIME_INF;
9373 } else { 7793 } else {
9374 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7794 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9375 7795
9376 quota = normalize_cfs_quota(tg, d); 7796 quota = normalize_cfs_quota(tg, d);
9377 parent_quota = parent_b->hierarchal_quota; 7797 parent_quota = parent_b->hierarchal_quota;
@@ -9415,7 +7835,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9415 struct cgroup_map_cb *cb) 7835 struct cgroup_map_cb *cb)
9416{ 7836{
9417 struct task_group *tg = cgroup_tg(cgrp); 7837 struct task_group *tg = cgroup_tg(cgrp);
9418 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7838 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9419 7839
9420 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7840 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9421 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7841 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9516,38 +7936,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9516 * (balbir@in.ibm.com). 7936 * (balbir@in.ibm.com).
9517 */ 7937 */
9518 7938
9519/* track cpu usage of a group of tasks and its child groups */
9520struct cpuacct {
9521 struct cgroup_subsys_state css;
9522 /* cpuusage holds pointer to a u64-type object on every cpu */
9523 u64 __percpu *cpuusage;
9524 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9525 struct cpuacct *parent;
9526};
9527
9528struct cgroup_subsys cpuacct_subsys;
9529
9530/* return cpu accounting group corresponding to this container */
9531static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9532{
9533 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9534 struct cpuacct, css);
9535}
9536
9537/* return cpu accounting group to which this task belongs */
9538static inline struct cpuacct *task_ca(struct task_struct *tsk)
9539{
9540 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9541 struct cpuacct, css);
9542}
9543
9544/* create a new cpu accounting group */ 7939/* create a new cpu accounting group */
9545static struct cgroup_subsys_state *cpuacct_create( 7940static struct cgroup_subsys_state *cpuacct_create(
9546 struct cgroup_subsys *ss, struct cgroup *cgrp) 7941 struct cgroup_subsys *ss, struct cgroup *cgrp)
9547{ 7942{
9548 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7943 struct cpuacct *ca;
9549 int i;
9550 7944
7945 if (!cgrp->parent)
7946 return &root_cpuacct.css;
7947
7948 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9551 if (!ca) 7949 if (!ca)
9552 goto out; 7950 goto out;
9553 7951
@@ -9555,18 +7953,13 @@ static struct cgroup_subsys_state *cpuacct_create(
9555 if (!ca->cpuusage) 7953 if (!ca->cpuusage)
9556 goto out_free_ca; 7954 goto out_free_ca;
9557 7955
9558 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7956 ca->cpustat = alloc_percpu(struct kernel_cpustat);
9559 if (percpu_counter_init(&ca->cpustat[i], 0)) 7957 if (!ca->cpustat)
9560 goto out_free_counters; 7958 goto out_free_cpuusage;
9561
9562 if (cgrp->parent)
9563 ca->parent = cgroup_ca(cgrp->parent);
9564 7959
9565 return &ca->css; 7960 return &ca->css;
9566 7961
9567out_free_counters: 7962out_free_cpuusage:
9568 while (--i >= 0)
9569 percpu_counter_destroy(&ca->cpustat[i]);
9570 free_percpu(ca->cpuusage); 7963 free_percpu(ca->cpuusage);
9571out_free_ca: 7964out_free_ca:
9572 kfree(ca); 7965 kfree(ca);
@@ -9579,10 +7972,8 @@ static void
9579cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7972cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9580{ 7973{
9581 struct cpuacct *ca = cgroup_ca(cgrp); 7974 struct cpuacct *ca = cgroup_ca(cgrp);
9582 int i;
9583 7975
9584 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7976 free_percpu(ca->cpustat);
9585 percpu_counter_destroy(&ca->cpustat[i]);
9586 free_percpu(ca->cpuusage); 7977 free_percpu(ca->cpuusage);
9587 kfree(ca); 7978 kfree(ca);
9588} 7979}
@@ -9675,16 +8066,31 @@ static const char *cpuacct_stat_desc[] = {
9675}; 8066};
9676 8067
9677static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8068static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9678 struct cgroup_map_cb *cb) 8069 struct cgroup_map_cb *cb)
9679{ 8070{
9680 struct cpuacct *ca = cgroup_ca(cgrp); 8071 struct cpuacct *ca = cgroup_ca(cgrp);
9681 int i; 8072 int cpu;
8073 s64 val = 0;
8074
8075 for_each_online_cpu(cpu) {
8076 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8077 val += kcpustat->cpustat[CPUTIME_USER];
8078 val += kcpustat->cpustat[CPUTIME_NICE];
8079 }
8080 val = cputime64_to_clock_t(val);
8081 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
9682 8082
9683 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8083 val = 0;
9684 s64 val = percpu_counter_read(&ca->cpustat[i]); 8084 for_each_online_cpu(cpu) {
9685 val = cputime64_to_clock_t(val); 8085 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
9686 cb->fill(cb, cpuacct_stat_desc[i], val); 8086 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8087 val += kcpustat->cpustat[CPUTIME_IRQ];
8088 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
9687 } 8089 }
8090
8091 val = cputime64_to_clock_t(val);
8092 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8093
9688 return 0; 8094 return 0;
9689} 8095}
9690 8096
@@ -9714,7 +8120,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9714 * 8120 *
9715 * called with rq->lock held. 8121 * called with rq->lock held.
9716 */ 8122 */
9717static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8123void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9718{ 8124{
9719 struct cpuacct *ca; 8125 struct cpuacct *ca;
9720 int cpu; 8126 int cpu;
@@ -9728,7 +8134,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9728 8134
9729 ca = task_ca(tsk); 8135 ca = task_ca(tsk);
9730 8136
9731 for (; ca; ca = ca->parent) { 8137 for (; ca; ca = parent_ca(ca)) {
9732 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8138 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9733 *cpuusage += cputime; 8139 *cpuusage += cputime;
9734 } 8140 }
@@ -9736,45 +8142,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9736 rcu_read_unlock(); 8142 rcu_read_unlock();
9737} 8143}
9738 8144
9739/*
9740 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9741 * in cputime_t units. As a result, cpuacct_update_stats calls
9742 * percpu_counter_add with values large enough to always overflow the
9743 * per cpu batch limit causing bad SMP scalability.
9744 *
9745 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9746 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9747 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9748 */
9749#ifdef CONFIG_SMP
9750#define CPUACCT_BATCH \
9751 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9752#else
9753#define CPUACCT_BATCH 0
9754#endif
9755
9756/*
9757 * Charge the system/user time to the task's accounting group.
9758 */
9759static void cpuacct_update_stats(struct task_struct *tsk,
9760 enum cpuacct_stat_index idx, cputime_t val)
9761{
9762 struct cpuacct *ca;
9763 int batch = CPUACCT_BATCH;
9764
9765 if (unlikely(!cpuacct_subsys.active))
9766 return;
9767
9768 rcu_read_lock();
9769 ca = task_ca(tsk);
9770
9771 do {
9772 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9773 ca = ca->parent;
9774 } while (ca);
9775 rcu_read_unlock();
9776}
9777
9778struct cgroup_subsys cpuacct_subsys = { 8145struct cgroup_subsys cpuacct_subsys = {
9779 .name = "cpuacct", 8146 .name = "cpuacct",
9780 .create = cpuacct_create, 8147 .create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb11..b0d798eaf130 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched_cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include "sched_cpupri.h" 31#include "cpupri.h"
32 32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio) 34static int convert_prio(int prio)
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/time/sched_debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree
5 * 5 *
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 8a39fa3e3c6c..8e42de9105f8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif 111#endif
105 112
106static const struct sched_class fair_sched_class; 113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
107 217
108/************************************************************** 218/**************************************************************
109 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414} 524}
415 525
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{ 527{
418 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
419 529
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
434} 544}
435 545
436#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{ 548{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440 550
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{ 794{
685 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se)) 796 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
688 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{ 807{
698 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se)) 809 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
701 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -893,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
893 if (unlikely(delta > se->statistics.sleep_max)) 1003 if (unlikely(delta > se->statistics.sleep_max))
894 se->statistics.sleep_max = delta; 1004 se->statistics.sleep_max = delta;
895 1005
896 se->statistics.sleep_start = 0;
897 se->statistics.sum_sleep_runtime += delta; 1006 se->statistics.sum_sleep_runtime += delta;
898 1007
899 if (tsk) { 1008 if (tsk) {
@@ -910,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
910 if (unlikely(delta > se->statistics.block_max)) 1019 if (unlikely(delta > se->statistics.block_max))
911 se->statistics.block_max = delta; 1020 se->statistics.block_max = delta;
912 1021
913 se->statistics.block_start = 0;
914 se->statistics.sum_sleep_runtime += delta; 1022 se->statistics.sum_sleep_runtime += delta;
915 1023
916 if (tsk) { 1024 if (tsk) {
@@ -920,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
920 trace_sched_stat_iowait(tsk, delta); 1028 trace_sched_stat_iowait(tsk, delta);
921 } 1029 }
922 1030
1031 trace_sched_stat_blocked(tsk, delta);
1032
923 /* 1033 /*
924 * Blocking time is in units of nanosecs, so shift by 1034 * Blocking time is in units of nanosecs, so shift by
925 * 20 to get a milliseconds-range estimation of the 1035 * 20 to get a milliseconds-range estimation of the
@@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1287 */ 1397 */
1288 1398
1289#ifdef CONFIG_CFS_BANDWIDTH 1399#ifdef CONFIG_CFS_BANDWIDTH
1400
1401#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used;
1403
1404static inline bool cfs_bandwidth_used(void)
1405{
1406 return static_branch(&__cfs_bandwidth_used);
1407}
1408
1409void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{
1411 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used);
1416}
1417#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void)
1419{
1420 return true;
1421}
1422
1423void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1424#endif /* HAVE_JUMP_LABEL */
1425
1290/* 1426/*
1291 * default period for cfs group bandwidth. 1427 * default period for cfs group bandwidth.
1292 * default: 0.1s, units: nanoseconds 1428 * default: 0.1s, units: nanoseconds
@@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1308 * 1444 *
1309 * requires cfs_b->lock 1445 * requires cfs_b->lock
1310 */ 1446 */
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1447void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{ 1448{
1313 u64 now; 1449 u64 now;
1314 1450
@@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1456 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321} 1457}
1322 1458
1459static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1460{
1461 return &tg->cfs_bandwidth;
1462}
1463
1323/* returns 0 on failure to allocate runtime */ 1464/* returns 0 on failure to allocate runtime */
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1465static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{ 1466{
@@ -1421,7 +1562,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1421static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1562static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1422 unsigned long delta_exec) 1563 unsigned long delta_exec)
1423{ 1564{
1424 if (!cfs_rq->runtime_enabled) 1565 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1425 return; 1566 return;
1426 1567
1427 __account_cfs_rq_runtime(cfs_rq, delta_exec); 1568 __account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1570,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1429 1570
1430static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 1571static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1431{ 1572{
1432 return cfs_rq->throttled; 1573 return cfs_bandwidth_used() && cfs_rq->throttled;
1433} 1574}
1434 1575
1435/* check whether cfs_rq, or any parent, is throttled */ 1576/* check whether cfs_rq, or any parent, is throttled */
1436static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 1577static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1437{ 1578{
1438 return cfs_rq->throttle_count; 1579 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1439} 1580}
1440 1581
1441/* 1582/*
@@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1530 raw_spin_unlock(&cfs_b->lock); 1671 raw_spin_unlock(&cfs_b->lock);
1531} 1672}
1532 1673
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1674void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{ 1675{
1535 struct rq *rq = rq_of(cfs_rq); 1676 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1677 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1756,6 +1897,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1756 1897
1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1898static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1758{ 1899{
1900 if (!cfs_bandwidth_used())
1901 return;
1902
1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) 1903 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1760 return; 1904 return;
1761 1905
@@ -1801,6 +1945,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1801 */ 1945 */
1802static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 1946static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1803{ 1947{
1948 if (!cfs_bandwidth_used())
1949 return;
1950
1804 /* an active group must be handled by the update_curr()->put() path */ 1951 /* an active group must be handled by the update_curr()->put() path */
1805 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 1952 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1806 return; 1953 return;
@@ -1818,6 +1965,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1818/* conditionally throttle active cfs_rq's from put_prev_entity() */ 1965/* conditionally throttle active cfs_rq's from put_prev_entity() */
1819static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1966static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1820{ 1967{
1968 if (!cfs_bandwidth_used())
1969 return;
1970
1821 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 1971 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1822 return; 1972 return;
1823 1973
@@ -1830,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1830 1980
1831 throttle_cfs_rq(cfs_rq); 1981 throttle_cfs_rq(cfs_rq);
1832} 1982}
1833#else 1983
1984static inline u64 default_cfs_period(void);
1985static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1986static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1987
1988static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1989{
1990 struct cfs_bandwidth *cfs_b =
1991 container_of(timer, struct cfs_bandwidth, slack_timer);
1992 do_sched_cfs_slack_timer(cfs_b);
1993
1994 return HRTIMER_NORESTART;
1995}
1996
1997static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1998{
1999 struct cfs_bandwidth *cfs_b =
2000 container_of(timer, struct cfs_bandwidth, period_timer);
2001 ktime_t now;
2002 int overrun;
2003 int idle = 0;
2004
2005 for (;;) {
2006 now = hrtimer_cb_get_time(timer);
2007 overrun = hrtimer_forward(timer, now, cfs_b->period);
2008
2009 if (!overrun)
2010 break;
2011
2012 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2013 }
2014
2015 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2016}
2017
2018void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2019{
2020 raw_spin_lock_init(&cfs_b->lock);
2021 cfs_b->runtime = 0;
2022 cfs_b->quota = RUNTIME_INF;
2023 cfs_b->period = ns_to_ktime(default_cfs_period());
2024
2025 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2026 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2027 cfs_b->period_timer.function = sched_cfs_period_timer;
2028 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2030}
2031
2032static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2033{
2034 cfs_rq->runtime_enabled = 0;
2035 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2036}
2037
2038/* requires cfs_b->lock, may release to reprogram timer */
2039void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2040{
2041 /*
2042 * The timer may be active because we're trying to set a new bandwidth
2043 * period or because we're racing with the tear-down path
2044 * (timer_active==0 becomes visible before the hrtimer call-back
2045 * terminates). In either case we ensure that it's re-programmed
2046 */
2047 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2048 raw_spin_unlock(&cfs_b->lock);
2049 /* ensure cfs_b->lock is available while we wait */
2050 hrtimer_cancel(&cfs_b->period_timer);
2051
2052 raw_spin_lock(&cfs_b->lock);
2053 /* if someone else restarted the timer then we're done */
2054 if (cfs_b->timer_active)
2055 return;
2056 }
2057
2058 cfs_b->timer_active = 1;
2059 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2060}
2061
2062static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2063{
2064 hrtimer_cancel(&cfs_b->period_timer);
2065 hrtimer_cancel(&cfs_b->slack_timer);
2066}
2067
2068void unthrottle_offline_cfs_rqs(struct rq *rq)
2069{
2070 struct cfs_rq *cfs_rq;
2071
2072 for_each_leaf_cfs_rq(rq, cfs_rq) {
2073 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2074
2075 if (!cfs_rq->runtime_enabled)
2076 continue;
2077
2078 /*
2079 * clock_task is not advancing so we just need to make sure
2080 * there's some valid quota amount
2081 */
2082 cfs_rq->runtime_remaining = cfs_b->quota;
2083 if (cfs_rq_throttled(cfs_rq))
2084 unthrottle_cfs_rq(cfs_rq);
2085 }
2086}
2087
2088#else /* CONFIG_CFS_BANDWIDTH */
1834static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1835 unsigned long delta_exec) {} 2090 unsigned long delta_exec) {}
1836static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1852,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
1852{ 2107{
1853 return 0; 2108 return 0;
1854} 2109}
2110
2111void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2112
2113#ifdef CONFIG_FAIR_GROUP_SCHED
2114static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1855#endif 2115#endif
1856 2116
2117static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2118{
2119 return NULL;
2120}
2121static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2122void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2123
2124#endif /* CONFIG_CFS_BANDWIDTH */
2125
1857/************************************************** 2126/**************************************************
1858 * CFS operations on tasks: 2127 * CFS operations on tasks:
1859 */ 2128 */
@@ -1866,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1866 2135
1867 WARN_ON(task_rq(p) != rq); 2136 WARN_ON(task_rq(p) != rq);
1868 2137
1869 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 2138 if (cfs_rq->nr_running > 1) {
1870 u64 slice = sched_slice(cfs_rq, se); 2139 u64 slice = sched_slice(cfs_rq, se);
1871 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2140 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1872 s64 delta = slice - ran; 2141 s64 delta = slice - ran;
@@ -1897,7 +2166,7 @@ static void hrtick_update(struct rq *rq)
1897{ 2166{
1898 struct task_struct *curr = rq->curr; 2167 struct task_struct *curr = rq->curr;
1899 2168
1900 if (curr->sched_class != &fair_sched_class) 2169 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
1901 return; 2170 return;
1902 2171
1903 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 2172 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2020,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2020} 2289}
2021 2290
2022#ifdef CONFIG_SMP 2291#ifdef CONFIG_SMP
2292/* Used instead of source_load when we know the type == 0 */
2293static unsigned long weighted_cpuload(const int cpu)
2294{
2295 return cpu_rq(cpu)->load.weight;
2296}
2297
2298/*
2299 * Return a low guess at the load of a migration-source cpu weighted
2300 * according to the scheduling class and "nice" value.
2301 *
2302 * We want to under-estimate the load of migration sources, to
2303 * balance conservatively.
2304 */
2305static unsigned long source_load(int cpu, int type)
2306{
2307 struct rq *rq = cpu_rq(cpu);
2308 unsigned long total = weighted_cpuload(cpu);
2309
2310 if (type == 0 || !sched_feat(LB_BIAS))
2311 return total;
2312
2313 return min(rq->cpu_load[type-1], total);
2314}
2315
2316/*
2317 * Return a high guess at the load of a migration-target cpu weighted
2318 * according to the scheduling class and "nice" value.
2319 */
2320static unsigned long target_load(int cpu, int type)
2321{
2322 struct rq *rq = cpu_rq(cpu);
2323 unsigned long total = weighted_cpuload(cpu);
2324
2325 if (type == 0 || !sched_feat(LB_BIAS))
2326 return total;
2327
2328 return max(rq->cpu_load[type-1], total);
2329}
2330
2331static unsigned long power_of(int cpu)
2332{
2333 return cpu_rq(cpu)->cpu_power;
2334}
2335
2336static unsigned long cpu_avg_load_per_task(int cpu)
2337{
2338 struct rq *rq = cpu_rq(cpu);
2339 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2340
2341 if (nr_running)
2342 return rq->load.weight / nr_running;
2343
2344 return 0;
2345}
2346
2023 2347
2024static void task_waking_fair(struct task_struct *p) 2348static void task_waking_fair(struct task_struct *p)
2025{ 2349{
@@ -2327,7 +2651,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
2327 int prev_cpu = task_cpu(p); 2651 int prev_cpu = task_cpu(p);
2328 struct sched_domain *sd; 2652 struct sched_domain *sd;
2329 struct sched_group *sg; 2653 struct sched_group *sg;
2330 int i, smt = 0; 2654 int i;
2331 2655
2332 /* 2656 /*
2333 * If the task is going to be woken-up on this cpu and if it is 2657 * If the task is going to be woken-up on this cpu and if it is
@@ -2347,17 +2671,9 @@ static int select_idle_sibling(struct task_struct *p, int target)
2347 * Otherwise, iterate the domains and find an elegible idle cpu. 2671 * Otherwise, iterate the domains and find an elegible idle cpu.
2348 */ 2672 */
2349 rcu_read_lock(); 2673 rcu_read_lock();
2350again:
2351 for_each_domain(target, sd) {
2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2353 continue;
2354
2355 if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
2356 break;
2357
2358 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
2359 break;
2360 2674
2675 sd = rcu_dereference(per_cpu(sd_llc, target));
2676 for_each_lower_domain(sd) {
2361 sg = sd->groups; 2677 sg = sd->groups;
2362 do { 2678 do {
2363 if (!cpumask_intersects(sched_group_cpus(sg), 2679 if (!cpumask_intersects(sched_group_cpus(sg),
@@ -2376,10 +2692,6 @@ next:
2376 sg = sg->next; 2692 sg = sg->next;
2377 } while (sg != sd->groups); 2693 } while (sg != sd->groups);
2378 } 2694 }
2379 if (!smt) {
2380 smt = 1;
2381 goto again;
2382 }
2383done: 2695done:
2384 rcu_read_unlock(); 2696 rcu_read_unlock();
2385 2697
@@ -2408,6 +2720,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2408 int want_sd = 1; 2720 int want_sd = 1;
2409 int sync = wake_flags & WF_SYNC; 2721 int sync = wake_flags & WF_SYNC;
2410 2722
2723 if (p->rt.nr_cpus_allowed == 1)
2724 return prev_cpu;
2725
2411 if (sd_flag & SD_BALANCE_WAKE) { 2726 if (sd_flag & SD_BALANCE_WAKE) {
2412 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 2727 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2413 want_affine = 1; 2728 want_affine = 1;
@@ -2692,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
2692 } while (cfs_rq); 3007 } while (cfs_rq);
2693 3008
2694 p = task_of(se); 3009 p = task_of(se);
2695 hrtick_start_fair(rq, p); 3010 if (hrtick_enabled(rq))
3011 hrtick_start_fair(rq, p);
2696 3012
2697 return p; 3013 return p;
2698} 3014}
@@ -2736,6 +3052,12 @@ static void yield_task_fair(struct rq *rq)
2736 * Update run-time statistics of the 'current'. 3052 * Update run-time statistics of the 'current'.
2737 */ 3053 */
2738 update_curr(cfs_rq); 3054 update_curr(cfs_rq);
3055 /*
3056 * Tell update_rq_clock() that we've just updated,
3057 * so we don't do microscopic update in schedule()
3058 * and double the fastpath cost.
3059 */
3060 rq->skip_clock_update = 1;
2739 } 3061 }
2740 3062
2741 set_skip_buddy(se); 3063 set_skip_buddy(se);
@@ -2776,12 +3098,48 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2776} 3098}
2777 3099
2778/* 3100/*
3101 * Is this task likely cache-hot:
3102 */
3103static int
3104task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3105{
3106 s64 delta;
3107
3108 if (p->sched_class != &fair_sched_class)
3109 return 0;
3110
3111 if (unlikely(p->policy == SCHED_IDLE))
3112 return 0;
3113
3114 /*
3115 * Buddy candidates are cache hot:
3116 */
3117 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3118 (&p->se == cfs_rq_of(&p->se)->next ||
3119 &p->se == cfs_rq_of(&p->se)->last))
3120 return 1;
3121
3122 if (sysctl_sched_migration_cost == -1)
3123 return 1;
3124 if (sysctl_sched_migration_cost == 0)
3125 return 0;
3126
3127 delta = now - p->se.exec_start;
3128
3129 return delta < (s64)sysctl_sched_migration_cost;
3130}
3131
3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02
3134#define LBF_ABORT 0x04
3135
3136/*
2779 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3137 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2780 */ 3138 */
2781static 3139static
2782int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3140int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2783 struct sched_domain *sd, enum cpu_idle_type idle, 3141 struct sched_domain *sd, enum cpu_idle_type idle,
2784 int *all_pinned) 3142 int *lb_flags)
2785{ 3143{
2786 int tsk_cache_hot = 0; 3144 int tsk_cache_hot = 0;
2787 /* 3145 /*
@@ -2794,7 +3152,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2794 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3152 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2795 return 0; 3153 return 0;
2796 } 3154 }
2797 *all_pinned = 0; 3155 *lb_flags &= ~LBF_ALL_PINNED;
2798 3156
2799 if (task_running(rq, p)) { 3157 if (task_running(rq, p)) {
2800 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3158 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2868,7 +3226,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2868static unsigned long 3226static unsigned long
2869balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3227balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2870 unsigned long max_load_move, struct sched_domain *sd, 3228 unsigned long max_load_move, struct sched_domain *sd,
2871 enum cpu_idle_type idle, int *all_pinned, 3229 enum cpu_idle_type idle, int *lb_flags,
2872 struct cfs_rq *busiest_cfs_rq) 3230 struct cfs_rq *busiest_cfs_rq)
2873{ 3231{
2874 int loops = 0, pulled = 0; 3232 int loops = 0, pulled = 0;
@@ -2879,12 +3237,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2879 goto out; 3237 goto out;
2880 3238
2881 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3239 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2882 if (loops++ > sysctl_sched_nr_migrate) 3240 if (loops++ > sysctl_sched_nr_migrate) {
3241 *lb_flags |= LBF_NEED_BREAK;
2883 break; 3242 break;
3243 }
2884 3244
2885 if ((p->se.load.weight >> 1) > rem_load_move || 3245 if ((p->se.load.weight >> 1) > rem_load_move ||
2886 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3246 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2887 all_pinned)) 3247 lb_flags))
2888 continue; 3248 continue;
2889 3249
2890 pull_task(busiest, p, this_rq, this_cpu); 3250 pull_task(busiest, p, this_rq, this_cpu);
@@ -2897,8 +3257,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2897 * kernels will stop after the first task is pulled to minimize 3257 * kernels will stop after the first task is pulled to minimize
2898 * the critical section. 3258 * the critical section.
2899 */ 3259 */
2900 if (idle == CPU_NEWLY_IDLE) 3260 if (idle == CPU_NEWLY_IDLE) {
3261 *lb_flags |= LBF_ABORT;
2901 break; 3262 break;
3263 }
2902#endif 3264#endif
2903 3265
2904 /* 3266 /*
@@ -3003,7 +3365,7 @@ static unsigned long
3003load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3365load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3004 unsigned long max_load_move, 3366 unsigned long max_load_move,
3005 struct sched_domain *sd, enum cpu_idle_type idle, 3367 struct sched_domain *sd, enum cpu_idle_type idle,
3006 int *all_pinned) 3368 int *lb_flags)
3007{ 3369{
3008 long rem_load_move = max_load_move; 3370 long rem_load_move = max_load_move;
3009 struct cfs_rq *busiest_cfs_rq; 3371 struct cfs_rq *busiest_cfs_rq;
@@ -3016,6 +3378,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3016 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 3378 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
3017 u64 rem_load, moved_load; 3379 u64 rem_load, moved_load;
3018 3380
3381 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3382 break;
3383
3019 /* 3384 /*
3020 * empty group or part of a throttled hierarchy 3385 * empty group or part of a throttled hierarchy
3021 */ 3386 */
@@ -3027,7 +3392,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3027 rem_load = div_u64(rem_load, busiest_h_load + 1); 3392 rem_load = div_u64(rem_load, busiest_h_load + 1);
3028 3393
3029 moved_load = balance_tasks(this_rq, this_cpu, busiest, 3394 moved_load = balance_tasks(this_rq, this_cpu, busiest,
3030 rem_load, sd, idle, all_pinned, 3395 rem_load, sd, idle, lb_flags,
3031 busiest_cfs_rq); 3396 busiest_cfs_rq);
3032 3397
3033 if (!moved_load) 3398 if (!moved_load)
@@ -3053,10 +3418,10 @@ static unsigned long
3053load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3418load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3054 unsigned long max_load_move, 3419 unsigned long max_load_move,
3055 struct sched_domain *sd, enum cpu_idle_type idle, 3420 struct sched_domain *sd, enum cpu_idle_type idle,
3056 int *all_pinned) 3421 int *lb_flags)
3057{ 3422{
3058 return balance_tasks(this_rq, this_cpu, busiest, 3423 return balance_tasks(this_rq, this_cpu, busiest,
3059 max_load_move, sd, idle, all_pinned, 3424 max_load_move, sd, idle, lb_flags,
3060 &busiest->cfs); 3425 &busiest->cfs);
3061} 3426}
3062#endif 3427#endif
@@ -3071,29 +3436,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3071static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3436static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3072 unsigned long max_load_move, 3437 unsigned long max_load_move,
3073 struct sched_domain *sd, enum cpu_idle_type idle, 3438 struct sched_domain *sd, enum cpu_idle_type idle,
3074 int *all_pinned) 3439 int *lb_flags)
3075{ 3440{
3076 unsigned long total_load_moved = 0, load_moved; 3441 unsigned long total_load_moved = 0, load_moved;
3077 3442
3078 do { 3443 do {
3079 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 3444 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
3080 max_load_move - total_load_moved, 3445 max_load_move - total_load_moved,
3081 sd, idle, all_pinned); 3446 sd, idle, lb_flags);
3082 3447
3083 total_load_moved += load_moved; 3448 total_load_moved += load_moved;
3084 3449
3450 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3451 break;
3452
3085#ifdef CONFIG_PREEMPT 3453#ifdef CONFIG_PREEMPT
3086 /* 3454 /*
3087 * NEWIDLE balancing is a source of latency, so preemptible 3455 * NEWIDLE balancing is a source of latency, so preemptible
3088 * kernels will stop after the first task is pulled to minimize 3456 * kernels will stop after the first task is pulled to minimize
3089 * the critical section. 3457 * the critical section.
3090 */ 3458 */
3091 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3459 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
3092 break; 3460 *lb_flags |= LBF_ABORT;
3093
3094 if (raw_spin_is_contended(&this_rq->lock) ||
3095 raw_spin_is_contended(&busiest->lock))
3096 break; 3461 break;
3462 }
3097#endif 3463#endif
3098 } while (load_moved && max_load_move > total_load_moved); 3464 } while (load_moved && max_load_move > total_load_moved);
3099 3465
@@ -3155,15 +3521,6 @@ struct sg_lb_stats {
3155}; 3521};
3156 3522
3157/** 3523/**
3158 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3159 * @group: The group whose first cpu is to be returned.
3160 */
3161static inline unsigned int group_first_cpu(struct sched_group *group)
3162{
3163 return cpumask_first(sched_group_cpus(group));
3164}
3165
3166/**
3167 * get_sd_load_idx - Obtain the load index for a given sched domain. 3524 * get_sd_load_idx - Obtain the load index for a given sched domain.
3168 * @sd: The sched_domain whose load_idx is to be obtained. 3525 * @sd: The sched_domain whose load_idx is to be obtained.
3169 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3526 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3412,7 +3769,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3412 sdg->sgp->power = power; 3769 sdg->sgp->power = power;
3413} 3770}
3414 3771
3415static void update_group_power(struct sched_domain *sd, int cpu) 3772void update_group_power(struct sched_domain *sd, int cpu)
3416{ 3773{
3417 struct sched_domain *child = sd->child; 3774 struct sched_domain *child = sd->child;
3418 struct sched_group *group, *sdg = sd->groups; 3775 struct sched_group *group, *sdg = sd->groups;
@@ -3678,11 +4035,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3678 } while (sg != sd->groups); 4035 } while (sg != sd->groups);
3679} 4036}
3680 4037
3681int __weak arch_sd_sibling_asym_packing(void)
3682{
3683 return 0*SD_ASYM_PACKING;
3684}
3685
3686/** 4038/**
3687 * check_asym_packing - Check to see if the group is packed into the 4039 * check_asym_packing - Check to see if the group is packed into the
3688 * sched doman. 4040 * sched doman.
@@ -4046,7 +4398,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4046#define MAX_PINNED_INTERVAL 512 4398#define MAX_PINNED_INTERVAL 512
4047 4399
4048/* Working cpumask for load_balance and load_balance_newidle. */ 4400/* Working cpumask for load_balance and load_balance_newidle. */
4049static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4401DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4050 4402
4051static int need_active_balance(struct sched_domain *sd, int idle, 4403static int need_active_balance(struct sched_domain *sd, int idle,
4052 int busiest_cpu, int this_cpu) 4404 int busiest_cpu, int this_cpu)
@@ -4097,7 +4449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4097 struct sched_domain *sd, enum cpu_idle_type idle, 4449 struct sched_domain *sd, enum cpu_idle_type idle,
4098 int *balance) 4450 int *balance)
4099{ 4451{
4100 int ld_moved, all_pinned = 0, active_balance = 0; 4452 int ld_moved, lb_flags = 0, active_balance = 0;
4101 struct sched_group *group; 4453 struct sched_group *group;
4102 unsigned long imbalance; 4454 unsigned long imbalance;
4103 struct rq *busiest; 4455 struct rq *busiest;
@@ -4138,11 +4490,11 @@ redo:
4138 * still unbalanced. ld_moved simply stays zero, so it is 4490 * still unbalanced. ld_moved simply stays zero, so it is
4139 * correctly treated as an imbalance. 4491 * correctly treated as an imbalance.
4140 */ 4492 */
4141 all_pinned = 1; 4493 lb_flags |= LBF_ALL_PINNED;
4142 local_irq_save(flags); 4494 local_irq_save(flags);
4143 double_rq_lock(this_rq, busiest); 4495 double_rq_lock(this_rq, busiest);
4144 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4496 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4145 imbalance, sd, idle, &all_pinned); 4497 imbalance, sd, idle, &lb_flags);
4146 double_rq_unlock(this_rq, busiest); 4498 double_rq_unlock(this_rq, busiest);
4147 local_irq_restore(flags); 4499 local_irq_restore(flags);
4148 4500
@@ -4152,8 +4504,16 @@ redo:
4152 if (ld_moved && this_cpu != smp_processor_id()) 4504 if (ld_moved && this_cpu != smp_processor_id())
4153 resched_cpu(this_cpu); 4505 resched_cpu(this_cpu);
4154 4506
4507 if (lb_flags & LBF_ABORT)
4508 goto out_balanced;
4509
4510 if (lb_flags & LBF_NEED_BREAK) {
4511 lb_flags &= ~LBF_NEED_BREAK;
4512 goto redo;
4513 }
4514
4155 /* All tasks on this runqueue were pinned by CPU affinity */ 4515 /* All tasks on this runqueue were pinned by CPU affinity */
4156 if (unlikely(all_pinned)) { 4516 if (unlikely(lb_flags & LBF_ALL_PINNED)) {
4157 cpumask_clear_cpu(cpu_of(busiest), cpus); 4517 cpumask_clear_cpu(cpu_of(busiest), cpus);
4158 if (!cpumask_empty(cpus)) 4518 if (!cpumask_empty(cpus))
4159 goto redo; 4519 goto redo;
@@ -4183,7 +4543,7 @@ redo:
4183 tsk_cpus_allowed(busiest->curr))) { 4543 tsk_cpus_allowed(busiest->curr))) {
4184 raw_spin_unlock_irqrestore(&busiest->lock, 4544 raw_spin_unlock_irqrestore(&busiest->lock,
4185 flags); 4545 flags);
4186 all_pinned = 1; 4546 lb_flags |= LBF_ALL_PINNED;
4187 goto out_one_pinned; 4547 goto out_one_pinned;
4188 } 4548 }
4189 4549
@@ -4236,7 +4596,8 @@ out_balanced:
4236 4596
4237out_one_pinned: 4597out_one_pinned:
4238 /* tune up the balancing interval */ 4598 /* tune up the balancing interval */
4239 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 4599 if (((lb_flags & LBF_ALL_PINNED) &&
4600 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4240 (sd->balance_interval < sd->max_interval)) 4601 (sd->balance_interval < sd->max_interval))
4241 sd->balance_interval *= 2; 4602 sd->balance_interval *= 2;
4242 4603
@@ -4249,7 +4610,7 @@ out:
4249 * idle_balance is called by schedule() if this_cpu is about to become 4610 * idle_balance is called by schedule() if this_cpu is about to become
4250 * idle. Attempts to pull tasks from other CPUs. 4611 * idle. Attempts to pull tasks from other CPUs.
4251 */ 4612 */
4252static void idle_balance(int this_cpu, struct rq *this_rq) 4613void idle_balance(int this_cpu, struct rq *this_rq)
4253{ 4614{
4254 struct sched_domain *sd; 4615 struct sched_domain *sd;
4255 int pulled_task = 0; 4616 int pulled_task = 0;
@@ -4364,28 +4725,16 @@ out_unlock:
4364#ifdef CONFIG_NO_HZ 4725#ifdef CONFIG_NO_HZ
4365/* 4726/*
4366 * idle load balancing details 4727 * idle load balancing details
4367 * - One of the idle CPUs nominates itself as idle load_balancer, while
4368 * entering idle.
4369 * - This idle load balancer CPU will also go into tickless mode when
4370 * it is idle, just like all other idle CPUs
4371 * - When one of the busy CPUs notice that there may be an idle rebalancing 4728 * - When one of the busy CPUs notice that there may be an idle rebalancing
4372 * needed, they will kick the idle load balancer, which then does idle 4729 * needed, they will kick the idle load balancer, which then does idle
4373 * load balancing for all the idle CPUs. 4730 * load balancing for all the idle CPUs.
4374 */ 4731 */
4375static struct { 4732static struct {
4376 atomic_t load_balancer;
4377 atomic_t first_pick_cpu;
4378 atomic_t second_pick_cpu;
4379 cpumask_var_t idle_cpus_mask; 4733 cpumask_var_t idle_cpus_mask;
4380 cpumask_var_t grp_idle_mask; 4734 atomic_t nr_cpus;
4381 unsigned long next_balance; /* in jiffy units */ 4735 unsigned long next_balance; /* in jiffy units */
4382} nohz ____cacheline_aligned; 4736} nohz ____cacheline_aligned;
4383 4737
4384int get_nohz_load_balancer(void)
4385{
4386 return atomic_read(&nohz.load_balancer);
4387}
4388
4389#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4738#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4390/** 4739/**
4391 * lowest_flag_domain - Return lowest sched_domain containing flag. 4740 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4422,33 +4771,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4422 (sd && (sd->flags & flag)); sd = sd->parent) 4771 (sd && (sd->flags & flag)); sd = sd->parent)
4423 4772
4424/** 4773/**
4425 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4426 * @ilb_group: group to be checked for semi-idleness
4427 *
4428 * Returns: 1 if the group is semi-idle. 0 otherwise.
4429 *
4430 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4431 * and atleast one non-idle CPU. This helper function checks if the given
4432 * sched_group is semi-idle or not.
4433 */
4434static inline int is_semi_idle_group(struct sched_group *ilb_group)
4435{
4436 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
4437 sched_group_cpus(ilb_group));
4438
4439 /*
4440 * A sched_group is semi-idle when it has atleast one busy cpu
4441 * and atleast one idle cpu.
4442 */
4443 if (cpumask_empty(nohz.grp_idle_mask))
4444 return 0;
4445
4446 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
4447 return 0;
4448
4449 return 1;
4450}
4451/**
4452 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4774 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4453 * @cpu: The cpu which is nominating a new idle_load_balancer. 4775 * @cpu: The cpu which is nominating a new idle_load_balancer.
4454 * 4776 *
@@ -4462,9 +4784,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
4462 */ 4784 */
4463static int find_new_ilb(int cpu) 4785static int find_new_ilb(int cpu)
4464{ 4786{
4787 int ilb = cpumask_first(nohz.idle_cpus_mask);
4788 struct sched_group *ilbg;
4465 struct sched_domain *sd; 4789 struct sched_domain *sd;
4466 struct sched_group *ilb_group;
4467 int ilb = nr_cpu_ids;
4468 4790
4469 /* 4791 /*
4470 * Have idle load balancer selection from semi-idle packages only 4792 * Have idle load balancer selection from semi-idle packages only
@@ -4482,23 +4804,28 @@ static int find_new_ilb(int cpu)
4482 4804
4483 rcu_read_lock(); 4805 rcu_read_lock();
4484 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4806 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4485 ilb_group = sd->groups; 4807 ilbg = sd->groups;
4486 4808
4487 do { 4809 do {
4488 if (is_semi_idle_group(ilb_group)) { 4810 if (ilbg->group_weight !=
4489 ilb = cpumask_first(nohz.grp_idle_mask); 4811 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4812 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4813 sched_group_cpus(ilbg));
4490 goto unlock; 4814 goto unlock;
4491 } 4815 }
4492 4816
4493 ilb_group = ilb_group->next; 4817 ilbg = ilbg->next;
4494 4818
4495 } while (ilb_group != sd->groups); 4819 } while (ilbg != sd->groups);
4496 } 4820 }
4497unlock: 4821unlock:
4498 rcu_read_unlock(); 4822 rcu_read_unlock();
4499 4823
4500out_done: 4824out_done:
4501 return ilb; 4825 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4826 return ilb;
4827
4828 return nr_cpu_ids;
4502} 4829}
4503#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4830#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4504static inline int find_new_ilb(int call_cpu) 4831static inline int find_new_ilb(int call_cpu)
@@ -4518,99 +4845,68 @@ static void nohz_balancer_kick(int cpu)
4518 4845
4519 nohz.next_balance++; 4846 nohz.next_balance++;
4520 4847
4521 ilb_cpu = get_nohz_load_balancer(); 4848 ilb_cpu = find_new_ilb(cpu);
4522
4523 if (ilb_cpu >= nr_cpu_ids) {
4524 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
4525 if (ilb_cpu >= nr_cpu_ids)
4526 return;
4527 }
4528 4849
4529 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4850 if (ilb_cpu >= nr_cpu_ids)
4530 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4851 return;
4531 4852
4532 smp_mb(); 4853 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4533 /* 4854 return;
4534 * Use smp_send_reschedule() instead of resched_cpu(). 4855 /*
4535 * This way we generate a sched IPI on the target cpu which 4856 * Use smp_send_reschedule() instead of resched_cpu().
4536 * is idle. And the softirq performing nohz idle load balance 4857 * This way we generate a sched IPI on the target cpu which
4537 * will be run before returning from the IPI. 4858 * is idle. And the softirq performing nohz idle load balance
4538 */ 4859 * will be run before returning from the IPI.
4539 smp_send_reschedule(ilb_cpu); 4860 */
4540 } 4861 smp_send_reschedule(ilb_cpu);
4541 return; 4862 return;
4542} 4863}
4543 4864
4544/* 4865static inline void set_cpu_sd_state_busy(void)
4545 * This routine will try to nominate the ilb (idle load balancing)
4546 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4547 * load balancing on behalf of all those cpus.
4548 *
4549 * When the ilb owner becomes busy, we will not have new ilb owner until some
4550 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
4551 * idle load balancing by kicking one of the idle CPUs.
4552 *
4553 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
4554 * ilb owner CPU in future (when there is a need for idle load balancing on
4555 * behalf of all idle CPUs).
4556 */
4557void select_nohz_load_balancer(int stop_tick)
4558{ 4866{
4867 struct sched_domain *sd;
4559 int cpu = smp_processor_id(); 4868 int cpu = smp_processor_id();
4560 4869
4561 if (stop_tick) { 4870 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4562 if (!cpu_active(cpu)) { 4871 return;
4563 if (atomic_read(&nohz.load_balancer) != cpu) 4872 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4564 return;
4565
4566 /*
4567 * If we are going offline and still the leader,
4568 * give up!
4569 */
4570 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4571 nr_cpu_ids) != cpu)
4572 BUG();
4573 4873
4574 return; 4874 rcu_read_lock();
4575 } 4875 for_each_domain(cpu, sd)
4876 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4877 rcu_read_unlock();
4878}
4576 4879
4577 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4880void set_cpu_sd_state_idle(void)
4881{
4882 struct sched_domain *sd;
4883 int cpu = smp_processor_id();
4578 4884
4579 if (atomic_read(&nohz.first_pick_cpu) == cpu) 4885 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4580 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 4886 return;
4581 if (atomic_read(&nohz.second_pick_cpu) == cpu) 4887 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4582 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4583 4888
4584 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 4889 rcu_read_lock();
4585 int new_ilb; 4890 for_each_domain(cpu, sd)
4891 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4892 rcu_read_unlock();
4893}
4586 4894
4587 /* make me the ilb owner */ 4895/*
4588 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 4896 * This routine will record that this cpu is going idle with tick stopped.
4589 cpu) != nr_cpu_ids) 4897 * This info will be used in performing idle load balancing in the future.
4590 return; 4898 */
4899void select_nohz_load_balancer(int stop_tick)
4900{
4901 int cpu = smp_processor_id();
4591 4902
4592 /* 4903 if (stop_tick) {
4593 * Check to see if there is a more power-efficient 4904 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4594 * ilb.
4595 */
4596 new_ilb = find_new_ilb(cpu);
4597 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4598 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4599 resched_cpu(new_ilb);
4600 return;
4601 }
4602 return;
4603 }
4604 } else {
4605 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4606 return; 4905 return;
4607 4906
4608 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4907 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4609 4908 atomic_inc(&nohz.nr_cpus);
4610 if (atomic_read(&nohz.load_balancer) == cpu) 4909 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4611 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4612 nr_cpu_ids) != cpu)
4613 BUG();
4614 } 4910 }
4615 return; 4911 return;
4616} 4912}
@@ -4624,7 +4920,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4624 * Scale the max load_balance interval with the number of CPUs in the system. 4920 * Scale the max load_balance interval with the number of CPUs in the system.
4625 * This trades load-balance latency on larger machines for less cross talk. 4921 * This trades load-balance latency on larger machines for less cross talk.
4626 */ 4922 */
4627static void update_max_interval(void) 4923void update_max_interval(void)
4628{ 4924{
4629 max_load_balance_interval = HZ*num_online_cpus()/10; 4925 max_load_balance_interval = HZ*num_online_cpus()/10;
4630} 4926}
@@ -4716,11 +5012,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4716 struct rq *rq; 5012 struct rq *rq;
4717 int balance_cpu; 5013 int balance_cpu;
4718 5014
4719 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 5015 if (idle != CPU_IDLE ||
4720 return; 5016 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5017 goto end;
4721 5018
4722 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5019 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4723 if (balance_cpu == this_cpu) 5020 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4724 continue; 5021 continue;
4725 5022
4726 /* 5023 /*
@@ -4728,10 +5025,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4728 * work being done for other cpus. Next load 5025 * work being done for other cpus. Next load
4729 * balancing owner will pick it up. 5026 * balancing owner will pick it up.
4730 */ 5027 */
4731 if (need_resched()) { 5028 if (need_resched())
4732 this_rq->nohz_balance_kick = 0;
4733 break; 5029 break;
4734 }
4735 5030
4736 raw_spin_lock_irq(&this_rq->lock); 5031 raw_spin_lock_irq(&this_rq->lock);
4737 update_rq_clock(this_rq); 5032 update_rq_clock(this_rq);
@@ -4745,53 +5040,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4745 this_rq->next_balance = rq->next_balance; 5040 this_rq->next_balance = rq->next_balance;
4746 } 5041 }
4747 nohz.next_balance = this_rq->next_balance; 5042 nohz.next_balance = this_rq->next_balance;
4748 this_rq->nohz_balance_kick = 0; 5043end:
5044 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4749} 5045}
4750 5046
4751/* 5047/*
4752 * Current heuristic for kicking the idle load balancer 5048 * Current heuristic for kicking the idle load balancer in the presence
4753 * - first_pick_cpu is the one of the busy CPUs. It will kick 5049 * of an idle cpu is the system.
4754 * idle load balancer when it has more than one process active. This 5050 * - This rq has more than one task.
4755 * eliminates the need for idle load balancing altogether when we have 5051 * - At any scheduler domain level, this cpu's scheduler group has multiple
4756 * only one running process in the system (common case). 5052 * busy cpu's exceeding the group's power.
4757 * - If there are more than one busy CPU, idle load balancer may have 5053 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4758 * to run for active_load_balance to happen (i.e., two busy CPUs are 5054 * domain span are idle.
4759 * SMT or core siblings and can run better if they move to different
4760 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
4761 * which will kick idle load balancer as soon as it has any load.
4762 */ 5055 */
4763static inline int nohz_kick_needed(struct rq *rq, int cpu) 5056static inline int nohz_kick_needed(struct rq *rq, int cpu)
4764{ 5057{
4765 unsigned long now = jiffies; 5058 unsigned long now = jiffies;
4766 int ret; 5059 struct sched_domain *sd;
4767 int first_pick_cpu, second_pick_cpu;
4768 5060
4769 if (time_before(now, nohz.next_balance)) 5061 if (unlikely(idle_cpu(cpu)))
4770 return 0; 5062 return 0;
4771 5063
4772 if (idle_cpu(cpu)) 5064 /*
4773 return 0; 5065 * We may be recently in ticked or tickless idle mode. At the first
5066 * busy tick after returning from idle, we will update the busy stats.
5067 */
5068 set_cpu_sd_state_busy();
5069 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5070 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5071 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5072 atomic_dec(&nohz.nr_cpus);
5073 }
4774 5074
4775 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5075 /*
4776 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5076 * None are in tickless mode and hence no need for NOHZ idle load
5077 * balancing.
5078 */
5079 if (likely(!atomic_read(&nohz.nr_cpus)))
5080 return 0;
4777 5081
4778 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5082 if (time_before(now, nohz.next_balance))
4779 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4780 return 0; 5083 return 0;
4781 5084
4782 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5085 if (rq->nr_running >= 2)
4783 if (ret == nr_cpu_ids || ret == cpu) { 5086 goto need_kick;
4784 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5087
4785 if (rq->nr_running > 1) 5088 rcu_read_lock();
4786 return 1; 5089 for_each_domain(cpu, sd) {
4787 } else { 5090 struct sched_group *sg = sd->groups;
4788 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5091 struct sched_group_power *sgp = sg->sgp;
4789 if (ret == nr_cpu_ids || ret == cpu) { 5092 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4790 if (rq->nr_running) 5093
4791 return 1; 5094 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4792 } 5095 goto need_kick_unlock;
5096
5097 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5098 && (cpumask_first_and(nohz.idle_cpus_mask,
5099 sched_domain_span(sd)) < cpu))
5100 goto need_kick_unlock;
5101
5102 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5103 break;
4793 } 5104 }
5105 rcu_read_unlock();
4794 return 0; 5106 return 0;
5107
5108need_kick_unlock:
5109 rcu_read_unlock();
5110need_kick:
5111 return 1;
4795} 5112}
4796#else 5113#else
4797static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5114static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4826,14 +5143,14 @@ static inline int on_null_domain(int cpu)
4826/* 5143/*
4827 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5144 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4828 */ 5145 */
4829static inline void trigger_load_balance(struct rq *rq, int cpu) 5146void trigger_load_balance(struct rq *rq, int cpu)
4830{ 5147{
4831 /* Don't need to rebalance while attached to NULL domain */ 5148 /* Don't need to rebalance while attached to NULL domain */
4832 if (time_after_eq(jiffies, rq->next_balance) && 5149 if (time_after_eq(jiffies, rq->next_balance) &&
4833 likely(!on_null_domain(cpu))) 5150 likely(!on_null_domain(cpu)))
4834 raise_softirq(SCHED_SOFTIRQ); 5151 raise_softirq(SCHED_SOFTIRQ);
4835#ifdef CONFIG_NO_HZ 5152#ifdef CONFIG_NO_HZ
4836 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5153 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4837 nohz_balancer_kick(cpu); 5154 nohz_balancer_kick(cpu);
4838#endif 5155#endif
4839} 5156}
@@ -4848,15 +5165,6 @@ static void rq_offline_fair(struct rq *rq)
4848 update_sysctl(); 5165 update_sysctl();
4849} 5166}
4850 5167
4851#else /* CONFIG_SMP */
4852
4853/*
4854 * on UP we do not need to balance between CPUs:
4855 */
4856static inline void idle_balance(int cpu, struct rq *rq)
4857{
4858}
4859
4860#endif /* CONFIG_SMP */ 5168#endif /* CONFIG_SMP */
4861 5169
4862/* 5170/*
@@ -4880,8 +5188,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4880 */ 5188 */
4881static void task_fork_fair(struct task_struct *p) 5189static void task_fork_fair(struct task_struct *p)
4882{ 5190{
4883 struct cfs_rq *cfs_rq = task_cfs_rq(current); 5191 struct cfs_rq *cfs_rq;
4884 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 5192 struct sched_entity *se = &p->se, *curr;
4885 int this_cpu = smp_processor_id(); 5193 int this_cpu = smp_processor_id();
4886 struct rq *rq = this_rq(); 5194 struct rq *rq = this_rq();
4887 unsigned long flags; 5195 unsigned long flags;
@@ -4890,6 +5198,9 @@ static void task_fork_fair(struct task_struct *p)
4890 5198
4891 update_rq_clock(rq); 5199 update_rq_clock(rq);
4892 5200
5201 cfs_rq = task_cfs_rq(current);
5202 curr = cfs_rq->curr;
5203
4893 if (unlikely(task_cpu(p) != this_cpu)) { 5204 if (unlikely(task_cpu(p) != this_cpu)) {
4894 rcu_read_lock(); 5205 rcu_read_lock();
4895 __set_task_cpu(p, this_cpu); 5206 __set_task_cpu(p, this_cpu);
@@ -4999,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq)
4999 } 5310 }
5000} 5311}
5001 5312
5313void init_cfs_rq(struct cfs_rq *cfs_rq)
5314{
5315 cfs_rq->tasks_timeline = RB_ROOT;
5316 INIT_LIST_HEAD(&cfs_rq->tasks);
5317 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5318#ifndef CONFIG_64BIT
5319 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5320#endif
5321}
5322
5002#ifdef CONFIG_FAIR_GROUP_SCHED 5323#ifdef CONFIG_FAIR_GROUP_SCHED
5003static void task_move_group_fair(struct task_struct *p, int on_rq) 5324static void task_move_group_fair(struct task_struct *p, int on_rq)
5004{ 5325{
@@ -5015,13 +5336,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5015 * to another cgroup's rq. This does somewhat interfere with the 5336 * to another cgroup's rq. This does somewhat interfere with the
5016 * fair sleeper stuff for the first placement, but who cares. 5337 * fair sleeper stuff for the first placement, but who cares.
5017 */ 5338 */
5339 /*
5340 * When !on_rq, vruntime of the task has usually NOT been normalized.
5341 * But there are some cases where it has already been normalized:
5342 *
5343 * - Moving a forked child which is waiting for being woken up by
5344 * wake_up_new_task().
5345 * - Moving a task which has been woken up by try_to_wake_up() and
5346 * waiting for actually being woken up by sched_ttwu_pending().
5347 *
5348 * To prevent boost or penalty in the new cfs_rq caused by delta
5349 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5350 */
5351 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5352 on_rq = 1;
5353
5018 if (!on_rq) 5354 if (!on_rq)
5019 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5355 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5020 set_task_rq(p, task_cpu(p)); 5356 set_task_rq(p, task_cpu(p));
5021 if (!on_rq) 5357 if (!on_rq)
5022 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5358 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5023} 5359}
5360
5361void free_fair_sched_group(struct task_group *tg)
5362{
5363 int i;
5364
5365 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5366
5367 for_each_possible_cpu(i) {
5368 if (tg->cfs_rq)
5369 kfree(tg->cfs_rq[i]);
5370 if (tg->se)
5371 kfree(tg->se[i]);
5372 }
5373
5374 kfree(tg->cfs_rq);
5375 kfree(tg->se);
5376}
5377
5378int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5379{
5380 struct cfs_rq *cfs_rq;
5381 struct sched_entity *se;
5382 int i;
5383
5384 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5385 if (!tg->cfs_rq)
5386 goto err;
5387 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5388 if (!tg->se)
5389 goto err;
5390
5391 tg->shares = NICE_0_LOAD;
5392
5393 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5394
5395 for_each_possible_cpu(i) {
5396 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5397 GFP_KERNEL, cpu_to_node(i));
5398 if (!cfs_rq)
5399 goto err;
5400
5401 se = kzalloc_node(sizeof(struct sched_entity),
5402 GFP_KERNEL, cpu_to_node(i));
5403 if (!se)
5404 goto err_free_rq;
5405
5406 init_cfs_rq(cfs_rq);
5407 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5408 }
5409
5410 return 1;
5411
5412err_free_rq:
5413 kfree(cfs_rq);
5414err:
5415 return 0;
5416}
5417
5418void unregister_fair_sched_group(struct task_group *tg, int cpu)
5419{
5420 struct rq *rq = cpu_rq(cpu);
5421 unsigned long flags;
5422
5423 /*
5424 * Only empty task groups can be destroyed; so we can speculatively
5425 * check on_list without danger of it being re-added.
5426 */
5427 if (!tg->cfs_rq[cpu]->on_list)
5428 return;
5429
5430 raw_spin_lock_irqsave(&rq->lock, flags);
5431 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5432 raw_spin_unlock_irqrestore(&rq->lock, flags);
5433}
5434
5435void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5436 struct sched_entity *se, int cpu,
5437 struct sched_entity *parent)
5438{
5439 struct rq *rq = cpu_rq(cpu);
5440
5441 cfs_rq->tg = tg;
5442 cfs_rq->rq = rq;
5443#ifdef CONFIG_SMP
5444 /* allow initial update_cfs_load() to truncate */
5445 cfs_rq->load_stamp = 1;
5024#endif 5446#endif
5447 init_cfs_rq_runtime(cfs_rq);
5448
5449 tg->cfs_rq[cpu] = cfs_rq;
5450 tg->se[cpu] = se;
5451
5452 /* se could be NULL for root_task_group */
5453 if (!se)
5454 return;
5455
5456 if (!parent)
5457 se->cfs_rq = &rq->cfs;
5458 else
5459 se->cfs_rq = parent->my_q;
5460
5461 se->my_q = cfs_rq;
5462 update_load_set(&se->load, 0);
5463 se->parent = parent;
5464}
5465
5466static DEFINE_MUTEX(shares_mutex);
5467
5468int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5469{
5470 int i;
5471 unsigned long flags;
5472
5473 /*
5474 * We can't change the weight of the root cgroup.
5475 */
5476 if (!tg->se[0])
5477 return -EINVAL;
5478
5479 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5480
5481 mutex_lock(&shares_mutex);
5482 if (tg->shares == shares)
5483 goto done;
5484
5485 tg->shares = shares;
5486 for_each_possible_cpu(i) {
5487 struct rq *rq = cpu_rq(i);
5488 struct sched_entity *se;
5489
5490 se = tg->se[i];
5491 /* Propagate contribution to hierarchy */
5492 raw_spin_lock_irqsave(&rq->lock, flags);
5493 for_each_sched_entity(se)
5494 update_cfs_shares(group_cfs_rq(se));
5495 raw_spin_unlock_irqrestore(&rq->lock, flags);
5496 }
5497
5498done:
5499 mutex_unlock(&shares_mutex);
5500 return 0;
5501}
5502#else /* CONFIG_FAIR_GROUP_SCHED */
5503
5504void free_fair_sched_group(struct task_group *tg) { }
5505
5506int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5507{
5508 return 1;
5509}
5510
5511void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5512
5513#endif /* CONFIG_FAIR_GROUP_SCHED */
5514
5025 5515
5026static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5516static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5027{ 5517{
@@ -5041,7 +5531,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
5041/* 5531/*
5042 * All the scheduling class methods: 5532 * All the scheduling class methods:
5043 */ 5533 */
5044static const struct sched_class fair_sched_class = { 5534const struct sched_class fair_sched_class = {
5045 .next = &idle_sched_class, 5535 .next = &idle_sched_class,
5046 .enqueue_task = enqueue_task_fair, 5536 .enqueue_task = enqueue_task_fair,
5047 .dequeue_task = dequeue_task_fair, 5537 .dequeue_task = dequeue_task_fair,
@@ -5078,7 +5568,7 @@ static const struct sched_class fair_sched_class = {
5078}; 5568};
5079 5569
5080#ifdef CONFIG_SCHED_DEBUG 5570#ifdef CONFIG_SCHED_DEBUG
5081static void print_cfs_stats(struct seq_file *m, int cpu) 5571void print_cfs_stats(struct seq_file *m, int cpu)
5082{ 5572{
5083 struct cfs_rq *cfs_rq; 5573 struct cfs_rq *cfs_rq;
5084 5574
@@ -5088,3 +5578,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
5088 rcu_read_unlock(); 5578 rcu_read_unlock();
5089} 5579}
5090#endif 5580#endif
5581
5582__init void init_sched_fair_class(void)
5583{
5584#ifdef CONFIG_SMP
5585 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5586
5587#ifdef CONFIG_NO_HZ
5588 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5589#endif
5590#endif /* SMP */
5591
5592}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 84802245abd2..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
3 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart. 4 * rip the spread apart.
5 */ 5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7 7
8/* 8/*
9 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
10 * tasks 10 * tasks
11 */ 11 */
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
@@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
17 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS. 18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */ 19 */
20SCHED_FEAT(AFFINE_WAKEUPS, 1) 20SCHED_FEAT(AFFINE_WAKEUPS, true)
21 21
22/* 22/*
23 * Prefer to schedule the task we woke last (assuming it failed 23 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 24 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 25 * touched, increases cache locality.
26 */ 26 */
27SCHED_FEAT(NEXT_BUDDY, 0) 27SCHED_FEAT(NEXT_BUDDY, false)
28 28
29/* 29/*
30 * Prefer to schedule the task that ran last (when we did 30 * Prefer to schedule the task that ran last (when we did
31 * wake-preempt) as that likely will touch the same data, increases 31 * wake-preempt) as that likely will touch the same data, increases
32 * cache locality. 32 * cache locality.
33 */ 33 */
34SCHED_FEAT(LAST_BUDDY, 1) 34SCHED_FEAT(LAST_BUDDY, true)
35 35
36/* 36/*
37 * Consider buddies to be cache hot, decreases the likelyness of a 37 * Consider buddies to be cache hot, decreases the likelyness of a
38 * cache buddy being migrated away, increases cache locality. 38 * cache buddy being migrated away, increases cache locality.
39 */ 39 */
40SCHED_FEAT(CACHE_HOT_BUDDY, 1) 40SCHED_FEAT(CACHE_HOT_BUDDY, true)
41 41
42/* 42/*
43 * Use arch dependent cpu power functions 43 * Use arch dependent cpu power functions
44 */ 44 */
45SCHED_FEAT(ARCH_POWER, 0) 45SCHED_FEAT(ARCH_POWER, false)
46 46
47SCHED_FEAT(HRTICK, 0) 47SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, 0) 48SCHED_FEAT(DOUBLE_TICK, false)
49SCHED_FEAT(LB_BIAS, 1) 49SCHED_FEAT(LB_BIAS, true)
50 50
51/* 51/*
52 * Spin-wait on mutex acquisition when the mutex owner is running on 52 * Spin-wait on mutex acquisition when the mutex owner is running on
53 * another cpu -- assumes that when the owner is running, it will soon 53 * another cpu -- assumes that when the owner is running, it will soon
54 * release the lock. Decreases scheduling overhead. 54 * release the lock. Decreases scheduling overhead.
55 */ 55 */
56SCHED_FEAT(OWNER_SPIN, 1) 56SCHED_FEAT(OWNER_SPIN, true)
57 57
58/* 58/*
59 * Decrement CPU power based on time not spent running tasks 59 * Decrement CPU power based on time not spent running tasks
60 */ 60 */
61SCHED_FEAT(NONTASK_POWER, 1) 61SCHED_FEAT(NONTASK_POWER, true)
62 62
63/* 63/*
64 * Queue remote wakeups on the target CPU and process them 64 * Queue remote wakeups on the target CPU and process them
65 * using the scheduler IPI. Reduces rq->lock contention/bounces. 65 * using the scheduler IPI. Reduces rq->lock contention/bounces.
66 */ 66 */
67SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, 1) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 583a1368afe6..3640ebbb466b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq)
556 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
557} 733}
558 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
559static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
560{ 758{
561 int more = 0; 759 int more = 0;
@@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
648 if (rt_rq->rt_throttled) 846 if (rt_rq->rt_throttled)
649 return rt_rq_throttled(rt_rq); 847 return rt_rq_throttled(rt_rq);
650 848
651 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 849 if (runtime >= sched_rt_period(rt_rq))
652 return 0; 850 return 0;
653 851
654 balance_runtime(rt_rq); 852 balance_runtime(rt_rq);
@@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
957} 1155}
958 1156
959/* 1157/*
960 * Put task to the end of the run list without the overhead of dequeue 1158 * Put task to the head or the end of the run list without the overhead of
961 * followed by enqueue. 1159 * dequeue followed by enqueue.
962 */ 1160 */
963static void 1161static void
964requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1002 1200
1003 cpu = task_cpu(p); 1201 cpu = task_cpu(p);
1004 1202
1203 if (p->rt.nr_cpus_allowed == 1)
1204 goto out;
1205
1005 /* For anything but wake ups, just return the task_cpu */ 1206 /* For anything but wake ups, just return the task_cpu */
1006 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1007 goto out; 1208 goto out;
@@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1178/* Only try algorithms three times */ 1379/* Only try algorithms three times */
1179#define RT_MAX_TRIES 3 1380#define RT_MAX_TRIES 3
1180 1381
1181static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1182
1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1184{ 1383{
1185 if (!task_running(rq, p) && 1384 if (!task_running(rq, p) &&
@@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1653 pull_rt_task(rq); 1852 pull_rt_task(rq);
1654} 1853}
1655 1854
1656static inline void init_sched_rt_class(void) 1855void init_sched_rt_class(void)
1657{ 1856{
1658 unsigned int i; 1857 unsigned int i;
1659 1858
1660 for_each_possible_cpu(i) 1859 for_each_possible_cpu(i) {
1661 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1860 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1662 GFP_KERNEL, cpu_to_node(i)); 1861 GFP_KERNEL, cpu_to_node(i));
1862 }
1663} 1863}
1664#endif /* CONFIG_SMP */ 1864#endif /* CONFIG_SMP */
1665 1865
@@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1800 return 0; 2000 return 0;
1801} 2001}
1802 2002
1803static const struct sched_class rt_sched_class = { 2003const struct sched_class rt_sched_class = {
1804 .next = &fair_sched_class, 2004 .next = &fair_sched_class,
1805 .enqueue_task = enqueue_task_rt, 2005 .enqueue_task = enqueue_task_rt,
1806 .dequeue_task = dequeue_task_rt, 2006 .dequeue_task = dequeue_task_rt,
@@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
1835#ifdef CONFIG_SCHED_DEBUG 2035#ifdef CONFIG_SCHED_DEBUG
1836extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2036extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1837 2037
1838static void print_rt_stats(struct seq_file *m, int cpu) 2038void print_rt_stats(struct seq_file *m, int cpu)
1839{ 2039{
1840 rt_rq_iter_t iter; 2040 rt_rq_iter_t iter;
1841 struct rt_rq *rt_rq; 2041 struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned long nohz_flags;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
491#define this_rq() (&__get_cpu_var(runqueues))
492#define task_rq(p) cpu_rq(task_cpu(p))
493#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
494#define raw_rq() (&__raw_get_cpu_var(runqueues))
495
496#ifdef CONFIG_SMP
497
498#define rcu_dereference_check_sched_domain(p) \
499 rcu_dereference_check((p), \
500 lockdep_is_held(&sched_domains_mutex))
501
502/*
503 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
504 * See detach_destroy_domains: synchronize_sched for details.
505 *
506 * The domain tree of any CPU may only be accessed from within
507 * preempt-disabled sections.
508 */
509#define for_each_domain(cpu, __sd) \
510 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
511 __sd; __sd = __sd->parent)
512
513#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
514
515/**
516 * highest_flag_domain - Return highest sched_domain containing flag.
517 * @cpu: The cpu whose highest level of sched domain is to
518 * be returned.
519 * @flag: The flag to check for the highest sched_domain
520 * for the given cpu.
521 *
522 * Returns the highest sched_domain of a cpu which contains the given flag.
523 */
524static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
525{
526 struct sched_domain *sd, *hsd = NULL;
527
528 for_each_domain(cpu, sd) {
529 if (!(sd->flags & flag))
530 break;
531 hsd = sd;
532 }
533
534 return hsd;
535}
536
537DECLARE_PER_CPU(struct sched_domain *, sd_llc);
538DECLARE_PER_CPU(int, sd_llc_id);
539
540#endif /* CONFIG_SMP */
541
542#include "stats.h"
543#include "auto_group.h"
544
545#ifdef CONFIG_CGROUP_SCHED
546
547/*
548 * Return the group to which this tasks belongs.
549 *
550 * We use task_subsys_state_check() and extend the RCU verification with
551 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
552 * task it moves into the cgroup. Therefore by holding either of those locks,
553 * we pin the task to the current cgroup.
554 */
555static inline struct task_group *task_group(struct task_struct *p)
556{
557 struct task_group *tg;
558 struct cgroup_subsys_state *css;
559
560 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
561 lockdep_is_held(&p->pi_lock) ||
562 lockdep_is_held(&task_rq(p)->lock));
563 tg = container_of(css, struct task_group, css);
564
565 return autogroup_task_group(p, tg);
566}
567
568/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
569static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
570{
571#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
572 struct task_group *tg = task_group(p);
573#endif
574
575#ifdef CONFIG_FAIR_GROUP_SCHED
576 p->se.cfs_rq = tg->cfs_rq[cpu];
577 p->se.parent = tg->se[cpu];
578#endif
579
580#ifdef CONFIG_RT_GROUP_SCHED
581 p->rt.rt_rq = tg->rt_rq[cpu];
582 p->rt.parent = tg->rt_se[cpu];
583#endif
584}
585
586#else /* CONFIG_CGROUP_SCHED */
587
588static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
589static inline struct task_group *task_group(struct task_struct *p)
590{
591 return NULL;
592}
593
594#endif /* CONFIG_CGROUP_SCHED */
595
596static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
597{
598 set_task_rq(p, cpu);
599#ifdef CONFIG_SMP
600 /*
601 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
602 * successfuly executed on another CPU. We must ensure that updates of
603 * per-task data have been completed by this moment.
604 */
605 smp_wmb();
606 task_thread_info(p)->cpu = cpu;
607#endif
608}
609
610/*
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */
613#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h>
615# define const_debug __read_mostly
616#else
617# define const_debug const
618#endif
619
620extern const_debug unsigned int sysctl_sched_features;
621
622#define SCHED_FEAT(name, enabled) \
623 __SCHED_FEAT_##name ,
624
625enum {
626#include "features.h"
627 __SCHED_FEAT_NR,
628};
629
630#undef SCHED_FEAT
631
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key)
634{
635 return likely(static_branch(key)); /* Not out of line branch. */
636}
637
638static __always_inline bool static_branch__false(struct jump_label_key *key)
639{
640 return unlikely(static_branch(key)); /* Out of line branch. */
641}
642
643#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \
645{ \
646 return static_branch__##enabled(key); \
647}
648
649#include "features.h"
650
651#undef SCHED_FEAT
652
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
657#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
658
659static inline u64 global_rt_period(void)
660{
661 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
662}
663
664static inline u64 global_rt_runtime(void)
665{
666 if (sysctl_sched_rt_runtime < 0)
667 return RUNTIME_INF;
668
669 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
670}
671
672
673
674static inline int task_current(struct rq *rq, struct task_struct *p)
675{
676 return rq->curr == p;
677}
678
679static inline int task_running(struct rq *rq, struct task_struct *p)
680{
681#ifdef CONFIG_SMP
682 return p->on_cpu;
683#else
684 return task_current(rq, p);
685#endif
686}
687
688
689#ifndef prepare_arch_switch
690# define prepare_arch_switch(next) do { } while (0)
691#endif
692#ifndef finish_arch_switch
693# define finish_arch_switch(prev) do { } while (0)
694#endif
695
696#ifndef __ARCH_WANT_UNLOCKED_CTXSW
697static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
698{
699#ifdef CONFIG_SMP
700 /*
701 * We can optimise this out completely for !SMP, because the
702 * SMP rebalancing from interrupt is the only thing that cares
703 * here.
704 */
705 next->on_cpu = 1;
706#endif
707}
708
709static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
710{
711#ifdef CONFIG_SMP
712 /*
713 * After ->on_cpu is cleared, the task can be moved to a different CPU.
714 * We must ensure this doesn't happen until the switch is completely
715 * finished.
716 */
717 smp_wmb();
718 prev->on_cpu = 0;
719#endif
720#ifdef CONFIG_DEBUG_SPINLOCK
721 /* this is a valid case when another task releases the spinlock */
722 rq->lock.owner = current;
723#endif
724 /*
725 * If we are tracking spinlock dependencies then we have to
726 * fix up the runqueue lock - which gets 'carried over' from
727 * prev into current:
728 */
729 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
730
731 raw_spin_unlock_irq(&rq->lock);
732}
733
734#else /* __ARCH_WANT_UNLOCKED_CTXSW */
735static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
736{
737#ifdef CONFIG_SMP
738 /*
739 * We can optimise this out completely for !SMP, because the
740 * SMP rebalancing from interrupt is the only thing that cares
741 * here.
742 */
743 next->on_cpu = 1;
744#endif
745#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
746 raw_spin_unlock_irq(&rq->lock);
747#else
748 raw_spin_unlock(&rq->lock);
749#endif
750}
751
752static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
753{
754#ifdef CONFIG_SMP
755 /*
756 * After ->on_cpu is cleared, the task can be moved to a different CPU.
757 * We must ensure this doesn't happen until the switch is completely
758 * finished.
759 */
760 smp_wmb();
761 prev->on_cpu = 0;
762#endif
763#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
764 local_irq_enable();
765#endif
766}
767#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
768
769
770static inline void update_load_add(struct load_weight *lw, unsigned long inc)
771{
772 lw->weight += inc;
773 lw->inv_weight = 0;
774}
775
776static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
777{
778 lw->weight -= dec;
779 lw->inv_weight = 0;
780}
781
782static inline void update_load_set(struct load_weight *lw, unsigned long w)
783{
784 lw->weight = w;
785 lw->inv_weight = 0;
786}
787
788/*
789 * To aid in avoiding the subversion of "niceness" due to uneven distribution
790 * of tasks with abnormal "nice" values across CPUs the contribution that
791 * each task makes to its run queue's load is weighted according to its
792 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
793 * scaled version of the new time slice allocation that they receive on time
794 * slice expiry etc.
795 */
796
797#define WEIGHT_IDLEPRIO 3
798#define WMULT_IDLEPRIO 1431655765
799
800/*
801 * Nice levels are multiplicative, with a gentle 10% change for every
802 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
803 * nice 1, it will get ~10% less CPU time than another CPU-bound task
804 * that remained on nice 0.
805 *
806 * The "10% effect" is relative and cumulative: from _any_ nice level,
807 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
808 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
809 * If a task goes up by ~10% and another task goes down by ~10% then
810 * the relative distance between them is ~25%.)
811 */
812static const int prio_to_weight[40] = {
813 /* -20 */ 88761, 71755, 56483, 46273, 36291,
814 /* -15 */ 29154, 23254, 18705, 14949, 11916,
815 /* -10 */ 9548, 7620, 6100, 4904, 3906,
816 /* -5 */ 3121, 2501, 1991, 1586, 1277,
817 /* 0 */ 1024, 820, 655, 526, 423,
818 /* 5 */ 335, 272, 215, 172, 137,
819 /* 10 */ 110, 87, 70, 56, 45,
820 /* 15 */ 36, 29, 23, 18, 15,
821};
822
823/*
824 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
825 *
826 * In cases where the weight does not change often, we can use the
827 * precalculated inverse to speed up arithmetics by turning divisions
828 * into multiplications:
829 */
830static const u32 prio_to_wmult[40] = {
831 /* -20 */ 48388, 59856, 76040, 92818, 118348,
832 /* -15 */ 147320, 184698, 229616, 287308, 360437,
833 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
834 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
835 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
836 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
837 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
838 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
839};
840
841/* Time spent by the tasks of the cpu accounting group executing in ... */
842enum cpuacct_stat_index {
843 CPUACCT_STAT_USER, /* ... user mode */
844 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
845
846 CPUACCT_STAT_NSTATS,
847};
848
849
850#define sched_class_highest (&stop_sched_class)
851#define for_each_class(class) \
852 for (class = sched_class_highest; class; class = class->next)
853
854extern const struct sched_class stop_sched_class;
855extern const struct sched_class rt_sched_class;
856extern const struct sched_class fair_sched_class;
857extern const struct sched_class idle_sched_class;
858
859
860#ifdef CONFIG_SMP
861
862extern void trigger_load_balance(struct rq *rq, int cpu);
863extern void idle_balance(int this_cpu, struct rq *this_rq);
864
865#else /* CONFIG_SMP */
866
867static inline void idle_balance(int cpu, struct rq *rq)
868{
869}
870
871#endif
872
873extern void sysrq_sched_debug_show(void);
874extern void sched_init_granularity(void);
875extern void update_max_interval(void);
876extern void update_group_power(struct sched_domain *sd, int cpu);
877extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
878extern void init_sched_rt_class(void);
879extern void init_sched_fair_class(void);
880
881extern void resched_task(struct task_struct *p);
882extern void resched_cpu(int cpu);
883
884extern struct rt_bandwidth def_rt_bandwidth;
885extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
886
887extern void update_cpu_load(struct rq *this_rq);
888
889#ifdef CONFIG_CGROUP_CPUACCT
890#include <linux/cgroup.h>
891/* track cpu usage of a group of tasks and its child groups */
892struct cpuacct {
893 struct cgroup_subsys_state css;
894 /* cpuusage holds pointer to a u64-type object on every cpu */
895 u64 __percpu *cpuusage;
896 struct kernel_cpustat __percpu *cpustat;
897};
898
899/* return cpu accounting group corresponding to this container */
900static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
901{
902 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
903 struct cpuacct, css);
904}
905
906/* return cpu accounting group to which this task belongs */
907static inline struct cpuacct *task_ca(struct task_struct *tsk)
908{
909 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
910 struct cpuacct, css);
911}
912
913static inline struct cpuacct *parent_ca(struct cpuacct *ca)
914{
915 if (!ca || !ca->css.cgroup->parent)
916 return NULL;
917 return cgroup_ca(ca->css.cgroup->parent);
918}
919
920extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
921#else
922static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
923#endif
924
925static inline void inc_nr_running(struct rq *rq)
926{
927 rq->nr_running++;
928}
929
930static inline void dec_nr_running(struct rq *rq)
931{
932 rq->nr_running--;
933}
934
935extern void update_rq_clock(struct rq *rq);
936
937extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
938extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
939
940extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
941
942extern const_debug unsigned int sysctl_sched_time_avg;
943extern const_debug unsigned int sysctl_sched_nr_migrate;
944extern const_debug unsigned int sysctl_sched_migration_cost;
945
946static inline u64 sched_avg_period(void)
947{
948 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
949}
950
951void calc_load_account_idle(struct rq *this_rq);
952
953#ifdef CONFIG_SCHED_HRTICK
954
955/*
956 * Use hrtick when:
957 * - enabled by features
958 * - hrtimer is actually high res
959 */
960static inline int hrtick_enabled(struct rq *rq)
961{
962 if (!sched_feat(HRTICK))
963 return 0;
964 if (!cpu_active(cpu_of(rq)))
965 return 0;
966 return hrtimer_is_hres_active(&rq->hrtick_timer);
967}
968
969void hrtick_start(struct rq *rq, u64 delay);
970
971#else
972
973static inline int hrtick_enabled(struct rq *rq)
974{
975 return 0;
976}
977
978#endif /* CONFIG_SCHED_HRTICK */
979
980#ifdef CONFIG_SMP
981extern void sched_avg_update(struct rq *rq);
982static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
983{
984 rq->rt_avg += rt_delta;
985 sched_avg_update(rq);
986}
987#else
988static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
989static inline void sched_avg_update(struct rq *rq) { }
990#endif
991
992extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
993
994#ifdef CONFIG_SMP
995#ifdef CONFIG_PREEMPT
996
997static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
998
999/*
1000 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1001 * way at the expense of forcing extra atomic operations in all
1002 * invocations. This assures that the double_lock is acquired using the
1003 * same underlying policy as the spinlock_t on this architecture, which
1004 * reduces latency compared to the unfair variant below. However, it
1005 * also adds more overhead and therefore may reduce throughput.
1006 */
1007static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1008 __releases(this_rq->lock)
1009 __acquires(busiest->lock)
1010 __acquires(this_rq->lock)
1011{
1012 raw_spin_unlock(&this_rq->lock);
1013 double_rq_lock(this_rq, busiest);
1014
1015 return 1;
1016}
1017
1018#else
1019/*
1020 * Unfair double_lock_balance: Optimizes throughput at the expense of
1021 * latency by eliminating extra atomic operations when the locks are
1022 * already in proper order on entry. This favors lower cpu-ids and will
1023 * grant the double lock to lower cpus over higher ids under contention,
1024 * regardless of entry order into the function.
1025 */
1026static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1027 __releases(this_rq->lock)
1028 __acquires(busiest->lock)
1029 __acquires(this_rq->lock)
1030{
1031 int ret = 0;
1032
1033 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1034 if (busiest < this_rq) {
1035 raw_spin_unlock(&this_rq->lock);
1036 raw_spin_lock(&busiest->lock);
1037 raw_spin_lock_nested(&this_rq->lock,
1038 SINGLE_DEPTH_NESTING);
1039 ret = 1;
1040 } else
1041 raw_spin_lock_nested(&busiest->lock,
1042 SINGLE_DEPTH_NESTING);
1043 }
1044 return ret;
1045}
1046
1047#endif /* CONFIG_PREEMPT */
1048
1049/*
1050 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1051 */
1052static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1053{
1054 if (unlikely(!irqs_disabled())) {
1055 /* printk() doesn't work good under rq->lock */
1056 raw_spin_unlock(&this_rq->lock);
1057 BUG_ON(1);
1058 }
1059
1060 return _double_lock_balance(this_rq, busiest);
1061}
1062
1063static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1064 __releases(busiest->lock)
1065{
1066 raw_spin_unlock(&busiest->lock);
1067 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1068}
1069
1070/*
1071 * double_rq_lock - safely lock two runqueues
1072 *
1073 * Note this does not disable interrupts like task_rq_lock,
1074 * you need to do so manually before calling.
1075 */
1076static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1077 __acquires(rq1->lock)
1078 __acquires(rq2->lock)
1079{
1080 BUG_ON(!irqs_disabled());
1081 if (rq1 == rq2) {
1082 raw_spin_lock(&rq1->lock);
1083 __acquire(rq2->lock); /* Fake it out ;) */
1084 } else {
1085 if (rq1 < rq2) {
1086 raw_spin_lock(&rq1->lock);
1087 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1088 } else {
1089 raw_spin_lock(&rq2->lock);
1090 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1091 }
1092 }
1093}
1094
1095/*
1096 * double_rq_unlock - safely unlock two runqueues
1097 *
1098 * Note this does not restore interrupts like task_rq_unlock,
1099 * you need to do so manually after calling.
1100 */
1101static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1102 __releases(rq1->lock)
1103 __releases(rq2->lock)
1104{
1105 raw_spin_unlock(&rq1->lock);
1106 if (rq1 != rq2)
1107 raw_spin_unlock(&rq2->lock);
1108 else
1109 __release(rq2->lock);
1110}
1111
1112#else /* CONFIG_SMP */
1113
1114/*
1115 * double_rq_lock - safely lock two runqueues
1116 *
1117 * Note this does not disable interrupts like task_rq_lock,
1118 * you need to do so manually before calling.
1119 */
1120static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1121 __acquires(rq1->lock)
1122 __acquires(rq2->lock)
1123{
1124 BUG_ON(!irqs_disabled());
1125 BUG_ON(rq1 != rq2);
1126 raw_spin_lock(&rq1->lock);
1127 __acquire(rq2->lock); /* Fake it out ;) */
1128}
1129
1130/*
1131 * double_rq_unlock - safely unlock two runqueues
1132 *
1133 * Note this does not restore interrupts like task_rq_unlock,
1134 * you need to do so manually after calling.
1135 */
1136static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1137 __releases(rq1->lock)
1138 __releases(rq2->lock)
1139{
1140 BUG_ON(rq1 != rq2);
1141 raw_spin_unlock(&rq1->lock);
1142 __release(rq2->lock);
1143}
1144
1145#endif
1146
1147extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1148extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1149extern void print_cfs_stats(struct seq_file *m, int cpu);
1150extern void print_rt_stats(struct seq_file *m, int cpu);
1151
1152extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1153extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1154extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1155
1156extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1157
1158#ifdef CONFIG_NO_HZ
1159enum rq_nohz_flag_bits {
1160 NOHZ_TICK_STOPPED,
1161 NOHZ_BALANCE_KICK,
1162 NOHZ_IDLE,
1163};
1164
1165#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1166#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 87f9e36ea56e..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
283 return; 180 return;
284 181
285 raw_spin_lock(&cputimer->lock); 182 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 183 cputimer->cputime.utime += cputime;
287 cputime_add(cputimer->cputime.utime, cputime);
288 raw_spin_unlock(&cputimer->lock); 184 raw_spin_unlock(&cputimer->lock);
289} 185}
290 186
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
307 return; 203 return;
308 204
309 raw_spin_lock(&cputimer->lock); 205 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 206 cputimer->cputime.stime += cputime;
311 cputime_add(cputimer->cputime.stime, cputime);
312 raw_spin_unlock(&cputimer->lock); 207 raw_spin_unlock(&cputimer->lock);
313} 208}
314 209
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb3..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
80/* 82/*
81 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
82 */ 84 */
83static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class, 86 .next = &rt_sched_class,
85 87
86 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,
diff --git a/kernel/signal.c b/kernel/signal.c
index b3f78d09a105..56ce3a618b28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1629,10 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1629 info.si_uid = __task_cred(tsk)->uid; 1629 info.si_uid = __task_cred(tsk)->uid;
1630 rcu_read_unlock(); 1630 rcu_read_unlock();
1631 1631
1632 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1632 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
1633 tsk->signal->utime)); 1633 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
1634 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1635 tsk->signal->stime));
1636 1634
1637 info.si_status = tsk->exit_code & 0x7f; 1635 info.si_status = tsk->exit_code & 0x7f;
1638 if (tsk->exit_code & 0x80) 1636 if (tsk->exit_code & 0x80)
@@ -1994,8 +1992,6 @@ static bool do_signal_stop(int signr)
1994 */ 1992 */
1995 if (!(sig->flags & SIGNAL_STOP_STOPPED)) 1993 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1996 sig->group_exit_code = signr; 1994 sig->group_exit_code = signr;
1997 else
1998 WARN_ON_ONCE(!current->ptrace);
1999 1995
2000 sig->group_stop_count = 0; 1996 sig->group_stop_count = 0;
2001 1997
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c71d91efff0..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,12 +347,12 @@ void irq_exit(void)
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 rcu_irq_exit();
351#ifdef CONFIG_NO_HZ 350#ifdef CONFIG_NO_HZ
352 /* Make sure that timer wheel updates are propagated */ 351 /* Make sure that timer wheel updates are propagated */
353 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
354 tick_nohz_stop_sched_tick(0); 353 tick_nohz_irq_exit();
355#endif 354#endif
355 rcu_irq_exit();
356 preempt_enable_no_resched(); 356 preempt_enable_no_resched();
357} 357}
358 358
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611fbd079..ddf8155bf3f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1605 unsigned long maxrss = 0; 1605 unsigned long maxrss = 0;
1606 1606
1607 memset((char *) r, 0, sizeof *r); 1607 memset((char *) r, 0, sizeof *r);
1608 utime = stime = cputime_zero; 1608 utime = stime = 0;
1609 1609
1610 if (who == RUSAGE_THREAD) { 1610 if (who == RUSAGE_THREAD) {
1611 task_times(current, &utime, &stime); 1611 task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1635 1635
1636 case RUSAGE_SELF: 1636 case RUSAGE_SELF:
1637 thread_group_times(p, &tgutime, &tgstime); 1637 thread_group_times(p, &tgutime, &tgstime);
1638 utime = cputime_add(utime, tgutime); 1638 utime += tgutime;
1639 stime = cputime_add(stime, tgstime); 1639 stime += tgstime;
1640 r->ru_nvcsw += p->signal->nvcsw; 1640 r->ru_nvcsw += p->signal->nvcsw;
1641 r->ru_nivcsw += p->signal->nivcsw; 1641 r->ru_nivcsw += p->signal->nivcsw;
1642 r->ru_minflt += p->signal->min_flt; 1642 r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c4eb71c8b2ea..1ecd6ba36d6c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -387,7 +387,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
387 * released list and do a notify add later. 387 * released list and do a notify add later.
388 */ 388 */
389 if (old) { 389 if (old) {
390 old->event_handler = clockevents_handle_noop;
391 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 390 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
392 list_del(&old->list); 391 list_del(&old->list);
393 list_add(&old->list, &clockevents_released); 392 list_add(&old->list, &clockevents_released);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0ba..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
275} 275}
276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
277 277
278/** 278static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
279 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
280 *
281 * When the next event is more than a tick into the future, stop the idle tick
282 * Called either from the idle loop or from irq_exit() when an idle period was
283 * just interrupted by an interrupt which did not cause a reschedule.
284 */
285void tick_nohz_stop_sched_tick(int inidle)
286{ 279{
287 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 280 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
288 struct tick_sched *ts;
289 ktime_t last_update, expires, now; 281 ktime_t last_update, expires, now;
290 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 282 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
291 u64 time_delta; 283 u64 time_delta;
292 int cpu; 284 int cpu;
293 285
294 local_irq_save(flags);
295
296 cpu = smp_processor_id(); 286 cpu = smp_processor_id();
297 ts = &per_cpu(tick_cpu_sched, cpu); 287 ts = &per_cpu(tick_cpu_sched, cpu);
298 288
299 /*
300 * Call to tick_nohz_start_idle stops the last_update_time from being
301 * updated. Thus, it must not be called in the event we are called from
302 * irq_exit() with the prior state different than idle.
303 */
304 if (!inidle && !ts->inidle)
305 goto end;
306
307 /*
308 * Set ts->inidle unconditionally. Even if the system did not
309 * switch to NOHZ mode the cpu frequency governers rely on the
310 * update of the idle time accounting in tick_nohz_start_idle().
311 */
312 ts->inidle = 1;
313
314 now = tick_nohz_start_idle(cpu, ts); 289 now = tick_nohz_start_idle(cpu, ts);
315 290
316 /* 291 /*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
326 } 301 }
327 302
328 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 303 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
329 goto end; 304 return;
330 305
331 if (need_resched()) 306 if (need_resched())
332 goto end; 307 return;
333 308
334 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 309 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
335 static int ratelimit; 310 static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
339 (unsigned int) local_softirq_pending()); 314 (unsigned int) local_softirq_pending());
340 ratelimit++; 315 ratelimit++;
341 } 316 }
342 goto end; 317 return;
343 } 318 }
344 319
345 ts->idle_calls++; 320 ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
434 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
435 ts->tick_stopped = 1; 410 ts->tick_stopped = 1;
436 ts->idle_jiffies = last_jiffies; 411 ts->idle_jiffies = last_jiffies;
437 rcu_enter_nohz();
438 } 412 }
439 413
440 ts->idle_sleeps++; 414 ts->idle_sleeps++;
@@ -472,8 +446,64 @@ out:
472 ts->next_jiffies = next_jiffies; 446 ts->next_jiffies = next_jiffies;
473 ts->last_jiffies = last_jiffies; 447 ts->last_jiffies = last_jiffies;
474 ts->sleep_length = ktime_sub(dev->next_event, now); 448 ts->sleep_length = ktime_sub(dev->next_event, now);
475end: 449}
476 local_irq_restore(flags); 450
451/**
452 * tick_nohz_idle_enter - stop the idle tick from the idle task
453 *
454 * When the next event is more than a tick into the future, stop the idle tick
455 * Called when we start the idle loop.
456 *
457 * The arch is responsible of calling:
458 *
459 * - rcu_idle_enter() after its last use of RCU before the CPU is put
460 * to sleep.
461 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
462 */
463void tick_nohz_idle_enter(void)
464{
465 struct tick_sched *ts;
466
467 WARN_ON_ONCE(irqs_disabled());
468
469 /*
470 * Update the idle state in the scheduler domain hierarchy
471 * when tick_nohz_stop_sched_tick() is called from the idle loop.
472 * State will be updated to busy during the first busy tick after
473 * exiting idle.
474 */
475 set_cpu_sd_state_idle();
476
477 local_irq_disable();
478
479 ts = &__get_cpu_var(tick_cpu_sched);
480 /*
481 * set ts->inidle unconditionally. even if the system did not
482 * switch to nohz mode the cpu frequency governers rely on the
483 * update of the idle time accounting in tick_nohz_start_idle().
484 */
485 ts->inidle = 1;
486 tick_nohz_stop_sched_tick(ts);
487
488 local_irq_enable();
489}
490
491/**
492 * tick_nohz_irq_exit - update next tick event from interrupt exit
493 *
494 * When an interrupt fires while we are idle and it doesn't cause
495 * a reschedule, it may still add, modify or delete a timer, enqueue
496 * an RCU callback, etc...
497 * So we need to re-calculate and reprogram the next tick event.
498 */
499void tick_nohz_irq_exit(void)
500{
501 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
502
503 if (!ts->inidle)
504 return;
505
506 tick_nohz_stop_sched_tick(ts);
477} 507}
478 508
479/** 509/**
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
515} 545}
516 546
517/** 547/**
518 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 548 * tick_nohz_idle_exit - restart the idle tick from the idle task
519 * 549 *
520 * Restart the idle tick when the CPU is woken up from idle 550 * Restart the idle tick when the CPU is woken up from idle
551 * This also exit the RCU extended quiescent state. The CPU
552 * can use RCU again after this function is called.
521 */ 553 */
522void tick_nohz_restart_sched_tick(void) 554void tick_nohz_idle_exit(void)
523{ 555{
524 int cpu = smp_processor_id(); 556 int cpu = smp_processor_id();
525 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 557 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
529 ktime_t now; 561 ktime_t now;
530 562
531 local_irq_disable(); 563 local_irq_disable();
564
532 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 565 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
533 now = ktime_get(); 566 now = ktime_get();
534 567
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void)
543 576
544 ts->inidle = 0; 577 ts->inidle = 0;
545 578
546 rcu_exit_nohz();
547
548 /* Update jiffies first */ 579 /* Update jiffies first */
549 select_nohz_load_balancer(0); 580 select_nohz_load_balancer(0);
550 tick_do_update_jiffies64(now); 581 tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 237841378c03..0c6358186401 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
131 /* calculate the delta since the last update_wall_time: */ 131 /* calculate the delta since the last update_wall_time: */
132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
133 133
134 /* return delta convert to nanoseconds using ntp adjusted mult. */ 134 /* return delta convert to nanoseconds. */
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 136}
137 137
@@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset)
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. 813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
814 * 814 *
815 * Note we subtract one in the shift, so that error is really error*2. 815 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) intererval twice, but keeps the 816 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparision as still measuring if error is 817 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval. 818 * larger then half an interval.
819 * 819 *
820 * Note: It does not "save" on aggrivation when reading the code. 820 * Note: It does not "save" on aggravation when reading the code.
821 */ 821 */
822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); 822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
823 if (error > interval) { 823 if (error > interval) {
@@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset)
833 * nanosecond, and store the amount rounded up into 833 * nanosecond, and store the amount rounded up into
834 * the error. This causes the likely below to be unlikely. 834 * the error. This causes the likely below to be unlikely.
835 * 835 *
836 * The properfix is to avoid rounding up by using 836 * The proper fix is to avoid rounding up by using
837 * the high precision timekeeper.xtime_nsec instead of 837 * the high precision timekeeper.xtime_nsec instead of
838 * xtime.tv_nsec everywhere. Fixing this will take some 838 * xtime.tv_nsec everywhere. Fixing this will take some
839 * time. 839 * time.
diff --git a/kernel/timer.c b/kernel/timer.c
index 9c3c62b0c4bc..a297ffcf888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
427 } 427 }
428} 428}
429 429
430/* Stub timer callback for improperly used timers. */
431static void stub_timer(unsigned long data)
432{
433 WARN_ON(1);
434}
435
430/* 436/*
431 * fixup_activate is called when: 437 * fixup_activate is called when:
432 * - an active object is activated 438 * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
450 debug_object_activate(timer, &timer_debug_descr); 456 debug_object_activate(timer, &timer_debug_descr);
451 return 0; 457 return 0;
452 } else { 458 } else {
453 WARN_ON_ONCE(1); 459 setup_timer(timer, stub_timer, 0);
460 return 1;
454 } 461 }
455 return 0; 462 return 0;
456 463
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
480 } 487 }
481} 488}
482 489
490/*
491 * fixup_assert_init is called when:
492 * - an untracked/uninit-ed object is found
493 */
494static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
495{
496 struct timer_list *timer = addr;
497
498 switch (state) {
499 case ODEBUG_STATE_NOTAVAILABLE:
500 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
501 /*
502 * This is not really a fixup. The timer was
503 * statically initialized. We just make sure that it
504 * is tracked in the object tracker.
505 */
506 debug_object_init(timer, &timer_debug_descr);
507 return 0;
508 } else {
509 setup_timer(timer, stub_timer, 0);
510 return 1;
511 }
512 default:
513 return 0;
514 }
515}
516
483static struct debug_obj_descr timer_debug_descr = { 517static struct debug_obj_descr timer_debug_descr = {
484 .name = "timer_list", 518 .name = "timer_list",
485 .debug_hint = timer_debug_hint, 519 .debug_hint = timer_debug_hint,
486 .fixup_init = timer_fixup_init, 520 .fixup_init = timer_fixup_init,
487 .fixup_activate = timer_fixup_activate, 521 .fixup_activate = timer_fixup_activate,
488 .fixup_free = timer_fixup_free, 522 .fixup_free = timer_fixup_free,
523 .fixup_assert_init = timer_fixup_assert_init,
489}; 524};
490 525
491static inline void debug_timer_init(struct timer_list *timer) 526static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
508 debug_object_free(timer, &timer_debug_descr); 543 debug_object_free(timer, &timer_debug_descr);
509} 544}
510 545
546static inline void debug_timer_assert_init(struct timer_list *timer)
547{
548 debug_object_assert_init(timer, &timer_debug_descr);
549}
550
511static void __init_timer(struct timer_list *timer, 551static void __init_timer(struct timer_list *timer,
512 const char *name, 552 const char *name,
513 struct lock_class_key *key); 553 struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
531static inline void debug_timer_init(struct timer_list *timer) { } 571static inline void debug_timer_init(struct timer_list *timer) { }
532static inline void debug_timer_activate(struct timer_list *timer) { } 572static inline void debug_timer_activate(struct timer_list *timer) { }
533static inline void debug_timer_deactivate(struct timer_list *timer) { } 573static inline void debug_timer_deactivate(struct timer_list *timer) { }
574static inline void debug_timer_assert_init(struct timer_list *timer) { }
534#endif 575#endif
535 576
536static inline void debug_init(struct timer_list *timer) 577static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
552 trace_timer_cancel(timer); 593 trace_timer_cancel(timer);
553} 594}
554 595
596static inline void debug_assert_init(struct timer_list *timer)
597{
598 debug_timer_assert_init(timer);
599}
600
555static void __init_timer(struct timer_list *timer, 601static void __init_timer(struct timer_list *timer,
556 const char *name, 602 const char *name,
557 struct lock_class_key *key) 603 struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
902 unsigned long flags; 948 unsigned long flags;
903 int ret = 0; 949 int ret = 0;
904 950
951 debug_assert_init(timer);
952
905 timer_stats_timer_clear_start_info(timer); 953 timer_stats_timer_clear_start_info(timer);
906 if (timer_pending(timer)) { 954 if (timer_pending(timer)) {
907 base = lock_timer_base(timer, &flags); 955 base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
932 unsigned long flags; 980 unsigned long flags;
933 int ret = -1; 981 int ret = -1;
934 982
983 debug_assert_init(timer);
984
935 base = lock_timer_base(timer, &flags); 985 base = lock_timer_base(timer, &flags);
936 986
937 if (base->running_timer == timer) 987 if (base->running_timer == timer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..91dc4bc8bf72 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
338/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
342 TRACE_ITER_IRQ_INFO;
342 343
343static int trace_stop_count; 344static int trace_stop_count;
344static DEFINE_RAW_SPINLOCK(tracing_start_lock); 345static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
426 "record-cmd", 427 "record-cmd",
427 "overwrite", 428 "overwrite",
428 "disable_on_free", 429 "disable_on_free",
430 "irq-info",
429 NULL 431 NULL
430}; 432};
431 433
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
1843 trace_event_read_unlock(); 1845 trace_event_read_unlock();
1844} 1846}
1845 1847
1848static void
1849get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
1850{
1851 unsigned long count;
1852 int cpu;
1853
1854 *total = 0;
1855 *entries = 0;
1856
1857 for_each_tracing_cpu(cpu) {
1858 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1859 /*
1860 * If this buffer has skipped entries, then we hold all
1861 * entries for the trace and we need to ignore the
1862 * ones before the time stamp.
1863 */
1864 if (tr->data[cpu]->skipped_entries) {
1865 count -= tr->data[cpu]->skipped_entries;
1866 /* total is the same as the entries */
1867 *total += count;
1868 } else
1869 *total += count +
1870 ring_buffer_overrun_cpu(tr->buffer, cpu);
1871 *entries += count;
1872 }
1873}
1874
1846static void print_lat_help_header(struct seq_file *m) 1875static void print_lat_help_header(struct seq_file *m)
1847{ 1876{
1848 seq_puts(m, "# _------=> CPU# \n"); 1877 seq_puts(m, "# _------=> CPU# \n");
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
1855 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1884 seq_puts(m, "# \\ / ||||| \\ | / \n");
1856} 1885}
1857 1886
1858static void print_func_help_header(struct seq_file *m) 1887static void print_event_info(struct trace_array *tr, struct seq_file *m)
1888{
1889 unsigned long total;
1890 unsigned long entries;
1891
1892 get_total_entries(tr, &total, &entries);
1893 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
1894 entries, total, num_online_cpus());
1895 seq_puts(m, "#\n");
1896}
1897
1898static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
1859{ 1899{
1860 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1900 print_event_info(tr, m);
1901 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1861 seq_puts(m, "# | | | | |\n"); 1902 seq_puts(m, "# | | | | |\n");
1862} 1903}
1863 1904
1905static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
1906{
1907 print_event_info(tr, m);
1908 seq_puts(m, "# _-----=> irqs-off\n");
1909 seq_puts(m, "# / _----=> need-resched\n");
1910 seq_puts(m, "# | / _---=> hardirq/softirq\n");
1911 seq_puts(m, "# || / _--=> preempt-depth\n");
1912 seq_puts(m, "# ||| / delay\n");
1913 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
1914 seq_puts(m, "# | | | |||| | |\n");
1915}
1864 1916
1865void 1917void
1866print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1918print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1869 struct trace_array *tr = iter->tr; 1921 struct trace_array *tr = iter->tr;
1870 struct trace_array_cpu *data = tr->data[tr->cpu]; 1922 struct trace_array_cpu *data = tr->data[tr->cpu];
1871 struct tracer *type = current_trace; 1923 struct tracer *type = current_trace;
1872 unsigned long entries = 0; 1924 unsigned long entries;
1873 unsigned long total = 0; 1925 unsigned long total;
1874 unsigned long count;
1875 const char *name = "preemption"; 1926 const char *name = "preemption";
1876 int cpu;
1877 1927
1878 if (type) 1928 if (type)
1879 name = type->name; 1929 name = type->name;
1880 1930
1881 1931 get_total_entries(tr, &total, &entries);
1882 for_each_tracing_cpu(cpu) {
1883 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1884 /*
1885 * If this buffer has skipped entries, then we hold all
1886 * entries for the trace and we need to ignore the
1887 * ones before the time stamp.
1888 */
1889 if (tr->data[cpu]->skipped_entries) {
1890 count -= tr->data[cpu]->skipped_entries;
1891 /* total is the same as the entries */
1892 total += count;
1893 } else
1894 total += count +
1895 ring_buffer_overrun_cpu(tr->buffer, cpu);
1896 entries += count;
1897 }
1898 1932
1899 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1933 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1900 name, UTS_RELEASE); 1934 name, UTS_RELEASE);
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2140 return print_trace_fmt(iter); 2174 return print_trace_fmt(iter);
2141} 2175}
2142 2176
2177void trace_latency_header(struct seq_file *m)
2178{
2179 struct trace_iterator *iter = m->private;
2180
2181 /* print nothing if the buffers are empty */
2182 if (trace_empty(iter))
2183 return;
2184
2185 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2186 print_trace_header(m, iter);
2187
2188 if (!(trace_flags & TRACE_ITER_VERBOSE))
2189 print_lat_help_header(m);
2190}
2191
2143void trace_default_header(struct seq_file *m) 2192void trace_default_header(struct seq_file *m)
2144{ 2193{
2145 struct trace_iterator *iter = m->private; 2194 struct trace_iterator *iter = m->private;
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m)
2155 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2204 if (!(trace_flags & TRACE_ITER_VERBOSE))
2156 print_lat_help_header(m); 2205 print_lat_help_header(m);
2157 } else { 2206 } else {
2158 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2207 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2159 print_func_help_header(m); 2208 if (trace_flags & TRACE_ITER_IRQ_INFO)
2209 print_func_help_header_irq(iter->tr, m);
2210 else
2211 print_func_help_header(iter->tr, m);
2212 }
2160 } 2213 }
2161} 2214}
2162 2215
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4775{ 4828{
4776 __ftrace_dump(true, oops_dump_mode); 4829 __ftrace_dump(true, oops_dump_mode);
4777} 4830}
4831EXPORT_SYMBOL_GPL(ftrace_dump);
4778 4832
4779__init static int tracer_alloc_buffers(void) 4833__init static int tracer_alloc_buffers(void)
4780{ 4834{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18dc..2c2657462ac3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
370 unsigned long ip, 370 unsigned long ip,
371 unsigned long parent_ip, 371 unsigned long parent_ip,
372 unsigned long flags, int pc); 372 unsigned long flags, int pc);
373void trace_latency_header(struct seq_file *m);
373void trace_default_header(struct seq_file *m); 374void trace_default_header(struct seq_file *m);
374void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 375void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
375int trace_empty(struct trace_iterator *iter); 376int trace_empty(struct trace_iterator *iter);
@@ -654,6 +655,7 @@ enum trace_iterator_flags {
654 TRACE_ITER_RECORD_CMD = 0x100000, 655 TRACE_ITER_RECORD_CMD = 0x100000,
655 TRACE_ITER_OVERWRITE = 0x200000, 656 TRACE_ITER_OVERWRITE = 0x200000,
656 TRACE_ITER_STOP_ON_FREE = 0x400000, 657 TRACE_ITER_STOP_ON_FREE = 0x400000,
658 TRACE_ITER_IRQ_INFO = 0x800000,
657}; 659};
658 660
659/* 661/*
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 95dc31efd6dd..f04cc3136bd3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30#define DEFAULT_SYS_FILTER_MESSAGE \
31 "### global filter ###\n" \
32 "# Use this to set filters for multiple events.\n" \
33 "# Only events with the given fields will be affected.\n" \
34 "# If no events are modified, an error message will be displayed here"
35
30enum filter_op_ids 36enum filter_op_ids
31{ 37{
32 OP_OR, 38 OP_OR,
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
646 if (filter && filter->filter_string) 652 if (filter && filter->filter_string)
647 trace_seq_printf(s, "%s\n", filter->filter_string); 653 trace_seq_printf(s, "%s\n", filter->filter_string);
648 else 654 else
649 trace_seq_printf(s, "none\n"); 655 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
650 mutex_unlock(&event_mutex); 656 mutex_unlock(&event_mutex);
651} 657}
652 658
@@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1838 if (!filter) 1844 if (!filter)
1839 goto out; 1845 goto out;
1840 1846
1841 replace_filter_string(filter, filter_string); 1847 /* System filters just show a default message */
1848 kfree(filter->filter_string);
1849 filter->filter_string = NULL;
1850
1842 /* 1851 /*
1843 * No event actually uses the system filter 1852 * No event actually uses the system filter
1844 * we can free it without synchronize_sched(). 1853 * we can free it without synchronize_sched().
@@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1848 1857
1849 parse_init(ps, filter_ops, filter_string); 1858 parse_init(ps, filter_ops, filter_string);
1850 err = filter_parse(ps); 1859 err = filter_parse(ps);
1851 if (err) { 1860 if (err)
1852 append_filter_err(ps, system->filter); 1861 goto err_filter;
1853 goto out;
1854 }
1855 1862
1856 err = replace_system_preds(system, ps, filter_string); 1863 err = replace_system_preds(system, ps, filter_string);
1857 if (err) 1864 if (err)
1858 append_filter_err(ps, system->filter); 1865 goto err_filter;
1859 1866
1860out: 1867out:
1861 filter_opstack_clear(ps); 1868 filter_opstack_clear(ps);
@@ -1865,6 +1872,11 @@ out_unlock:
1865 mutex_unlock(&event_mutex); 1872 mutex_unlock(&event_mutex);
1866 1873
1867 return err; 1874 return err;
1875
1876err_filter:
1877 replace_filter_string(filter, filter_string);
1878 append_filter_err(ps, system->filter);
1879 goto out;
1868} 1880}
1869 1881
1870#ifdef CONFIG_PERF_EVENTS 1882#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 20dad0d7a163..99d20e920368 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } 282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
283static void irqsoff_print_header(struct seq_file *s) { }
284static void irqsoff_trace_open(struct trace_iterator *iter) { } 283static void irqsoff_trace_open(struct trace_iterator *iter) { }
285static void irqsoff_trace_close(struct trace_iterator *iter) { } 284static void irqsoff_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void irqsoff_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void irqsoff_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6cf..0d6ff3555942 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
627 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 627 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
628 unsigned long secs = (unsigned long)t; 628 unsigned long secs = (unsigned long)t;
629 char comm[TASK_COMM_LEN]; 629 char comm[TASK_COMM_LEN];
630 int ret;
630 631
631 trace_find_cmdline(entry->pid, comm); 632 trace_find_cmdline(entry->pid, comm);
632 633
633 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", 634 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
634 comm, entry->pid, iter->cpu, secs, usec_rem); 635 comm, entry->pid, iter->cpu);
636 if (!ret)
637 return 0;
638
639 if (trace_flags & TRACE_ITER_IRQ_INFO) {
640 ret = trace_print_lat_fmt(s, entry);
641 if (!ret)
642 return 0;
643 }
644
645 return trace_seq_printf(s, " %5lu.%06lu: ",
646 secs, usec_rem);
635} 647}
636 648
637int trace_print_lat_context(struct trace_iterator *iter) 649int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b6..ff791ea48b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } 282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
283static void wakeup_print_header(struct seq_file *s) { }
284static void wakeup_trace_open(struct trace_iterator *iter) { } 283static void wakeup_trace_open(struct trace_iterator *iter) { }
285static void wakeup_trace_close(struct trace_iterator *iter) { } 284static void wakeup_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void wakeup_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void wakeup_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
127 127
128 local_irq_save(flags); 128 local_irq_save(flags);
129 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
130 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = time - tsk->acct_timexpd;
131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
132 delta = value.tv_sec; 132 delta = value.tv_sec;
133 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/wait.c b/kernel/wait.c
index 26fa7797f90f..7fdd9eaca2c3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,10 +10,10 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) 13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key); 16 lockdep_set_class_and_name(&q->lock, key, name);
17 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
18} 18}
19 19
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index a78b7c6e042c..77cb245f8e7b 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -268,12 +268,16 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
268 * Try to repair the damage, so we have a better chance to get useful 268 * Try to repair the damage, so we have a better chance to get useful
269 * debug output. 269 * debug output.
270 */ 270 */
271static void 271static int
272debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), 272debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
273 void * addr, enum debug_obj_state state) 273 void * addr, enum debug_obj_state state)
274{ 274{
275 int fixed = 0;
276
275 if (fixup) 277 if (fixup)
276 debug_objects_fixups += fixup(addr, state); 278 fixed = fixup(addr, state);
279 debug_objects_fixups += fixed;
280 return fixed;
277} 281}
278 282
279static void debug_object_is_on_stack(void *addr, int onstack) 283static void debug_object_is_on_stack(void *addr, int onstack)
@@ -386,6 +390,9 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
386 struct debug_bucket *db; 390 struct debug_bucket *db;
387 struct debug_obj *obj; 391 struct debug_obj *obj;
388 unsigned long flags; 392 unsigned long flags;
393 struct debug_obj o = { .object = addr,
394 .state = ODEBUG_STATE_NOTAVAILABLE,
395 .descr = descr };
389 396
390 if (!debug_objects_enabled) 397 if (!debug_objects_enabled)
391 return; 398 return;
@@ -425,8 +432,9 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
425 * let the type specific code decide whether this is 432 * let the type specific code decide whether this is
426 * true or not. 433 * true or not.
427 */ 434 */
428 debug_object_fixup(descr->fixup_activate, addr, 435 if (debug_object_fixup(descr->fixup_activate, addr,
429 ODEBUG_STATE_NOTAVAILABLE); 436 ODEBUG_STATE_NOTAVAILABLE))
437 debug_print_object(&o, "activate");
430} 438}
431 439
432/** 440/**
@@ -563,6 +571,44 @@ out_unlock:
563} 571}
564 572
565/** 573/**
574 * debug_object_assert_init - debug checks when object should be init-ed
575 * @addr: address of the object
576 * @descr: pointer to an object specific debug description structure
577 */
578void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
579{
580 struct debug_bucket *db;
581 struct debug_obj *obj;
582 unsigned long flags;
583
584 if (!debug_objects_enabled)
585 return;
586
587 db = get_bucket((unsigned long) addr);
588
589 raw_spin_lock_irqsave(&db->lock, flags);
590
591 obj = lookup_object(addr, db);
592 if (!obj) {
593 struct debug_obj o = { .object = addr,
594 .state = ODEBUG_STATE_NOTAVAILABLE,
595 .descr = descr };
596
597 raw_spin_unlock_irqrestore(&db->lock, flags);
598 /*
599 * Maybe the object is static. Let the type specific
600 * code decide what to do.
601 */
602 if (debug_object_fixup(descr->fixup_assert_init, addr,
603 ODEBUG_STATE_NOTAVAILABLE))
604 debug_print_object(&o, "assert_init");
605 return;
606 }
607
608 raw_spin_unlock_irqrestore(&db->lock, flags);
609}
610
611/**
566 * debug_object_active_state - debug checks object usage state machine 612 * debug_object_active_state - debug checks object usage state machine
567 * @addr: address of the object 613 * @addr: address of the object
568 * @descr: pointer to an object specific debug description structure 614 * @descr: pointer to an object specific debug description structure
diff --git a/mm/Kconfig b/mm/Kconfig
index 011b110365c8..e338407f1225 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -131,6 +131,12 @@ config SPARSEMEM_VMEMMAP
131config HAVE_MEMBLOCK 131config HAVE_MEMBLOCK
132 boolean 132 boolean
133 133
134config HAVE_MEMBLOCK_NODE_MAP
135 boolean
136
137config ARCH_DISCARD_MEMBLOCK
138 boolean
139
134config NO_BOOTMEM 140config NO_BOOTMEM
135 boolean 141 boolean
136 142
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 73f17c0293c0..2316840b337a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -901,7 +901,6 @@ retry:
901 h->resv_huge_pages += delta; 901 h->resv_huge_pages += delta;
902 ret = 0; 902 ret = 0;
903 903
904 spin_unlock(&hugetlb_lock);
905 /* Free the needed pages to the hugetlb pool */ 904 /* Free the needed pages to the hugetlb pool */
906 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 905 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
907 if ((--needed) < 0) 906 if ((--needed) < 0)
@@ -915,6 +914,7 @@ retry:
915 VM_BUG_ON(page_count(page)); 914 VM_BUG_ON(page_count(page));
916 enqueue_huge_page(h, page); 915 enqueue_huge_page(h, page);
917 } 916 }
917 spin_unlock(&hugetlb_lock);
918 918
919 /* Free unnecessary surplus pages to the buddy allocator */ 919 /* Free unnecessary surplus pages to the buddy allocator */
920free: 920free:
diff --git a/mm/memblock.c b/mm/memblock.c
index 84bec4969ed5..2f55f19b7c86 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,12 +20,23 @@
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23struct memblock memblock __initdata_memblock; 23static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
24static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
25
26struct memblock memblock __initdata_memblock = {
27 .memory.regions = memblock_memory_init_regions,
28 .memory.cnt = 1, /* empty dummy entry */
29 .memory.max = INIT_MEMBLOCK_REGIONS,
30
31 .reserved.regions = memblock_reserved_init_regions,
32 .reserved.cnt = 1, /* empty dummy entry */
33 .reserved.max = INIT_MEMBLOCK_REGIONS,
34
35 .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
36};
24 37
25int memblock_debug __initdata_memblock; 38int memblock_debug __initdata_memblock;
26int memblock_can_resize __initdata_memblock; 39static int memblock_can_resize __initdata_memblock;
27static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
28static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
29 40
30/* inline so we don't get a warning when pr_debug is compiled out */ 41/* inline so we don't get a warning when pr_debug is compiled out */
31static inline const char *memblock_type_name(struct memblock_type *type) 42static inline const char *memblock_type_name(struct memblock_type *type)
@@ -38,20 +49,15 @@ static inline const char *memblock_type_name(struct memblock_type *type)
38 return "unknown"; 49 return "unknown";
39} 50}
40 51
41/* 52/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
42 * Address comparison utilities 53static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
43 */
44
45static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
46{
47 return addr & ~(size - 1);
48}
49
50static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
51{ 54{
52 return (addr + (size - 1)) & ~(size - 1); 55 return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
53} 56}
54 57
58/*
59 * Address comparison utilities
60 */
55static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, 61static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
56 phys_addr_t base2, phys_addr_t size2) 62 phys_addr_t base2, phys_addr_t size2)
57{ 63{
@@ -73,83 +79,66 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
73 return (i < type->cnt) ? i : -1; 79 return (i < type->cnt) ? i : -1;
74} 80}
75 81
76/* 82/**
77 * Find, allocate, deallocate or reserve unreserved regions. All allocations 83 * memblock_find_in_range_node - find free area in given range and node
78 * are top-down. 84 * @start: start of candidate range
85 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
86 * @size: size of free area to find
87 * @align: alignment of free area to find
88 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
89 *
90 * Find @size free area aligned to @align in the specified range and node.
91 *
92 * RETURNS:
93 * Found address on success, %0 on failure.
79 */ 94 */
80 95phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
81static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, 96 phys_addr_t end, phys_addr_t size,
82 phys_addr_t size, phys_addr_t align) 97 phys_addr_t align, int nid)
83{ 98{
84 phys_addr_t base, res_base; 99 phys_addr_t this_start, this_end, cand;
85 long j; 100 u64 i;
86
87 /* In case, huge size is requested */
88 if (end < size)
89 return MEMBLOCK_ERROR;
90
91 base = memblock_align_down((end - size), align);
92 101
93 /* Prevent allocations returning 0 as it's also used to 102 /* align @size to avoid excessive fragmentation on reserved array */
94 * indicate an allocation failure 103 size = round_up(size, align);
95 */
96 if (start == 0)
97 start = PAGE_SIZE;
98
99 while (start <= base) {
100 j = memblock_overlaps_region(&memblock.reserved, base, size);
101 if (j < 0)
102 return base;
103 res_base = memblock.reserved.regions[j].base;
104 if (res_base < size)
105 break;
106 base = memblock_align_down(res_base - size, align);
107 }
108 104
109 return MEMBLOCK_ERROR; 105 /* pump up @end */
110}
111
112static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
113 phys_addr_t align, phys_addr_t start, phys_addr_t end)
114{
115 long i;
116
117 BUG_ON(0 == size);
118
119 /* Pump up max_addr */
120 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 106 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
121 end = memblock.current_limit; 107 end = memblock.current_limit;
122 108
123 /* We do a top-down search, this tends to limit memory 109 /* adjust @start to avoid underflow and allocating the first page */
124 * fragmentation by keeping early boot allocs near the 110 start = max3(start, size, (phys_addr_t)PAGE_SIZE);
125 * top of memory 111 end = max(start, end);
126 */
127 for (i = memblock.memory.cnt - 1; i >= 0; i--) {
128 phys_addr_t memblockbase = memblock.memory.regions[i].base;
129 phys_addr_t memblocksize = memblock.memory.regions[i].size;
130 phys_addr_t bottom, top, found;
131 112
132 if (memblocksize < size) 113 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
133 continue; 114 this_start = clamp(this_start, start, end);
134 if ((memblockbase + memblocksize) <= start) 115 this_end = clamp(this_end, start, end);
135 break; 116
136 bottom = max(memblockbase, start); 117 cand = round_down(this_end - size, align);
137 top = min(memblockbase + memblocksize, end); 118 if (cand >= this_start)
138 if (bottom >= top) 119 return cand;
139 continue;
140 found = memblock_find_region(bottom, top, size, align);
141 if (found != MEMBLOCK_ERROR)
142 return found;
143 } 120 }
144 return MEMBLOCK_ERROR; 121 return 0;
145} 122}
146 123
147/* 124/**
148 * Find a free area with specified alignment in a specific range. 125 * memblock_find_in_range - find free area in given range
126 * @start: start of candidate range
127 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
128 * @size: size of free area to find
129 * @align: alignment of free area to find
130 *
131 * Find @size free area aligned to @align in the specified range.
132 *
133 * RETURNS:
134 * Found address on success, %0 on failure.
149 */ 135 */
150u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) 136phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
137 phys_addr_t end, phys_addr_t size,
138 phys_addr_t align)
151{ 139{
152 return memblock_find_base(size, align, start, end); 140 return memblock_find_in_range_node(start, end, size, align,
141 MAX_NUMNODES);
153} 142}
154 143
155/* 144/*
@@ -178,25 +167,21 @@ int __init_memblock memblock_reserve_reserved_regions(void)
178 167
179static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 168static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
180{ 169{
181 unsigned long i; 170 type->total_size -= type->regions[r].size;
182 171 memmove(&type->regions[r], &type->regions[r + 1],
183 for (i = r; i < type->cnt - 1; i++) { 172 (type->cnt - (r + 1)) * sizeof(type->regions[r]));
184 type->regions[i].base = type->regions[i + 1].base;
185 type->regions[i].size = type->regions[i + 1].size;
186 }
187 type->cnt--; 173 type->cnt--;
188 174
189 /* Special case for empty arrays */ 175 /* Special case for empty arrays */
190 if (type->cnt == 0) { 176 if (type->cnt == 0) {
177 WARN_ON(type->total_size != 0);
191 type->cnt = 1; 178 type->cnt = 1;
192 type->regions[0].base = 0; 179 type->regions[0].base = 0;
193 type->regions[0].size = 0; 180 type->regions[0].size = 0;
181 memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
194 } 182 }
195} 183}
196 184
197/* Defined below but needed now */
198static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
199
200static int __init_memblock memblock_double_array(struct memblock_type *type) 185static int __init_memblock memblock_double_array(struct memblock_type *type)
201{ 186{
202 struct memblock_region *new_array, *old_array; 187 struct memblock_region *new_array, *old_array;
@@ -226,10 +211,10 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
226 */ 211 */
227 if (use_slab) { 212 if (use_slab) {
228 new_array = kmalloc(new_size, GFP_KERNEL); 213 new_array = kmalloc(new_size, GFP_KERNEL);
229 addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); 214 addr = new_array ? __pa(new_array) : 0;
230 } else 215 } else
231 addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); 216 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
232 if (addr == MEMBLOCK_ERROR) { 217 if (!addr) {
233 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", 218 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
234 memblock_type_name(type), type->max, type->max * 2); 219 memblock_type_name(type), type->max, type->max * 2);
235 return -1; 220 return -1;
@@ -254,7 +239,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
254 return 0; 239 return 0;
255 240
256 /* Add the new reserved region now. Should not fail ! */ 241 /* Add the new reserved region now. Should not fail ! */
257 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); 242 BUG_ON(memblock_reserve(addr, new_size));
258 243
259 /* If the array wasn't our static init one, then free it. We only do 244 /* If the array wasn't our static init one, then free it. We only do
260 * that before SLAB is available as later on, we don't know whether 245 * that before SLAB is available as later on, we don't know whether
@@ -268,343 +253,514 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
268 return 0; 253 return 0;
269} 254}
270 255
271int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, 256/**
272 phys_addr_t addr2, phys_addr_t size2) 257 * memblock_merge_regions - merge neighboring compatible regions
273{ 258 * @type: memblock type to scan
274 return 1; 259 *
275} 260 * Scan @type and merge neighboring compatible regions.
276 261 */
277static long __init_memblock memblock_add_region(struct memblock_type *type, 262static void __init_memblock memblock_merge_regions(struct memblock_type *type)
278 phys_addr_t base, phys_addr_t size)
279{ 263{
280 phys_addr_t end = base + size; 264 int i = 0;
281 int i, slot = -1;
282
283 /* First try and coalesce this MEMBLOCK with others */
284 for (i = 0; i < type->cnt; i++) {
285 struct memblock_region *rgn = &type->regions[i];
286 phys_addr_t rend = rgn->base + rgn->size;
287 265
288 /* Exit if there's no possible hits */ 266 /* cnt never goes below 1 */
289 if (rgn->base > end || rgn->size == 0) 267 while (i < type->cnt - 1) {
290 break; 268 struct memblock_region *this = &type->regions[i];
269 struct memblock_region *next = &type->regions[i + 1];
291 270
292 /* Check if we are fully enclosed within an existing 271 if (this->base + this->size != next->base ||
293 * block 272 memblock_get_region_node(this) !=
294 */ 273 memblock_get_region_node(next)) {
295 if (rgn->base <= base && rend >= end) 274 BUG_ON(this->base + this->size > next->base);
296 return 0; 275 i++;
276 continue;
277 }
297 278
298 /* Check if we overlap or are adjacent with the bottom 279 this->size += next->size;
299 * of a block. 280 memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
300 */ 281 type->cnt--;
301 if (base < rgn->base && end >= rgn->base) { 282 }
302 /* If we can't coalesce, create a new block */ 283}
303 if (!memblock_memory_can_coalesce(base, size,
304 rgn->base,
305 rgn->size)) {
306 /* Overlap & can't coalesce are mutually
307 * exclusive, if you do that, be prepared
308 * for trouble
309 */
310 WARN_ON(end != rgn->base);
311 goto new_block;
312 }
313 /* We extend the bottom of the block down to our
314 * base
315 */
316 rgn->base = base;
317 rgn->size = rend - base;
318 284
319 /* Return if we have nothing else to allocate 285/**
320 * (fully coalesced) 286 * memblock_insert_region - insert new memblock region
321 */ 287 * @type: memblock type to insert into
322 if (rend >= end) 288 * @idx: index for the insertion point
323 return 0; 289 * @base: base address of the new region
290 * @size: size of the new region
291 *
292 * Insert new memblock region [@base,@base+@size) into @type at @idx.
293 * @type must already have extra room to accomodate the new region.
294 */
295static void __init_memblock memblock_insert_region(struct memblock_type *type,
296 int idx, phys_addr_t base,
297 phys_addr_t size, int nid)
298{
299 struct memblock_region *rgn = &type->regions[idx];
324 300
325 /* We continue processing from the end of the 301 BUG_ON(type->cnt >= type->max);
326 * coalesced block. 302 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
327 */ 303 rgn->base = base;
328 base = rend; 304 rgn->size = size;
329 size = end - base; 305 memblock_set_region_node(rgn, nid);
330 } 306 type->cnt++;
307 type->total_size += size;
308}
331 309
332 /* Now check if we overlap or are adjacent with the 310/**
333 * top of a block 311 * memblock_add_region - add new memblock region
334 */ 312 * @type: memblock type to add new region into
335 if (base <= rend && end >= rend) { 313 * @base: base address of the new region
336 /* If we can't coalesce, create a new block */ 314 * @size: size of the new region
337 if (!memblock_memory_can_coalesce(rgn->base, 315 * @nid: nid of the new region
338 rgn->size, 316 *
339 base, size)) { 317 * Add new memblock region [@base,@base+@size) into @type. The new region
340 /* Overlap & can't coalesce are mutually 318 * is allowed to overlap with existing ones - overlaps don't affect already
341 * exclusive, if you do that, be prepared 319 * existing regions. @type is guaranteed to be minimal (all neighbouring
342 * for trouble 320 * compatible regions are merged) after the addition.
343 */ 321 *
344 WARN_ON(rend != base); 322 * RETURNS:
345 goto new_block; 323 * 0 on success, -errno on failure.
346 } 324 */
347 /* We adjust our base down to enclose the 325static int __init_memblock memblock_add_region(struct memblock_type *type,
348 * original block and destroy it. It will be 326 phys_addr_t base, phys_addr_t size, int nid)
349 * part of our new allocation. Since we've 327{
350 * freed an entry, we know we won't fail 328 bool insert = false;
351 * to allocate one later, so we won't risk 329 phys_addr_t obase = base;
352 * losing the original block allocation. 330 phys_addr_t end = base + memblock_cap_size(base, &size);
353 */ 331 int i, nr_new;
354 size += (base - rgn->base);
355 base = rgn->base;
356 memblock_remove_region(type, i--);
357 }
358 }
359 332
360 /* If the array is empty, special case, replace the fake 333 /* special case for empty array */
361 * filler region and return 334 if (type->regions[0].size == 0) {
362 */ 335 WARN_ON(type->cnt != 1 || type->total_size);
363 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
364 type->regions[0].base = base; 336 type->regions[0].base = base;
365 type->regions[0].size = size; 337 type->regions[0].size = size;
338 memblock_set_region_node(&type->regions[0], nid);
339 type->total_size = size;
366 return 0; 340 return 0;
367 } 341 }
368 342repeat:
369 new_block: 343 /*
370 /* If we are out of space, we fail. It's too late to resize the array 344 * The following is executed twice. Once with %false @insert and
371 * but then this shouldn't have happened in the first place. 345 * then with %true. The first counts the number of regions needed
346 * to accomodate the new area. The second actually inserts them.
372 */ 347 */
373 if (WARN_ON(type->cnt >= type->max)) 348 base = obase;
374 return -1; 349 nr_new = 0;
375 350
376 /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ 351 for (i = 0; i < type->cnt; i++) {
377 for (i = type->cnt - 1; i >= 0; i--) { 352 struct memblock_region *rgn = &type->regions[i];
378 if (base < type->regions[i].base) { 353 phys_addr_t rbase = rgn->base;
379 type->regions[i+1].base = type->regions[i].base; 354 phys_addr_t rend = rbase + rgn->size;
380 type->regions[i+1].size = type->regions[i].size; 355
381 } else { 356 if (rbase >= end)
382 type->regions[i+1].base = base;
383 type->regions[i+1].size = size;
384 slot = i + 1;
385 break; 357 break;
358 if (rend <= base)
359 continue;
360 /*
361 * @rgn overlaps. If it separates the lower part of new
362 * area, insert that portion.
363 */
364 if (rbase > base) {
365 nr_new++;
366 if (insert)
367 memblock_insert_region(type, i++, base,
368 rbase - base, nid);
386 } 369 }
370 /* area below @rend is dealt with, forget about it */
371 base = min(rend, end);
387 } 372 }
388 if (base < type->regions[0].base) { 373
389 type->regions[0].base = base; 374 /* insert the remaining portion */
390 type->regions[0].size = size; 375 if (base < end) {
391 slot = 0; 376 nr_new++;
377 if (insert)
378 memblock_insert_region(type, i, base, end - base, nid);
392 } 379 }
393 type->cnt++;
394 380
395 /* The array is full ? Try to resize it. If that fails, we undo 381 /*
396 * our allocation and return an error 382 * If this was the first round, resize array and repeat for actual
383 * insertions; otherwise, merge and return.
397 */ 384 */
398 if (type->cnt == type->max && memblock_double_array(type)) { 385 if (!insert) {
399 BUG_ON(slot < 0); 386 while (type->cnt + nr_new > type->max)
400 memblock_remove_region(type, slot); 387 if (memblock_double_array(type) < 0)
401 return -1; 388 return -ENOMEM;
389 insert = true;
390 goto repeat;
391 } else {
392 memblock_merge_regions(type);
393 return 0;
402 } 394 }
403
404 return 0;
405} 395}
406 396
407long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 397int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
398 int nid)
408{ 399{
409 return memblock_add_region(&memblock.memory, base, size); 400 return memblock_add_region(&memblock.memory, base, size, nid);
401}
410 402
403int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
404{
405 return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
411} 406}
412 407
413static long __init_memblock __memblock_remove(struct memblock_type *type, 408/**
414 phys_addr_t base, phys_addr_t size) 409 * memblock_isolate_range - isolate given range into disjoint memblocks
410 * @type: memblock type to isolate range for
411 * @base: base of range to isolate
412 * @size: size of range to isolate
413 * @start_rgn: out parameter for the start of isolated region
414 * @end_rgn: out parameter for the end of isolated region
415 *
416 * Walk @type and ensure that regions don't cross the boundaries defined by
417 * [@base,@base+@size). Crossing regions are split at the boundaries,
418 * which may create at most two more regions. The index of the first
419 * region inside the range is returned in *@start_rgn and end in *@end_rgn.
420 *
421 * RETURNS:
422 * 0 on success, -errno on failure.
423 */
424static int __init_memblock memblock_isolate_range(struct memblock_type *type,
425 phys_addr_t base, phys_addr_t size,
426 int *start_rgn, int *end_rgn)
415{ 427{
416 phys_addr_t end = base + size; 428 phys_addr_t end = base + memblock_cap_size(base, &size);
417 int i; 429 int i;
418 430
419 /* Walk through the array for collisions */ 431 *start_rgn = *end_rgn = 0;
432
433 /* we'll create at most two more regions */
434 while (type->cnt + 2 > type->max)
435 if (memblock_double_array(type) < 0)
436 return -ENOMEM;
437
420 for (i = 0; i < type->cnt; i++) { 438 for (i = 0; i < type->cnt; i++) {
421 struct memblock_region *rgn = &type->regions[i]; 439 struct memblock_region *rgn = &type->regions[i];
422 phys_addr_t rend = rgn->base + rgn->size; 440 phys_addr_t rbase = rgn->base;
441 phys_addr_t rend = rbase + rgn->size;
423 442
424 /* Nothing more to do, exit */ 443 if (rbase >= end)
425 if (rgn->base > end || rgn->size == 0)
426 break; 444 break;
427 445 if (rend <= base)
428 /* If we fully enclose the block, drop it */
429 if (base <= rgn->base && end >= rend) {
430 memblock_remove_region(type, i--);
431 continue; 446 continue;
432 }
433 447
434 /* If we are fully enclosed within a block 448 if (rbase < base) {
435 * then we need to split it and we are done 449 /*
436 */ 450 * @rgn intersects from below. Split and continue
437 if (base > rgn->base && end < rend) { 451 * to process the next region - the new top half.
438 rgn->size = base - rgn->base; 452 */
439 if (!memblock_add_region(type, end, rend - end)) 453 rgn->base = base;
440 return 0; 454 rgn->size -= base - rbase;
441 /* Failure to split is bad, we at least 455 type->total_size -= base - rbase;
442 * restore the block before erroring 456 memblock_insert_region(type, i, rbase, base - rbase,
457 memblock_get_region_node(rgn));
458 } else if (rend > end) {
459 /*
460 * @rgn intersects from above. Split and redo the
461 * current region - the new bottom half.
443 */ 462 */
444 rgn->size = rend - rgn->base;
445 WARN_ON(1);
446 return -1;
447 }
448
449 /* Check if we need to trim the bottom of a block */
450 if (rgn->base < end && rend > end) {
451 rgn->size -= end - rgn->base;
452 rgn->base = end; 463 rgn->base = end;
453 break; 464 rgn->size -= end - rbase;
465 type->total_size -= end - rbase;
466 memblock_insert_region(type, i--, rbase, end - rbase,
467 memblock_get_region_node(rgn));
468 } else {
469 /* @rgn is fully contained, record it */
470 if (!*end_rgn)
471 *start_rgn = i;
472 *end_rgn = i + 1;
454 } 473 }
474 }
455 475
456 /* And check if we need to trim the top of a block */ 476 return 0;
457 if (base < rend) 477}
458 rgn->size -= rend - base;
459 478
460 } 479static int __init_memblock __memblock_remove(struct memblock_type *type,
480 phys_addr_t base, phys_addr_t size)
481{
482 int start_rgn, end_rgn;
483 int i, ret;
484
485 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
486 if (ret)
487 return ret;
488
489 for (i = end_rgn - 1; i >= start_rgn; i--)
490 memblock_remove_region(type, i);
461 return 0; 491 return 0;
462} 492}
463 493
464long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) 494int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
465{ 495{
466 return __memblock_remove(&memblock.memory, base, size); 496 return __memblock_remove(&memblock.memory, base, size);
467} 497}
468 498
469long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) 499int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
470{ 500{
501 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
502 (unsigned long long)base,
503 (unsigned long long)base + size,
504 (void *)_RET_IP_);
505
471 return __memblock_remove(&memblock.reserved, base, size); 506 return __memblock_remove(&memblock.reserved, base, size);
472} 507}
473 508
474long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 509int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
475{ 510{
476 struct memblock_type *_rgn = &memblock.reserved; 511 struct memblock_type *_rgn = &memblock.reserved;
477 512
513 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
514 (unsigned long long)base,
515 (unsigned long long)base + size,
516 (void *)_RET_IP_);
478 BUG_ON(0 == size); 517 BUG_ON(0 == size);
479 518
480 return memblock_add_region(_rgn, base, size); 519 return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
481} 520}
482 521
483phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 522/**
523 * __next_free_mem_range - next function for for_each_free_mem_range()
524 * @idx: pointer to u64 loop variable
525 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
526 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
527 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
528 * @p_nid: ptr to int for nid of the range, can be %NULL
529 *
530 * Find the first free area from *@idx which matches @nid, fill the out
531 * parameters, and update *@idx for the next iteration. The lower 32bit of
532 * *@idx contains index into memory region and the upper 32bit indexes the
533 * areas before each reserved region. For example, if reserved regions
534 * look like the following,
535 *
536 * 0:[0-16), 1:[32-48), 2:[128-130)
537 *
538 * The upper 32bit indexes the following regions.
539 *
540 * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
541 *
542 * As both region arrays are sorted, the function advances the two indices
543 * in lockstep and returns each intersection.
544 */
545void __init_memblock __next_free_mem_range(u64 *idx, int nid,
546 phys_addr_t *out_start,
547 phys_addr_t *out_end, int *out_nid)
484{ 548{
485 phys_addr_t found; 549 struct memblock_type *mem = &memblock.memory;
550 struct memblock_type *rsv = &memblock.reserved;
551 int mi = *idx & 0xffffffff;
552 int ri = *idx >> 32;
486 553
487 /* We align the size to limit fragmentation. Without this, a lot of 554 for ( ; mi < mem->cnt; mi++) {
488 * small allocs quickly eat up the whole reserve array on sparc 555 struct memblock_region *m = &mem->regions[mi];
489 */ 556 phys_addr_t m_start = m->base;
490 size = memblock_align_up(size, align); 557 phys_addr_t m_end = m->base + m->size;
491 558
492 found = memblock_find_base(size, align, 0, max_addr); 559 /* only memory regions are associated with nodes, check it */
493 if (found != MEMBLOCK_ERROR && 560 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
494 !memblock_add_region(&memblock.reserved, found, size)) 561 continue;
495 return found;
496 562
497 return 0; 563 /* scan areas before each reservation for intersection */
564 for ( ; ri < rsv->cnt + 1; ri++) {
565 struct memblock_region *r = &rsv->regions[ri];
566 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
567 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
568
569 /* if ri advanced past mi, break out to advance mi */
570 if (r_start >= m_end)
571 break;
572 /* if the two regions intersect, we're done */
573 if (m_start < r_end) {
574 if (out_start)
575 *out_start = max(m_start, r_start);
576 if (out_end)
577 *out_end = min(m_end, r_end);
578 if (out_nid)
579 *out_nid = memblock_get_region_node(m);
580 /*
581 * The region which ends first is advanced
582 * for the next iteration.
583 */
584 if (m_end <= r_end)
585 mi++;
586 else
587 ri++;
588 *idx = (u32)mi | (u64)ri << 32;
589 return;
590 }
591 }
592 }
593
594 /* signal end of iteration */
595 *idx = ULLONG_MAX;
498} 596}
499 597
500phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 598/**
599 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
600 * @idx: pointer to u64 loop variable
601 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
602 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
603 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
604 * @p_nid: ptr to int for nid of the range, can be %NULL
605 *
606 * Reverse of __next_free_mem_range().
607 */
608void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
609 phys_addr_t *out_start,
610 phys_addr_t *out_end, int *out_nid)
501{ 611{
502 phys_addr_t alloc; 612 struct memblock_type *mem = &memblock.memory;
613 struct memblock_type *rsv = &memblock.reserved;
614 int mi = *idx & 0xffffffff;
615 int ri = *idx >> 32;
503 616
504 alloc = __memblock_alloc_base(size, align, max_addr); 617 if (*idx == (u64)ULLONG_MAX) {
618 mi = mem->cnt - 1;
619 ri = rsv->cnt;
620 }
505 621
506 if (alloc == 0) 622 for ( ; mi >= 0; mi--) {
507 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", 623 struct memblock_region *m = &mem->regions[mi];
508 (unsigned long long) size, (unsigned long long) max_addr); 624 phys_addr_t m_start = m->base;
625 phys_addr_t m_end = m->base + m->size;
509 626
510 return alloc; 627 /* only memory regions are associated with nodes, check it */
511} 628 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
629 continue;
512 630
513phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) 631 /* scan areas before each reservation for intersection */
514{ 632 for ( ; ri >= 0; ri--) {
515 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 633 struct memblock_region *r = &rsv->regions[ri];
516} 634 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
635 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
636
637 /* if ri advanced past mi, break out to advance mi */
638 if (r_end <= m_start)
639 break;
640 /* if the two regions intersect, we're done */
641 if (m_end > r_start) {
642 if (out_start)
643 *out_start = max(m_start, r_start);
644 if (out_end)
645 *out_end = min(m_end, r_end);
646 if (out_nid)
647 *out_nid = memblock_get_region_node(m);
648
649 if (m_start >= r_start)
650 mi--;
651 else
652 ri--;
653 *idx = (u32)mi | (u64)ri << 32;
654 return;
655 }
656 }
657 }
517 658
659 *idx = ULLONG_MAX;
660}
518 661
662#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
519/* 663/*
520 * Additional node-local allocators. Search for node memory is bottom up 664 * Common iterator interface used to define for_each_mem_range().
521 * and walks memblock regions within that node bottom-up as well, but allocation
522 * within an memblock region is top-down. XXX I plan to fix that at some stage
523 *
524 * WARNING: Only available after early_node_map[] has been populated,
525 * on some architectures, that is after all the calls to add_active_range()
526 * have been done to populate it.
527 */ 665 */
528 666void __init_memblock __next_mem_pfn_range(int *idx, int nid,
529phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) 667 unsigned long *out_start_pfn,
668 unsigned long *out_end_pfn, int *out_nid)
530{ 669{
531#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 670 struct memblock_type *type = &memblock.memory;
532 /* 671 struct memblock_region *r;
533 * This code originates from sparc which really wants use to walk by addresses
534 * and returns the nid. This is not very convenient for early_pfn_map[] users
535 * as the map isn't sorted yet, and it really wants to be walked by nid.
536 *
537 * For now, I implement the inefficient method below which walks the early
538 * map multiple times. Eventually we may want to use an ARCH config option
539 * to implement a completely different method for both case.
540 */
541 unsigned long start_pfn, end_pfn;
542 int i;
543 672
544 for (i = 0; i < MAX_NUMNODES; i++) { 673 while (++*idx < type->cnt) {
545 get_pfn_range_for_nid(i, &start_pfn, &end_pfn); 674 r = &type->regions[*idx];
546 if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) 675
676 if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
547 continue; 677 continue;
548 *nid = i; 678 if (nid == MAX_NUMNODES || nid == r->nid)
549 return min(end, PFN_PHYS(end_pfn)); 679 break;
680 }
681 if (*idx >= type->cnt) {
682 *idx = -1;
683 return;
550 } 684 }
551#endif
552 *nid = 0;
553 685
554 return end; 686 if (out_start_pfn)
687 *out_start_pfn = PFN_UP(r->base);
688 if (out_end_pfn)
689 *out_end_pfn = PFN_DOWN(r->base + r->size);
690 if (out_nid)
691 *out_nid = r->nid;
555} 692}
556 693
557static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, 694/**
558 phys_addr_t size, 695 * memblock_set_node - set node ID on memblock regions
559 phys_addr_t align, int nid) 696 * @base: base of area to set node ID for
697 * @size: size of area to set node ID for
698 * @nid: node ID to set
699 *
700 * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
701 * Regions which cross the area boundaries are split as necessary.
702 *
703 * RETURNS:
704 * 0 on success, -errno on failure.
705 */
706int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
707 int nid)
560{ 708{
561 phys_addr_t start, end; 709 struct memblock_type *type = &memblock.memory;
710 int start_rgn, end_rgn;
711 int i, ret;
562 712
563 start = mp->base; 713 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
564 end = start + mp->size; 714 if (ret)
715 return ret;
565 716
566 start = memblock_align_up(start, align); 717 for (i = start_rgn; i < end_rgn; i++)
567 while (start < end) { 718 type->regions[i].nid = nid;
568 phys_addr_t this_end;
569 int this_nid;
570 719
571 this_end = memblock_nid_range(start, end, &this_nid); 720 memblock_merge_regions(type);
572 if (this_nid == nid) { 721 return 0;
573 phys_addr_t ret = memblock_find_region(start, this_end, size, align); 722}
574 if (ret != MEMBLOCK_ERROR && 723#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
575 !memblock_add_region(&memblock.reserved, ret, size)) 724
576 return ret; 725static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
577 } 726 phys_addr_t align, phys_addr_t max_addr,
578 start = this_end; 727 int nid)
579 } 728{
729 phys_addr_t found;
580 730
581 return MEMBLOCK_ERROR; 731 found = memblock_find_in_range_node(0, max_addr, size, align, nid);
732 if (found && !memblock_reserve(found, size))
733 return found;
734
735 return 0;
582} 736}
583 737
584phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) 738phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
585{ 739{
586 struct memblock_type *mem = &memblock.memory; 740 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
587 int i; 741}
588 742
589 BUG_ON(0 == size); 743phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
744{
745 return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
746}
590 747
591 /* We align the size to limit fragmentation. Without this, a lot of 748phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
592 * small allocs quickly eat up the whole reserve array on sparc 749{
593 */ 750 phys_addr_t alloc;
594 size = memblock_align_up(size, align);
595 751
596 /* We do a bottom-up search for a region with the right 752 alloc = __memblock_alloc_base(size, align, max_addr);
597 * nid since that's easier considering how memblock_nid_range()
598 * works
599 */
600 for (i = 0; i < mem->cnt; i++) {
601 phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
602 size, align, nid);
603 if (ret != MEMBLOCK_ERROR)
604 return ret;
605 }
606 753
607 return 0; 754 if (alloc == 0)
755 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
756 (unsigned long long) size, (unsigned long long) max_addr);
757
758 return alloc;
759}
760
761phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
762{
763 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
608} 764}
609 765
610phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) 766phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
@@ -613,7 +769,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
613 769
614 if (res) 770 if (res)
615 return res; 771 return res;
616 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); 772 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
617} 773}
618 774
619 775
@@ -621,10 +777,9 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
621 * Remaining API functions 777 * Remaining API functions
622 */ 778 */
623 779
624/* You must call memblock_analyze() before this. */
625phys_addr_t __init memblock_phys_mem_size(void) 780phys_addr_t __init memblock_phys_mem_size(void)
626{ 781{
627 return memblock.memory_size; 782 return memblock.memory.total_size;
628} 783}
629 784
630/* lowest address */ 785/* lowest address */
@@ -640,45 +795,28 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
640 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); 795 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
641} 796}
642 797
643/* You must call memblock_analyze() after this. */ 798void __init memblock_enforce_memory_limit(phys_addr_t limit)
644void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
645{ 799{
646 unsigned long i; 800 unsigned long i;
647 phys_addr_t limit; 801 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
648 struct memblock_region *p;
649 802
650 if (!memory_limit) 803 if (!limit)
651 return; 804 return;
652 805
653 /* Truncate the memblock regions to satisfy the memory limit. */ 806 /* find out max address */
654 limit = memory_limit;
655 for (i = 0; i < memblock.memory.cnt; i++) { 807 for (i = 0; i < memblock.memory.cnt; i++) {
656 if (limit > memblock.memory.regions[i].size) { 808 struct memblock_region *r = &memblock.memory.regions[i];
657 limit -= memblock.memory.regions[i].size;
658 continue;
659 }
660
661 memblock.memory.regions[i].size = limit;
662 memblock.memory.cnt = i + 1;
663 break;
664 }
665
666 memory_limit = memblock_end_of_DRAM();
667 809
668 /* And truncate any reserves above the limit also. */ 810 if (limit <= r->size) {
669 for (i = 0; i < memblock.reserved.cnt; i++) { 811 max_addr = r->base + limit;
670 p = &memblock.reserved.regions[i]; 812 break;
671
672 if (p->base > memory_limit)
673 p->size = 0;
674 else if ((p->base + p->size) > memory_limit)
675 p->size = memory_limit - p->base;
676
677 if (p->size == 0) {
678 memblock_remove_region(&memblock.reserved, i);
679 i--;
680 } 813 }
814 limit -= r->size;
681 } 815 }
816
817 /* truncate both memory and reserved regions */
818 __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX);
819 __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX);
682} 820}
683 821
684static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) 822static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
@@ -712,16 +850,18 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
712int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 850int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
713{ 851{
714 int idx = memblock_search(&memblock.memory, base); 852 int idx = memblock_search(&memblock.memory, base);
853 phys_addr_t end = base + memblock_cap_size(base, &size);
715 854
716 if (idx == -1) 855 if (idx == -1)
717 return 0; 856 return 0;
718 return memblock.memory.regions[idx].base <= base && 857 return memblock.memory.regions[idx].base <= base &&
719 (memblock.memory.regions[idx].base + 858 (memblock.memory.regions[idx].base +
720 memblock.memory.regions[idx].size) >= (base + size); 859 memblock.memory.regions[idx].size) >= end;
721} 860}
722 861
723int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 862int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
724{ 863{
864 memblock_cap_size(base, &size);
725 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; 865 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
726} 866}
727 867
@@ -731,86 +871,45 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
731 memblock.current_limit = limit; 871 memblock.current_limit = limit;
732} 872}
733 873
734static void __init_memblock memblock_dump(struct memblock_type *region, char *name) 874static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
735{ 875{
736 unsigned long long base, size; 876 unsigned long long base, size;
737 int i; 877 int i;
738 878
739 pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); 879 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
740 880
741 for (i = 0; i < region->cnt; i++) { 881 for (i = 0; i < type->cnt; i++) {
742 base = region->regions[i].base; 882 struct memblock_region *rgn = &type->regions[i];
743 size = region->regions[i].size; 883 char nid_buf[32] = "";
744 884
745 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", 885 base = rgn->base;
746 name, i, base, base + size - 1, size); 886 size = rgn->size;
887#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
888 if (memblock_get_region_node(rgn) != MAX_NUMNODES)
889 snprintf(nid_buf, sizeof(nid_buf), " on node %d",
890 memblock_get_region_node(rgn));
891#endif
892 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
893 name, i, base, base + size - 1, size, nid_buf);
747 } 894 }
748} 895}
749 896
750void __init_memblock memblock_dump_all(void) 897void __init_memblock __memblock_dump_all(void)
751{ 898{
752 if (!memblock_debug)
753 return;
754
755 pr_info("MEMBLOCK configuration:\n"); 899 pr_info("MEMBLOCK configuration:\n");
756 pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); 900 pr_info(" memory size = %#llx reserved size = %#llx\n",
901 (unsigned long long)memblock.memory.total_size,
902 (unsigned long long)memblock.reserved.total_size);
757 903
758 memblock_dump(&memblock.memory, "memory"); 904 memblock_dump(&memblock.memory, "memory");
759 memblock_dump(&memblock.reserved, "reserved"); 905 memblock_dump(&memblock.reserved, "reserved");
760} 906}
761 907
762void __init memblock_analyze(void) 908void __init memblock_allow_resize(void)
763{ 909{
764 int i;
765
766 /* Check marker in the unused last array entry */
767 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
768 != MEMBLOCK_INACTIVE);
769 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
770 != MEMBLOCK_INACTIVE);
771
772 memblock.memory_size = 0;
773
774 for (i = 0; i < memblock.memory.cnt; i++)
775 memblock.memory_size += memblock.memory.regions[i].size;
776
777 /* We allow resizing from there */
778 memblock_can_resize = 1; 910 memblock_can_resize = 1;
779} 911}
780 912
781void __init memblock_init(void)
782{
783 static int init_done __initdata = 0;
784
785 if (init_done)
786 return;
787 init_done = 1;
788
789 /* Hookup the initial arrays */
790 memblock.memory.regions = memblock_memory_init_regions;
791 memblock.memory.max = INIT_MEMBLOCK_REGIONS;
792 memblock.reserved.regions = memblock_reserved_init_regions;
793 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
794
795 /* Write a marker in the unused last array entry */
796 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
797 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
798
799 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
800 * This simplifies the memblock_add() code below...
801 */
802 memblock.memory.regions[0].base = 0;
803 memblock.memory.regions[0].size = 0;
804 memblock.memory.cnt = 1;
805
806 /* Ditto. */
807 memblock.reserved.regions[0].base = 0;
808 memblock.reserved.regions[0].size = 0;
809 memblock.reserved.cnt = 1;
810
811 memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
812}
813
814static int __init early_memblock(char *p) 913static int __init early_memblock(char *p)
815{ 914{
816 if (p && strstr(p, "debug")) 915 if (p && strstr(p, "debug"))
@@ -819,7 +918,7 @@ static int __init early_memblock(char *p)
819} 918}
820early_param("memblock", early_memblock); 919early_param("memblock", early_memblock);
821 920
822#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) 921#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
823 922
824static int memblock_debug_show(struct seq_file *m, void *private) 923static int memblock_debug_show(struct seq_file *m, void *private)
825{ 924{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index adc395481813..c3fdbcb17658 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -636,6 +636,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
636 struct vm_area_struct *prev; 636 struct vm_area_struct *prev;
637 struct vm_area_struct *vma; 637 struct vm_area_struct *vma;
638 int err = 0; 638 int err = 0;
639 pgoff_t pgoff;
639 unsigned long vmstart; 640 unsigned long vmstart;
640 unsigned long vmend; 641 unsigned long vmend;
641 642
@@ -643,13 +644,21 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
643 if (!vma || vma->vm_start > start) 644 if (!vma || vma->vm_start > start)
644 return -EFAULT; 645 return -EFAULT;
645 646
647 if (start > vma->vm_start)
648 prev = vma;
649
646 for (; vma && vma->vm_start < end; prev = vma, vma = next) { 650 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
647 next = vma->vm_next; 651 next = vma->vm_next;
648 vmstart = max(start, vma->vm_start); 652 vmstart = max(start, vma->vm_start);
649 vmend = min(end, vma->vm_end); 653 vmend = min(end, vma->vm_end);
650 654
655 if (mpol_equal(vma_policy(vma), new_pol))
656 continue;
657
658 pgoff = vma->vm_pgoff +
659 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
651 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 660 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
652 vma->anon_vma, vma->vm_file, vma->vm_pgoff, 661 vma->anon_vma, vma->vm_file, pgoff,
653 new_pol); 662 new_pol);
654 if (prev) { 663 if (prev) {
655 vma = prev; 664 vma = prev;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7fa41b4a07bf..24f0fc1a56d6 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,14 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
41 if (limit > memblock.current_limit) 41 if (limit > memblock.current_limit)
42 limit = memblock.current_limit; 42 limit = memblock.current_limit;
43 43
44 addr = find_memory_core_early(nid, size, align, goal, limit); 44 addr = memblock_find_in_range_node(goal, limit, size, align, nid);
45 45 if (!addr)
46 if (addr == MEMBLOCK_ERROR)
47 return NULL; 46 return NULL;
48 47
49 ptr = phys_to_virt(addr); 48 ptr = phys_to_virt(addr);
50 memset(ptr, 0, size); 49 memset(ptr, 0, size);
51 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); 50 memblock_reserve(addr, size);
52 /* 51 /*
53 * The min_count is set to 0 so that bootmem allocated blocks 52 * The min_count is set to 0 so that bootmem allocated blocks
54 * are never reported as leaks. 53 * are never reported as leaks.
@@ -107,23 +106,27 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
107 __free_pages_bootmem(pfn_to_page(i), 0); 106 __free_pages_bootmem(pfn_to_page(i), 0);
108} 107}
109 108
110unsigned long __init free_all_memory_core_early(int nodeid) 109unsigned long __init free_low_memory_core_early(int nodeid)
111{ 110{
112 int i;
113 u64 start, end;
114 unsigned long count = 0; 111 unsigned long count = 0;
115 struct range *range = NULL; 112 phys_addr_t start, end;
116 int nr_range; 113 u64 i;
117 114
118 nr_range = get_free_all_memory_range(&range, nodeid); 115 /* free reserved array temporarily so that it's treated as free area */
119 116 memblock_free_reserved_regions();
120 for (i = 0; i < nr_range; i++) { 117
121 start = range[i].start; 118 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
122 end = range[i].end; 119 unsigned long start_pfn = PFN_UP(start);
123 count += end - start; 120 unsigned long end_pfn = min_t(unsigned long,
124 __free_pages_memory(start, end); 121 PFN_DOWN(end), max_low_pfn);
122 if (start_pfn < end_pfn) {
123 __free_pages_memory(start_pfn, end_pfn);
124 count += end_pfn - start_pfn;
125 }
125 } 126 }
126 127
128 /* put region array back? */
129 memblock_reserve_reserved_regions();
127 return count; 130 return count;
128} 131}
129 132
@@ -137,7 +140,7 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
137{ 140{
138 register_page_bootmem_info_node(pgdat); 141 register_page_bootmem_info_node(pgdat);
139 142
140 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ 143 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
141 return 0; 144 return 0;
142} 145}
143 146
@@ -155,7 +158,7 @@ unsigned long __init free_all_bootmem(void)
155 * Use MAX_NUMNODES will make sure all ranges in early_node_map[] 158 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
156 * will be used instead of only Node0 related 159 * will be used instead of only Node0 related
157 */ 160 */
158 return free_all_memory_core_early(MAX_NUMNODES); 161 return free_low_memory_core_early(MAX_NUMNODES);
159} 162}
160 163
161/** 164/**
@@ -172,7 +175,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
172 unsigned long size) 175 unsigned long size)
173{ 176{
174 kmemleak_free_part(__va(physaddr), size); 177 kmemleak_free_part(__va(physaddr), size);
175 memblock_x86_free_range(physaddr, physaddr + size); 178 memblock_free(physaddr, size);
176} 179}
177 180
178/** 181/**
@@ -187,7 +190,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
187void __init free_bootmem(unsigned long addr, unsigned long size) 190void __init free_bootmem(unsigned long addr, unsigned long size)
188{ 191{
189 kmemleak_free_part(__va(addr), size); 192 kmemleak_free_part(__va(addr), size);
190 memblock_x86_free_range(addr, addr + size); 193 memblock_free(addr, size);
191} 194}
192 195
193static void * __init ___alloc_bootmem_nopanic(unsigned long size, 196static void * __init ___alloc_bootmem_nopanic(unsigned long size,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b8ba3aebf6e..bdc804c2d99c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -181,39 +181,17 @@ static unsigned long __meminitdata nr_kernel_pages;
181static unsigned long __meminitdata nr_all_pages; 181static unsigned long __meminitdata nr_all_pages;
182static unsigned long __meminitdata dma_reserve; 182static unsigned long __meminitdata dma_reserve;
183 183
184#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 184#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
185 /* 185static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
186 * MAX_ACTIVE_REGIONS determines the maximum number of distinct 186static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
187 * ranges of memory (RAM) that may be registered with add_active_range(). 187static unsigned long __initdata required_kernelcore;
188 * Ranges passed to add_active_range() will be merged if possible 188static unsigned long __initdata required_movablecore;
189 * so the number of times add_active_range() can be called is 189static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
190 * related to the number of nodes and the number of holes 190
191 */ 191/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
192 #ifdef CONFIG_MAX_ACTIVE_REGIONS 192int movable_zone;
193 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 193EXPORT_SYMBOL(movable_zone);
194 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 194#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
195 #else
196 #if MAX_NUMNODES >= 32
197 /* If there can be many nodes, allow up to 50 holes per node */
198 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
199 #else
200 /* By default, allow up to 256 distinct regions */
201 #define MAX_ACTIVE_REGIONS 256
202 #endif
203 #endif
204
205 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
206 static int __meminitdata nr_nodemap_entries;
207 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
208 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
209 static unsigned long __initdata required_kernelcore;
210 static unsigned long __initdata required_movablecore;
211 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
212
213 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
214 int movable_zone;
215 EXPORT_SYMBOL(movable_zone);
216#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
217 195
218#if MAX_NUMNODES > 1 196#if MAX_NUMNODES > 1
219int nr_node_ids __read_mostly = MAX_NUMNODES; 197int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -706,10 +684,10 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
706 int loop; 684 int loop;
707 685
708 prefetchw(page); 686 prefetchw(page);
709 for (loop = 0; loop < BITS_PER_LONG; loop++) { 687 for (loop = 0; loop < (1 << order); loop++) {
710 struct page *p = &page[loop]; 688 struct page *p = &page[loop];
711 689
712 if (loop + 1 < BITS_PER_LONG) 690 if (loop + 1 < (1 << order))
713 prefetchw(p + 1); 691 prefetchw(p + 1);
714 __ClearPageReserved(p); 692 __ClearPageReserved(p);
715 set_page_count(p, 0); 693 set_page_count(p, 0);
@@ -3737,35 +3715,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
3737 return 0; 3715 return 0;
3738} 3716}
3739 3717
3740#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3718#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
3741/*
3742 * Basic iterator support. Return the first range of PFNs for a node
3743 * Note: nid == MAX_NUMNODES returns first region regardless of node
3744 */
3745static int __meminit first_active_region_index_in_nid(int nid)
3746{
3747 int i;
3748
3749 for (i = 0; i < nr_nodemap_entries; i++)
3750 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3751 return i;
3752
3753 return -1;
3754}
3755
3756/*
3757 * Basic iterator support. Return the next active range of PFNs for a node
3758 * Note: nid == MAX_NUMNODES returns next region regardless of node
3759 */
3760static int __meminit next_active_region_index_in_nid(int index, int nid)
3761{
3762 for (index = index + 1; index < nr_nodemap_entries; index++)
3763 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3764 return index;
3765
3766 return -1;
3767}
3768
3769#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 3719#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
3770/* 3720/*
3771 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 3721 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3775,15 +3725,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
3775 */ 3725 */
3776int __meminit __early_pfn_to_nid(unsigned long pfn) 3726int __meminit __early_pfn_to_nid(unsigned long pfn)
3777{ 3727{
3778 int i; 3728 unsigned long start_pfn, end_pfn;
3779 3729 int i, nid;
3780 for (i = 0; i < nr_nodemap_entries; i++) {
3781 unsigned long start_pfn = early_node_map[i].start_pfn;
3782 unsigned long end_pfn = early_node_map[i].end_pfn;
3783 3730
3731 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
3784 if (start_pfn <= pfn && pfn < end_pfn) 3732 if (start_pfn <= pfn && pfn < end_pfn)
3785 return early_node_map[i].nid; 3733 return nid;
3786 }
3787 /* This is a memory hole */ 3734 /* This is a memory hole */
3788 return -1; 3735 return -1;
3789} 3736}
@@ -3812,11 +3759,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
3812} 3759}
3813#endif 3760#endif
3814 3761
3815/* Basic iterator support to walk early_node_map[] */
3816#define for_each_active_range_index_in_nid(i, nid) \
3817 for (i = first_active_region_index_in_nid(nid); i != -1; \
3818 i = next_active_region_index_in_nid(i, nid))
3819
3820/** 3762/**
3821 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 3763 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
3822 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 3764 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -3826,122 +3768,34 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
3826 * add_active_ranges() contain no holes and may be freed, this 3768 * add_active_ranges() contain no holes and may be freed, this
3827 * this function may be used instead of calling free_bootmem() manually. 3769 * this function may be used instead of calling free_bootmem() manually.
3828 */ 3770 */
3829void __init free_bootmem_with_active_regions(int nid, 3771void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
3830 unsigned long max_low_pfn)
3831{
3832 int i;
3833
3834 for_each_active_range_index_in_nid(i, nid) {
3835 unsigned long size_pages = 0;
3836 unsigned long end_pfn = early_node_map[i].end_pfn;
3837
3838 if (early_node_map[i].start_pfn >= max_low_pfn)
3839 continue;
3840
3841 if (end_pfn > max_low_pfn)
3842 end_pfn = max_low_pfn;
3843
3844 size_pages = end_pfn - early_node_map[i].start_pfn;
3845 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
3846 PFN_PHYS(early_node_map[i].start_pfn),
3847 size_pages << PAGE_SHIFT);
3848 }
3849}
3850
3851#ifdef CONFIG_HAVE_MEMBLOCK
3852/*
3853 * Basic iterator support. Return the last range of PFNs for a node
3854 * Note: nid == MAX_NUMNODES returns last region regardless of node
3855 */
3856static int __meminit last_active_region_index_in_nid(int nid)
3857{ 3772{
3858 int i; 3773 unsigned long start_pfn, end_pfn;
3859 3774 int i, this_nid;
3860 for (i = nr_nodemap_entries - 1; i >= 0; i--)
3861 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3862 return i;
3863
3864 return -1;
3865}
3866
3867/*
3868 * Basic iterator support. Return the previous active range of PFNs for a node
3869 * Note: nid == MAX_NUMNODES returns next region regardless of node
3870 */
3871static int __meminit previous_active_region_index_in_nid(int index, int nid)
3872{
3873 for (index = index - 1; index >= 0; index--)
3874 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3875 return index;
3876
3877 return -1;
3878}
3879
3880#define for_each_active_range_index_in_nid_reverse(i, nid) \
3881 for (i = last_active_region_index_in_nid(nid); i != -1; \
3882 i = previous_active_region_index_in_nid(i, nid))
3883
3884u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3885 u64 goal, u64 limit)
3886{
3887 int i;
3888
3889 /* Need to go over early_node_map to find out good range for node */
3890 for_each_active_range_index_in_nid_reverse(i, nid) {
3891 u64 addr;
3892 u64 ei_start, ei_last;
3893 u64 final_start, final_end;
3894
3895 ei_last = early_node_map[i].end_pfn;
3896 ei_last <<= PAGE_SHIFT;
3897 ei_start = early_node_map[i].start_pfn;
3898 ei_start <<= PAGE_SHIFT;
3899
3900 final_start = max(ei_start, goal);
3901 final_end = min(ei_last, limit);
3902
3903 if (final_start >= final_end)
3904 continue;
3905
3906 addr = memblock_find_in_range(final_start, final_end, size, align);
3907 3775
3908 if (addr == MEMBLOCK_ERROR) 3776 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
3909 continue; 3777 start_pfn = min(start_pfn, max_low_pfn);
3778 end_pfn = min(end_pfn, max_low_pfn);
3910 3779
3911 return addr; 3780 if (start_pfn < end_pfn)
3781 free_bootmem_node(NODE_DATA(this_nid),
3782 PFN_PHYS(start_pfn),
3783 (end_pfn - start_pfn) << PAGE_SHIFT);
3912 } 3784 }
3913
3914 return MEMBLOCK_ERROR;
3915} 3785}
3916#endif
3917 3786
3918int __init add_from_early_node_map(struct range *range, int az, 3787int __init add_from_early_node_map(struct range *range, int az,
3919 int nr_range, int nid) 3788 int nr_range, int nid)
3920{ 3789{
3790 unsigned long start_pfn, end_pfn;
3921 int i; 3791 int i;
3922 u64 start, end;
3923 3792
3924 /* need to go over early_node_map to find out good range for node */ 3793 /* need to go over early_node_map to find out good range for node */
3925 for_each_active_range_index_in_nid(i, nid) { 3794 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
3926 start = early_node_map[i].start_pfn; 3795 nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
3927 end = early_node_map[i].end_pfn;
3928 nr_range = add_range(range, az, nr_range, start, end);
3929 }
3930 return nr_range; 3796 return nr_range;
3931} 3797}
3932 3798
3933void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3934{
3935 int i;
3936 int ret;
3937
3938 for_each_active_range_index_in_nid(i, nid) {
3939 ret = work_fn(early_node_map[i].start_pfn,
3940 early_node_map[i].end_pfn, data);
3941 if (ret)
3942 break;
3943 }
3944}
3945/** 3799/**
3946 * sparse_memory_present_with_active_regions - Call memory_present for each active range 3800 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3947 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 3801 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -3952,12 +3806,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3952 */ 3806 */
3953void __init sparse_memory_present_with_active_regions(int nid) 3807void __init sparse_memory_present_with_active_regions(int nid)
3954{ 3808{
3955 int i; 3809 unsigned long start_pfn, end_pfn;
3810 int i, this_nid;
3956 3811
3957 for_each_active_range_index_in_nid(i, nid) 3812 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
3958 memory_present(early_node_map[i].nid, 3813 memory_present(this_nid, start_pfn, end_pfn);
3959 early_node_map[i].start_pfn,
3960 early_node_map[i].end_pfn);
3961} 3814}
3962 3815
3963/** 3816/**
@@ -3974,13 +3827,15 @@ void __init sparse_memory_present_with_active_regions(int nid)
3974void __meminit get_pfn_range_for_nid(unsigned int nid, 3827void __meminit get_pfn_range_for_nid(unsigned int nid,
3975 unsigned long *start_pfn, unsigned long *end_pfn) 3828 unsigned long *start_pfn, unsigned long *end_pfn)
3976{ 3829{
3830 unsigned long this_start_pfn, this_end_pfn;
3977 int i; 3831 int i;
3832
3978 *start_pfn = -1UL; 3833 *start_pfn = -1UL;
3979 *end_pfn = 0; 3834 *end_pfn = 0;
3980 3835
3981 for_each_active_range_index_in_nid(i, nid) { 3836 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
3982 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 3837 *start_pfn = min(*start_pfn, this_start_pfn);
3983 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3838 *end_pfn = max(*end_pfn, this_end_pfn);
3984 } 3839 }
3985 3840
3986 if (*start_pfn == -1UL) 3841 if (*start_pfn == -1UL)
@@ -4083,46 +3938,16 @@ unsigned long __meminit __absent_pages_in_range(int nid,
4083 unsigned long range_start_pfn, 3938 unsigned long range_start_pfn,
4084 unsigned long range_end_pfn) 3939 unsigned long range_end_pfn)
4085{ 3940{
4086 int i = 0; 3941 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4087 unsigned long prev_end_pfn = 0, hole_pages = 0; 3942 unsigned long start_pfn, end_pfn;
4088 unsigned long start_pfn; 3943 int i;
4089
4090 /* Find the end_pfn of the first active range of pfns in the node */
4091 i = first_active_region_index_in_nid(nid);
4092 if (i == -1)
4093 return 0;
4094
4095 prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
4096
4097 /* Account for ranges before physical memory on this node */
4098 if (early_node_map[i].start_pfn > range_start_pfn)
4099 hole_pages = prev_end_pfn - range_start_pfn;
4100
4101 /* Find all holes for the zone within the node */
4102 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
4103
4104 /* No need to continue if prev_end_pfn is outside the zone */
4105 if (prev_end_pfn >= range_end_pfn)
4106 break;
4107
4108 /* Make sure the end of the zone is not within the hole */
4109 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
4110 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
4111 3944
4112 /* Update the hole size cound and move on */ 3945 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4113 if (start_pfn > range_start_pfn) { 3946 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4114 BUG_ON(prev_end_pfn > start_pfn); 3947 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4115 hole_pages += start_pfn - prev_end_pfn; 3948 nr_absent -= end_pfn - start_pfn;
4116 }
4117 prev_end_pfn = early_node_map[i].end_pfn;
4118 } 3949 }
4119 3950 return nr_absent;
4120 /* Account for ranges past physical memory on this node */
4121 if (range_end_pfn > prev_end_pfn)
4122 hole_pages += range_end_pfn -
4123 max(range_start_pfn, prev_end_pfn);
4124
4125 return hole_pages;
4126} 3951}
4127 3952
4128/** 3953/**
@@ -4143,14 +3968,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4143 unsigned long zone_type, 3968 unsigned long zone_type,
4144 unsigned long *ignored) 3969 unsigned long *ignored)
4145{ 3970{
3971 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
3972 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4146 unsigned long node_start_pfn, node_end_pfn; 3973 unsigned long node_start_pfn, node_end_pfn;
4147 unsigned long zone_start_pfn, zone_end_pfn; 3974 unsigned long zone_start_pfn, zone_end_pfn;
4148 3975
4149 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 3976 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4150 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 3977 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4151 node_start_pfn); 3978 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4152 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
4153 node_end_pfn);
4154 3979
4155 adjust_zone_range_for_zone_movable(nid, zone_type, 3980 adjust_zone_range_for_zone_movable(nid, zone_type,
4156 node_start_pfn, node_end_pfn, 3981 node_start_pfn, node_end_pfn,
@@ -4158,7 +3983,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4158 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 3983 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4159} 3984}
4160 3985
4161#else 3986#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4162static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 3987static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4163 unsigned long zone_type, 3988 unsigned long zone_type,
4164 unsigned long *zones_size) 3989 unsigned long *zones_size)
@@ -4176,7 +4001,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4176 return zholes_size[zone_type]; 4001 return zholes_size[zone_type];
4177} 4002}
4178 4003
4179#endif 4004#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4180 4005
4181static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4006static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4182 unsigned long *zones_size, unsigned long *zholes_size) 4007 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4399,10 +4224,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4399 */ 4224 */
4400 if (pgdat == NODE_DATA(0)) { 4225 if (pgdat == NODE_DATA(0)) {
4401 mem_map = NODE_DATA(0)->node_mem_map; 4226 mem_map = NODE_DATA(0)->node_mem_map;
4402#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 4227#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4403 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4228 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4404 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4229 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4405#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 4230#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4406 } 4231 }
4407#endif 4232#endif
4408#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4233#endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@ -4427,7 +4252,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4427 free_area_init_core(pgdat, zones_size, zholes_size); 4252 free_area_init_core(pgdat, zones_size, zholes_size);
4428} 4253}
4429 4254
4430#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 4255#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4431 4256
4432#if MAX_NUMNODES > 1 4257#if MAX_NUMNODES > 1
4433/* 4258/*
@@ -4449,170 +4274,6 @@ static inline void setup_nr_node_ids(void)
4449#endif 4274#endif
4450 4275
4451/** 4276/**
4452 * add_active_range - Register a range of PFNs backed by physical memory
4453 * @nid: The node ID the range resides on
4454 * @start_pfn: The start PFN of the available physical memory
4455 * @end_pfn: The end PFN of the available physical memory
4456 *
4457 * These ranges are stored in an early_node_map[] and later used by
4458 * free_area_init_nodes() to calculate zone sizes and holes. If the
4459 * range spans a memory hole, it is up to the architecture to ensure
4460 * the memory is not freed by the bootmem allocator. If possible
4461 * the range being registered will be merged with existing ranges.
4462 */
4463void __init add_active_range(unsigned int nid, unsigned long start_pfn,
4464 unsigned long end_pfn)
4465{
4466 int i;
4467
4468 mminit_dprintk(MMINIT_TRACE, "memory_register",
4469 "Entering add_active_range(%d, %#lx, %#lx) "
4470 "%d entries of %d used\n",
4471 nid, start_pfn, end_pfn,
4472 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
4473
4474 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
4475
4476 /* Merge with existing active regions if possible */
4477 for (i = 0; i < nr_nodemap_entries; i++) {
4478 if (early_node_map[i].nid != nid)
4479 continue;
4480
4481 /* Skip if an existing region covers this new one */
4482 if (start_pfn >= early_node_map[i].start_pfn &&
4483 end_pfn <= early_node_map[i].end_pfn)
4484 return;
4485
4486 /* Merge forward if suitable */
4487 if (start_pfn <= early_node_map[i].end_pfn &&
4488 end_pfn > early_node_map[i].end_pfn) {
4489 early_node_map[i].end_pfn = end_pfn;
4490 return;
4491 }
4492
4493 /* Merge backward if suitable */
4494 if (start_pfn < early_node_map[i].start_pfn &&
4495 end_pfn >= early_node_map[i].start_pfn) {
4496 early_node_map[i].start_pfn = start_pfn;
4497 return;
4498 }
4499 }
4500
4501 /* Check that early_node_map is large enough */
4502 if (i >= MAX_ACTIVE_REGIONS) {
4503 printk(KERN_CRIT "More than %d memory regions, truncating\n",
4504 MAX_ACTIVE_REGIONS);
4505 return;
4506 }
4507
4508 early_node_map[i].nid = nid;
4509 early_node_map[i].start_pfn = start_pfn;
4510 early_node_map[i].end_pfn = end_pfn;
4511 nr_nodemap_entries = i + 1;
4512}
4513
4514/**
4515 * remove_active_range - Shrink an existing registered range of PFNs
4516 * @nid: The node id the range is on that should be shrunk
4517 * @start_pfn: The new PFN of the range
4518 * @end_pfn: The new PFN of the range
4519 *
4520 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
4521 * The map is kept near the end physical page range that has already been
4522 * registered. This function allows an arch to shrink an existing registered
4523 * range.
4524 */
4525void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
4526 unsigned long end_pfn)
4527{
4528 int i, j;
4529 int removed = 0;
4530
4531 printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
4532 nid, start_pfn, end_pfn);
4533
4534 /* Find the old active region end and shrink */
4535 for_each_active_range_index_in_nid(i, nid) {
4536 if (early_node_map[i].start_pfn >= start_pfn &&
4537 early_node_map[i].end_pfn <= end_pfn) {
4538 /* clear it */
4539 early_node_map[i].start_pfn = 0;
4540 early_node_map[i].end_pfn = 0;
4541 removed = 1;
4542 continue;
4543 }
4544 if (early_node_map[i].start_pfn < start_pfn &&
4545 early_node_map[i].end_pfn > start_pfn) {
4546 unsigned long temp_end_pfn = early_node_map[i].end_pfn;
4547 early_node_map[i].end_pfn = start_pfn;
4548 if (temp_end_pfn > end_pfn)
4549 add_active_range(nid, end_pfn, temp_end_pfn);
4550 continue;
4551 }
4552 if (early_node_map[i].start_pfn >= start_pfn &&
4553 early_node_map[i].end_pfn > end_pfn &&
4554 early_node_map[i].start_pfn < end_pfn) {
4555 early_node_map[i].start_pfn = end_pfn;
4556 continue;
4557 }
4558 }
4559
4560 if (!removed)
4561 return;
4562
4563 /* remove the blank ones */
4564 for (i = nr_nodemap_entries - 1; i > 0; i--) {
4565 if (early_node_map[i].nid != nid)
4566 continue;
4567 if (early_node_map[i].end_pfn)
4568 continue;
4569 /* we found it, get rid of it */
4570 for (j = i; j < nr_nodemap_entries - 1; j++)
4571 memcpy(&early_node_map[j], &early_node_map[j+1],
4572 sizeof(early_node_map[j]));
4573 j = nr_nodemap_entries - 1;
4574 memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
4575 nr_nodemap_entries--;
4576 }
4577}
4578
4579/**
4580 * remove_all_active_ranges - Remove all currently registered regions
4581 *
4582 * During discovery, it may be found that a table like SRAT is invalid
4583 * and an alternative discovery method must be used. This function removes
4584 * all currently registered regions.
4585 */
4586void __init remove_all_active_ranges(void)
4587{
4588 memset(early_node_map, 0, sizeof(early_node_map));
4589 nr_nodemap_entries = 0;
4590}
4591
4592/* Compare two active node_active_regions */
4593static int __init cmp_node_active_region(const void *a, const void *b)
4594{
4595 struct node_active_region *arange = (struct node_active_region *)a;
4596 struct node_active_region *brange = (struct node_active_region *)b;
4597
4598 /* Done this way to avoid overflows */
4599 if (arange->start_pfn > brange->start_pfn)
4600 return 1;
4601 if (arange->start_pfn < brange->start_pfn)
4602 return -1;
4603
4604 return 0;
4605}
4606
4607/* sort the node_map by start_pfn */
4608void __init sort_node_map(void)
4609{
4610 sort(early_node_map, (size_t)nr_nodemap_entries,
4611 sizeof(struct node_active_region),
4612 cmp_node_active_region, NULL);
4613}
4614
4615/**
4616 * node_map_pfn_alignment - determine the maximum internode alignment 4277 * node_map_pfn_alignment - determine the maximum internode alignment
4617 * 4278 *
4618 * This function should be called after node map is populated and sorted. 4279 * This function should be called after node map is populated and sorted.
@@ -4634,15 +4295,11 @@ void __init sort_node_map(void)
4634unsigned long __init node_map_pfn_alignment(void) 4295unsigned long __init node_map_pfn_alignment(void)
4635{ 4296{
4636 unsigned long accl_mask = 0, last_end = 0; 4297 unsigned long accl_mask = 0, last_end = 0;
4298 unsigned long start, end, mask;
4637 int last_nid = -1; 4299 int last_nid = -1;
4638 int i; 4300 int i, nid;
4639
4640 for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
4641 int nid = early_node_map[i].nid;
4642 unsigned long start = early_node_map[i].start_pfn;
4643 unsigned long end = early_node_map[i].end_pfn;
4644 unsigned long mask;
4645 4301
4302 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4646 if (!start || last_nid < 0 || last_nid == nid) { 4303 if (!start || last_nid < 0 || last_nid == nid) {
4647 last_nid = nid; 4304 last_nid = nid;
4648 last_end = end; 4305 last_end = end;
@@ -4669,12 +4326,12 @@ unsigned long __init node_map_pfn_alignment(void)
4669/* Find the lowest pfn for a node */ 4326/* Find the lowest pfn for a node */
4670static unsigned long __init find_min_pfn_for_node(int nid) 4327static unsigned long __init find_min_pfn_for_node(int nid)
4671{ 4328{
4672 int i;
4673 unsigned long min_pfn = ULONG_MAX; 4329 unsigned long min_pfn = ULONG_MAX;
4330 unsigned long start_pfn;
4331 int i;
4674 4332
4675 /* Assuming a sorted map, the first range found has the starting pfn */ 4333 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4676 for_each_active_range_index_in_nid(i, nid) 4334 min_pfn = min(min_pfn, start_pfn);
4677 min_pfn = min(min_pfn, early_node_map[i].start_pfn);
4678 4335
4679 if (min_pfn == ULONG_MAX) { 4336 if (min_pfn == ULONG_MAX) {
4680 printk(KERN_WARNING 4337 printk(KERN_WARNING
@@ -4703,15 +4360,16 @@ unsigned long __init find_min_pfn_with_active_regions(void)
4703 */ 4360 */
4704static unsigned long __init early_calculate_totalpages(void) 4361static unsigned long __init early_calculate_totalpages(void)
4705{ 4362{
4706 int i;
4707 unsigned long totalpages = 0; 4363 unsigned long totalpages = 0;
4364 unsigned long start_pfn, end_pfn;
4365 int i, nid;
4366
4367 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4368 unsigned long pages = end_pfn - start_pfn;
4708 4369
4709 for (i = 0; i < nr_nodemap_entries; i++) {
4710 unsigned long pages = early_node_map[i].end_pfn -
4711 early_node_map[i].start_pfn;
4712 totalpages += pages; 4370 totalpages += pages;
4713 if (pages) 4371 if (pages)
4714 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); 4372 node_set_state(nid, N_HIGH_MEMORY);
4715 } 4373 }
4716 return totalpages; 4374 return totalpages;
4717} 4375}
@@ -4766,6 +4424,8 @@ restart:
4766 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4424 /* Spread kernelcore memory as evenly as possible throughout nodes */
4767 kernelcore_node = required_kernelcore / usable_nodes; 4425 kernelcore_node = required_kernelcore / usable_nodes;
4768 for_each_node_state(nid, N_HIGH_MEMORY) { 4426 for_each_node_state(nid, N_HIGH_MEMORY) {
4427 unsigned long start_pfn, end_pfn;
4428
4769 /* 4429 /*
4770 * Recalculate kernelcore_node if the division per node 4430 * Recalculate kernelcore_node if the division per node
4771 * now exceeds what is necessary to satisfy the requested 4431 * now exceeds what is necessary to satisfy the requested
@@ -4782,13 +4442,10 @@ restart:
4782 kernelcore_remaining = kernelcore_node; 4442 kernelcore_remaining = kernelcore_node;
4783 4443
4784 /* Go through each range of PFNs within this node */ 4444 /* Go through each range of PFNs within this node */
4785 for_each_active_range_index_in_nid(i, nid) { 4445 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4786 unsigned long start_pfn, end_pfn;
4787 unsigned long size_pages; 4446 unsigned long size_pages;
4788 4447
4789 start_pfn = max(early_node_map[i].start_pfn, 4448 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4790 zone_movable_pfn[nid]);
4791 end_pfn = early_node_map[i].end_pfn;
4792 if (start_pfn >= end_pfn) 4449 if (start_pfn >= end_pfn)
4793 continue; 4450 continue;
4794 4451
@@ -4890,11 +4547,8 @@ static void check_for_regular_memory(pg_data_t *pgdat)
4890 */ 4547 */
4891void __init free_area_init_nodes(unsigned long *max_zone_pfn) 4548void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4892{ 4549{
4893 unsigned long nid; 4550 unsigned long start_pfn, end_pfn;
4894 int i; 4551 int i, nid;
4895
4896 /* Sort early_node_map as initialisation assumes it is sorted */
4897 sort_node_map();
4898 4552
4899 /* Record where the zone boundaries are */ 4553 /* Record where the zone boundaries are */
4900 memset(arch_zone_lowest_possible_pfn, 0, 4554 memset(arch_zone_lowest_possible_pfn, 0,
@@ -4941,11 +4595,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4941 } 4595 }
4942 4596
4943 /* Print out the early_node_map[] */ 4597 /* Print out the early_node_map[] */
4944 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 4598 printk("Early memory PFN ranges\n");
4945 for (i = 0; i < nr_nodemap_entries; i++) 4599 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4946 printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, 4600 printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
4947 early_node_map[i].start_pfn,
4948 early_node_map[i].end_pfn);
4949 4601
4950 /* Initialise every node */ 4602 /* Initialise every node */
4951 mminit_verify_pageflags_layout(); 4603 mminit_verify_pageflags_layout();
@@ -4998,7 +4650,7 @@ static int __init cmdline_parse_movablecore(char *p)
4998early_param("kernelcore", cmdline_parse_kernelcore); 4650early_param("kernelcore", cmdline_parse_kernelcore);
4999early_param("movablecore", cmdline_parse_movablecore); 4651early_param("movablecore", cmdline_parse_movablecore);
5000 4652
5001#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 4653#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5002 4654
5003/** 4655/**
5004 * set_dma_reserve - set the specified number of pages reserved in the first zone 4656 * set_dma_reserve - set the specified number of pages reserved in the first zone
diff --git a/mm/slub.c b/mm/slub.c
index ed3334d9b6da..09ccee8fb58e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -368,7 +368,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
368 VM_BUG_ON(!irqs_disabled()); 368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE 369#ifdef CONFIG_CMPXCHG_DOUBLE
370 if (s->flags & __CMPXCHG_DOUBLE) { 370 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist, 371 if (cmpxchg_double(&page->freelist, &page->counters,
372 freelist_old, counters_old, 372 freelist_old, counters_old,
373 freelist_new, counters_new)) 373 freelist_new, counters_new))
374 return 1; 374 return 1;
@@ -402,7 +402,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
402{ 402{
403#ifdef CONFIG_CMPXCHG_DOUBLE 403#ifdef CONFIG_CMPXCHG_DOUBLE
404 if (s->flags & __CMPXCHG_DOUBLE) { 404 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist, 405 if (cmpxchg_double(&page->freelist, &page->counters,
406 freelist_old, counters_old, 406 freelist_old, counters_old,
407 freelist_new, counters_new)) 407 freelist_new, counters_new))
408 return 1; 408 return 1;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index be84ae33ae36..b84458dcc226 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -613,7 +613,7 @@ static int hci_dev_do_close(struct hci_dev *hdev)
613 if (!test_bit(HCI_RAW, &hdev->flags)) { 613 if (!test_bit(HCI_RAW, &hdev->flags)) {
614 set_bit(HCI_INIT, &hdev->flags); 614 set_bit(HCI_INIT, &hdev->flags);
615 __hci_request(hdev, hci_reset_req, 0, 615 __hci_request(hdev, hci_reset_req, 0,
616 msecs_to_jiffies(HCI_INIT_TIMEOUT)); 616 msecs_to_jiffies(250));
617 clear_bit(HCI_INIT, &hdev->flags); 617 clear_bit(HCI_INIT, &hdev->flags);
618 } 618 }
619 619
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 12571fb2881c..29fa5badde75 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -616,7 +616,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
616 if ((cp) && (!cp->dest)) { 616 if ((cp) && (!cp->dest)) {
617 dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, 617 dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
618 cp->dport, &cp->vaddr, cp->vport, 618 cp->dport, &cp->vaddr, cp->vport,
619 cp->protocol, cp->fwmark); 619 cp->protocol, cp->fwmark, cp->flags);
620 ip_vs_bind_dest(cp, dest); 620 ip_vs_bind_dest(cp, dest);
621 return dest; 621 return dest;
622 } else 622 } else
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 008bf97cc91a..e1a66cf37f9a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -619,15 +619,21 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
619 const union nf_inet_addr *daddr, 619 const union nf_inet_addr *daddr,
620 __be16 dport, 620 __be16 dport,
621 const union nf_inet_addr *vaddr, 621 const union nf_inet_addr *vaddr,
622 __be16 vport, __u16 protocol, __u32 fwmark) 622 __be16 vport, __u16 protocol, __u32 fwmark,
623 __u32 flags)
623{ 624{
624 struct ip_vs_dest *dest; 625 struct ip_vs_dest *dest;
625 struct ip_vs_service *svc; 626 struct ip_vs_service *svc;
627 __be16 port = dport;
626 628
627 svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); 629 svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
628 if (!svc) 630 if (!svc)
629 return NULL; 631 return NULL;
630 dest = ip_vs_lookup_dest(svc, daddr, dport); 632 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
633 port = 0;
634 dest = ip_vs_lookup_dest(svc, daddr, port);
635 if (!dest)
636 dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
631 if (dest) 637 if (dest)
632 atomic_inc(&dest->refcnt); 638 atomic_inc(&dest->refcnt);
633 ip_vs_service_put(svc); 639 ip_vs_service_put(svc);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3cdd479f9b5d..2b6678c0ce14 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -740,7 +740,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
740 * but still handled. 740 * but still handled.
741 */ 741 */
742 dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, 742 dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
743 param->vport, protocol, fwmark); 743 param->vport, protocol, fwmark, flags);
744 744
745 /* Set the approprite ativity flag */ 745 /* Set the approprite ativity flag */
746 if (protocol == IPPROTO_TCP) { 746 if (protocol == IPPROTO_TCP) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index ef21b221f036..257e77256c5c 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -135,7 +135,7 @@ nla_put_failure:
135static inline int 135static inline int
136ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) 136ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
137{ 137{
138 long timeout = (ct->timeout.expires - jiffies) / HZ; 138 long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
139 139
140 if (timeout < 0) 140 if (timeout < 0)
141 timeout = 0; 141 timeout = 0;
@@ -1358,12 +1358,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
1358 nf_ct_protonum(ct)); 1358 nf_ct_protonum(ct));
1359 if (helper == NULL) { 1359 if (helper == NULL) {
1360 rcu_read_unlock(); 1360 rcu_read_unlock();
1361 spin_unlock_bh(&nf_conntrack_lock);
1361#ifdef CONFIG_MODULES 1362#ifdef CONFIG_MODULES
1362 if (request_module("nfct-helper-%s", helpname) < 0) { 1363 if (request_module("nfct-helper-%s", helpname) < 0) {
1364 spin_lock_bh(&nf_conntrack_lock);
1363 err = -EOPNOTSUPP; 1365 err = -EOPNOTSUPP;
1364 goto err1; 1366 goto err1;
1365 } 1367 }
1366 1368
1369 spin_lock_bh(&nf_conntrack_lock);
1367 rcu_read_lock(); 1370 rcu_read_lock();
1368 helper = __nf_conntrack_helper_find(helpname, 1371 helper = __nf_conntrack_helper_find(helpname,
1369 nf_ct_l3num(ct), 1372 nf_ct_l3num(ct),
@@ -1638,7 +1641,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
1638 const struct nf_conntrack_expect *exp) 1641 const struct nf_conntrack_expect *exp)
1639{ 1642{
1640 struct nf_conn *master = exp->master; 1643 struct nf_conn *master = exp->master;
1641 long timeout = (exp->timeout.expires - jiffies) / HZ; 1644 long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
1642 struct nf_conn_help *help; 1645 struct nf_conn_help *help;
1643 1646
1644 if (timeout < 0) 1647 if (timeout < 0)
@@ -1869,25 +1872,30 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
1869 1872
1870 err = -ENOMEM; 1873 err = -ENOMEM;
1871 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1874 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1872 if (skb2 == NULL) 1875 if (skb2 == NULL) {
1876 nf_ct_expect_put(exp);
1873 goto out; 1877 goto out;
1878 }
1874 1879
1875 rcu_read_lock(); 1880 rcu_read_lock();
1876 err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, 1881 err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
1877 nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); 1882 nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
1878 rcu_read_unlock(); 1883 rcu_read_unlock();
1884 nf_ct_expect_put(exp);
1879 if (err <= 0) 1885 if (err <= 0)
1880 goto free; 1886 goto free;
1881 1887
1882 nf_ct_expect_put(exp); 1888 err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
1889 if (err < 0)
1890 goto out;
1883 1891
1884 return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); 1892 return 0;
1885 1893
1886free: 1894free:
1887 kfree_skb(skb2); 1895 kfree_skb(skb2);
1888out: 1896out:
1889 nf_ct_expect_put(exp); 1897 /* this avoids a loop in nfnetlink. */
1890 return err; 1898 return err == -EAGAIN ? -ENOBUFS : err;
1891} 1899}
1892 1900
1893static int 1901static int
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3891702b81df..d9d4970b9b07 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2448,8 +2448,12 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
2448{ 2448{
2449 struct packet_sock *po = pkt_sk(sk); 2449 struct packet_sock *po = pkt_sk(sk);
2450 2450
2451 if (po->fanout) 2451 if (po->fanout) {
2452 if (dev)
2453 dev_put(dev);
2454
2452 return -EINVAL; 2455 return -EINVAL;
2456 }
2453 2457
2454 lock_sock(sk); 2458 lock_sock(sk);
2455 2459
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index eb3b9a86c6ed..a4ab207cdc59 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -488,7 +488,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
488 return -EINVAL; 488 return -EINVAL;
489 489
490 s = sizeof(struct disttable) + n * sizeof(s16); 490 s = sizeof(struct disttable) + n * sizeof(s16);
491 d = kmalloc(s, GFP_KERNEL); 491 d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
492 if (!d) 492 if (!d)
493 d = vmalloc(s); 493 d = vmalloc(s);
494 if (!d) 494 if (!d)
@@ -501,9 +501,10 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
501 root_lock = qdisc_root_sleeping_lock(sch); 501 root_lock = qdisc_root_sleeping_lock(sch);
502 502
503 spin_lock_bh(root_lock); 503 spin_lock_bh(root_lock);
504 dist_free(q->delay_dist); 504 swap(q->delay_dist, d);
505 q->delay_dist = d;
506 spin_unlock_bh(root_lock); 505 spin_unlock_bh(root_lock);
506
507 dist_free(d);
507 return 0; 508 return 0;
508} 509}
509 510
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 103343408593..7b0325459e71 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -817,11 +817,11 @@ skip_unblock:
817static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) 817static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
818{ 818{
819 unsigned long mask; 819 unsigned long mask;
820 uint32_t limit, roundedF; 820 u64 limit, roundedF;
821 int slot_shift = cl->grp->slot_shift; 821 int slot_shift = cl->grp->slot_shift;
822 822
823 roundedF = qfq_round_down(cl->F, slot_shift); 823 roundedF = qfq_round_down(cl->F, slot_shift);
824 limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); 824 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
825 825
826 if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { 826 if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
827 /* timestamp was stale */ 827 /* timestamp was stale */
diff --git a/net/socket.c b/net/socket.c
index 2877647f347b..a0053750e37a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2883,7 +2883,7 @@ static int bond_ioctl(struct net *net, unsigned int cmd,
2883 2883
2884 return dev_ioctl(net, cmd, uifr); 2884 return dev_ioctl(net, cmd, uifr);
2885 default: 2885 default:
2886 return -EINVAL; 2886 return -ENOIOCTLCMD;
2887 } 2887 }
2888} 2888}
2889 2889
@@ -3210,20 +3210,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
3210 return sock_do_ioctl(net, sock, cmd, arg); 3210 return sock_do_ioctl(net, sock, cmd, arg);
3211 } 3211 }
3212 3212
3213 /* Prevent warning from compat_sys_ioctl, these always
3214 * result in -EINVAL in the native case anyway. */
3215 switch (cmd) {
3216 case SIOCRTMSG:
3217 case SIOCGIFCOUNT:
3218 case SIOCSRARP:
3219 case SIOCGRARP:
3220 case SIOCDRARP:
3221 case SIOCSIFLINK:
3222 case SIOCGIFSLAVE:
3223 case SIOCSIFSLAVE:
3224 return -EINVAL;
3225 }
3226
3227 return -ENOIOCTLCMD; 3213 return -ENOIOCTLCMD;
3228} 3214}
3229 3215
diff --git a/security/security.c b/security/security.c
index 0c6cc69c8f86..e2f684aeb70c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -381,7 +381,7 @@ int security_old_inode_init_security(struct inode *inode, struct inode *dir,
381 void **value, size_t *len) 381 void **value, size_t *len)
382{ 382{
383 if (unlikely(IS_PRIVATE(inode))) 383 if (unlikely(IS_PRIVATE(inode)))
384 return 0; 384 return -EOPNOTSUPP;
385 return security_ops->inode_init_security(inode, dir, qstr, name, value, 385 return security_ops->inode_init_security(inode, dir, qstr, name, value,
386 len); 386 len);
387} 387}
diff --git a/sound/soc/codecs/wm8776.c b/sound/soc/codecs/wm8776.c
index bfdc52370ad0..d3b0a20744f1 100644
--- a/sound/soc/codecs/wm8776.c
+++ b/sound/soc/codecs/wm8776.c
@@ -235,6 +235,7 @@ static int wm8776_hw_params(struct snd_pcm_substream *substream,
235 switch (snd_pcm_format_width(params_format(params))) { 235 switch (snd_pcm_format_width(params_format(params))) {
236 case 16: 236 case 16:
237 iface = 0; 237 iface = 0;
238 break;
238 case 20: 239 case 20:
239 iface = 0x10; 240 iface = 0x10;
240 break; 241 break;
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index fe6762ed56bd..c89f9e1453f7 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -22,7 +22,7 @@ OPTIONS
22------- 22-------
23-i:: 23-i::
24--input=:: 24--input=::
25 Input file name. (default: perf.data) 25 Input file name. (default: perf.data unless stdin is a fifo)
26 26
27-d:: 27-d::
28--dsos=<dso[,dso...]>:: 28--dsos=<dso[,dso...]>::
@@ -66,7 +66,7 @@ OPTIONS
66 used. This interfaces starts by centering on the line with more 66 used. This interfaces starts by centering on the line with more
67 samples, TAB/UNTAB cycles through the lines with more samples. 67 samples, TAB/UNTAB cycles through the lines with more samples.
68 68
69-c:: 69-C::
70--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can 70--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
71 be provided as a comma-separated list with no space: 0,1. Ranges of 71 be provided as a comma-separated list with no space: 0,1. Ranges of
72 CPUs are specified with -: 0-2. Default is to report samples on all 72 CPUs are specified with -: 0-2. Default is to report samples on all
diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt
index cc22325ffd1b..25c52efcc7f0 100644
--- a/tools/perf/Documentation/perf-buildid-list.txt
+++ b/tools/perf/Documentation/perf-buildid-list.txt
@@ -26,7 +26,7 @@ OPTIONS
26 Show only DSOs with hits. 26 Show only DSOs with hits.
27-i:: 27-i::
28--input=:: 28--input=::
29 Input file name. (default: perf.data) 29 Input file name. (default: perf.data unless stdin is a fifo)
30-f:: 30-f::
31--force:: 31--force::
32 Don't do ownership validation. 32 Don't do ownership validation.
diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt
index 0cada9e053dc..0507ec7bad71 100644
--- a/tools/perf/Documentation/perf-evlist.txt
+++ b/tools/perf/Documentation/perf-evlist.txt
@@ -18,7 +18,7 @@ OPTIONS
18------- 18-------
19-i:: 19-i::
20--input=:: 20--input=::
21 Input file name. (default: perf.data) 21 Input file name. (default: perf.data unless stdin is a fifo)
22 22
23SEE ALSO 23SEE ALSO
24-------- 24--------
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
index a52fcde894c7..7c8fbbf3f61c 100644
--- a/tools/perf/Documentation/perf-kmem.txt
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -23,7 +23,7 @@ OPTIONS
23------- 23-------
24-i <file>:: 24-i <file>::
25--input=<file>:: 25--input=<file>::
26 Select the input file (default: perf.data) 26 Select the input file (default: perf.data unless stdin is a fifo)
27 27
28--caller:: 28--caller::
29 Show per-callsite statistics 29 Show per-callsite statistics
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index 4a26a2f3a6a3..d6b2a4f2108b 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -29,7 +29,7 @@ COMMON OPTIONS
29 29
30-i:: 30-i::
31--input=<file>:: 31--input=<file>::
32 Input file name. 32 Input file name. (default: perf.data unless stdin is a fifo)
33 33
34-v:: 34-v::
35--verbose:: 35--verbose::
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 5a520f825295..2937f7e14bb7 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -89,7 +89,7 @@ OPTIONS
89 89
90-m:: 90-m::
91--mmap-pages=:: 91--mmap-pages=::
92 Number of mmap data pages. 92 Number of mmap data pages. Must be a power of two.
93 93
94-g:: 94-g::
95--call-graph:: 95--call-graph::
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 212f24d672e1..9b430e98712e 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -19,7 +19,7 @@ OPTIONS
19------- 19-------
20-i:: 20-i::
21--input=:: 21--input=::
22 Input file name. (default: perf.data) 22 Input file name. (default: perf.data unless stdin is a fifo)
23 23
24-v:: 24-v::
25--verbose:: 25--verbose::
@@ -39,7 +39,7 @@ OPTIONS
39-T:: 39-T::
40--threads:: 40--threads::
41 Show per-thread event counters 41 Show per-thread event counters
42-C:: 42-c::
43--comms=:: 43--comms=::
44 Only consider symbols in these comms. CSV that understands 44 Only consider symbols in these comms. CSV that understands
45 file://filename entries. 45 file://filename entries.
@@ -80,9 +80,10 @@ OPTIONS
80--dump-raw-trace:: 80--dump-raw-trace::
81 Dump raw trace in ASCII. 81 Dump raw trace in ASCII.
82 82
83-g [type,min,order]:: 83-g [type,min[,limit],order]::
84--call-graph:: 84--call-graph::
85 Display call chains using type, min percent threshold and order. 85 Display call chains using type, min percent threshold, optional print
86 limit and order.
86 type can be either: 87 type can be either:
87 - flat: single column, linear exposure of call chains. 88 - flat: single column, linear exposure of call chains.
88 - graph: use a graph tree, displaying absolute overhead rates. 89 - graph: use a graph tree, displaying absolute overhead rates.
@@ -128,7 +129,7 @@ OPTIONS
128--symfs=<directory>:: 129--symfs=<directory>::
129 Look for files with symbols relative to this directory. 130 Look for files with symbols relative to this directory.
130 131
131-c:: 132-C::
132--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can 133--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
133 be provided as a comma-separated list with no space: 0,1. Ranges of 134 be provided as a comma-separated list with no space: 0,1. Ranges of
134 CPUs are specified with -: 0-2. Default is to report samples on all 135 CPUs are specified with -: 0-2. Default is to report samples on all
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 5b212b57f70b..8ff4df956951 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -40,7 +40,7 @@ OPTIONS
40------- 40-------
41-i:: 41-i::
42--input=<file>:: 42--input=<file>::
43 Input file name. (default: perf.data) 43 Input file name. (default: perf.data unless stdin is a fifo)
44 44
45-v:: 45-v::
46--verbose:: 46--verbose::
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index dec87ecb530e..2f6cef43da25 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -106,7 +106,7 @@ OPTIONS
106 106
107-i:: 107-i::
108--input=:: 108--input=::
109 Input file name. 109 Input file name. (default: perf.data unless stdin is a fifo)
110 110
111-d:: 111-d::
112--debug-mode:: 112--debug-mode::
@@ -182,12 +182,17 @@ OPTIONS
182--hide-call-graph:: 182--hide-call-graph::
183 When printing symbols do not display call chain. 183 When printing symbols do not display call chain.
184 184
185-c:: 185-C::
186--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can 186--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
187 be provided as a comma-separated list with no space: 0,1. Ranges of 187 be provided as a comma-separated list with no space: 0,1. Ranges of
188 CPUs are specified with -: 0-2. Default is to report samples on all 188 CPUs are specified with -: 0-2. Default is to report samples on all
189 CPUs. 189 CPUs.
190 190
191-c::
192--comms=::
193 Only display events for these comms. CSV that understands
194 file://filename entries.
195
191-I:: 196-I::
192--show-info:: 197--show-info::
193 Display extended information about the perf.data file. This adds 198 Display extended information about the perf.data file. This adds
diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt
index 2c3b462f64b0..b24ac40fcd58 100644
--- a/tools/perf/Documentation/perf-test.txt
+++ b/tools/perf/Documentation/perf-test.txt
@@ -8,13 +8,19 @@ perf-test - Runs sanity tests.
8SYNOPSIS 8SYNOPSIS
9-------- 9--------
10[verse] 10[verse]
11'perf test <options>' 11'perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]'
12 12
13DESCRIPTION 13DESCRIPTION
14----------- 14-----------
15This command does assorted sanity tests, initially through linked routines but 15This command does assorted sanity tests, initially through linked routines but
16also will look for a directory with more tests in the form of scripts. 16also will look for a directory with more tests in the form of scripts.
17 17
18To get a list of available tests use 'perf test list', specifying a test name
19fragment will show all tests that have it.
20
21To run just specific tests, inform test name fragments or the numbers obtained
22from 'perf test list'.
23
18OPTIONS 24OPTIONS
19------- 25-------
20-v:: 26-v::
diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt
index d7b79e2ba2ad..1632b0efc757 100644
--- a/tools/perf/Documentation/perf-timechart.txt
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -27,7 +27,7 @@ OPTIONS
27 Select the output file (default: output.svg) 27 Select the output file (default: output.svg)
28-i:: 28-i::
29--input=:: 29--input=::
30 Select the input file (default: perf.data) 30 Select the input file (default: perf.data unless stdin is a fifo)
31-w:: 31-w::
32--width=:: 32--width=::
33 Select the width of the SVG file (default: 1000) 33 Select the width of the SVG file (default: 1000)
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index b98e3075646b..ac86d67b636e 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -278,6 +278,7 @@ LIB_H += util/strbuf.h
278LIB_H += util/strlist.h 278LIB_H += util/strlist.h
279LIB_H += util/strfilter.h 279LIB_H += util/strfilter.h
280LIB_H += util/svghelper.h 280LIB_H += util/svghelper.h
281LIB_H += util/tool.h
281LIB_H += util/run-command.h 282LIB_H += util/run-command.h
282LIB_H += util/sigchain.h 283LIB_H += util/sigchain.h
283LIB_H += util/symbol.h 284LIB_H += util/symbol.h
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 46b4c24f338e..214ba7f9f577 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -27,32 +27,32 @@
27#include "util/sort.h" 27#include "util/sort.h"
28#include "util/hist.h" 28#include "util/hist.h"
29#include "util/session.h" 29#include "util/session.h"
30#include "util/tool.h"
30 31
31#include <linux/bitmap.h> 32#include <linux/bitmap.h>
32 33
33static char const *input_name = "perf.data"; 34struct perf_annotate {
34 35 struct perf_tool tool;
35static bool force, use_tui, use_stdio; 36 char const *input_name;
36 37 bool force, use_tui, use_stdio;
37static bool full_paths; 38 bool full_paths;
38 39 bool print_line;
39static bool print_line; 40 const char *sym_hist_filter;
40 41 const char *cpu_list;
41static const char *sym_hist_filter; 42 DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
42 43};
43static const char *cpu_list;
44static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
45 44
46static int perf_evlist__add_sample(struct perf_evlist *evlist, 45static int perf_evsel__add_sample(struct perf_evsel *evsel,
47 struct perf_sample *sample, 46 struct perf_sample *sample,
48 struct perf_evsel *evsel, 47 struct addr_location *al,
49 struct addr_location *al) 48 struct perf_annotate *ann)
50{ 49{
51 struct hist_entry *he; 50 struct hist_entry *he;
52 int ret; 51 int ret;
53 52
54 if (sym_hist_filter != NULL && 53 if (ann->sym_hist_filter != NULL &&
55 (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) { 54 (al->sym == NULL ||
55 strcmp(ann->sym_hist_filter, al->sym->name) != 0)) {
56 /* We're only interested in a symbol named sym_hist_filter */ 56 /* We're only interested in a symbol named sym_hist_filter */
57 if (al->sym != NULL) { 57 if (al->sym != NULL) {
58 rb_erase(&al->sym->rb_node, 58 rb_erase(&al->sym->rb_node,
@@ -69,8 +69,7 @@ static int perf_evlist__add_sample(struct perf_evlist *evlist,
69 ret = 0; 69 ret = 0;
70 if (he->ms.sym != NULL) { 70 if (he->ms.sym != NULL) {
71 struct annotation *notes = symbol__annotation(he->ms.sym); 71 struct annotation *notes = symbol__annotation(he->ms.sym);
72 if (notes->src == NULL && 72 if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0)
73 symbol__alloc_hist(he->ms.sym, evlist->nr_entries) < 0)
74 return -ENOMEM; 73 return -ENOMEM;
75 74
76 ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); 75 ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
@@ -81,25 +80,26 @@ static int perf_evlist__add_sample(struct perf_evlist *evlist,
81 return ret; 80 return ret;
82} 81}
83 82
84static int process_sample_event(union perf_event *event, 83static int process_sample_event(struct perf_tool *tool,
84 union perf_event *event,
85 struct perf_sample *sample, 85 struct perf_sample *sample,
86 struct perf_evsel *evsel, 86 struct perf_evsel *evsel,
87 struct perf_session *session) 87 struct machine *machine)
88{ 88{
89 struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool);
89 struct addr_location al; 90 struct addr_location al;
90 91
91 if (perf_event__preprocess_sample(event, session, &al, sample, 92 if (perf_event__preprocess_sample(event, machine, &al, sample,
92 symbol__annotate_init) < 0) { 93 symbol__annotate_init) < 0) {
93 pr_warning("problem processing %d event, skipping it.\n", 94 pr_warning("problem processing %d event, skipping it.\n",
94 event->header.type); 95 event->header.type);
95 return -1; 96 return -1;
96 } 97 }
97 98
98 if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) 99 if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap))
99 return 0; 100 return 0;
100 101
101 if (!al.filtered && 102 if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) {
102 perf_evlist__add_sample(session->evlist, sample, evsel, &al)) {
103 pr_warning("problem incrementing symbol count, " 103 pr_warning("problem incrementing symbol count, "
104 "skipping event\n"); 104 "skipping event\n");
105 return -1; 105 return -1;
@@ -108,14 +108,15 @@ static int process_sample_event(union perf_event *event,
108 return 0; 108 return 0;
109} 109}
110 110
111static int hist_entry__tty_annotate(struct hist_entry *he, int evidx) 111static int hist_entry__tty_annotate(struct hist_entry *he, int evidx,
112 struct perf_annotate *ann)
112{ 113{
113 return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx, 114 return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx,
114 print_line, full_paths, 0, 0); 115 ann->print_line, ann->full_paths, 0, 0);
115} 116}
116 117
117static void hists__find_annotations(struct hists *self, int evidx, 118static void hists__find_annotations(struct hists *self, int evidx,
118 int nr_events) 119 struct perf_annotate *ann)
119{ 120{
120 struct rb_node *nd = rb_first(&self->entries), *next; 121 struct rb_node *nd = rb_first(&self->entries), *next;
121 int key = K_RIGHT; 122 int key = K_RIGHT;
@@ -138,8 +139,7 @@ find_next:
138 } 139 }
139 140
140 if (use_browser > 0) { 141 if (use_browser > 0) {
141 key = hist_entry__tui_annotate(he, evidx, nr_events, 142 key = hist_entry__tui_annotate(he, evidx, NULL, NULL, 0);
142 NULL, NULL, 0);
143 switch (key) { 143 switch (key) {
144 case K_RIGHT: 144 case K_RIGHT:
145 next = rb_next(nd); 145 next = rb_next(nd);
@@ -154,7 +154,7 @@ find_next:
154 if (next != NULL) 154 if (next != NULL)
155 nd = next; 155 nd = next;
156 } else { 156 } else {
157 hist_entry__tty_annotate(he, evidx); 157 hist_entry__tty_annotate(he, evidx, ann);
158 nd = rb_next(nd); 158 nd = rb_next(nd);
159 /* 159 /*
160 * Since we have a hist_entry per IP for the same 160 * Since we have a hist_entry per IP for the same
@@ -167,33 +167,26 @@ find_next:
167 } 167 }
168} 168}
169 169
170static struct perf_event_ops event_ops = { 170static int __cmd_annotate(struct perf_annotate *ann)
171 .sample = process_sample_event,
172 .mmap = perf_event__process_mmap,
173 .comm = perf_event__process_comm,
174 .fork = perf_event__process_task,
175 .ordered_samples = true,
176 .ordering_requires_timestamps = true,
177};
178
179static int __cmd_annotate(void)
180{ 171{
181 int ret; 172 int ret;
182 struct perf_session *session; 173 struct perf_session *session;
183 struct perf_evsel *pos; 174 struct perf_evsel *pos;
184 u64 total_nr_samples; 175 u64 total_nr_samples;
185 176
186 session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); 177 session = perf_session__new(ann->input_name, O_RDONLY,
178 ann->force, false, &ann->tool);
187 if (session == NULL) 179 if (session == NULL)
188 return -ENOMEM; 180 return -ENOMEM;
189 181
190 if (cpu_list) { 182 if (ann->cpu_list) {
191 ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); 183 ret = perf_session__cpu_bitmap(session, ann->cpu_list,
184 ann->cpu_bitmap);
192 if (ret) 185 if (ret)
193 goto out_delete; 186 goto out_delete;
194 } 187 }
195 188
196 ret = perf_session__process_events(session, &event_ops); 189 ret = perf_session__process_events(session, &ann->tool);
197 if (ret) 190 if (ret)
198 goto out_delete; 191 goto out_delete;
199 192
@@ -217,13 +210,12 @@ static int __cmd_annotate(void)
217 total_nr_samples += nr_samples; 210 total_nr_samples += nr_samples;
218 hists__collapse_resort(hists); 211 hists__collapse_resort(hists);
219 hists__output_resort(hists); 212 hists__output_resort(hists);
220 hists__find_annotations(hists, pos->idx, 213 hists__find_annotations(hists, pos->idx, ann);
221 session->evlist->nr_entries);
222 } 214 }
223 } 215 }
224 216
225 if (total_nr_samples == 0) { 217 if (total_nr_samples == 0) {
226 ui__warning("The %s file has no samples!\n", input_name); 218 ui__warning("The %s file has no samples!\n", session->filename);
227 goto out_delete; 219 goto out_delete;
228 } 220 }
229out_delete: 221out_delete:
@@ -247,29 +239,41 @@ static const char * const annotate_usage[] = {
247 NULL 239 NULL
248}; 240};
249 241
250static const struct option options[] = { 242int cmd_annotate(int argc, const char **argv, const char *prefix __used)
251 OPT_STRING('i', "input", &input_name, "file", 243{
244 struct perf_annotate annotate = {
245 .tool = {
246 .sample = process_sample_event,
247 .mmap = perf_event__process_mmap,
248 .comm = perf_event__process_comm,
249 .fork = perf_event__process_task,
250 .ordered_samples = true,
251 .ordering_requires_timestamps = true,
252 },
253 };
254 const struct option options[] = {
255 OPT_STRING('i', "input", &annotate.input_name, "file",
252 "input file name"), 256 "input file name"),
253 OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", 257 OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
254 "only consider symbols in these dsos"), 258 "only consider symbols in these dsos"),
255 OPT_STRING('s', "symbol", &sym_hist_filter, "symbol", 259 OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol",
256 "symbol to annotate"), 260 "symbol to annotate"),
257 OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), 261 OPT_BOOLEAN('f', "force", &annotate.force, "don't complain, do it"),
258 OPT_INCR('v', "verbose", &verbose, 262 OPT_INCR('v', "verbose", &verbose,
259 "be more verbose (show symbol address, etc)"), 263 "be more verbose (show symbol address, etc)"),
260 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 264 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
261 "dump raw trace in ASCII"), 265 "dump raw trace in ASCII"),
262 OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), 266 OPT_BOOLEAN(0, "tui", &annotate.use_tui, "Use the TUI interface"),
263 OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), 267 OPT_BOOLEAN(0, "stdio", &annotate.use_stdio, "Use the stdio interface"),
264 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, 268 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
265 "file", "vmlinux pathname"), 269 "file", "vmlinux pathname"),
266 OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, 270 OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
267 "load module symbols - WARNING: use only with -k and LIVE kernel"), 271 "load module symbols - WARNING: use only with -k and LIVE kernel"),
268 OPT_BOOLEAN('l', "print-line", &print_line, 272 OPT_BOOLEAN('l', "print-line", &annotate.print_line,
269 "print matching source lines (may be slow)"), 273 "print matching source lines (may be slow)"),
270 OPT_BOOLEAN('P', "full-paths", &full_paths, 274 OPT_BOOLEAN('P', "full-paths", &annotate.full_paths,
271 "Don't shorten the displayed pathnames"), 275 "Don't shorten the displayed pathnames"),
272 OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), 276 OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
273 OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", 277 OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
274 "Look for files with symbols relative to this directory"), 278 "Look for files with symbols relative to this directory"),
275 OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src, 279 OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
@@ -279,15 +283,13 @@ static const struct option options[] = {
279 OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", 283 OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
280 "Specify disassembler style (e.g. -M intel for intel syntax)"), 284 "Specify disassembler style (e.g. -M intel for intel syntax)"),
281 OPT_END() 285 OPT_END()
282}; 286 };
283 287
284int cmd_annotate(int argc, const char **argv, const char *prefix __used)
285{
286 argc = parse_options(argc, argv, options, annotate_usage, 0); 288 argc = parse_options(argc, argv, options, annotate_usage, 0);
287 289
288 if (use_stdio) 290 if (annotate.use_stdio)
289 use_browser = 0; 291 use_browser = 0;
290 else if (use_tui) 292 else if (annotate.use_tui)
291 use_browser = 1; 293 use_browser = 1;
292 294
293 setup_browser(true); 295 setup_browser(true);
@@ -308,7 +310,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
308 if (argc > 1) 310 if (argc > 1)
309 usage_with_options(annotate_usage, options); 311 usage_with_options(annotate_usage, options);
310 312
311 sym_hist_filter = argv[0]; 313 annotate.sym_hist_filter = argv[0];
312 } 314 }
313 315
314 if (field_sep && *field_sep == '.') { 316 if (field_sep && *field_sep == '.') {
@@ -316,5 +318,5 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
316 return -1; 318 return -1;
317 } 319 }
318 320
319 return __cmd_annotate(); 321 return __cmd_annotate(&annotate);
320} 322}
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index cb690a65bf02..52480467e9ff 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -18,7 +18,7 @@
18 18
19#include <libelf.h> 19#include <libelf.h>
20 20
21static char const *input_name = "perf.data"; 21static const char *input_name;
22static bool force; 22static bool force;
23static bool show_kernel; 23static bool show_kernel;
24static bool with_hits; 24static bool with_hits;
@@ -39,24 +39,6 @@ static const struct option options[] = {
39 OPT_END() 39 OPT_END()
40}; 40};
41 41
42static int perf_session__list_build_ids(void)
43{
44 struct perf_session *session;
45
46 session = perf_session__new(input_name, O_RDONLY, force, false,
47 &build_id__mark_dso_hit_ops);
48 if (session == NULL)
49 return -1;
50
51 if (with_hits)
52 perf_session__process_events(session, &build_id__mark_dso_hit_ops);
53
54 perf_session__fprintf_dsos_buildid(session, stdout, with_hits);
55
56 perf_session__delete(session);
57 return 0;
58}
59
60static int sysfs__fprintf_build_id(FILE *fp) 42static int sysfs__fprintf_build_id(FILE *fp)
61{ 43{
62 u8 kallsyms_build_id[BUILD_ID_SIZE]; 44 u8 kallsyms_build_id[BUILD_ID_SIZE];
@@ -85,17 +67,36 @@ static int filename__fprintf_build_id(const char *name, FILE *fp)
85 return fprintf(fp, "%s\n", sbuild_id); 67 return fprintf(fp, "%s\n", sbuild_id);
86} 68}
87 69
88static int __cmd_buildid_list(void) 70static int perf_session__list_build_ids(void)
89{ 71{
90 if (show_kernel) 72 struct perf_session *session;
91 return sysfs__fprintf_build_id(stdout);
92 73
93 elf_version(EV_CURRENT); 74 elf_version(EV_CURRENT);
75
76 session = perf_session__new(input_name, O_RDONLY, force, false,
77 &build_id__mark_dso_hit_ops);
78 if (session == NULL)
79 return -1;
80
94 /* 81 /*
95 * See if this is an ELF file first: 82 * See if this is an ELF file first:
96 */ 83 */
97 if (filename__fprintf_build_id(input_name, stdout)) 84 if (filename__fprintf_build_id(session->filename, stdout))
98 return 0; 85 goto out;
86
87 if (with_hits)
88 perf_session__process_events(session, &build_id__mark_dso_hit_ops);
89
90 perf_session__fprintf_dsos_buildid(session, stdout, with_hits);
91out:
92 perf_session__delete(session);
93 return 0;
94}
95
96static int __cmd_buildid_list(void)
97{
98 if (show_kernel)
99 return sysfs__fprintf_build_id(stdout);
99 100
100 return perf_session__list_build_ids(); 101 return perf_session__list_build_ids();
101} 102}
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index b39f3a1ee7dc..4f19513d7dda 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -9,7 +9,9 @@
9#include "util/debug.h" 9#include "util/debug.h"
10#include "util/event.h" 10#include "util/event.h"
11#include "util/hist.h" 11#include "util/hist.h"
12#include "util/evsel.h"
12#include "util/session.h" 13#include "util/session.h"
14#include "util/tool.h"
13#include "util/sort.h" 15#include "util/sort.h"
14#include "util/symbol.h" 16#include "util/symbol.h"
15#include "util/util.h" 17#include "util/util.h"
@@ -30,14 +32,15 @@ static int hists__add_entry(struct hists *self,
30 return -ENOMEM; 32 return -ENOMEM;
31} 33}
32 34
33static int diff__process_sample_event(union perf_event *event, 35static int diff__process_sample_event(struct perf_tool *tool __used,
36 union perf_event *event,
34 struct perf_sample *sample, 37 struct perf_sample *sample,
35 struct perf_evsel *evsel __used, 38 struct perf_evsel *evsel __used,
36 struct perf_session *session) 39 struct machine *machine)
37{ 40{
38 struct addr_location al; 41 struct addr_location al;
39 42
40 if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { 43 if (perf_event__preprocess_sample(event, machine, &al, sample, NULL) < 0) {
41 pr_warning("problem processing %d event, skipping it.\n", 44 pr_warning("problem processing %d event, skipping it.\n",
42 event->header.type); 45 event->header.type);
43 return -1; 46 return -1;
@@ -46,16 +49,16 @@ static int diff__process_sample_event(union perf_event *event,
46 if (al.filtered || al.sym == NULL) 49 if (al.filtered || al.sym == NULL)
47 return 0; 50 return 0;
48 51
49 if (hists__add_entry(&session->hists, &al, sample->period)) { 52 if (hists__add_entry(&evsel->hists, &al, sample->period)) {
50 pr_warning("problem incrementing symbol period, skipping event\n"); 53 pr_warning("problem incrementing symbol period, skipping event\n");
51 return -1; 54 return -1;
52 } 55 }
53 56
54 session->hists.stats.total_period += sample->period; 57 evsel->hists.stats.total_period += sample->period;
55 return 0; 58 return 0;
56} 59}
57 60
58static struct perf_event_ops event_ops = { 61static struct perf_tool perf_diff = {
59 .sample = diff__process_sample_event, 62 .sample = diff__process_sample_event,
60 .mmap = perf_event__process_mmap, 63 .mmap = perf_event__process_mmap,
61 .comm = perf_event__process_comm, 64 .comm = perf_event__process_comm,
@@ -145,13 +148,13 @@ static int __cmd_diff(void)
145 int ret, i; 148 int ret, i;
146 struct perf_session *session[2]; 149 struct perf_session *session[2];
147 150
148 session[0] = perf_session__new(input_old, O_RDONLY, force, false, &event_ops); 151 session[0] = perf_session__new(input_old, O_RDONLY, force, false, &perf_diff);
149 session[1] = perf_session__new(input_new, O_RDONLY, force, false, &event_ops); 152 session[1] = perf_session__new(input_new, O_RDONLY, force, false, &perf_diff);
150 if (session[0] == NULL || session[1] == NULL) 153 if (session[0] == NULL || session[1] == NULL)
151 return -ENOMEM; 154 return -ENOMEM;
152 155
153 for (i = 0; i < 2; ++i) { 156 for (i = 0; i < 2; ++i) {
154 ret = perf_session__process_events(session[i], &event_ops); 157 ret = perf_session__process_events(session[i], &perf_diff);
155 if (ret) 158 if (ret)
156 goto out_delete; 159 goto out_delete;
157 } 160 }
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index 4c5e9e04a41f..26760322c4f4 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -15,7 +15,7 @@
15#include "util/parse-options.h" 15#include "util/parse-options.h"
16#include "util/session.h" 16#include "util/session.h"
17 17
18static char const *input_name = "perf.data"; 18static const char *input_name;
19 19
20static int __cmd_evlist(void) 20static int __cmd_evlist(void)
21{ 21{
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 8dfc12bb119b..09c106193e65 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -9,6 +9,7 @@
9 9
10#include "perf.h" 10#include "perf.h"
11#include "util/session.h" 11#include "util/session.h"
12#include "util/tool.h"
12#include "util/debug.h" 13#include "util/debug.h"
13 14
14#include "util/parse-options.h" 15#include "util/parse-options.h"
@@ -16,8 +17,9 @@
16static char const *input_name = "-"; 17static char const *input_name = "-";
17static bool inject_build_ids; 18static bool inject_build_ids;
18 19
19static int perf_event__repipe_synth(union perf_event *event, 20static int perf_event__repipe_synth(struct perf_tool *tool __used,
20 struct perf_session *session __used) 21 union perf_event *event,
22 struct machine *machine __used)
21{ 23{
22 uint32_t size; 24 uint32_t size;
23 void *buf = event; 25 void *buf = event;
@@ -36,41 +38,70 @@ static int perf_event__repipe_synth(union perf_event *event,
36 return 0; 38 return 0;
37} 39}
38 40
39static int perf_event__repipe(union perf_event *event, 41static int perf_event__repipe_op2_synth(struct perf_tool *tool,
42 union perf_event *event,
43 struct perf_session *session __used)
44{
45 return perf_event__repipe_synth(tool, event, NULL);
46}
47
48static int perf_event__repipe_event_type_synth(struct perf_tool *tool,
49 union perf_event *event)
50{
51 return perf_event__repipe_synth(tool, event, NULL);
52}
53
54static int perf_event__repipe_tracing_data_synth(union perf_event *event,
55 struct perf_session *session __used)
56{
57 return perf_event__repipe_synth(NULL, event, NULL);
58}
59
60static int perf_event__repipe_attr(union perf_event *event,
61 struct perf_evlist **pevlist __used)
62{
63 return perf_event__repipe_synth(NULL, event, NULL);
64}
65
66static int perf_event__repipe(struct perf_tool *tool,
67 union perf_event *event,
40 struct perf_sample *sample __used, 68 struct perf_sample *sample __used,
41 struct perf_session *session) 69 struct machine *machine)
42{ 70{
43 return perf_event__repipe_synth(event, session); 71 return perf_event__repipe_synth(tool, event, machine);
44} 72}
45 73
46static int perf_event__repipe_sample(union perf_event *event, 74static int perf_event__repipe_sample(struct perf_tool *tool,
75 union perf_event *event,
47 struct perf_sample *sample __used, 76 struct perf_sample *sample __used,
48 struct perf_evsel *evsel __used, 77 struct perf_evsel *evsel __used,
49 struct perf_session *session) 78 struct machine *machine)
50{ 79{
51 return perf_event__repipe_synth(event, session); 80 return perf_event__repipe_synth(tool, event, machine);
52} 81}
53 82
54static int perf_event__repipe_mmap(union perf_event *event, 83static int perf_event__repipe_mmap(struct perf_tool *tool,
84 union perf_event *event,
55 struct perf_sample *sample, 85 struct perf_sample *sample,
56 struct perf_session *session) 86 struct machine *machine)
57{ 87{
58 int err; 88 int err;
59 89
60 err = perf_event__process_mmap(event, sample, session); 90 err = perf_event__process_mmap(tool, event, sample, machine);
61 perf_event__repipe(event, sample, session); 91 perf_event__repipe(tool, event, sample, machine);
62 92
63 return err; 93 return err;
64} 94}
65 95
66static int perf_event__repipe_task(union perf_event *event, 96static int perf_event__repipe_task(struct perf_tool *tool,
97 union perf_event *event,
67 struct perf_sample *sample, 98 struct perf_sample *sample,
68 struct perf_session *session) 99 struct machine *machine)
69{ 100{
70 int err; 101 int err;
71 102
72 err = perf_event__process_task(event, sample, session); 103 err = perf_event__process_task(tool, event, sample, machine);
73 perf_event__repipe(event, sample, session); 104 perf_event__repipe(tool, event, sample, machine);
74 105
75 return err; 106 return err;
76} 107}
@@ -80,7 +111,7 @@ static int perf_event__repipe_tracing_data(union perf_event *event,
80{ 111{
81 int err; 112 int err;
82 113
83 perf_event__repipe_synth(event, session); 114 perf_event__repipe_synth(NULL, event, NULL);
84 err = perf_event__process_tracing_data(event, session); 115 err = perf_event__process_tracing_data(event, session);
85 116
86 return err; 117 return err;
@@ -100,10 +131,10 @@ static int dso__read_build_id(struct dso *self)
100 return -1; 131 return -1;
101} 132}
102 133
103static int dso__inject_build_id(struct dso *self, struct perf_session *session) 134static int dso__inject_build_id(struct dso *self, struct perf_tool *tool,
135 struct machine *machine)
104{ 136{
105 u16 misc = PERF_RECORD_MISC_USER; 137 u16 misc = PERF_RECORD_MISC_USER;
106 struct machine *machine;
107 int err; 138 int err;
108 139
109 if (dso__read_build_id(self) < 0) { 140 if (dso__read_build_id(self) < 0) {
@@ -111,17 +142,11 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
111 return -1; 142 return -1;
112 } 143 }
113 144
114 machine = perf_session__find_host_machine(session);
115 if (machine == NULL) {
116 pr_err("Can't find machine for session\n");
117 return -1;
118 }
119
120 if (self->kernel) 145 if (self->kernel)
121 misc = PERF_RECORD_MISC_KERNEL; 146 misc = PERF_RECORD_MISC_KERNEL;
122 147
123 err = perf_event__synthesize_build_id(self, misc, perf_event__repipe, 148 err = perf_event__synthesize_build_id(tool, self, misc, perf_event__repipe,
124 machine, session); 149 machine);
125 if (err) { 150 if (err) {
126 pr_err("Can't synthesize build_id event for %s\n", self->long_name); 151 pr_err("Can't synthesize build_id event for %s\n", self->long_name);
127 return -1; 152 return -1;
@@ -130,10 +155,11 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
130 return 0; 155 return 0;
131} 156}
132 157
133static int perf_event__inject_buildid(union perf_event *event, 158static int perf_event__inject_buildid(struct perf_tool *tool,
159 union perf_event *event,
134 struct perf_sample *sample, 160 struct perf_sample *sample,
135 struct perf_evsel *evsel __used, 161 struct perf_evsel *evsel __used,
136 struct perf_session *session) 162 struct machine *machine)
137{ 163{
138 struct addr_location al; 164 struct addr_location al;
139 struct thread *thread; 165 struct thread *thread;
@@ -141,21 +167,21 @@ static int perf_event__inject_buildid(union perf_event *event,
141 167
142 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 168 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
143 169
144 thread = perf_session__findnew(session, event->ip.pid); 170 thread = machine__findnew_thread(machine, event->ip.pid);
145 if (thread == NULL) { 171 if (thread == NULL) {
146 pr_err("problem processing %d event, skipping it.\n", 172 pr_err("problem processing %d event, skipping it.\n",
147 event->header.type); 173 event->header.type);
148 goto repipe; 174 goto repipe;
149 } 175 }
150 176
151 thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, 177 thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
152 event->ip.pid, event->ip.ip, &al); 178 event->ip.ip, &al);
153 179
154 if (al.map != NULL) { 180 if (al.map != NULL) {
155 if (!al.map->dso->hit) { 181 if (!al.map->dso->hit) {
156 al.map->dso->hit = 1; 182 al.map->dso->hit = 1;
157 if (map__load(al.map, NULL) >= 0) { 183 if (map__load(al.map, NULL) >= 0) {
158 dso__inject_build_id(al.map->dso, session); 184 dso__inject_build_id(al.map->dso, tool, machine);
159 /* 185 /*
160 * If this fails, too bad, let the other side 186 * If this fails, too bad, let the other side
161 * account this as unresolved. 187 * account this as unresolved.
@@ -168,24 +194,24 @@ static int perf_event__inject_buildid(union perf_event *event,
168 } 194 }
169 195
170repipe: 196repipe:
171 perf_event__repipe(event, sample, session); 197 perf_event__repipe(tool, event, sample, machine);
172 return 0; 198 return 0;
173} 199}
174 200
175struct perf_event_ops inject_ops = { 201struct perf_tool perf_inject = {
176 .sample = perf_event__repipe_sample, 202 .sample = perf_event__repipe_sample,
177 .mmap = perf_event__repipe, 203 .mmap = perf_event__repipe,
178 .comm = perf_event__repipe, 204 .comm = perf_event__repipe,
179 .fork = perf_event__repipe, 205 .fork = perf_event__repipe,
180 .exit = perf_event__repipe, 206 .exit = perf_event__repipe,
181 .lost = perf_event__repipe, 207 .lost = perf_event__repipe,
182 .read = perf_event__repipe, 208 .read = perf_event__repipe_sample,
183 .throttle = perf_event__repipe, 209 .throttle = perf_event__repipe,
184 .unthrottle = perf_event__repipe, 210 .unthrottle = perf_event__repipe,
185 .attr = perf_event__repipe_synth, 211 .attr = perf_event__repipe_attr,
186 .event_type = perf_event__repipe_synth, 212 .event_type = perf_event__repipe_event_type_synth,
187 .tracing_data = perf_event__repipe_synth, 213 .tracing_data = perf_event__repipe_tracing_data_synth,
188 .build_id = perf_event__repipe_synth, 214 .build_id = perf_event__repipe_op2_synth,
189}; 215};
190 216
191extern volatile int session_done; 217extern volatile int session_done;
@@ -203,17 +229,17 @@ static int __cmd_inject(void)
203 signal(SIGINT, sig_handler); 229 signal(SIGINT, sig_handler);
204 230
205 if (inject_build_ids) { 231 if (inject_build_ids) {
206 inject_ops.sample = perf_event__inject_buildid; 232 perf_inject.sample = perf_event__inject_buildid;
207 inject_ops.mmap = perf_event__repipe_mmap; 233 perf_inject.mmap = perf_event__repipe_mmap;
208 inject_ops.fork = perf_event__repipe_task; 234 perf_inject.fork = perf_event__repipe_task;
209 inject_ops.tracing_data = perf_event__repipe_tracing_data; 235 perf_inject.tracing_data = perf_event__repipe_tracing_data;
210 } 236 }
211 237
212 session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops); 238 session = perf_session__new(input_name, O_RDONLY, false, true, &perf_inject);
213 if (session == NULL) 239 if (session == NULL)
214 return -ENOMEM; 240 return -ENOMEM;
215 241
216 ret = perf_session__process_events(session, &inject_ops); 242 ret = perf_session__process_events(session, &perf_inject);
217 243
218 perf_session__delete(session); 244 perf_session__delete(session);
219 245
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 225e963df105..fe1ad8f21961 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -7,6 +7,7 @@
7#include "util/thread.h" 7#include "util/thread.h"
8#include "util/header.h" 8#include "util/header.h"
9#include "util/session.h" 9#include "util/session.h"
10#include "util/tool.h"
10 11
11#include "util/parse-options.h" 12#include "util/parse-options.h"
12#include "util/trace-event.h" 13#include "util/trace-event.h"
@@ -18,7 +19,7 @@
18struct alloc_stat; 19struct alloc_stat;
19typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *); 20typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
20 21
21static char const *input_name = "perf.data"; 22static const char *input_name;
22 23
23static int alloc_flag; 24static int alloc_flag;
24static int caller_flag; 25static int caller_flag;
@@ -303,12 +304,13 @@ static void process_raw_event(union perf_event *raw_event __used, void *data,
303 } 304 }
304} 305}
305 306
306static int process_sample_event(union perf_event *event, 307static int process_sample_event(struct perf_tool *tool __used,
308 union perf_event *event,
307 struct perf_sample *sample, 309 struct perf_sample *sample,
308 struct perf_evsel *evsel __used, 310 struct perf_evsel *evsel __used,
309 struct perf_session *session) 311 struct machine *machine)
310{ 312{
311 struct thread *thread = perf_session__findnew(session, event->ip.pid); 313 struct thread *thread = machine__findnew_thread(machine, event->ip.pid);
312 314
313 if (thread == NULL) { 315 if (thread == NULL) {
314 pr_debug("problem processing %d event, skipping it.\n", 316 pr_debug("problem processing %d event, skipping it.\n",
@@ -324,7 +326,7 @@ static int process_sample_event(union perf_event *event,
324 return 0; 326 return 0;
325} 327}
326 328
327static struct perf_event_ops event_ops = { 329static struct perf_tool perf_kmem = {
328 .sample = process_sample_event, 330 .sample = process_sample_event,
329 .comm = perf_event__process_comm, 331 .comm = perf_event__process_comm,
330 .ordered_samples = true, 332 .ordered_samples = true,
@@ -483,7 +485,7 @@ static int __cmd_kmem(void)
483{ 485{
484 int err = -EINVAL; 486 int err = -EINVAL;
485 struct perf_session *session = perf_session__new(input_name, O_RDONLY, 487 struct perf_session *session = perf_session__new(input_name, O_RDONLY,
486 0, false, &event_ops); 488 0, false, &perf_kmem);
487 if (session == NULL) 489 if (session == NULL)
488 return -ENOMEM; 490 return -ENOMEM;
489 491
@@ -494,7 +496,7 @@ static int __cmd_kmem(void)
494 goto out_delete; 496 goto out_delete;
495 497
496 setup_pager(); 498 setup_pager();
497 err = perf_session__process_events(session, &event_ops); 499 err = perf_session__process_events(session, &perf_kmem);
498 if (err != 0) 500 if (err != 0)
499 goto out_delete; 501 goto out_delete;
500 sort_result(); 502 sort_result();
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 34d1e853829d..032324a76b87 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -38,7 +38,7 @@ static const struct option kvm_options[] = {
38 OPT_BOOLEAN(0, "guest", &perf_guest, 38 OPT_BOOLEAN(0, "guest", &perf_guest,
39 "Collect guest os data"), 39 "Collect guest os data"),
40 OPT_BOOLEAN(0, "host", &perf_host, 40 OPT_BOOLEAN(0, "host", &perf_host,
41 "Collect guest os data"), 41 "Collect host os data"),
42 OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory", 42 OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
43 "guest mount directory under which every guest os" 43 "guest mount directory under which every guest os"
44 " instance has a subdir"), 44 " instance has a subdir"),
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 899080ace267..2296c391d0f5 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -12,6 +12,7 @@
12 12
13#include "util/debug.h" 13#include "util/debug.h"
14#include "util/session.h" 14#include "util/session.h"
15#include "util/tool.h"
15 16
16#include <sys/types.h> 17#include <sys/types.h>
17#include <sys/prctl.h> 18#include <sys/prctl.h>
@@ -325,7 +326,7 @@ alloc_failed:
325 die("memory allocation failed\n"); 326 die("memory allocation failed\n");
326} 327}
327 328
328static char const *input_name = "perf.data"; 329static const char *input_name;
329 330
330struct raw_event_sample { 331struct raw_event_sample {
331 u32 size; 332 u32 size;
@@ -845,12 +846,13 @@ static void dump_info(void)
845 die("Unknown type of information\n"); 846 die("Unknown type of information\n");
846} 847}
847 848
848static int process_sample_event(union perf_event *event, 849static int process_sample_event(struct perf_tool *tool __used,
850 union perf_event *event,
849 struct perf_sample *sample, 851 struct perf_sample *sample,
850 struct perf_evsel *evsel __used, 852 struct perf_evsel *evsel __used,
851 struct perf_session *s) 853 struct machine *machine)
852{ 854{
853 struct thread *thread = perf_session__findnew(s, sample->tid); 855 struct thread *thread = machine__findnew_thread(machine, sample->tid);
854 856
855 if (thread == NULL) { 857 if (thread == NULL) {
856 pr_debug("problem processing %d event, skipping it.\n", 858 pr_debug("problem processing %d event, skipping it.\n",
@@ -863,7 +865,7 @@ static int process_sample_event(union perf_event *event,
863 return 0; 865 return 0;
864} 866}
865 867
866static struct perf_event_ops eops = { 868static struct perf_tool eops = {
867 .sample = process_sample_event, 869 .sample = process_sample_event,
868 .comm = perf_event__process_comm, 870 .comm = perf_event__process_comm,
869 .ordered_samples = true, 871 .ordered_samples = true,
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 710ae3d0a489..59d43abfbfec 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -46,7 +46,6 @@
46 46
47#define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*" 47#define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
48#define DEFAULT_FUNC_FILTER "!_*" 48#define DEFAULT_FUNC_FILTER "!_*"
49#define MAX_PATH_LEN 256
50 49
51/* Session management structure */ 50/* Session management structure */
52static struct { 51static struct {
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 6ab58cc99d53..0abfb18b911f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -22,6 +22,7 @@
22#include "util/evsel.h" 22#include "util/evsel.h"
23#include "util/debug.h" 23#include "util/debug.h"
24#include "util/session.h" 24#include "util/session.h"
25#include "util/tool.h"
25#include "util/symbol.h" 26#include "util/symbol.h"
26#include "util/cpumap.h" 27#include "util/cpumap.h"
27#include "util/thread_map.h" 28#include "util/thread_map.h"
@@ -35,55 +36,36 @@ enum write_mode_t {
35 WRITE_APPEND 36 WRITE_APPEND
36}; 37};
37 38
38static u64 user_interval = ULLONG_MAX; 39struct perf_record {
39static u64 default_interval = 0; 40 struct perf_tool tool;
40 41 struct perf_record_opts opts;
41static unsigned int page_size; 42 u64 bytes_written;
42static unsigned int mmap_pages = UINT_MAX; 43 const char *output_name;
43static unsigned int user_freq = UINT_MAX; 44 struct perf_evlist *evlist;
44static int freq = 1000; 45 struct perf_session *session;
45static int output; 46 const char *progname;
46static int pipe_output = 0; 47 int output;
47static const char *output_name = NULL; 48 unsigned int page_size;
48static bool group = false; 49 int realtime_prio;
49static int realtime_prio = 0; 50 enum write_mode_t write_mode;
50static bool nodelay = false; 51 bool no_buildid;
51static bool raw_samples = false; 52 bool no_buildid_cache;
52static bool sample_id_all_avail = true; 53 bool force;
53static bool system_wide = false; 54 bool file_new;
54static pid_t target_pid = -1; 55 bool append_file;
55static pid_t target_tid = -1; 56 long samples;
56static pid_t child_pid = -1; 57 off_t post_processing_offset;
57static bool no_inherit = false; 58};
58static enum write_mode_t write_mode = WRITE_FORCE; 59
59static bool call_graph = false; 60static void advance_output(struct perf_record *rec, size_t size)
60static bool inherit_stat = false;
61static bool no_samples = false;
62static bool sample_address = false;
63static bool sample_time = false;
64static bool no_buildid = false;
65static bool no_buildid_cache = false;
66static struct perf_evlist *evsel_list;
67
68static long samples = 0;
69static u64 bytes_written = 0;
70
71static int file_new = 1;
72static off_t post_processing_offset;
73
74static struct perf_session *session;
75static const char *cpu_list;
76static const char *progname;
77
78static void advance_output(size_t size)
79{ 61{
80 bytes_written += size; 62 rec->bytes_written += size;
81} 63}
82 64
83static void write_output(void *buf, size_t size) 65static void write_output(struct perf_record *rec, void *buf, size_t size)
84{ 66{
85 while (size) { 67 while (size) {
86 int ret = write(output, buf, size); 68 int ret = write(rec->output, buf, size);
87 69
88 if (ret < 0) 70 if (ret < 0)
89 die("failed to write"); 71 die("failed to write");
@@ -91,30 +73,33 @@ static void write_output(void *buf, size_t size)
91 size -= ret; 73 size -= ret;
92 buf += ret; 74 buf += ret;
93 75
94 bytes_written += ret; 76 rec->bytes_written += ret;
95 } 77 }
96} 78}
97 79
98static int process_synthesized_event(union perf_event *event, 80static int process_synthesized_event(struct perf_tool *tool,
81 union perf_event *event,
99 struct perf_sample *sample __used, 82 struct perf_sample *sample __used,
100 struct perf_session *self __used) 83 struct machine *machine __used)
101{ 84{
102 write_output(event, event->header.size); 85 struct perf_record *rec = container_of(tool, struct perf_record, tool);
86 write_output(rec, event, event->header.size);
103 return 0; 87 return 0;
104} 88}
105 89
106static void mmap_read(struct perf_mmap *md) 90static void perf_record__mmap_read(struct perf_record *rec,
91 struct perf_mmap *md)
107{ 92{
108 unsigned int head = perf_mmap__read_head(md); 93 unsigned int head = perf_mmap__read_head(md);
109 unsigned int old = md->prev; 94 unsigned int old = md->prev;
110 unsigned char *data = md->base + page_size; 95 unsigned char *data = md->base + rec->page_size;
111 unsigned long size; 96 unsigned long size;
112 void *buf; 97 void *buf;
113 98
114 if (old == head) 99 if (old == head)
115 return; 100 return;
116 101
117 samples++; 102 rec->samples++;
118 103
119 size = head - old; 104 size = head - old;
120 105
@@ -123,14 +108,14 @@ static void mmap_read(struct perf_mmap *md)
123 size = md->mask + 1 - (old & md->mask); 108 size = md->mask + 1 - (old & md->mask);
124 old += size; 109 old += size;
125 110
126 write_output(buf, size); 111 write_output(rec, buf, size);
127 } 112 }
128 113
129 buf = &data[old & md->mask]; 114 buf = &data[old & md->mask];
130 size = head - old; 115 size = head - old;
131 old += size; 116 old += size;
132 117
133 write_output(buf, size); 118 write_output(rec, buf, size);
134 119
135 md->prev = old; 120 md->prev = old;
136 perf_mmap__write_tail(md, old); 121 perf_mmap__write_tail(md, old);
@@ -149,17 +134,18 @@ static void sig_handler(int sig)
149 signr = sig; 134 signr = sig;
150} 135}
151 136
152static void sig_atexit(void) 137static void perf_record__sig_exit(int exit_status __used, void *arg)
153{ 138{
139 struct perf_record *rec = arg;
154 int status; 140 int status;
155 141
156 if (child_pid > 0) { 142 if (rec->evlist->workload.pid > 0) {
157 if (!child_finished) 143 if (!child_finished)
158 kill(child_pid, SIGTERM); 144 kill(rec->evlist->workload.pid, SIGTERM);
159 145
160 wait(&status); 146 wait(&status);
161 if (WIFSIGNALED(status)) 147 if (WIFSIGNALED(status))
162 psignal(WTERMSIG(status), progname); 148 psignal(WTERMSIG(status), rec->progname);
163 } 149 }
164 150
165 if (signr == -1 || signr == SIGUSR1) 151 if (signr == -1 || signr == SIGUSR1)
@@ -169,78 +155,6 @@ static void sig_atexit(void)
169 kill(getpid(), signr); 155 kill(getpid(), signr);
170} 156}
171 157
172static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
173{
174 struct perf_event_attr *attr = &evsel->attr;
175 int track = !evsel->idx; /* only the first counter needs these */
176
177 attr->disabled = 1;
178 attr->inherit = !no_inherit;
179 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
180 PERF_FORMAT_TOTAL_TIME_RUNNING |
181 PERF_FORMAT_ID;
182
183 attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
184
185 if (evlist->nr_entries > 1)
186 attr->sample_type |= PERF_SAMPLE_ID;
187
188 /*
189 * We default some events to a 1 default interval. But keep
190 * it a weak assumption overridable by the user.
191 */
192 if (!attr->sample_period || (user_freq != UINT_MAX &&
193 user_interval != ULLONG_MAX)) {
194 if (freq) {
195 attr->sample_type |= PERF_SAMPLE_PERIOD;
196 attr->freq = 1;
197 attr->sample_freq = freq;
198 } else {
199 attr->sample_period = default_interval;
200 }
201 }
202
203 if (no_samples)
204 attr->sample_freq = 0;
205
206 if (inherit_stat)
207 attr->inherit_stat = 1;
208
209 if (sample_address) {
210 attr->sample_type |= PERF_SAMPLE_ADDR;
211 attr->mmap_data = track;
212 }
213
214 if (call_graph)
215 attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
216
217 if (system_wide)
218 attr->sample_type |= PERF_SAMPLE_CPU;
219
220 if (sample_id_all_avail &&
221 (sample_time || system_wide || !no_inherit || cpu_list))
222 attr->sample_type |= PERF_SAMPLE_TIME;
223
224 if (raw_samples) {
225 attr->sample_type |= PERF_SAMPLE_TIME;
226 attr->sample_type |= PERF_SAMPLE_RAW;
227 attr->sample_type |= PERF_SAMPLE_CPU;
228 }
229
230 if (nodelay) {
231 attr->watermark = 0;
232 attr->wakeup_events = 1;
233 }
234
235 attr->mmap = track;
236 attr->comm = track;
237
238 if (target_pid == -1 && target_tid == -1 && !system_wide) {
239 attr->disabled = 1;
240 attr->enable_on_exec = 1;
241 }
242}
243
244static bool perf_evlist__equal(struct perf_evlist *evlist, 158static bool perf_evlist__equal(struct perf_evlist *evlist,
245 struct perf_evlist *other) 159 struct perf_evlist *other)
246{ 160{
@@ -260,15 +174,17 @@ static bool perf_evlist__equal(struct perf_evlist *evlist,
260 return true; 174 return true;
261} 175}
262 176
263static void open_counters(struct perf_evlist *evlist) 177static void perf_record__open(struct perf_record *rec)
264{ 178{
265 struct perf_evsel *pos, *first; 179 struct perf_evsel *pos, *first;
266 180 struct perf_evlist *evlist = rec->evlist;
267 if (evlist->cpus->map[0] < 0) 181 struct perf_session *session = rec->session;
268 no_inherit = true; 182 struct perf_record_opts *opts = &rec->opts;
269 183
270 first = list_entry(evlist->entries.next, struct perf_evsel, node); 184 first = list_entry(evlist->entries.next, struct perf_evsel, node);
271 185
186 perf_evlist__config_attrs(evlist, opts);
187
272 list_for_each_entry(pos, &evlist->entries, node) { 188 list_for_each_entry(pos, &evlist->entries, node) {
273 struct perf_event_attr *attr = &pos->attr; 189 struct perf_event_attr *attr = &pos->attr;
274 struct xyarray *group_fd = NULL; 190 struct xyarray *group_fd = NULL;
@@ -286,29 +202,27 @@ static void open_counters(struct perf_evlist *evlist)
286 */ 202 */
287 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME; 203 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
288 204
289 if (group && pos != first) 205 if (opts->group && pos != first)
290 group_fd = first->fd; 206 group_fd = first->fd;
291
292 config_attr(pos, evlist);
293retry_sample_id: 207retry_sample_id:
294 attr->sample_id_all = sample_id_all_avail ? 1 : 0; 208 attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0;
295try_again: 209try_again:
296 if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group, 210 if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
297 group_fd) < 0) { 211 opts->group, group_fd) < 0) {
298 int err = errno; 212 int err = errno;
299 213
300 if (err == EPERM || err == EACCES) { 214 if (err == EPERM || err == EACCES) {
301 ui__error_paranoid(); 215 ui__error_paranoid();
302 exit(EXIT_FAILURE); 216 exit(EXIT_FAILURE);
303 } else if (err == ENODEV && cpu_list) { 217 } else if (err == ENODEV && opts->cpu_list) {
304 die("No such device - did you specify" 218 die("No such device - did you specify"
305 " an out-of-range profile CPU?\n"); 219 " an out-of-range profile CPU?\n");
306 } else if (err == EINVAL && sample_id_all_avail) { 220 } else if (err == EINVAL && opts->sample_id_all_avail) {
307 /* 221 /*
308 * Old kernel, no attr->sample_id_type_all field 222 * Old kernel, no attr->sample_id_type_all field
309 */ 223 */
310 sample_id_all_avail = false; 224 opts->sample_id_all_avail = false;
311 if (!sample_time && !raw_samples && !time_needed) 225 if (!opts->sample_time && !opts->raw_samples && !time_needed)
312 attr->sample_type &= ~PERF_SAMPLE_TIME; 226 attr->sample_type &= ~PERF_SAMPLE_TIME;
313 227
314 goto retry_sample_id; 228 goto retry_sample_id;
@@ -358,10 +272,20 @@ try_again:
358 exit(-1); 272 exit(-1);
359 } 273 }
360 274
361 if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) 275 if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
276 if (errno == EPERM)
277 die("Permission error mapping pages.\n"
278 "Consider increasing "
279 "/proc/sys/kernel/perf_event_mlock_kb,\n"
280 "or try again with a smaller value of -m/--mmap_pages.\n"
281 "(current value: %d)\n", opts->mmap_pages);
282 else if (!is_power_of_2(opts->mmap_pages))
283 die("--mmap_pages/-m value must be a power of two.");
284
362 die("failed to mmap with %d (%s)\n", errno, strerror(errno)); 285 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
286 }
363 287
364 if (file_new) 288 if (rec->file_new)
365 session->evlist = evlist; 289 session->evlist = evlist;
366 else { 290 else {
367 if (!perf_evlist__equal(session->evlist, evlist)) { 291 if (!perf_evlist__equal(session->evlist, evlist)) {
@@ -373,29 +297,32 @@ try_again:
373 perf_session__update_sample_type(session); 297 perf_session__update_sample_type(session);
374} 298}
375 299
376static int process_buildids(void) 300static int process_buildids(struct perf_record *rec)
377{ 301{
378 u64 size = lseek(output, 0, SEEK_CUR); 302 u64 size = lseek(rec->output, 0, SEEK_CUR);
379 303
380 if (size == 0) 304 if (size == 0)
381 return 0; 305 return 0;
382 306
383 session->fd = output; 307 rec->session->fd = rec->output;
384 return __perf_session__process_events(session, post_processing_offset, 308 return __perf_session__process_events(rec->session, rec->post_processing_offset,
385 size - post_processing_offset, 309 size - rec->post_processing_offset,
386 size, &build_id__mark_dso_hit_ops); 310 size, &build_id__mark_dso_hit_ops);
387} 311}
388 312
389static void atexit_header(void) 313static void perf_record__exit(int status __used, void *arg)
390{ 314{
391 if (!pipe_output) { 315 struct perf_record *rec = arg;
392 session->header.data_size += bytes_written; 316
393 317 if (!rec->opts.pipe_output) {
394 if (!no_buildid) 318 rec->session->header.data_size += rec->bytes_written;
395 process_buildids(); 319
396 perf_session__write_header(session, evsel_list, output, true); 320 if (!rec->no_buildid)
397 perf_session__delete(session); 321 process_buildids(rec);
398 perf_evlist__delete(evsel_list); 322 perf_session__write_header(rec->session, rec->evlist,
323 rec->output, true);
324 perf_session__delete(rec->session);
325 perf_evlist__delete(rec->evlist);
399 symbol__exit(); 326 symbol__exit();
400 } 327 }
401} 328}
@@ -403,7 +330,7 @@ static void atexit_header(void)
403static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 330static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
404{ 331{
405 int err; 332 int err;
406 struct perf_session *psession = data; 333 struct perf_tool *tool = data;
407 334
408 if (machine__is_host(machine)) 335 if (machine__is_host(machine))
409 return; 336 return;
@@ -416,8 +343,8 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
416 *method is used to avoid symbol missing when the first addr is 343 *method is used to avoid symbol missing when the first addr is
417 *in module instead of in guest kernel. 344 *in module instead of in guest kernel.
418 */ 345 */
419 err = perf_event__synthesize_modules(process_synthesized_event, 346 err = perf_event__synthesize_modules(tool, process_synthesized_event,
420 psession, machine); 347 machine);
421 if (err < 0) 348 if (err < 0)
422 pr_err("Couldn't record guest kernel [%d]'s reference" 349 pr_err("Couldn't record guest kernel [%d]'s reference"
423 " relocation symbol.\n", machine->pid); 350 " relocation symbol.\n", machine->pid);
@@ -426,12 +353,11 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
426 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 353 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
427 * have no _text sometimes. 354 * have no _text sometimes.
428 */ 355 */
429 err = perf_event__synthesize_kernel_mmap(process_synthesized_event, 356 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
430 psession, machine, "_text"); 357 machine, "_text");
431 if (err < 0) 358 if (err < 0)
432 err = perf_event__synthesize_kernel_mmap(process_synthesized_event, 359 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
433 psession, machine, 360 machine, "_stext");
434 "_stext");
435 if (err < 0) 361 if (err < 0)
436 pr_err("Couldn't record guest kernel [%d]'s reference" 362 pr_err("Couldn't record guest kernel [%d]'s reference"
437 " relocation symbol.\n", machine->pid); 363 " relocation symbol.\n", machine->pid);
@@ -442,73 +368,71 @@ static struct perf_event_header finished_round_event = {
442 .type = PERF_RECORD_FINISHED_ROUND, 368 .type = PERF_RECORD_FINISHED_ROUND,
443}; 369};
444 370
445static void mmap_read_all(void) 371static void perf_record__mmap_read_all(struct perf_record *rec)
446{ 372{
447 int i; 373 int i;
448 374
449 for (i = 0; i < evsel_list->nr_mmaps; i++) { 375 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
450 if (evsel_list->mmap[i].base) 376 if (rec->evlist->mmap[i].base)
451 mmap_read(&evsel_list->mmap[i]); 377 perf_record__mmap_read(rec, &rec->evlist->mmap[i]);
452 } 378 }
453 379
454 if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO)) 380 if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO))
455 write_output(&finished_round_event, sizeof(finished_round_event)); 381 write_output(rec, &finished_round_event, sizeof(finished_round_event));
456} 382}
457 383
458static int __cmd_record(int argc, const char **argv) 384static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
459{ 385{
460 struct stat st; 386 struct stat st;
461 int flags; 387 int flags;
462 int err; 388 int err, output;
463 unsigned long waking = 0; 389 unsigned long waking = 0;
464 int child_ready_pipe[2], go_pipe[2];
465 const bool forks = argc > 0; 390 const bool forks = argc > 0;
466 char buf;
467 struct machine *machine; 391 struct machine *machine;
392 struct perf_tool *tool = &rec->tool;
393 struct perf_record_opts *opts = &rec->opts;
394 struct perf_evlist *evsel_list = rec->evlist;
395 const char *output_name = rec->output_name;
396 struct perf_session *session;
468 397
469 progname = argv[0]; 398 rec->progname = argv[0];
470 399
471 page_size = sysconf(_SC_PAGE_SIZE); 400 rec->page_size = sysconf(_SC_PAGE_SIZE);
472 401
473 atexit(sig_atexit); 402 on_exit(perf_record__sig_exit, rec);
474 signal(SIGCHLD, sig_handler); 403 signal(SIGCHLD, sig_handler);
475 signal(SIGINT, sig_handler); 404 signal(SIGINT, sig_handler);
476 signal(SIGUSR1, sig_handler); 405 signal(SIGUSR1, sig_handler);
477 406
478 if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
479 perror("failed to create pipes");
480 exit(-1);
481 }
482
483 if (!output_name) { 407 if (!output_name) {
484 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode)) 408 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
485 pipe_output = 1; 409 opts->pipe_output = true;
486 else 410 else
487 output_name = "perf.data"; 411 rec->output_name = output_name = "perf.data";
488 } 412 }
489 if (output_name) { 413 if (output_name) {
490 if (!strcmp(output_name, "-")) 414 if (!strcmp(output_name, "-"))
491 pipe_output = 1; 415 opts->pipe_output = true;
492 else if (!stat(output_name, &st) && st.st_size) { 416 else if (!stat(output_name, &st) && st.st_size) {
493 if (write_mode == WRITE_FORCE) { 417 if (rec->write_mode == WRITE_FORCE) {
494 char oldname[PATH_MAX]; 418 char oldname[PATH_MAX];
495 snprintf(oldname, sizeof(oldname), "%s.old", 419 snprintf(oldname, sizeof(oldname), "%s.old",
496 output_name); 420 output_name);
497 unlink(oldname); 421 unlink(oldname);
498 rename(output_name, oldname); 422 rename(output_name, oldname);
499 } 423 }
500 } else if (write_mode == WRITE_APPEND) { 424 } else if (rec->write_mode == WRITE_APPEND) {
501 write_mode = WRITE_FORCE; 425 rec->write_mode = WRITE_FORCE;
502 } 426 }
503 } 427 }
504 428
505 flags = O_CREAT|O_RDWR; 429 flags = O_CREAT|O_RDWR;
506 if (write_mode == WRITE_APPEND) 430 if (rec->write_mode == WRITE_APPEND)
507 file_new = 0; 431 rec->file_new = 0;
508 else 432 else
509 flags |= O_TRUNC; 433 flags |= O_TRUNC;
510 434
511 if (pipe_output) 435 if (opts->pipe_output)
512 output = STDOUT_FILENO; 436 output = STDOUT_FILENO;
513 else 437 else
514 output = open(output_name, flags, S_IRUSR | S_IWUSR); 438 output = open(output_name, flags, S_IRUSR | S_IWUSR);
@@ -517,17 +441,21 @@ static int __cmd_record(int argc, const char **argv)
517 exit(-1); 441 exit(-1);
518 } 442 }
519 443
444 rec->output = output;
445
520 session = perf_session__new(output_name, O_WRONLY, 446 session = perf_session__new(output_name, O_WRONLY,
521 write_mode == WRITE_FORCE, false, NULL); 447 rec->write_mode == WRITE_FORCE, false, NULL);
522 if (session == NULL) { 448 if (session == NULL) {
523 pr_err("Not enough memory for reading perf file header\n"); 449 pr_err("Not enough memory for reading perf file header\n");
524 return -1; 450 return -1;
525 } 451 }
526 452
527 if (!no_buildid) 453 rec->session = session;
454
455 if (!rec->no_buildid)
528 perf_header__set_feat(&session->header, HEADER_BUILD_ID); 456 perf_header__set_feat(&session->header, HEADER_BUILD_ID);
529 457
530 if (!file_new) { 458 if (!rec->file_new) {
531 err = perf_session__read_header(session, output); 459 err = perf_session__read_header(session, output);
532 if (err < 0) 460 if (err < 0)
533 goto out_delete_session; 461 goto out_delete_session;
@@ -549,94 +477,57 @@ static int __cmd_record(int argc, const char **argv)
549 perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY); 477 perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
550 perf_header__set_feat(&session->header, HEADER_CPUID); 478 perf_header__set_feat(&session->header, HEADER_CPUID);
551 479
552 /* 512 kiB: default amount of unprivileged mlocked memory */
553 if (mmap_pages == UINT_MAX)
554 mmap_pages = (512 * 1024) / page_size;
555
556 if (forks) { 480 if (forks) {
557 child_pid = fork(); 481 err = perf_evlist__prepare_workload(evsel_list, opts, argv);
558 if (child_pid < 0) { 482 if (err < 0) {
559 perror("failed to fork"); 483 pr_err("Couldn't run the workload!\n");
560 exit(-1); 484 goto out_delete_session;
561 }
562
563 if (!child_pid) {
564 if (pipe_output)
565 dup2(2, 1);
566 close(child_ready_pipe[0]);
567 close(go_pipe[1]);
568 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
569
570 /*
571 * Do a dummy execvp to get the PLT entry resolved,
572 * so we avoid the resolver overhead on the real
573 * execvp call.
574 */
575 execvp("", (char **)argv);
576
577 /*
578 * Tell the parent we're ready to go
579 */
580 close(child_ready_pipe[1]);
581
582 /*
583 * Wait until the parent tells us to go.
584 */
585 if (read(go_pipe[0], &buf, 1) == -1)
586 perror("unable to read pipe");
587
588 execvp(argv[0], (char **)argv);
589
590 perror(argv[0]);
591 kill(getppid(), SIGUSR1);
592 exit(-1);
593 }
594
595 if (!system_wide && target_tid == -1 && target_pid == -1)
596 evsel_list->threads->map[0] = child_pid;
597
598 close(child_ready_pipe[1]);
599 close(go_pipe[0]);
600 /*
601 * wait for child to settle
602 */
603 if (read(child_ready_pipe[0], &buf, 1) == -1) {
604 perror("unable to read pipe");
605 exit(-1);
606 } 485 }
607 close(child_ready_pipe[0]);
608 } 486 }
609 487
610 open_counters(evsel_list); 488 perf_record__open(rec);
611 489
612 /* 490 /*
613 * perf_session__delete(session) will be called at atexit_header() 491 * perf_session__delete(session) will be called at perf_record__exit()
614 */ 492 */
615 atexit(atexit_header); 493 on_exit(perf_record__exit, rec);
616 494
617 if (pipe_output) { 495 if (opts->pipe_output) {
618 err = perf_header__write_pipe(output); 496 err = perf_header__write_pipe(output);
619 if (err < 0) 497 if (err < 0)
620 return err; 498 return err;
621 } else if (file_new) { 499 } else if (rec->file_new) {
622 err = perf_session__write_header(session, evsel_list, 500 err = perf_session__write_header(session, evsel_list,
623 output, false); 501 output, false);
624 if (err < 0) 502 if (err < 0)
625 return err; 503 return err;
626 } 504 }
627 505
628 post_processing_offset = lseek(output, 0, SEEK_CUR); 506 if (!!rec->no_buildid
507 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
508 pr_err("Couldn't generating buildids. "
509 "Use --no-buildid to profile anyway.\n");
510 return -1;
511 }
629 512
630 if (pipe_output) { 513 rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
631 err = perf_session__synthesize_attrs(session, 514
632 process_synthesized_event); 515 machine = perf_session__find_host_machine(session);
516 if (!machine) {
517 pr_err("Couldn't find native kernel information.\n");
518 return -1;
519 }
520
521 if (opts->pipe_output) {
522 err = perf_event__synthesize_attrs(tool, session,
523 process_synthesized_event);
633 if (err < 0) { 524 if (err < 0) {
634 pr_err("Couldn't synthesize attrs.\n"); 525 pr_err("Couldn't synthesize attrs.\n");
635 return err; 526 return err;
636 } 527 }
637 528
638 err = perf_event__synthesize_event_types(process_synthesized_event, 529 err = perf_event__synthesize_event_types(tool, process_synthesized_event,
639 session); 530 machine);
640 if (err < 0) { 531 if (err < 0) {
641 pr_err("Couldn't synthesize event_types.\n"); 532 pr_err("Couldn't synthesize event_types.\n");
642 return err; 533 return err;
@@ -651,56 +542,49 @@ static int __cmd_record(int argc, const char **argv)
651 * return this more properly and also 542 * return this more properly and also
652 * propagate errors that now are calling die() 543 * propagate errors that now are calling die()
653 */ 544 */
654 err = perf_event__synthesize_tracing_data(output, evsel_list, 545 err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
655 process_synthesized_event, 546 process_synthesized_event);
656 session);
657 if (err <= 0) { 547 if (err <= 0) {
658 pr_err("Couldn't record tracing data.\n"); 548 pr_err("Couldn't record tracing data.\n");
659 return err; 549 return err;
660 } 550 }
661 advance_output(err); 551 advance_output(rec, err);
662 } 552 }
663 } 553 }
664 554
665 machine = perf_session__find_host_machine(session); 555 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
666 if (!machine) { 556 machine, "_text");
667 pr_err("Couldn't find native kernel information.\n");
668 return -1;
669 }
670
671 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
672 session, machine, "_text");
673 if (err < 0) 557 if (err < 0)
674 err = perf_event__synthesize_kernel_mmap(process_synthesized_event, 558 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
675 session, machine, "_stext"); 559 machine, "_stext");
676 if (err < 0) 560 if (err < 0)
677 pr_err("Couldn't record kernel reference relocation symbol\n" 561 pr_err("Couldn't record kernel reference relocation symbol\n"
678 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 562 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
679 "Check /proc/kallsyms permission or run as root.\n"); 563 "Check /proc/kallsyms permission or run as root.\n");
680 564
681 err = perf_event__synthesize_modules(process_synthesized_event, 565 err = perf_event__synthesize_modules(tool, process_synthesized_event,
682 session, machine); 566 machine);
683 if (err < 0) 567 if (err < 0)
684 pr_err("Couldn't record kernel module information.\n" 568 pr_err("Couldn't record kernel module information.\n"
685 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 569 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
686 "Check /proc/modules permission or run as root.\n"); 570 "Check /proc/modules permission or run as root.\n");
687 571
688 if (perf_guest) 572 if (perf_guest)
689 perf_session__process_machines(session, 573 perf_session__process_machines(session, tool,
690 perf_event__synthesize_guest_os); 574 perf_event__synthesize_guest_os);
691 575
692 if (!system_wide) 576 if (!opts->system_wide)
693 perf_event__synthesize_thread_map(evsel_list->threads, 577 perf_event__synthesize_thread_map(tool, evsel_list->threads,
694 process_synthesized_event, 578 process_synthesized_event,
695 session); 579 machine);
696 else 580 else
697 perf_event__synthesize_threads(process_synthesized_event, 581 perf_event__synthesize_threads(tool, process_synthesized_event,
698 session); 582 machine);
699 583
700 if (realtime_prio) { 584 if (rec->realtime_prio) {
701 struct sched_param param; 585 struct sched_param param;
702 586
703 param.sched_priority = realtime_prio; 587 param.sched_priority = rec->realtime_prio;
704 if (sched_setscheduler(0, SCHED_FIFO, &param)) { 588 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
705 pr_err("Could not set realtime priority.\n"); 589 pr_err("Could not set realtime priority.\n");
706 exit(-1); 590 exit(-1);
@@ -713,14 +597,14 @@ static int __cmd_record(int argc, const char **argv)
713 * Let the child rip 597 * Let the child rip
714 */ 598 */
715 if (forks) 599 if (forks)
716 close(go_pipe[1]); 600 perf_evlist__start_workload(evsel_list);
717 601
718 for (;;) { 602 for (;;) {
719 int hits = samples; 603 int hits = rec->samples;
720 604
721 mmap_read_all(); 605 perf_record__mmap_read_all(rec);
722 606
723 if (hits == samples) { 607 if (hits == rec->samples) {
724 if (done) 608 if (done)
725 break; 609 break;
726 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1); 610 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
@@ -741,9 +625,9 @@ static int __cmd_record(int argc, const char **argv)
741 */ 625 */
742 fprintf(stderr, 626 fprintf(stderr,
743 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n", 627 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
744 (double)bytes_written / 1024.0 / 1024.0, 628 (double)rec->bytes_written / 1024.0 / 1024.0,
745 output_name, 629 output_name,
746 bytes_written / 24); 630 rec->bytes_written / 24);
747 631
748 return 0; 632 return 0;
749 633
@@ -758,58 +642,89 @@ static const char * const record_usage[] = {
758 NULL 642 NULL
759}; 643};
760 644
761static bool force, append_file; 645/*
646 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
647 * because we need to have access to it in perf_record__exit, that is called
648 * after cmd_record() exits, but since record_options need to be accessible to
649 * builtin-script, leave it here.
650 *
651 * At least we don't ouch it in all the other functions here directly.
652 *
653 * Just say no to tons of global variables, sigh.
654 */
655static struct perf_record record = {
656 .opts = {
657 .target_pid = -1,
658 .target_tid = -1,
659 .mmap_pages = UINT_MAX,
660 .user_freq = UINT_MAX,
661 .user_interval = ULLONG_MAX,
662 .freq = 1000,
663 .sample_id_all_avail = true,
664 },
665 .write_mode = WRITE_FORCE,
666 .file_new = true,
667};
762 668
669/*
670 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
671 * with it and switch to use the library functions in perf_evlist that came
672 * from builtin-record.c, i.e. use perf_record_opts,
673 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
674 * using pipes, etc.
675 */
763const struct option record_options[] = { 676const struct option record_options[] = {
764 OPT_CALLBACK('e', "event", &evsel_list, "event", 677 OPT_CALLBACK('e', "event", &record.evlist, "event",
765 "event selector. use 'perf list' to list available events", 678 "event selector. use 'perf list' to list available events",
766 parse_events_option), 679 parse_events_option),
767 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 680 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
768 "event filter", parse_filter), 681 "event filter", parse_filter),
769 OPT_INTEGER('p', "pid", &target_pid, 682 OPT_INTEGER('p', "pid", &record.opts.target_pid,
770 "record events on existing process id"), 683 "record events on existing process id"),
771 OPT_INTEGER('t', "tid", &target_tid, 684 OPT_INTEGER('t', "tid", &record.opts.target_tid,
772 "record events on existing thread id"), 685 "record events on existing thread id"),
773 OPT_INTEGER('r', "realtime", &realtime_prio, 686 OPT_INTEGER('r', "realtime", &record.realtime_prio,
774 "collect data with this RT SCHED_FIFO priority"), 687 "collect data with this RT SCHED_FIFO priority"),
775 OPT_BOOLEAN('D', "no-delay", &nodelay, 688 OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
776 "collect data without buffering"), 689 "collect data without buffering"),
777 OPT_BOOLEAN('R', "raw-samples", &raw_samples, 690 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
778 "collect raw sample records from all opened counters"), 691 "collect raw sample records from all opened counters"),
779 OPT_BOOLEAN('a', "all-cpus", &system_wide, 692 OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide,
780 "system-wide collection from all CPUs"), 693 "system-wide collection from all CPUs"),
781 OPT_BOOLEAN('A', "append", &append_file, 694 OPT_BOOLEAN('A', "append", &record.append_file,
782 "append to the output file to do incremental profiling"), 695 "append to the output file to do incremental profiling"),
783 OPT_STRING('C', "cpu", &cpu_list, "cpu", 696 OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu",
784 "list of cpus to monitor"), 697 "list of cpus to monitor"),
785 OPT_BOOLEAN('f', "force", &force, 698 OPT_BOOLEAN('f', "force", &record.force,
786 "overwrite existing data file (deprecated)"), 699 "overwrite existing data file (deprecated)"),
787 OPT_U64('c', "count", &user_interval, "event period to sample"), 700 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
788 OPT_STRING('o', "output", &output_name, "file", 701 OPT_STRING('o', "output", &record.output_name, "file",
789 "output file name"), 702 "output file name"),
790 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 703 OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
791 "child tasks do not inherit counters"), 704 "child tasks do not inherit counters"),
792 OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"), 705 OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
793 OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), 706 OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
794 OPT_BOOLEAN(0, "group", &group, 707 "number of mmap data pages"),
708 OPT_BOOLEAN(0, "group", &record.opts.group,
795 "put the counters into a counter group"), 709 "put the counters into a counter group"),
796 OPT_BOOLEAN('g', "call-graph", &call_graph, 710 OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
797 "do call-graph (stack chain/backtrace) recording"), 711 "do call-graph (stack chain/backtrace) recording"),
798 OPT_INCR('v', "verbose", &verbose, 712 OPT_INCR('v', "verbose", &verbose,
799 "be more verbose (show counter open errors, etc)"), 713 "be more verbose (show counter open errors, etc)"),
800 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), 714 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
801 OPT_BOOLEAN('s', "stat", &inherit_stat, 715 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
802 "per thread counts"), 716 "per thread counts"),
803 OPT_BOOLEAN('d', "data", &sample_address, 717 OPT_BOOLEAN('d', "data", &record.opts.sample_address,
804 "Sample addresses"), 718 "Sample addresses"),
805 OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"), 719 OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
806 OPT_BOOLEAN('n', "no-samples", &no_samples, 720 OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
721 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
807 "don't sample"), 722 "don't sample"),
808 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache, 723 OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
809 "do not update the buildid cache"), 724 "do not update the buildid cache"),
810 OPT_BOOLEAN('B', "no-buildid", &no_buildid, 725 OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
811 "do not collect buildids in perf.data"), 726 "do not collect buildids in perf.data"),
812 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 727 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
813 "monitor event in cgroup name only", 728 "monitor event in cgroup name only",
814 parse_cgroups), 729 parse_cgroups),
815 OPT_END() 730 OPT_END()
@@ -819,6 +734,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
819{ 734{
820 int err = -ENOMEM; 735 int err = -ENOMEM;
821 struct perf_evsel *pos; 736 struct perf_evsel *pos;
737 struct perf_evlist *evsel_list;
738 struct perf_record *rec = &record;
822 739
823 perf_header__set_cmdline(argc, argv); 740 perf_header__set_cmdline(argc, argv);
824 741
@@ -826,23 +743,25 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
826 if (evsel_list == NULL) 743 if (evsel_list == NULL)
827 return -ENOMEM; 744 return -ENOMEM;
828 745
746 rec->evlist = evsel_list;
747
829 argc = parse_options(argc, argv, record_options, record_usage, 748 argc = parse_options(argc, argv, record_options, record_usage,
830 PARSE_OPT_STOP_AT_NON_OPTION); 749 PARSE_OPT_STOP_AT_NON_OPTION);
831 if (!argc && target_pid == -1 && target_tid == -1 && 750 if (!argc && rec->opts.target_pid == -1 && rec->opts.target_tid == -1 &&
832 !system_wide && !cpu_list) 751 !rec->opts.system_wide && !rec->opts.cpu_list)
833 usage_with_options(record_usage, record_options); 752 usage_with_options(record_usage, record_options);
834 753
835 if (force && append_file) { 754 if (rec->force && rec->append_file) {
836 fprintf(stderr, "Can't overwrite and append at the same time." 755 fprintf(stderr, "Can't overwrite and append at the same time."
837 " You need to choose between -f and -A"); 756 " You need to choose between -f and -A");
838 usage_with_options(record_usage, record_options); 757 usage_with_options(record_usage, record_options);
839 } else if (append_file) { 758 } else if (rec->append_file) {
840 write_mode = WRITE_APPEND; 759 rec->write_mode = WRITE_APPEND;
841 } else { 760 } else {
842 write_mode = WRITE_FORCE; 761 rec->write_mode = WRITE_FORCE;
843 } 762 }
844 763
845 if (nr_cgroups && !system_wide) { 764 if (nr_cgroups && !rec->opts.system_wide) {
846 fprintf(stderr, "cgroup monitoring only available in" 765 fprintf(stderr, "cgroup monitoring only available in"
847 " system-wide mode\n"); 766 " system-wide mode\n");
848 usage_with_options(record_usage, record_options); 767 usage_with_options(record_usage, record_options);
@@ -860,7 +779,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
860"If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 779"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
861"even with a suitable vmlinux or kallsyms file.\n\n"); 780"even with a suitable vmlinux or kallsyms file.\n\n");
862 781
863 if (no_buildid_cache || no_buildid) 782 if (rec->no_buildid_cache || rec->no_buildid)
864 disable_buildid_cache(); 783 disable_buildid_cache();
865 784
866 if (evsel_list->nr_entries == 0 && 785 if (evsel_list->nr_entries == 0 &&
@@ -869,43 +788,37 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
869 goto out_symbol_exit; 788 goto out_symbol_exit;
870 } 789 }
871 790
872 if (target_pid != -1) 791 if (rec->opts.target_pid != -1)
873 target_tid = target_pid; 792 rec->opts.target_tid = rec->opts.target_pid;
874 793
875 if (perf_evlist__create_maps(evsel_list, target_pid, 794 if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid,
876 target_tid, cpu_list) < 0) 795 rec->opts.target_tid, rec->opts.cpu_list) < 0)
877 usage_with_options(record_usage, record_options); 796 usage_with_options(record_usage, record_options);
878 797
879 list_for_each_entry(pos, &evsel_list->entries, node) { 798 list_for_each_entry(pos, &evsel_list->entries, node) {
880 if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
881 evsel_list->threads->nr) < 0)
882 goto out_free_fd;
883 if (perf_header__push_event(pos->attr.config, event_name(pos))) 799 if (perf_header__push_event(pos->attr.config, event_name(pos)))
884 goto out_free_fd; 800 goto out_free_fd;
885 } 801 }
886 802
887 if (perf_evlist__alloc_pollfd(evsel_list) < 0) 803 if (rec->opts.user_interval != ULLONG_MAX)
888 goto out_free_fd; 804 rec->opts.default_interval = rec->opts.user_interval;
889 805 if (rec->opts.user_freq != UINT_MAX)
890 if (user_interval != ULLONG_MAX) 806 rec->opts.freq = rec->opts.user_freq;
891 default_interval = user_interval;
892 if (user_freq != UINT_MAX)
893 freq = user_freq;
894 807
895 /* 808 /*
896 * User specified count overrides default frequency. 809 * User specified count overrides default frequency.
897 */ 810 */
898 if (default_interval) 811 if (rec->opts.default_interval)
899 freq = 0; 812 rec->opts.freq = 0;
900 else if (freq) { 813 else if (rec->opts.freq) {
901 default_interval = freq; 814 rec->opts.default_interval = rec->opts.freq;
902 } else { 815 } else {
903 fprintf(stderr, "frequency and count are zero, aborting\n"); 816 fprintf(stderr, "frequency and count are zero, aborting\n");
904 err = -EINVAL; 817 err = -EINVAL;
905 goto out_free_fd; 818 goto out_free_fd;
906 } 819 }
907 820
908 err = __cmd_record(argc, argv); 821 err = __cmd_record(&record, argc, argv);
909out_free_fd: 822out_free_fd:
910 perf_evlist__delete_maps(evsel_list); 823 perf_evlist__delete_maps(evsel_list);
911out_symbol_exit: 824out_symbol_exit:
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 4d7c8340c326..25d34d483e49 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -25,6 +25,7 @@
25#include "util/evsel.h" 25#include "util/evsel.h"
26#include "util/header.h" 26#include "util/header.h"
27#include "util/session.h" 27#include "util/session.h"
28#include "util/tool.h"
28 29
29#include "util/parse-options.h" 30#include "util/parse-options.h"
30#include "util/parse-events.h" 31#include "util/parse-events.h"
@@ -35,38 +36,35 @@
35 36
36#include <linux/bitmap.h> 37#include <linux/bitmap.h>
37 38
38static char const *input_name = "perf.data"; 39struct perf_report {
39 40 struct perf_tool tool;
40static bool force, use_tui, use_stdio; 41 struct perf_session *session;
41static bool hide_unresolved; 42 char const *input_name;
42static bool dont_use_callchains; 43 bool force, use_tui, use_stdio;
43static bool show_full_info; 44 bool hide_unresolved;
44 45 bool dont_use_callchains;
45static bool show_threads; 46 bool show_full_info;
46static struct perf_read_values show_threads_values; 47 bool show_threads;
47 48 bool inverted_callchain;
48static const char default_pretty_printing_style[] = "normal"; 49 struct perf_read_values show_threads_values;
49static const char *pretty_printing_style = default_pretty_printing_style; 50 const char *pretty_printing_style;
50 51 symbol_filter_t annotate_init;
51static char callchain_default_opt[] = "fractal,0.5,callee"; 52 const char *cpu_list;
52static bool inverted_callchain; 53 DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
53static symbol_filter_t annotate_init; 54};
54
55static const char *cpu_list;
56static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
57 55
58static int perf_session__add_hist_entry(struct perf_session *session, 56static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
59 struct addr_location *al, 57 struct addr_location *al,
60 struct perf_sample *sample, 58 struct perf_sample *sample,
61 struct perf_evsel *evsel) 59 struct machine *machine)
62{ 60{
63 struct symbol *parent = NULL; 61 struct symbol *parent = NULL;
64 int err = 0; 62 int err = 0;
65 struct hist_entry *he; 63 struct hist_entry *he;
66 64
67 if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { 65 if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
68 err = perf_session__resolve_callchain(session, al->thread, 66 err = machine__resolve_callchain(machine, evsel, al->thread,
69 sample->callchain, &parent); 67 sample->callchain, &parent);
70 if (err) 68 if (err)
71 return err; 69 return err;
72 } 70 }
@@ -76,7 +74,8 @@ static int perf_session__add_hist_entry(struct perf_session *session,
76 return -ENOMEM; 74 return -ENOMEM;
77 75
78 if (symbol_conf.use_callchain) { 76 if (symbol_conf.use_callchain) {
79 err = callchain_append(he->callchain, &session->callchain_cursor, 77 err = callchain_append(he->callchain,
78 &evsel->hists.callchain_cursor,
80 sample->period); 79 sample->period);
81 if (err) 80 if (err)
82 return err; 81 return err;
@@ -92,8 +91,7 @@ static int perf_session__add_hist_entry(struct perf_session *session,
92 assert(evsel != NULL); 91 assert(evsel != NULL);
93 92
94 err = -ENOMEM; 93 err = -ENOMEM;
95 if (notes->src == NULL && 94 if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0)
96 symbol__alloc_hist(he->ms.sym, session->evlist->nr_entries) < 0)
97 goto out; 95 goto out;
98 96
99 err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); 97 err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
@@ -106,30 +104,32 @@ out:
106} 104}
107 105
108 106
109static int process_sample_event(union perf_event *event, 107static int process_sample_event(struct perf_tool *tool,
108 union perf_event *event,
110 struct perf_sample *sample, 109 struct perf_sample *sample,
111 struct perf_evsel *evsel, 110 struct perf_evsel *evsel,
112 struct perf_session *session) 111 struct machine *machine)
113{ 112{
113 struct perf_report *rep = container_of(tool, struct perf_report, tool);
114 struct addr_location al; 114 struct addr_location al;
115 115
116 if (perf_event__preprocess_sample(event, session, &al, sample, 116 if (perf_event__preprocess_sample(event, machine, &al, sample,
117 annotate_init) < 0) { 117 rep->annotate_init) < 0) {
118 fprintf(stderr, "problem processing %d event, skipping it.\n", 118 fprintf(stderr, "problem processing %d event, skipping it.\n",
119 event->header.type); 119 event->header.type);
120 return -1; 120 return -1;
121 } 121 }
122 122
123 if (al.filtered || (hide_unresolved && al.sym == NULL)) 123 if (al.filtered || (rep->hide_unresolved && al.sym == NULL))
124 return 0; 124 return 0;
125 125
126 if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) 126 if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
127 return 0; 127 return 0;
128 128
129 if (al.map != NULL) 129 if (al.map != NULL)
130 al.map->dso->hit = 1; 130 al.map->dso->hit = 1;
131 131
132 if (perf_session__add_hist_entry(session, &al, sample, evsel)) { 132 if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) {
133 pr_debug("problem incrementing symbol period, skipping event\n"); 133 pr_debug("problem incrementing symbol period, skipping event\n");
134 return -1; 134 return -1;
135 } 135 }
@@ -137,15 +137,17 @@ static int process_sample_event(union perf_event *event,
137 return 0; 137 return 0;
138} 138}
139 139
140static int process_read_event(union perf_event *event, 140static int process_read_event(struct perf_tool *tool,
141 union perf_event *event,
141 struct perf_sample *sample __used, 142 struct perf_sample *sample __used,
142 struct perf_session *session) 143 struct perf_evsel *evsel,
144 struct machine *machine __used)
143{ 145{
144 struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, 146 struct perf_report *rep = container_of(tool, struct perf_report, tool);
145 event->read.id); 147
146 if (show_threads) { 148 if (rep->show_threads) {
147 const char *name = evsel ? event_name(evsel) : "unknown"; 149 const char *name = evsel ? event_name(evsel) : "unknown";
148 perf_read_values_add_value(&show_threads_values, 150 perf_read_values_add_value(&rep->show_threads_values,
149 event->read.pid, event->read.tid, 151 event->read.pid, event->read.tid,
150 event->read.id, 152 event->read.id,
151 name, 153 name,
@@ -159,8 +161,10 @@ static int process_read_event(union perf_event *event,
159 return 0; 161 return 0;
160} 162}
161 163
162static int perf_session__setup_sample_type(struct perf_session *self) 164static int perf_report__setup_sample_type(struct perf_report *rep)
163{ 165{
166 struct perf_session *self = rep->session;
167
164 if (!(self->sample_type & PERF_SAMPLE_CALLCHAIN)) { 168 if (!(self->sample_type & PERF_SAMPLE_CALLCHAIN)) {
165 if (sort__has_parent) { 169 if (sort__has_parent) {
166 ui__warning("Selected --sort parent, but no " 170 ui__warning("Selected --sort parent, but no "
@@ -173,7 +177,8 @@ static int perf_session__setup_sample_type(struct perf_session *self)
173 "you call 'perf record' without -g?\n"); 177 "you call 'perf record' without -g?\n");
174 return -1; 178 return -1;
175 } 179 }
176 } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE && 180 } else if (!rep->dont_use_callchains &&
181 callchain_param.mode != CHAIN_NONE &&
177 !symbol_conf.use_callchain) { 182 !symbol_conf.use_callchain) {
178 symbol_conf.use_callchain = true; 183 symbol_conf.use_callchain = true;
179 if (callchain_register_param(&callchain_param) < 0) { 184 if (callchain_register_param(&callchain_param) < 0) {
@@ -186,22 +191,6 @@ static int perf_session__setup_sample_type(struct perf_session *self)
186 return 0; 191 return 0;
187} 192}
188 193
189static struct perf_event_ops event_ops = {
190 .sample = process_sample_event,
191 .mmap = perf_event__process_mmap,
192 .comm = perf_event__process_comm,
193 .exit = perf_event__process_task,
194 .fork = perf_event__process_task,
195 .lost = perf_event__process_lost,
196 .read = process_read_event,
197 .attr = perf_event__process_attr,
198 .event_type = perf_event__process_event_type,
199 .tracing_data = perf_event__process_tracing_data,
200 .build_id = perf_event__process_build_id,
201 .ordered_samples = true,
202 .ordering_requires_timestamps = true,
203};
204
205extern volatile int session_done; 194extern volatile int session_done;
206 195
207static void sig_handler(int sig __used) 196static void sig_handler(int sig __used)
@@ -224,6 +213,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self,
224} 213}
225 214
226static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, 215static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
216 struct perf_report *rep,
227 const char *help) 217 const char *help)
228{ 218{
229 struct perf_evsel *pos; 219 struct perf_evsel *pos;
@@ -241,18 +231,18 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
241 parent_pattern == default_parent_pattern) { 231 parent_pattern == default_parent_pattern) {
242 fprintf(stdout, "#\n# (%s)\n#\n", help); 232 fprintf(stdout, "#\n# (%s)\n#\n", help);
243 233
244 if (show_threads) { 234 if (rep->show_threads) {
245 bool style = !strcmp(pretty_printing_style, "raw"); 235 bool style = !strcmp(rep->pretty_printing_style, "raw");
246 perf_read_values_display(stdout, &show_threads_values, 236 perf_read_values_display(stdout, &rep->show_threads_values,
247 style); 237 style);
248 perf_read_values_destroy(&show_threads_values); 238 perf_read_values_destroy(&rep->show_threads_values);
249 } 239 }
250 } 240 }
251 241
252 return 0; 242 return 0;
253} 243}
254 244
255static int __cmd_report(void) 245static int __cmd_report(struct perf_report *rep)
256{ 246{
257 int ret = -EINVAL; 247 int ret = -EINVAL;
258 u64 nr_samples; 248 u64 nr_samples;
@@ -264,27 +254,31 @@ static int __cmd_report(void)
264 254
265 signal(SIGINT, sig_handler); 255 signal(SIGINT, sig_handler);
266 256
267 session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); 257 session = perf_session__new(rep->input_name, O_RDONLY,
258 rep->force, false, &rep->tool);
268 if (session == NULL) 259 if (session == NULL)
269 return -ENOMEM; 260 return -ENOMEM;
270 261
271 if (cpu_list) { 262 rep->session = session;
272 ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); 263
264 if (rep->cpu_list) {
265 ret = perf_session__cpu_bitmap(session, rep->cpu_list,
266 rep->cpu_bitmap);
273 if (ret) 267 if (ret)
274 goto out_delete; 268 goto out_delete;
275 } 269 }
276 270
277 if (use_browser <= 0) 271 if (use_browser <= 0)
278 perf_session__fprintf_info(session, stdout, show_full_info); 272 perf_session__fprintf_info(session, stdout, rep->show_full_info);
279 273
280 if (show_threads) 274 if (rep->show_threads)
281 perf_read_values_init(&show_threads_values); 275 perf_read_values_init(&rep->show_threads_values);
282 276
283 ret = perf_session__setup_sample_type(session); 277 ret = perf_report__setup_sample_type(rep);
284 if (ret) 278 if (ret)
285 goto out_delete; 279 goto out_delete;
286 280
287 ret = perf_session__process_events(session, &event_ops); 281 ret = perf_session__process_events(session, &rep->tool);
288 if (ret) 282 if (ret)
289 goto out_delete; 283 goto out_delete;
290 284
@@ -327,7 +321,7 @@ static int __cmd_report(void)
327 } 321 }
328 322
329 if (nr_samples == 0) { 323 if (nr_samples == 0) {
330 ui__warning("The %s file has no samples!\n", input_name); 324 ui__warning("The %s file has no samples!\n", session->filename);
331 goto out_delete; 325 goto out_delete;
332 } 326 }
333 327
@@ -335,7 +329,7 @@ static int __cmd_report(void)
335 perf_evlist__tui_browse_hists(session->evlist, help, 329 perf_evlist__tui_browse_hists(session->evlist, help,
336 NULL, NULL, 0); 330 NULL, NULL, 0);
337 } else 331 } else
338 perf_evlist__tty_browse_hists(session->evlist, help); 332 perf_evlist__tty_browse_hists(session->evlist, rep, help);
339 333
340out_delete: 334out_delete:
341 /* 335 /*
@@ -354,9 +348,9 @@ out_delete:
354} 348}
355 349
356static int 350static int
357parse_callchain_opt(const struct option *opt __used, const char *arg, 351parse_callchain_opt(const struct option *opt, const char *arg, int unset)
358 int unset)
359{ 352{
353 struct perf_report *rep = (struct perf_report *)opt->value;
360 char *tok, *tok2; 354 char *tok, *tok2;
361 char *endptr; 355 char *endptr;
362 356
@@ -364,7 +358,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
364 * --no-call-graph 358 * --no-call-graph
365 */ 359 */
366 if (unset) { 360 if (unset) {
367 dont_use_callchains = true; 361 rep->dont_use_callchains = true;
368 return 0; 362 return 0;
369 } 363 }
370 364
@@ -412,7 +406,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
412 goto setup; 406 goto setup;
413 407
414 if (tok2[0] != 'c') { 408 if (tok2[0] != 'c') {
415 callchain_param.print_limit = strtod(tok2, &endptr); 409 callchain_param.print_limit = strtoul(tok2, &endptr, 0);
416 tok2 = strtok(NULL, ","); 410 tok2 = strtok(NULL, ",");
417 if (!tok2) 411 if (!tok2)
418 goto setup; 412 goto setup;
@@ -433,13 +427,34 @@ setup:
433 return 0; 427 return 0;
434} 428}
435 429
436static const char * const report_usage[] = { 430int cmd_report(int argc, const char **argv, const char *prefix __used)
437 "perf report [<options>] <command>", 431{
438 NULL 432 struct stat st;
439}; 433 char callchain_default_opt[] = "fractal,0.5,callee";
440 434 const char * const report_usage[] = {
441static const struct option options[] = { 435 "perf report [<options>]",
442 OPT_STRING('i', "input", &input_name, "file", 436 NULL
437 };
438 struct perf_report report = {
439 .tool = {
440 .sample = process_sample_event,
441 .mmap = perf_event__process_mmap,
442 .comm = perf_event__process_comm,
443 .exit = perf_event__process_task,
444 .fork = perf_event__process_task,
445 .lost = perf_event__process_lost,
446 .read = process_read_event,
447 .attr = perf_event__process_attr,
448 .event_type = perf_event__process_event_type,
449 .tracing_data = perf_event__process_tracing_data,
450 .build_id = perf_event__process_build_id,
451 .ordered_samples = true,
452 .ordering_requires_timestamps = true,
453 },
454 .pretty_printing_style = "normal",
455 };
456 const struct option options[] = {
457 OPT_STRING('i', "input", &report.input_name, "file",
443 "input file name"), 458 "input file name"),
444 OPT_INCR('v', "verbose", &verbose, 459 OPT_INCR('v', "verbose", &verbose,
445 "be more verbose (show symbol address, etc)"), 460 "be more verbose (show symbol address, etc)"),
@@ -449,17 +464,18 @@ static const struct option options[] = {
449 "file", "vmlinux pathname"), 464 "file", "vmlinux pathname"),
450 OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, 465 OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
451 "file", "kallsyms pathname"), 466 "file", "kallsyms pathname"),
452 OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), 467 OPT_BOOLEAN('f', "force", &report.force, "don't complain, do it"),
453 OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, 468 OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
454 "load module symbols - WARNING: use only with -k and LIVE kernel"), 469 "load module symbols - WARNING: use only with -k and LIVE kernel"),
455 OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, 470 OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
456 "Show a column with the number of samples"), 471 "Show a column with the number of samples"),
457 OPT_BOOLEAN('T', "threads", &show_threads, 472 OPT_BOOLEAN('T', "threads", &report.show_threads,
458 "Show per-thread event counters"), 473 "Show per-thread event counters"),
459 OPT_STRING(0, "pretty", &pretty_printing_style, "key", 474 OPT_STRING(0, "pretty", &report.pretty_printing_style, "key",
460 "pretty printing style key: normal raw"), 475 "pretty printing style key: normal raw"),
461 OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), 476 OPT_BOOLEAN(0, "tui", &report.use_tui, "Use the TUI interface"),
462 OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), 477 OPT_BOOLEAN(0, "stdio", &report.use_stdio,
478 "Use the stdio interface"),
463 OPT_STRING('s', "sort", &sort_order, "key[,key2...]", 479 OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
464 "sort by key(s): pid, comm, dso, symbol, parent"), 480 "sort by key(s): pid, comm, dso, symbol, parent"),
465 OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, 481 OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
@@ -468,13 +484,14 @@ static const struct option options[] = {
468 "regex filter to identify parent, see: '--sort parent'"), 484 "regex filter to identify parent, see: '--sort parent'"),
469 OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, 485 OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
470 "Only display entries with parent-match"), 486 "Only display entries with parent-match"),
471 OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent, call_order", 487 OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
472 "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold and callchain order. " 488 "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit and callchain order. "
473 "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt), 489 "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt),
474 OPT_BOOLEAN('G', "inverted", &inverted_callchain, "alias for inverted call graph"), 490 OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
491 "alias for inverted call graph"),
475 OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", 492 OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
476 "only consider symbols in these dsos"), 493 "only consider symbols in these dsos"),
477 OPT_STRING('C', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", 494 OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
478 "only consider symbols in these comms"), 495 "only consider symbols in these comms"),
479 OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", 496 OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
480 "only consider these symbols"), 497 "only consider these symbols"),
@@ -484,12 +501,13 @@ static const struct option options[] = {
484 OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator", 501 OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
485 "separator for columns, no spaces will be added between " 502 "separator for columns, no spaces will be added between "
486 "columns '.' is reserved."), 503 "columns '.' is reserved."),
487 OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved, 504 OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved,
488 "Only display entries resolved to a symbol"), 505 "Only display entries resolved to a symbol"),
489 OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", 506 OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
490 "Look for files with symbols relative to this directory"), 507 "Look for files with symbols relative to this directory"),
491 OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), 508 OPT_STRING('C', "cpu", &report.cpu_list, "cpu",
492 OPT_BOOLEAN('I', "show-info", &show_full_info, 509 "list of cpus to profile"),
510 OPT_BOOLEAN('I', "show-info", &report.show_full_info,
493 "Display extended information about perf.data file"), 511 "Display extended information about perf.data file"),
494 OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src, 512 OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
495 "Interleave source code with assembly code (default)"), 513 "Interleave source code with assembly code (default)"),
@@ -500,24 +518,30 @@ static const struct option options[] = {
500 OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, 518 OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
501 "Show a column with the sum of periods"), 519 "Show a column with the sum of periods"),
502 OPT_END() 520 OPT_END()
503}; 521 };
504 522
505int cmd_report(int argc, const char **argv, const char *prefix __used)
506{
507 argc = parse_options(argc, argv, options, report_usage, 0); 523 argc = parse_options(argc, argv, options, report_usage, 0);
508 524
509 if (use_stdio) 525 if (report.use_stdio)
510 use_browser = 0; 526 use_browser = 0;
511 else if (use_tui) 527 else if (report.use_tui)
512 use_browser = 1; 528 use_browser = 1;
513 529
514 if (inverted_callchain) 530 if (report.inverted_callchain)
515 callchain_param.order = ORDER_CALLER; 531 callchain_param.order = ORDER_CALLER;
516 532
517 if (strcmp(input_name, "-") != 0) 533 if (!report.input_name || !strlen(report.input_name)) {
534 if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
535 report.input_name = "-";
536 else
537 report.input_name = "perf.data";
538 }
539
540 if (strcmp(report.input_name, "-") != 0)
518 setup_browser(true); 541 setup_browser(true);
519 else 542 else
520 use_browser = 0; 543 use_browser = 0;
544
521 /* 545 /*
522 * Only in the newt browser we are doing integrated annotation, 546 * Only in the newt browser we are doing integrated annotation,
523 * so don't allocate extra space that won't be used in the stdio 547 * so don't allocate extra space that won't be used in the stdio
@@ -525,7 +549,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
525 */ 549 */
526 if (use_browser > 0) { 550 if (use_browser > 0) {
527 symbol_conf.priv_size = sizeof(struct annotation); 551 symbol_conf.priv_size = sizeof(struct annotation);
528 annotate_init = symbol__annotate_init; 552 report.annotate_init = symbol__annotate_init;
529 /* 553 /*
530 * For searching by name on the "Browse map details". 554 * For searching by name on the "Browse map details".
531 * providing it only in verbose mode not to bloat too 555 * providing it only in verbose mode not to bloat too
@@ -572,5 +596,5 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
572 sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout); 596 sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout);
573 sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout); 597 sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout);
574 598
575 return __cmd_report(); 599 return __cmd_report(&report);
576} 600}
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 5177964943e7..fb8b5f83b4a0 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2,11 +2,14 @@
2#include "perf.h" 2#include "perf.h"
3 3
4#include "util/util.h" 4#include "util/util.h"
5#include "util/evlist.h"
5#include "util/cache.h" 6#include "util/cache.h"
7#include "util/evsel.h"
6#include "util/symbol.h" 8#include "util/symbol.h"
7#include "util/thread.h" 9#include "util/thread.h"
8#include "util/header.h" 10#include "util/header.h"
9#include "util/session.h" 11#include "util/session.h"
12#include "util/tool.h"
10 13
11#include "util/parse-options.h" 14#include "util/parse-options.h"
12#include "util/trace-event.h" 15#include "util/trace-event.h"
@@ -19,7 +22,7 @@
19#include <pthread.h> 22#include <pthread.h>
20#include <math.h> 23#include <math.h>
21 24
22static char const *input_name = "perf.data"; 25static const char *input_name;
23 26
24static char default_sort_order[] = "avg, max, switch, runtime"; 27static char default_sort_order[] = "avg, max, switch, runtime";
25static const char *sort_order = default_sort_order; 28static const char *sort_order = default_sort_order;
@@ -723,21 +726,21 @@ struct trace_migrate_task_event {
723 726
724struct trace_sched_handler { 727struct trace_sched_handler {
725 void (*switch_event)(struct trace_switch_event *, 728 void (*switch_event)(struct trace_switch_event *,
726 struct perf_session *, 729 struct machine *,
727 struct event *, 730 struct event *,
728 int cpu, 731 int cpu,
729 u64 timestamp, 732 u64 timestamp,
730 struct thread *thread); 733 struct thread *thread);
731 734
732 void (*runtime_event)(struct trace_runtime_event *, 735 void (*runtime_event)(struct trace_runtime_event *,
733 struct perf_session *, 736 struct machine *,
734 struct event *, 737 struct event *,
735 int cpu, 738 int cpu,
736 u64 timestamp, 739 u64 timestamp,
737 struct thread *thread); 740 struct thread *thread);
738 741
739 void (*wakeup_event)(struct trace_wakeup_event *, 742 void (*wakeup_event)(struct trace_wakeup_event *,
740 struct perf_session *, 743 struct machine *,
741 struct event *, 744 struct event *,
742 int cpu, 745 int cpu,
743 u64 timestamp, 746 u64 timestamp,
@@ -750,7 +753,7 @@ struct trace_sched_handler {
750 struct thread *thread); 753 struct thread *thread);
751 754
752 void (*migrate_task_event)(struct trace_migrate_task_event *, 755 void (*migrate_task_event)(struct trace_migrate_task_event *,
753 struct perf_session *session, 756 struct machine *machine,
754 struct event *, 757 struct event *,
755 int cpu, 758 int cpu,
756 u64 timestamp, 759 u64 timestamp,
@@ -760,7 +763,7 @@ struct trace_sched_handler {
760 763
761static void 764static void
762replay_wakeup_event(struct trace_wakeup_event *wakeup_event, 765replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
763 struct perf_session *session __used, 766 struct machine *machine __used,
764 struct event *event, 767 struct event *event,
765 int cpu __used, 768 int cpu __used,
766 u64 timestamp __used, 769 u64 timestamp __used,
@@ -787,7 +790,7 @@ static u64 cpu_last_switched[MAX_CPUS];
787 790
788static void 791static void
789replay_switch_event(struct trace_switch_event *switch_event, 792replay_switch_event(struct trace_switch_event *switch_event,
790 struct perf_session *session __used, 793 struct machine *machine __used,
791 struct event *event, 794 struct event *event,
792 int cpu, 795 int cpu,
793 u64 timestamp, 796 u64 timestamp,
@@ -1021,7 +1024,7 @@ add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1021 1024
1022static void 1025static void
1023latency_switch_event(struct trace_switch_event *switch_event, 1026latency_switch_event(struct trace_switch_event *switch_event,
1024 struct perf_session *session, 1027 struct machine *machine,
1025 struct event *event __used, 1028 struct event *event __used,
1026 int cpu, 1029 int cpu,
1027 u64 timestamp, 1030 u64 timestamp,
@@ -1045,8 +1048,8 @@ latency_switch_event(struct trace_switch_event *switch_event,
1045 die("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1048 die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1046 1049
1047 1050
1048 sched_out = perf_session__findnew(session, switch_event->prev_pid); 1051 sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1049 sched_in = perf_session__findnew(session, switch_event->next_pid); 1052 sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1050 1053
1051 out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); 1054 out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1052 if (!out_events) { 1055 if (!out_events) {
@@ -1074,13 +1077,13 @@ latency_switch_event(struct trace_switch_event *switch_event,
1074 1077
1075static void 1078static void
1076latency_runtime_event(struct trace_runtime_event *runtime_event, 1079latency_runtime_event(struct trace_runtime_event *runtime_event,
1077 struct perf_session *session, 1080 struct machine *machine,
1078 struct event *event __used, 1081 struct event *event __used,
1079 int cpu, 1082 int cpu,
1080 u64 timestamp, 1083 u64 timestamp,
1081 struct thread *this_thread __used) 1084 struct thread *this_thread __used)
1082{ 1085{
1083 struct thread *thread = perf_session__findnew(session, runtime_event->pid); 1086 struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
1084 struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); 1087 struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1085 1088
1086 BUG_ON(cpu >= MAX_CPUS || cpu < 0); 1089 BUG_ON(cpu >= MAX_CPUS || cpu < 0);
@@ -1097,7 +1100,7 @@ latency_runtime_event(struct trace_runtime_event *runtime_event,
1097 1100
1098static void 1101static void
1099latency_wakeup_event(struct trace_wakeup_event *wakeup_event, 1102latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1100 struct perf_session *session, 1103 struct machine *machine,
1101 struct event *__event __used, 1104 struct event *__event __used,
1102 int cpu __used, 1105 int cpu __used,
1103 u64 timestamp, 1106 u64 timestamp,
@@ -1111,7 +1114,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1111 if (!wakeup_event->success) 1114 if (!wakeup_event->success)
1112 return; 1115 return;
1113 1116
1114 wakee = perf_session__findnew(session, wakeup_event->pid); 1117 wakee = machine__findnew_thread(machine, wakeup_event->pid);
1115 atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); 1118 atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1116 if (!atoms) { 1119 if (!atoms) {
1117 thread_atoms_insert(wakee); 1120 thread_atoms_insert(wakee);
@@ -1145,7 +1148,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1145 1148
1146static void 1149static void
1147latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event, 1150latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1148 struct perf_session *session, 1151 struct machine *machine,
1149 struct event *__event __used, 1152 struct event *__event __used,
1150 int cpu __used, 1153 int cpu __used,
1151 u64 timestamp, 1154 u64 timestamp,
@@ -1161,7 +1164,7 @@ latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1161 if (profile_cpu == -1) 1164 if (profile_cpu == -1)
1162 return; 1165 return;
1163 1166
1164 migrant = perf_session__findnew(session, migrate_task_event->pid); 1167 migrant = machine__findnew_thread(machine, migrate_task_event->pid);
1165 atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid); 1168 atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1166 if (!atoms) { 1169 if (!atoms) {
1167 thread_atoms_insert(migrant); 1170 thread_atoms_insert(migrant);
@@ -1356,12 +1359,13 @@ static void sort_lat(void)
1356static struct trace_sched_handler *trace_handler; 1359static struct trace_sched_handler *trace_handler;
1357 1360
1358static void 1361static void
1359process_sched_wakeup_event(void *data, struct perf_session *session, 1362process_sched_wakeup_event(struct perf_tool *tool __used,
1360 struct event *event, 1363 struct event *event,
1361 int cpu __used, 1364 struct perf_sample *sample,
1362 u64 timestamp __used, 1365 struct machine *machine,
1363 struct thread *thread __used) 1366 struct thread *thread)
1364{ 1367{
1368 void *data = sample->raw_data;
1365 struct trace_wakeup_event wakeup_event; 1369 struct trace_wakeup_event wakeup_event;
1366 1370
1367 FILL_COMMON_FIELDS(wakeup_event, event, data); 1371 FILL_COMMON_FIELDS(wakeup_event, event, data);
@@ -1373,8 +1377,8 @@ process_sched_wakeup_event(void *data, struct perf_session *session,
1373 FILL_FIELD(wakeup_event, cpu, event, data); 1377 FILL_FIELD(wakeup_event, cpu, event, data);
1374 1378
1375 if (trace_handler->wakeup_event) 1379 if (trace_handler->wakeup_event)
1376 trace_handler->wakeup_event(&wakeup_event, session, event, 1380 trace_handler->wakeup_event(&wakeup_event, machine, event,
1377 cpu, timestamp, thread); 1381 sample->cpu, sample->time, thread);
1378} 1382}
1379 1383
1380/* 1384/*
@@ -1392,7 +1396,7 @@ static char next_shortname2 = '0';
1392 1396
1393static void 1397static void
1394map_switch_event(struct trace_switch_event *switch_event, 1398map_switch_event(struct trace_switch_event *switch_event,
1395 struct perf_session *session, 1399 struct machine *machine,
1396 struct event *event __used, 1400 struct event *event __used,
1397 int this_cpu, 1401 int this_cpu,
1398 u64 timestamp, 1402 u64 timestamp,
@@ -1420,8 +1424,8 @@ map_switch_event(struct trace_switch_event *switch_event,
1420 die("hm, delta: %" PRIu64 " < 0 ?\n", delta); 1424 die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1421 1425
1422 1426
1423 sched_out = perf_session__findnew(session, switch_event->prev_pid); 1427 sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1424 sched_in = perf_session__findnew(session, switch_event->next_pid); 1428 sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1425 1429
1426 curr_thread[this_cpu] = sched_in; 1430 curr_thread[this_cpu] = sched_in;
1427 1431
@@ -1469,14 +1473,15 @@ map_switch_event(struct trace_switch_event *switch_event,
1469 } 1473 }
1470} 1474}
1471 1475
1472
1473static void 1476static void
1474process_sched_switch_event(void *data, struct perf_session *session, 1477process_sched_switch_event(struct perf_tool *tool __used,
1475 struct event *event, 1478 struct event *event,
1476 int this_cpu, 1479 struct perf_sample *sample,
1477 u64 timestamp __used, 1480 struct machine *machine,
1478 struct thread *thread __used) 1481 struct thread *thread)
1479{ 1482{
1483 int this_cpu = sample->cpu;
1484 void *data = sample->raw_data;
1480 struct trace_switch_event switch_event; 1485 struct trace_switch_event switch_event;
1481 1486
1482 FILL_COMMON_FIELDS(switch_event, event, data); 1487 FILL_COMMON_FIELDS(switch_event, event, data);
@@ -1498,19 +1503,20 @@ process_sched_switch_event(void *data, struct perf_session *session,
1498 nr_context_switch_bugs++; 1503 nr_context_switch_bugs++;
1499 } 1504 }
1500 if (trace_handler->switch_event) 1505 if (trace_handler->switch_event)
1501 trace_handler->switch_event(&switch_event, session, event, 1506 trace_handler->switch_event(&switch_event, machine, event,
1502 this_cpu, timestamp, thread); 1507 this_cpu, sample->time, thread);
1503 1508
1504 curr_pid[this_cpu] = switch_event.next_pid; 1509 curr_pid[this_cpu] = switch_event.next_pid;
1505} 1510}
1506 1511
1507static void 1512static void
1508process_sched_runtime_event(void *data, struct perf_session *session, 1513process_sched_runtime_event(struct perf_tool *tool __used,
1509 struct event *event, 1514 struct event *event,
1510 int cpu __used, 1515 struct perf_sample *sample,
1511 u64 timestamp __used, 1516 struct machine *machine,
1512 struct thread *thread __used) 1517 struct thread *thread)
1513{ 1518{
1519 void *data = sample->raw_data;
1514 struct trace_runtime_event runtime_event; 1520 struct trace_runtime_event runtime_event;
1515 1521
1516 FILL_ARRAY(runtime_event, comm, event, data); 1522 FILL_ARRAY(runtime_event, comm, event, data);
@@ -1519,16 +1525,18 @@ process_sched_runtime_event(void *data, struct perf_session *session,
1519 FILL_FIELD(runtime_event, vruntime, event, data); 1525 FILL_FIELD(runtime_event, vruntime, event, data);
1520 1526
1521 if (trace_handler->runtime_event) 1527 if (trace_handler->runtime_event)
1522 trace_handler->runtime_event(&runtime_event, session, event, cpu, timestamp, thread); 1528 trace_handler->runtime_event(&runtime_event, machine, event,
1529 sample->cpu, sample->time, thread);
1523} 1530}
1524 1531
1525static void 1532static void
1526process_sched_fork_event(void *data, 1533process_sched_fork_event(struct perf_tool *tool __used,
1527 struct event *event, 1534 struct event *event,
1528 int cpu __used, 1535 struct perf_sample *sample,
1529 u64 timestamp __used, 1536 struct machine *machine __used,
1530 struct thread *thread __used) 1537 struct thread *thread)
1531{ 1538{
1539 void *data = sample->raw_data;
1532 struct trace_fork_event fork_event; 1540 struct trace_fork_event fork_event;
1533 1541
1534 FILL_COMMON_FIELDS(fork_event, event, data); 1542 FILL_COMMON_FIELDS(fork_event, event, data);
@@ -1540,13 +1548,14 @@ process_sched_fork_event(void *data,
1540 1548
1541 if (trace_handler->fork_event) 1549 if (trace_handler->fork_event)
1542 trace_handler->fork_event(&fork_event, event, 1550 trace_handler->fork_event(&fork_event, event,
1543 cpu, timestamp, thread); 1551 sample->cpu, sample->time, thread);
1544} 1552}
1545 1553
1546static void 1554static void
1547process_sched_exit_event(struct event *event, 1555process_sched_exit_event(struct perf_tool *tool __used,
1548 int cpu __used, 1556 struct event *event,
1549 u64 timestamp __used, 1557 struct perf_sample *sample __used,
1558 struct machine *machine __used,
1550 struct thread *thread __used) 1559 struct thread *thread __used)
1551{ 1560{
1552 if (verbose) 1561 if (verbose)
@@ -1554,12 +1563,13 @@ process_sched_exit_event(struct event *event,
1554} 1563}
1555 1564
1556static void 1565static void
1557process_sched_migrate_task_event(void *data, struct perf_session *session, 1566process_sched_migrate_task_event(struct perf_tool *tool __used,
1558 struct event *event, 1567 struct event *event,
1559 int cpu __used, 1568 struct perf_sample *sample,
1560 u64 timestamp __used, 1569 struct machine *machine,
1561 struct thread *thread __used) 1570 struct thread *thread)
1562{ 1571{
1572 void *data = sample->raw_data;
1563 struct trace_migrate_task_event migrate_task_event; 1573 struct trace_migrate_task_event migrate_task_event;
1564 1574
1565 FILL_COMMON_FIELDS(migrate_task_event, event, data); 1575 FILL_COMMON_FIELDS(migrate_task_event, event, data);
@@ -1570,67 +1580,47 @@ process_sched_migrate_task_event(void *data, struct perf_session *session,
1570 FILL_FIELD(migrate_task_event, cpu, event, data); 1580 FILL_FIELD(migrate_task_event, cpu, event, data);
1571 1581
1572 if (trace_handler->migrate_task_event) 1582 if (trace_handler->migrate_task_event)
1573 trace_handler->migrate_task_event(&migrate_task_event, session, 1583 trace_handler->migrate_task_event(&migrate_task_event, machine,
1574 event, cpu, timestamp, thread); 1584 event, sample->cpu,
1585 sample->time, thread);
1575} 1586}
1576 1587
1577static void process_raw_event(union perf_event *raw_event __used, 1588typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event *event,
1578 struct perf_session *session, void *data, int cpu, 1589 struct perf_sample *sample,
1579 u64 timestamp, struct thread *thread) 1590 struct machine *machine,
1580{ 1591 struct thread *thread);
1581 struct event *event;
1582 int type;
1583
1584
1585 type = trace_parse_common_type(data);
1586 event = trace_find_event(type);
1587
1588 if (!strcmp(event->name, "sched_switch"))
1589 process_sched_switch_event(data, session, event, cpu, timestamp, thread);
1590 if (!strcmp(event->name, "sched_stat_runtime"))
1591 process_sched_runtime_event(data, session, event, cpu, timestamp, thread);
1592 if (!strcmp(event->name, "sched_wakeup"))
1593 process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
1594 if (!strcmp(event->name, "sched_wakeup_new"))
1595 process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
1596 if (!strcmp(event->name, "sched_process_fork"))
1597 process_sched_fork_event(data, event, cpu, timestamp, thread);
1598 if (!strcmp(event->name, "sched_process_exit"))
1599 process_sched_exit_event(event, cpu, timestamp, thread);
1600 if (!strcmp(event->name, "sched_migrate_task"))
1601 process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
1602}
1603 1592
1604static int process_sample_event(union perf_event *event, 1593static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
1605 struct perf_sample *sample, 1594 union perf_event *event __used,
1606 struct perf_evsel *evsel __used, 1595 struct perf_sample *sample,
1607 struct perf_session *session) 1596 struct perf_evsel *evsel,
1597 struct machine *machine)
1608{ 1598{
1609 struct thread *thread; 1599 struct thread *thread = machine__findnew_thread(machine, sample->pid);
1610
1611 if (!(session->sample_type & PERF_SAMPLE_RAW))
1612 return 0;
1613 1600
1614 thread = perf_session__findnew(session, sample->pid);
1615 if (thread == NULL) { 1601 if (thread == NULL) {
1616 pr_debug("problem processing %d event, skipping it.\n", 1602 pr_debug("problem processing %s event, skipping it.\n",
1617 event->header.type); 1603 evsel->name);
1618 return -1; 1604 return -1;
1619 } 1605 }
1620 1606
1621 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); 1607 evsel->hists.stats.total_period += sample->period;
1608 hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1622 1609
1623 if (profile_cpu != -1 && profile_cpu != (int)sample->cpu) 1610 if (evsel->handler.func != NULL) {
1624 return 0; 1611 tracepoint_handler f = evsel->handler.func;
1625 1612
1626 process_raw_event(event, session, sample->raw_data, sample->cpu, 1613 if (evsel->handler.data == NULL)
1627 sample->time, thread); 1614 evsel->handler.data = trace_find_event(evsel->attr.config);
1615
1616 f(tool, evsel->handler.data, sample, machine, thread);
1617 }
1628 1618
1629 return 0; 1619 return 0;
1630} 1620}
1631 1621
1632static struct perf_event_ops event_ops = { 1622static struct perf_tool perf_sched = {
1633 .sample = process_sample_event, 1623 .sample = perf_sched__process_tracepoint_sample,
1634 .comm = perf_event__process_comm, 1624 .comm = perf_event__process_comm,
1635 .lost = perf_event__process_lost, 1625 .lost = perf_event__process_lost,
1636 .fork = perf_event__process_task, 1626 .fork = perf_event__process_task,
@@ -1640,13 +1630,25 @@ static struct perf_event_ops event_ops = {
1640static void read_events(bool destroy, struct perf_session **psession) 1630static void read_events(bool destroy, struct perf_session **psession)
1641{ 1631{
1642 int err = -EINVAL; 1632 int err = -EINVAL;
1633 const struct perf_evsel_str_handler handlers[] = {
1634 { "sched:sched_switch", process_sched_switch_event, },
1635 { "sched:sched_stat_runtime", process_sched_runtime_event, },
1636 { "sched:sched_wakeup", process_sched_wakeup_event, },
1637 { "sched:sched_wakeup_new", process_sched_wakeup_event, },
1638 { "sched:sched_process_fork", process_sched_fork_event, },
1639 { "sched:sched_process_exit", process_sched_exit_event, },
1640 { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1641 };
1643 struct perf_session *session = perf_session__new(input_name, O_RDONLY, 1642 struct perf_session *session = perf_session__new(input_name, O_RDONLY,
1644 0, false, &event_ops); 1643 0, false, &perf_sched);
1645 if (session == NULL) 1644 if (session == NULL)
1646 die("No Memory"); 1645 die("No Memory");
1647 1646
1647 err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers);
1648 assert(err == 0);
1649
1648 if (perf_session__has_traces(session, "record -R")) { 1650 if (perf_session__has_traces(session, "record -R")) {
1649 err = perf_session__process_events(session, &event_ops); 1651 err = perf_session__process_events(session, &perf_sched);
1650 if (err) 1652 if (err)
1651 die("Failed to process events, error %d", err); 1653 die("Failed to process events, error %d", err);
1652 1654
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 2f62a2952269..fd1909afcfd6 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -7,6 +7,7 @@
7#include "util/header.h" 7#include "util/header.h"
8#include "util/parse-options.h" 8#include "util/parse-options.h"
9#include "util/session.h" 9#include "util/session.h"
10#include "util/tool.h"
10#include "util/symbol.h" 11#include "util/symbol.h"
11#include "util/thread.h" 12#include "util/thread.h"
12#include "util/trace-event.h" 13#include "util/trace-event.h"
@@ -23,6 +24,7 @@ static u64 nr_unordered;
23extern const struct option record_options[]; 24extern const struct option record_options[];
24static bool no_callchain; 25static bool no_callchain;
25static bool show_full_info; 26static bool show_full_info;
27static bool system_wide;
26static const char *cpu_list; 28static const char *cpu_list;
27static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); 29static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
28 30
@@ -315,7 +317,7 @@ static bool sample_addr_correlates_sym(struct perf_event_attr *attr)
315 317
316static void print_sample_addr(union perf_event *event, 318static void print_sample_addr(union perf_event *event,
317 struct perf_sample *sample, 319 struct perf_sample *sample,
318 struct perf_session *session, 320 struct machine *machine,
319 struct thread *thread, 321 struct thread *thread,
320 struct perf_event_attr *attr) 322 struct perf_event_attr *attr)
321{ 323{
@@ -328,11 +330,11 @@ static void print_sample_addr(union perf_event *event,
328 if (!sample_addr_correlates_sym(attr)) 330 if (!sample_addr_correlates_sym(attr))
329 return; 331 return;
330 332
331 thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, 333 thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
332 event->ip.pid, sample->addr, &al); 334 sample->addr, &al);
333 if (!al.map) 335 if (!al.map)
334 thread__find_addr_map(thread, session, cpumode, MAP__VARIABLE, 336 thread__find_addr_map(thread, machine, cpumode, MAP__VARIABLE,
335 event->ip.pid, sample->addr, &al); 337 sample->addr, &al);
336 338
337 al.cpu = sample->cpu; 339 al.cpu = sample->cpu;
338 al.sym = NULL; 340 al.sym = NULL;
@@ -362,7 +364,7 @@ static void print_sample_addr(union perf_event *event,
362static void process_event(union perf_event *event __unused, 364static void process_event(union perf_event *event __unused,
363 struct perf_sample *sample, 365 struct perf_sample *sample,
364 struct perf_evsel *evsel, 366 struct perf_evsel *evsel,
365 struct perf_session *session, 367 struct machine *machine,
366 struct thread *thread) 368 struct thread *thread)
367{ 369{
368 struct perf_event_attr *attr = &evsel->attr; 370 struct perf_event_attr *attr = &evsel->attr;
@@ -377,15 +379,15 @@ static void process_event(union perf_event *event __unused,
377 sample->raw_size); 379 sample->raw_size);
378 380
379 if (PRINT_FIELD(ADDR)) 381 if (PRINT_FIELD(ADDR))
380 print_sample_addr(event, sample, session, thread, attr); 382 print_sample_addr(event, sample, machine, thread, attr);
381 383
382 if (PRINT_FIELD(IP)) { 384 if (PRINT_FIELD(IP)) {
383 if (!symbol_conf.use_callchain) 385 if (!symbol_conf.use_callchain)
384 printf(" "); 386 printf(" ");
385 else 387 else
386 printf("\n"); 388 printf("\n");
387 perf_session__print_ip(event, sample, session, 389 perf_event__print_ip(event, sample, machine, evsel,
388 PRINT_FIELD(SYM), PRINT_FIELD(DSO)); 390 PRINT_FIELD(SYM), PRINT_FIELD(DSO));
389 } 391 }
390 392
391 printf("\n"); 393 printf("\n");
@@ -432,14 +434,16 @@ static int cleanup_scripting(void)
432 return scripting_ops->stop_script(); 434 return scripting_ops->stop_script();
433} 435}
434 436
435static char const *input_name = "perf.data"; 437static const char *input_name;
436 438
437static int process_sample_event(union perf_event *event, 439static int process_sample_event(struct perf_tool *tool __used,
440 union perf_event *event,
438 struct perf_sample *sample, 441 struct perf_sample *sample,
439 struct perf_evsel *evsel, 442 struct perf_evsel *evsel,
440 struct perf_session *session) 443 struct machine *machine)
441{ 444{
442 struct thread *thread = perf_session__findnew(session, event->ip.pid); 445 struct addr_location al;
446 struct thread *thread = machine__findnew_thread(machine, event->ip.tid);
443 447
444 if (thread == NULL) { 448 if (thread == NULL) {
445 pr_debug("problem processing %d event, skipping it.\n", 449 pr_debug("problem processing %d event, skipping it.\n",
@@ -458,16 +462,25 @@ static int process_sample_event(union perf_event *event,
458 return 0; 462 return 0;
459 } 463 }
460 464
465 if (perf_event__preprocess_sample(event, machine, &al, sample, 0) < 0) {
466 pr_err("problem processing %d event, skipping it.\n",
467 event->header.type);
468 return -1;
469 }
470
471 if (al.filtered)
472 return 0;
473
461 if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) 474 if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
462 return 0; 475 return 0;
463 476
464 scripting_ops->process_event(event, sample, evsel, session, thread); 477 scripting_ops->process_event(event, sample, evsel, machine, thread);
465 478
466 session->hists.stats.total_period += sample->period; 479 evsel->hists.stats.total_period += sample->period;
467 return 0; 480 return 0;
468} 481}
469 482
470static struct perf_event_ops event_ops = { 483static struct perf_tool perf_script = {
471 .sample = process_sample_event, 484 .sample = process_sample_event,
472 .mmap = perf_event__process_mmap, 485 .mmap = perf_event__process_mmap,
473 .comm = perf_event__process_comm, 486 .comm = perf_event__process_comm,
@@ -494,7 +507,7 @@ static int __cmd_script(struct perf_session *session)
494 507
495 signal(SIGINT, sig_handler); 508 signal(SIGINT, sig_handler);
496 509
497 ret = perf_session__process_events(session, &event_ops); 510 ret = perf_session__process_events(session, &perf_script);
498 511
499 if (debug_mode) 512 if (debug_mode)
500 pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); 513 pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered);
@@ -523,12 +536,6 @@ static struct script_spec *script_spec__new(const char *spec,
523 return s; 536 return s;
524} 537}
525 538
526static void script_spec__delete(struct script_spec *s)
527{
528 free(s->spec);
529 free(s);
530}
531
532static void script_spec__add(struct script_spec *s) 539static void script_spec__add(struct script_spec *s)
533{ 540{
534 list_add_tail(&s->node, &script_specs); 541 list_add_tail(&s->node, &script_specs);
@@ -554,16 +561,11 @@ static struct script_spec *script_spec__findnew(const char *spec,
554 561
555 s = script_spec__new(spec, ops); 562 s = script_spec__new(spec, ops);
556 if (!s) 563 if (!s)
557 goto out_delete_spec; 564 return NULL;
558 565
559 script_spec__add(s); 566 script_spec__add(s);
560 567
561 return s; 568 return s;
562
563out_delete_spec:
564 script_spec__delete(s);
565
566 return NULL;
567} 569}
568 570
569int script_spec_register(const char *spec, struct scripting_ops *ops) 571int script_spec_register(const char *spec, struct scripting_ops *ops)
@@ -681,7 +683,8 @@ static int parse_output_fields(const struct option *opt __used,
681 type = PERF_TYPE_RAW; 683 type = PERF_TYPE_RAW;
682 else { 684 else {
683 fprintf(stderr, "Invalid event type in field string.\n"); 685 fprintf(stderr, "Invalid event type in field string.\n");
684 return -EINVAL; 686 rc = -EINVAL;
687 goto out;
685 } 688 }
686 689
687 if (output[type].user_set) 690 if (output[type].user_set)
@@ -923,6 +926,24 @@ static int read_script_info(struct script_desc *desc, const char *filename)
923 return 0; 926 return 0;
924} 927}
925 928
929static char *get_script_root(struct dirent *script_dirent, const char *suffix)
930{
931 char *script_root, *str;
932
933 script_root = strdup(script_dirent->d_name);
934 if (!script_root)
935 return NULL;
936
937 str = (char *)ends_with(script_root, suffix);
938 if (!str) {
939 free(script_root);
940 return NULL;
941 }
942
943 *str = '\0';
944 return script_root;
945}
946
926static int list_available_scripts(const struct option *opt __used, 947static int list_available_scripts(const struct option *opt __used,
927 const char *s __used, int unset __used) 948 const char *s __used, int unset __used)
928{ 949{
@@ -934,7 +955,6 @@ static int list_available_scripts(const struct option *opt __used,
934 struct script_desc *desc; 955 struct script_desc *desc;
935 char first_half[BUFSIZ]; 956 char first_half[BUFSIZ];
936 char *script_root; 957 char *script_root;
937 char *str;
938 958
939 snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path()); 959 snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
940 960
@@ -950,16 +970,14 @@ static int list_available_scripts(const struct option *opt __used,
950 continue; 970 continue;
951 971
952 for_each_script(lang_path, lang_dir, script_dirent, script_next) { 972 for_each_script(lang_path, lang_dir, script_dirent, script_next) {
953 script_root = strdup(script_dirent.d_name); 973 script_root = get_script_root(&script_dirent, REPORT_SUFFIX);
954 str = (char *)ends_with(script_root, REPORT_SUFFIX); 974 if (script_root) {
955 if (str) {
956 *str = '\0';
957 desc = script_desc__findnew(script_root); 975 desc = script_desc__findnew(script_root);
958 snprintf(script_path, MAXPATHLEN, "%s/%s", 976 snprintf(script_path, MAXPATHLEN, "%s/%s",
959 lang_path, script_dirent.d_name); 977 lang_path, script_dirent.d_name);
960 read_script_info(desc, script_path); 978 read_script_info(desc, script_path);
979 free(script_root);
961 } 980 }
962 free(script_root);
963 } 981 }
964 } 982 }
965 983
@@ -981,8 +999,7 @@ static char *get_script_path(const char *script_root, const char *suffix)
981 char script_path[MAXPATHLEN]; 999 char script_path[MAXPATHLEN];
982 DIR *scripts_dir, *lang_dir; 1000 DIR *scripts_dir, *lang_dir;
983 char lang_path[MAXPATHLEN]; 1001 char lang_path[MAXPATHLEN];
984 char *str, *__script_root; 1002 char *__script_root;
985 char *path = NULL;
986 1003
987 snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path()); 1004 snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
988 1005
@@ -998,23 +1015,18 @@ static char *get_script_path(const char *script_root, const char *suffix)
998 continue; 1015 continue;
999 1016
1000 for_each_script(lang_path, lang_dir, script_dirent, script_next) { 1017 for_each_script(lang_path, lang_dir, script_dirent, script_next) {
1001 __script_root = strdup(script_dirent.d_name); 1018 __script_root = get_script_root(&script_dirent, suffix);
1002 str = (char *)ends_with(__script_root, suffix); 1019 if (__script_root && !strcmp(script_root, __script_root)) {
1003 if (str) { 1020 free(__script_root);
1004 *str = '\0';
1005 if (strcmp(__script_root, script_root))
1006 continue;
1007 snprintf(script_path, MAXPATHLEN, "%s/%s", 1021 snprintf(script_path, MAXPATHLEN, "%s/%s",
1008 lang_path, script_dirent.d_name); 1022 lang_path, script_dirent.d_name);
1009 path = strdup(script_path); 1023 return strdup(script_path);
1010 free(__script_root);
1011 break;
1012 } 1024 }
1013 free(__script_root); 1025 free(__script_root);
1014 } 1026 }
1015 } 1027 }
1016 1028
1017 return path; 1029 return NULL;
1018} 1030}
1019 1031
1020static bool is_top_script(const char *script_path) 1032static bool is_top_script(const char *script_path)
@@ -1083,7 +1095,11 @@ static const struct option options[] = {
1083 OPT_CALLBACK('f', "fields", NULL, "str", 1095 OPT_CALLBACK('f', "fields", NULL, "str",
1084 "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr", 1096 "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr",
1085 parse_output_fields), 1097 parse_output_fields),
1086 OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), 1098 OPT_BOOLEAN('a', "all-cpus", &system_wide,
1099 "system-wide collection from all CPUs"),
1100 OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
1101 OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
1102 "only display events for these comms"),
1087 OPT_BOOLEAN('I', "show-info", &show_full_info, 1103 OPT_BOOLEAN('I', "show-info", &show_full_info,
1088 "display extended information from perf.data file"), 1104 "display extended information from perf.data file"),
1089 OPT_END() 1105 OPT_END()
@@ -1110,7 +1126,6 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
1110 struct perf_session *session; 1126 struct perf_session *session;
1111 char *script_path = NULL; 1127 char *script_path = NULL;
1112 const char **__argv; 1128 const char **__argv;
1113 bool system_wide;
1114 int i, j, err; 1129 int i, j, err;
1115 1130
1116 setup_scripting(); 1131 setup_scripting();
@@ -1178,15 +1193,17 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
1178 } 1193 }
1179 1194
1180 if (!pid) { 1195 if (!pid) {
1181 system_wide = true;
1182 j = 0; 1196 j = 0;
1183 1197
1184 dup2(live_pipe[1], 1); 1198 dup2(live_pipe[1], 1);
1185 close(live_pipe[0]); 1199 close(live_pipe[0]);
1186 1200
1187 if (!is_top_script(argv[0])) 1201 if (is_top_script(argv[0])) {
1202 system_wide = true;
1203 } else if (!system_wide) {
1188 system_wide = !have_cmd(argc - rep_args, 1204 system_wide = !have_cmd(argc - rep_args,
1189 &argv[rep_args]); 1205 &argv[rep_args]);
1206 }
1190 1207
1191 __argv = malloc((argc + 6) * sizeof(const char *)); 1208 __argv = malloc((argc + 6) * sizeof(const char *));
1192 if (!__argv) 1209 if (!__argv)
@@ -1234,10 +1251,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
1234 script_path = rep_script_path; 1251 script_path = rep_script_path;
1235 1252
1236 if (script_path) { 1253 if (script_path) {
1237 system_wide = false;
1238 j = 0; 1254 j = 0;
1239 1255
1240 if (rec_script_path) 1256 if (!rec_script_path)
1257 system_wide = false;
1258 else if (!system_wide)
1241 system_wide = !have_cmd(argc - 1, &argv[1]); 1259 system_wide = !have_cmd(argc - 1, &argv[1]);
1242 1260
1243 __argv = malloc((argc + 2) * sizeof(const char *)); 1261 __argv = malloc((argc + 2) * sizeof(const char *));
@@ -1261,7 +1279,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
1261 if (!script_name) 1279 if (!script_name)
1262 setup_pager(); 1280 setup_pager();
1263 1281
1264 session = perf_session__new(input_name, O_RDONLY, 0, false, &event_ops); 1282 session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_script);
1265 if (session == NULL) 1283 if (session == NULL)
1266 return -ENOMEM; 1284 return -ENOMEM;
1267 1285
@@ -1287,7 +1305,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
1287 return -1; 1305 return -1;
1288 } 1306 }
1289 1307
1290 input = open(input_name, O_RDONLY); 1308 input = open(session->filename, O_RDONLY); /* input_name */
1291 if (input < 0) { 1309 if (input < 0) {
1292 perror("failed to open file"); 1310 perror("failed to open file");
1293 exit(-1); 1311 exit(-1);
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 955930e0a5c3..f5d2a63eba66 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -578,6 +578,33 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
578 avg / avg_stats(&walltime_nsecs_stats)); 578 avg / avg_stats(&walltime_nsecs_stats));
579} 579}
580 580
581/* used for get_ratio_color() */
582enum grc_type {
583 GRC_STALLED_CYCLES_FE,
584 GRC_STALLED_CYCLES_BE,
585 GRC_CACHE_MISSES,
586 GRC_MAX_NR
587};
588
589static const char *get_ratio_color(enum grc_type type, double ratio)
590{
591 static const double grc_table[GRC_MAX_NR][3] = {
592 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
593 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
594 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 },
595 };
596 const char *color = PERF_COLOR_NORMAL;
597
598 if (ratio > grc_table[type][0])
599 color = PERF_COLOR_RED;
600 else if (ratio > grc_table[type][1])
601 color = PERF_COLOR_MAGENTA;
602 else if (ratio > grc_table[type][2])
603 color = PERF_COLOR_YELLOW;
604
605 return color;
606}
607
581static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) 608static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
582{ 609{
583 double total, ratio = 0.0; 610 double total, ratio = 0.0;
@@ -588,13 +615,7 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us
588 if (total) 615 if (total)
589 ratio = avg / total * 100.0; 616 ratio = avg / total * 100.0;
590 617
591 color = PERF_COLOR_NORMAL; 618 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
592 if (ratio > 50.0)
593 color = PERF_COLOR_RED;
594 else if (ratio > 30.0)
595 color = PERF_COLOR_MAGENTA;
596 else if (ratio > 10.0)
597 color = PERF_COLOR_YELLOW;
598 619
599 fprintf(output, " # "); 620 fprintf(output, " # ");
600 color_fprintf(output, color, "%6.2f%%", ratio); 621 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -611,13 +632,7 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use
611 if (total) 632 if (total)
612 ratio = avg / total * 100.0; 633 ratio = avg / total * 100.0;
613 634
614 color = PERF_COLOR_NORMAL; 635 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
615 if (ratio > 75.0)
616 color = PERF_COLOR_RED;
617 else if (ratio > 50.0)
618 color = PERF_COLOR_MAGENTA;
619 else if (ratio > 20.0)
620 color = PERF_COLOR_YELLOW;
621 636
622 fprintf(output, " # "); 637 fprintf(output, " # ");
623 color_fprintf(output, color, "%6.2f%%", ratio); 638 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -634,13 +649,7 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double
634 if (total) 649 if (total)
635 ratio = avg / total * 100.0; 650 ratio = avg / total * 100.0;
636 651
637 color = PERF_COLOR_NORMAL; 652 color = get_ratio_color(GRC_CACHE_MISSES, ratio);
638 if (ratio > 20.0)
639 color = PERF_COLOR_RED;
640 else if (ratio > 10.0)
641 color = PERF_COLOR_MAGENTA;
642 else if (ratio > 5.0)
643 color = PERF_COLOR_YELLOW;
644 653
645 fprintf(output, " # "); 654 fprintf(output, " # ");
646 color_fprintf(output, color, "%6.2f%%", ratio); 655 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -657,13 +666,7 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou
657 if (total) 666 if (total)
658 ratio = avg / total * 100.0; 667 ratio = avg / total * 100.0;
659 668
660 color = PERF_COLOR_NORMAL; 669 color = get_ratio_color(GRC_CACHE_MISSES, ratio);
661 if (ratio > 20.0)
662 color = PERF_COLOR_RED;
663 else if (ratio > 10.0)
664 color = PERF_COLOR_MAGENTA;
665 else if (ratio > 5.0)
666 color = PERF_COLOR_YELLOW;
667 670
668 fprintf(output, " # "); 671 fprintf(output, " # ");
669 color_fprintf(output, color, "%6.2f%%", ratio); 672 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -680,13 +683,7 @@ static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, dou
680 if (total) 683 if (total)
681 ratio = avg / total * 100.0; 684 ratio = avg / total * 100.0;
682 685
683 color = PERF_COLOR_NORMAL; 686 color = get_ratio_color(GRC_CACHE_MISSES, ratio);
684 if (ratio > 20.0)
685 color = PERF_COLOR_RED;
686 else if (ratio > 10.0)
687 color = PERF_COLOR_MAGENTA;
688 else if (ratio > 5.0)
689 color = PERF_COLOR_YELLOW;
690 687
691 fprintf(output, " # "); 688 fprintf(output, " # ");
692 color_fprintf(output, color, "%6.2f%%", ratio); 689 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -703,13 +700,7 @@ static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do
703 if (total) 700 if (total)
704 ratio = avg / total * 100.0; 701 ratio = avg / total * 100.0;
705 702
706 color = PERF_COLOR_NORMAL; 703 color = get_ratio_color(GRC_CACHE_MISSES, ratio);
707 if (ratio > 20.0)
708 color = PERF_COLOR_RED;
709 else if (ratio > 10.0)
710 color = PERF_COLOR_MAGENTA;
711 else if (ratio > 5.0)
712 color = PERF_COLOR_YELLOW;
713 704
714 fprintf(output, " # "); 705 fprintf(output, " # ");
715 color_fprintf(output, color, "%6.2f%%", ratio); 706 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -726,13 +717,7 @@ static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do
726 if (total) 717 if (total)
727 ratio = avg / total * 100.0; 718 ratio = avg / total * 100.0;
728 719
729 color = PERF_COLOR_NORMAL; 720 color = get_ratio_color(GRC_CACHE_MISSES, ratio);
730 if (ratio > 20.0)
731 color = PERF_COLOR_RED;
732 else if (ratio > 10.0)
733 color = PERF_COLOR_MAGENTA;
734 else if (ratio > 5.0)
735 color = PERF_COLOR_YELLOW;
736 721
737 fprintf(output, " # "); 722 fprintf(output, " # ");
738 color_fprintf(output, color, "%6.2f%%", ratio); 723 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -749,13 +734,7 @@ static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, doub
749 if (total) 734 if (total)
750 ratio = avg / total * 100.0; 735 ratio = avg / total * 100.0;
751 736
752 color = PERF_COLOR_NORMAL; 737 color = get_ratio_color(GRC_CACHE_MISSES, ratio);
753 if (ratio > 20.0)
754 color = PERF_COLOR_RED;
755 else if (ratio > 10.0)
756 color = PERF_COLOR_MAGENTA;
757 else if (ratio > 5.0)
758 color = PERF_COLOR_YELLOW;
759 738
760 fprintf(output, " # "); 739 fprintf(output, " # ");
761 color_fprintf(output, color, "%6.2f%%", ratio); 740 color_fprintf(output, color, "%6.2f%%", ratio);
@@ -1108,22 +1087,13 @@ static const struct option options[] = {
1108 */ 1087 */
1109static int add_default_attributes(void) 1088static int add_default_attributes(void)
1110{ 1089{
1111 struct perf_evsel *pos;
1112 size_t attr_nr = 0;
1113 size_t c;
1114
1115 /* Set attrs if no event is selected and !null_run: */ 1090 /* Set attrs if no event is selected and !null_run: */
1116 if (null_run) 1091 if (null_run)
1117 return 0; 1092 return 0;
1118 1093
1119 if (!evsel_list->nr_entries) { 1094 if (!evsel_list->nr_entries) {
1120 for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { 1095 if (perf_evlist__add_attrs_array(evsel_list, default_attrs) < 0)
1121 pos = perf_evsel__new(default_attrs + c, c + attr_nr); 1096 return -1;
1122 if (pos == NULL)
1123 return -1;
1124 perf_evlist__add(evsel_list, pos);
1125 }
1126 attr_nr += c;
1127 } 1097 }
1128 1098
1129 /* Detailed events get appended to the event list: */ 1099 /* Detailed events get appended to the event list: */
@@ -1132,38 +1102,21 @@ static int add_default_attributes(void)
1132 return 0; 1102 return 0;
1133 1103
1134 /* Append detailed run extra attributes: */ 1104 /* Append detailed run extra attributes: */
1135 for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { 1105 if (perf_evlist__add_attrs_array(evsel_list, detailed_attrs) < 0)
1136 pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); 1106 return -1;
1137 if (pos == NULL)
1138 return -1;
1139 perf_evlist__add(evsel_list, pos);
1140 }
1141 attr_nr += c;
1142 1107
1143 if (detailed_run < 2) 1108 if (detailed_run < 2)
1144 return 0; 1109 return 0;
1145 1110
1146 /* Append very detailed run extra attributes: */ 1111 /* Append very detailed run extra attributes: */
1147 for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { 1112 if (perf_evlist__add_attrs_array(evsel_list, very_detailed_attrs) < 0)
1148 pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); 1113 return -1;
1149 if (pos == NULL)
1150 return -1;
1151 perf_evlist__add(evsel_list, pos);
1152 }
1153 1114
1154 if (detailed_run < 3) 1115 if (detailed_run < 3)
1155 return 0; 1116 return 0;
1156 1117
1157 /* Append very, very detailed run extra attributes: */ 1118 /* Append very, very detailed run extra attributes: */
1158 for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { 1119 return perf_evlist__add_attrs_array(evsel_list, very_very_detailed_attrs);
1159 pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr);
1160 if (pos == NULL)
1161 return -1;
1162 perf_evlist__add(evsel_list, pos);
1163 }
1164
1165
1166 return 0;
1167} 1120}
1168 1121
1169int cmd_stat(int argc, const char **argv, const char *prefix __used) 1122int cmd_stat(int argc, const char **argv, const char *prefix __used)
@@ -1267,8 +1220,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
1267 1220
1268 list_for_each_entry(pos, &evsel_list->entries, node) { 1221 list_for_each_entry(pos, &evsel_list->entries, node) {
1269 if (perf_evsel__alloc_stat_priv(pos) < 0 || 1222 if (perf_evsel__alloc_stat_priv(pos) < 0 ||
1270 perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 || 1223 perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0)
1271 perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0)
1272 goto out_free_fd; 1224 goto out_free_fd;
1273 } 1225 }
1274 1226
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 831d1baeac37..2b9a7f497a20 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -7,6 +7,7 @@
7 7
8#include "util/cache.h" 8#include "util/cache.h"
9#include "util/debug.h" 9#include "util/debug.h"
10#include "util/debugfs.h"
10#include "util/evlist.h" 11#include "util/evlist.h"
11#include "util/parse-options.h" 12#include "util/parse-options.h"
12#include "util/parse-events.h" 13#include "util/parse-events.h"
@@ -14,8 +15,6 @@
14#include "util/thread_map.h" 15#include "util/thread_map.h"
15#include "../../include/linux/hw_breakpoint.h" 16#include "../../include/linux/hw_breakpoint.h"
16 17
17static long page_size;
18
19static int vmlinux_matches_kallsyms_filter(struct map *map __used, struct symbol *sym) 18static int vmlinux_matches_kallsyms_filter(struct map *map __used, struct symbol *sym)
20{ 19{
21 bool *visited = symbol__priv(sym); 20 bool *visited = symbol__priv(sym);
@@ -31,6 +30,7 @@ static int test__vmlinux_matches_kallsyms(void)
31 struct map *kallsyms_map, *vmlinux_map; 30 struct map *kallsyms_map, *vmlinux_map;
32 struct machine kallsyms, vmlinux; 31 struct machine kallsyms, vmlinux;
33 enum map_type type = MAP__FUNCTION; 32 enum map_type type = MAP__FUNCTION;
33 long page_size = sysconf(_SC_PAGE_SIZE);
34 struct ref_reloc_sym ref_reloc_sym = { .name = "_stext", }; 34 struct ref_reloc_sym ref_reloc_sym = { .name = "_stext", };
35 35
36 /* 36 /*
@@ -247,7 +247,7 @@ static int trace_event__id(const char *evname)
247 247
248 if (asprintf(&filename, 248 if (asprintf(&filename,
249 "%s/syscalls/%s/id", 249 "%s/syscalls/%s/id",
250 debugfs_path, evname) < 0) 250 tracing_events_path, evname) < 0)
251 return -1; 251 return -1;
252 252
253 fd = open(filename, O_RDONLY); 253 fd = open(filename, O_RDONLY);
@@ -603,7 +603,7 @@ out_free_threads:
603 603
604#define TEST_ASSERT_VAL(text, cond) \ 604#define TEST_ASSERT_VAL(text, cond) \
605do { \ 605do { \
606 if (!cond) { \ 606 if (!(cond)) { \
607 pr_debug("FAILED %s:%d %s\n", __FILE__, __LINE__, text); \ 607 pr_debug("FAILED %s:%d %s\n", __FILE__, __LINE__, text); \
608 return -1; \ 608 return -1; \
609 } \ 609 } \
@@ -759,6 +759,103 @@ static int test__checkevent_breakpoint_w(struct perf_evlist *evlist)
759 return 0; 759 return 0;
760} 760}
761 761
762static int test__checkevent_tracepoint_modifier(struct perf_evlist *evlist)
763{
764 struct perf_evsel *evsel = list_entry(evlist->entries.next,
765 struct perf_evsel, node);
766
767 TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
768 TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
769 TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
770 TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
771
772 return test__checkevent_tracepoint(evlist);
773}
774
775static int
776test__checkevent_tracepoint_multi_modifier(struct perf_evlist *evlist)
777{
778 struct perf_evsel *evsel;
779
780 TEST_ASSERT_VAL("wrong number of entries", evlist->nr_entries > 1);
781
782 list_for_each_entry(evsel, &evlist->entries, node) {
783 TEST_ASSERT_VAL("wrong exclude_user",
784 !evsel->attr.exclude_user);
785 TEST_ASSERT_VAL("wrong exclude_kernel",
786 evsel->attr.exclude_kernel);
787 TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
788 TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
789 }
790
791 return test__checkevent_tracepoint_multi(evlist);
792}
793
794static int test__checkevent_raw_modifier(struct perf_evlist *evlist)
795{
796 struct perf_evsel *evsel = list_entry(evlist->entries.next,
797 struct perf_evsel, node);
798
799 TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
800 TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
801 TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
802 TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
803
804 return test__checkevent_raw(evlist);
805}
806
807static int test__checkevent_numeric_modifier(struct perf_evlist *evlist)
808{
809 struct perf_evsel *evsel = list_entry(evlist->entries.next,
810 struct perf_evsel, node);
811
812 TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
813 TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
814 TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
815 TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
816
817 return test__checkevent_numeric(evlist);
818}
819
820static int test__checkevent_symbolic_name_modifier(struct perf_evlist *evlist)
821{
822 struct perf_evsel *evsel = list_entry(evlist->entries.next,
823 struct perf_evsel, node);
824
825 TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
826 TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
827 TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
828 TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
829
830 return test__checkevent_symbolic_name(evlist);
831}
832
833static int test__checkevent_symbolic_alias_modifier(struct perf_evlist *evlist)
834{
835 struct perf_evsel *evsel = list_entry(evlist->entries.next,
836 struct perf_evsel, node);
837
838 TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
839 TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
840 TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
841 TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
842
843 return test__checkevent_symbolic_alias(evlist);
844}
845
846static int test__checkevent_genhw_modifier(struct perf_evlist *evlist)
847{
848 struct perf_evsel *evsel = list_entry(evlist->entries.next,
849 struct perf_evsel, node);
850
851 TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
852 TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
853 TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
854 TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
855
856 return test__checkevent_genhw(evlist);
857}
858
762static struct test__event_st { 859static struct test__event_st {
763 const char *name; 860 const char *name;
764 __u32 type; 861 __u32 type;
@@ -808,6 +905,34 @@ static struct test__event_st {
808 .name = "mem:0:w", 905 .name = "mem:0:w",
809 .check = test__checkevent_breakpoint_w, 906 .check = test__checkevent_breakpoint_w,
810 }, 907 },
908 {
909 .name = "syscalls:sys_enter_open:k",
910 .check = test__checkevent_tracepoint_modifier,
911 },
912 {
913 .name = "syscalls:*:u",
914 .check = test__checkevent_tracepoint_multi_modifier,
915 },
916 {
917 .name = "r1:kp",
918 .check = test__checkevent_raw_modifier,
919 },
920 {
921 .name = "1:1:hp",
922 .check = test__checkevent_numeric_modifier,
923 },
924 {
925 .name = "instructions:h",
926 .check = test__checkevent_symbolic_name_modifier,
927 },
928 {
929 .name = "faults:u",
930 .check = test__checkevent_symbolic_alias_modifier,
931 },
932 {
933 .name = "L1-dcache-load-miss:kp",
934 .check = test__checkevent_genhw_modifier,
935 },
811}; 936};
812 937
813#define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st)) 938#define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st))
@@ -841,6 +966,336 @@ static int test__parse_events(void)
841 966
842 return ret; 967 return ret;
843} 968}
969
970static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t **maskp,
971 size_t *sizep)
972{
973 cpu_set_t *mask;
974 size_t size;
975 int i, cpu = -1, nrcpus = 1024;
976realloc:
977 mask = CPU_ALLOC(nrcpus);
978 size = CPU_ALLOC_SIZE(nrcpus);
979 CPU_ZERO_S(size, mask);
980
981 if (sched_getaffinity(pid, size, mask) == -1) {
982 CPU_FREE(mask);
983 if (errno == EINVAL && nrcpus < (1024 << 8)) {
984 nrcpus = nrcpus << 2;
985 goto realloc;
986 }
987 perror("sched_getaffinity");
988 return -1;
989 }
990
991 for (i = 0; i < nrcpus; i++) {
992 if (CPU_ISSET_S(i, size, mask)) {
993 if (cpu == -1) {
994 cpu = i;
995 *maskp = mask;
996 *sizep = size;
997 } else
998 CPU_CLR_S(i, size, mask);
999 }
1000 }
1001
1002 if (cpu == -1)
1003 CPU_FREE(mask);
1004
1005 return cpu;
1006}
1007
1008static int test__PERF_RECORD(void)
1009{
1010 struct perf_record_opts opts = {
1011 .target_pid = -1,
1012 .target_tid = -1,
1013 .no_delay = true,
1014 .freq = 10,
1015 .mmap_pages = 256,
1016 .sample_id_all_avail = true,
1017 };
1018 cpu_set_t *cpu_mask = NULL;
1019 size_t cpu_mask_size = 0;
1020 struct perf_evlist *evlist = perf_evlist__new(NULL, NULL);
1021 struct perf_evsel *evsel;
1022 struct perf_sample sample;
1023 const char *cmd = "sleep";
1024 const char *argv[] = { cmd, "1", NULL, };
1025 char *bname;
1026 u64 sample_type, prev_time = 0;
1027 bool found_cmd_mmap = false,
1028 found_libc_mmap = false,
1029 found_vdso_mmap = false,
1030 found_ld_mmap = false;
1031 int err = -1, errs = 0, i, wakeups = 0, sample_size;
1032 u32 cpu;
1033 int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, };
1034
1035 if (evlist == NULL || argv == NULL) {
1036 pr_debug("Not enough memory to create evlist\n");
1037 goto out;
1038 }
1039
1040 /*
1041 * We need at least one evsel in the evlist, use the default
1042 * one: "cycles".
1043 */
1044 err = perf_evlist__add_default(evlist);
1045 if (err < 0) {
1046 pr_debug("Not enough memory to create evsel\n");
1047 goto out_delete_evlist;
1048 }
1049
1050 /*
1051 * Create maps of threads and cpus to monitor. In this case
1052 * we start with all threads and cpus (-1, -1) but then in
1053 * perf_evlist__prepare_workload we'll fill in the only thread
1054 * we're monitoring, the one forked there.
1055 */
1056 err = perf_evlist__create_maps(evlist, opts.target_pid,
1057 opts.target_tid, opts.cpu_list);
1058 if (err < 0) {
1059 pr_debug("Not enough memory to create thread/cpu maps\n");
1060 goto out_delete_evlist;
1061 }
1062
1063 /*
1064 * Prepare the workload in argv[] to run, it'll fork it, and then wait
1065 * for perf_evlist__start_workload() to exec it. This is done this way
1066 * so that we have time to open the evlist (calling sys_perf_event_open
1067 * on all the fds) and then mmap them.
1068 */
1069 err = perf_evlist__prepare_workload(evlist, &opts, argv);
1070 if (err < 0) {
1071 pr_debug("Couldn't run the workload!\n");
1072 goto out_delete_evlist;
1073 }
1074
1075 /*
1076 * Config the evsels, setting attr->comm on the first one, etc.
1077 */
1078 evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
1079 evsel->attr.sample_type |= PERF_SAMPLE_CPU;
1080 evsel->attr.sample_type |= PERF_SAMPLE_TID;
1081 evsel->attr.sample_type |= PERF_SAMPLE_TIME;
1082 perf_evlist__config_attrs(evlist, &opts);
1083
1084 err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask,
1085 &cpu_mask_size);
1086 if (err < 0) {
1087 pr_debug("sched__get_first_possible_cpu: %s\n", strerror(errno));
1088 goto out_delete_evlist;
1089 }
1090
1091 cpu = err;
1092
1093 /*
1094 * So that we can check perf_sample.cpu on all the samples.
1095 */
1096 if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, cpu_mask) < 0) {
1097 pr_debug("sched_setaffinity: %s\n", strerror(errno));
1098 goto out_free_cpu_mask;
1099 }
1100
1101 /*
1102 * Call sys_perf_event_open on all the fds on all the evsels,
1103 * grouping them if asked to.
1104 */
1105 err = perf_evlist__open(evlist, opts.group);
1106 if (err < 0) {
1107 pr_debug("perf_evlist__open: %s\n", strerror(errno));
1108 goto out_delete_evlist;
1109 }
1110
1111 /*
1112 * mmap the first fd on a given CPU and ask for events for the other
1113 * fds in the same CPU to be injected in the same mmap ring buffer
1114 * (using ioctl(PERF_EVENT_IOC_SET_OUTPUT)).
1115 */
1116 err = perf_evlist__mmap(evlist, opts.mmap_pages, false);
1117 if (err < 0) {
1118 pr_debug("perf_evlist__mmap: %s\n", strerror(errno));
1119 goto out_delete_evlist;
1120 }
1121
1122 /*
1123 * We'll need these two to parse the PERF_SAMPLE_* fields in each
1124 * event.
1125 */
1126 sample_type = perf_evlist__sample_type(evlist);
1127 sample_size = __perf_evsel__sample_size(sample_type);
1128
1129 /*
1130 * Now that all is properly set up, enable the events, they will
1131 * count just on workload.pid, which will start...
1132 */
1133 perf_evlist__enable(evlist);
1134
1135 /*
1136 * Now!
1137 */
1138 perf_evlist__start_workload(evlist);
1139
1140 while (1) {
1141 int before = total_events;
1142
1143 for (i = 0; i < evlist->nr_mmaps; i++) {
1144 union perf_event *event;
1145
1146 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
1147 const u32 type = event->header.type;
1148 const char *name = perf_event__name(type);
1149
1150 ++total_events;
1151 if (type < PERF_RECORD_MAX)
1152 nr_events[type]++;
1153
1154 err = perf_event__parse_sample(event, sample_type,
1155 sample_size, true,
1156 &sample, false);
1157 if (err < 0) {
1158 if (verbose)
1159 perf_event__fprintf(event, stderr);
1160 pr_debug("Couldn't parse sample\n");
1161 goto out_err;
1162 }
1163
1164 if (verbose) {
1165 pr_info("%" PRIu64" %d ", sample.time, sample.cpu);
1166 perf_event__fprintf(event, stderr);
1167 }
1168
1169 if (prev_time > sample.time) {
1170 pr_debug("%s going backwards in time, prev=%" PRIu64 ", curr=%" PRIu64 "\n",
1171 name, prev_time, sample.time);
1172 ++errs;
1173 }
1174
1175 prev_time = sample.time;
1176
1177 if (sample.cpu != cpu) {
1178 pr_debug("%s with unexpected cpu, expected %d, got %d\n",
1179 name, cpu, sample.cpu);
1180 ++errs;
1181 }
1182
1183 if ((pid_t)sample.pid != evlist->workload.pid) {
1184 pr_debug("%s with unexpected pid, expected %d, got %d\n",
1185 name, evlist->workload.pid, sample.pid);
1186 ++errs;
1187 }
1188
1189 if ((pid_t)sample.tid != evlist->workload.pid) {
1190 pr_debug("%s with unexpected tid, expected %d, got %d\n",
1191 name, evlist->workload.pid, sample.tid);
1192 ++errs;
1193 }
1194
1195 if ((type == PERF_RECORD_COMM ||
1196 type == PERF_RECORD_MMAP ||
1197 type == PERF_RECORD_FORK ||
1198 type == PERF_RECORD_EXIT) &&
1199 (pid_t)event->comm.pid != evlist->workload.pid) {
1200 pr_debug("%s with unexpected pid/tid\n", name);
1201 ++errs;
1202 }
1203
1204 if ((type == PERF_RECORD_COMM ||
1205 type == PERF_RECORD_MMAP) &&
1206 event->comm.pid != event->comm.tid) {
1207 pr_debug("%s with different pid/tid!\n", name);
1208 ++errs;
1209 }
1210
1211 switch (type) {
1212 case PERF_RECORD_COMM:
1213 if (strcmp(event->comm.comm, cmd)) {
1214 pr_debug("%s with unexpected comm!\n", name);
1215 ++errs;
1216 }
1217 break;
1218 case PERF_RECORD_EXIT:
1219 goto found_exit;
1220 case PERF_RECORD_MMAP:
1221 bname = strrchr(event->mmap.filename, '/');
1222 if (bname != NULL) {
1223 if (!found_cmd_mmap)
1224 found_cmd_mmap = !strcmp(bname + 1, cmd);
1225 if (!found_libc_mmap)
1226 found_libc_mmap = !strncmp(bname + 1, "libc", 4);
1227 if (!found_ld_mmap)
1228 found_ld_mmap = !strncmp(bname + 1, "ld", 2);
1229 } else if (!found_vdso_mmap)
1230 found_vdso_mmap = !strcmp(event->mmap.filename, "[vdso]");
1231 break;
1232
1233 case PERF_RECORD_SAMPLE:
1234 /* Just ignore samples for now */
1235 break;
1236 default:
1237 pr_debug("Unexpected perf_event->header.type %d!\n",
1238 type);
1239 ++errs;
1240 }
1241 }
1242 }
1243
1244 /*
1245 * We don't use poll here because at least at 3.1 times the
1246 * PERF_RECORD_{!SAMPLE} events don't honour
1247 * perf_event_attr.wakeup_events, just PERF_EVENT_SAMPLE does.
1248 */
1249 if (total_events == before && false)
1250 poll(evlist->pollfd, evlist->nr_fds, -1);
1251
1252 sleep(1);
1253 if (++wakeups > 5) {
1254 pr_debug("No PERF_RECORD_EXIT event!\n");
1255 break;
1256 }
1257 }
1258
1259found_exit:
1260 if (nr_events[PERF_RECORD_COMM] > 1) {
1261 pr_debug("Excessive number of PERF_RECORD_COMM events!\n");
1262 ++errs;
1263 }
1264
1265 if (nr_events[PERF_RECORD_COMM] == 0) {
1266 pr_debug("Missing PERF_RECORD_COMM for %s!\n", cmd);
1267 ++errs;
1268 }
1269
1270 if (!found_cmd_mmap) {
1271 pr_debug("PERF_RECORD_MMAP for %s missing!\n", cmd);
1272 ++errs;
1273 }
1274
1275 if (!found_libc_mmap) {
1276 pr_debug("PERF_RECORD_MMAP for %s missing!\n", "libc");
1277 ++errs;
1278 }
1279
1280 if (!found_ld_mmap) {
1281 pr_debug("PERF_RECORD_MMAP for %s missing!\n", "ld");
1282 ++errs;
1283 }
1284
1285 if (!found_vdso_mmap) {
1286 pr_debug("PERF_RECORD_MMAP for %s missing!\n", "[vdso]");
1287 ++errs;
1288 }
1289out_err:
1290 perf_evlist__munmap(evlist);
1291out_free_cpu_mask:
1292 CPU_FREE(cpu_mask);
1293out_delete_evlist:
1294 perf_evlist__delete(evlist);
1295out:
1296 return (err < 0 || errs > 0) ? -1 : 0;
1297}
1298
844static struct test { 1299static struct test {
845 const char *desc; 1300 const char *desc;
846 int (*func)(void); 1301 int (*func)(void);
@@ -866,45 +1321,89 @@ static struct test {
866 .func = test__parse_events, 1321 .func = test__parse_events,
867 }, 1322 },
868 { 1323 {
1324 .desc = "Validate PERF_RECORD_* events & perf_sample fields",
1325 .func = test__PERF_RECORD,
1326 },
1327 {
869 .func = NULL, 1328 .func = NULL,
870 }, 1329 },
871}; 1330};
872 1331
873static int __cmd_test(void) 1332static bool perf_test__matches(int curr, int argc, const char *argv[])
874{ 1333{
875 int i = 0; 1334 int i;
1335
1336 if (argc == 0)
1337 return true;
876 1338
877 page_size = sysconf(_SC_PAGE_SIZE); 1339 for (i = 0; i < argc; ++i) {
1340 char *end;
1341 long nr = strtoul(argv[i], &end, 10);
1342
1343 if (*end == '\0') {
1344 if (nr == curr + 1)
1345 return true;
1346 continue;
1347 }
1348
1349 if (strstr(tests[curr].desc, argv[i]))
1350 return true;
1351 }
1352
1353 return false;
1354}
1355
1356static int __cmd_test(int argc, const char *argv[])
1357{
1358 int i = 0;
878 1359
879 while (tests[i].func) { 1360 while (tests[i].func) {
880 int err; 1361 int curr = i++, err;
881 pr_info("%2d: %s:", i + 1, tests[i].desc); 1362
1363 if (!perf_test__matches(curr, argc, argv))
1364 continue;
1365
1366 pr_info("%2d: %s:", i, tests[curr].desc);
882 pr_debug("\n--- start ---\n"); 1367 pr_debug("\n--- start ---\n");
883 err = tests[i].func(); 1368 err = tests[curr].func();
884 pr_debug("---- end ----\n%s:", tests[i].desc); 1369 pr_debug("---- end ----\n%s:", tests[curr].desc);
885 pr_info(" %s\n", err ? "FAILED!\n" : "Ok"); 1370 pr_info(" %s\n", err ? "FAILED!\n" : "Ok");
886 ++i;
887 } 1371 }
888 1372
889 return 0; 1373 return 0;
890} 1374}
891 1375
892static const char * const test_usage[] = { 1376static int perf_test__list(int argc, const char **argv)
893 "perf test [<options>]", 1377{
894 NULL, 1378 int i = 0;
895}; 1379
1380 while (tests[i].func) {
1381 int curr = i++;
896 1382
897static const struct option test_options[] = { 1383 if (argc > 1 && !strstr(tests[curr].desc, argv[1]))
1384 continue;
1385
1386 pr_info("%2d: %s\n", i, tests[curr].desc);
1387 }
1388
1389 return 0;
1390}
1391
1392int cmd_test(int argc, const char **argv, const char *prefix __used)
1393{
1394 const char * const test_usage[] = {
1395 "perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]",
1396 NULL,
1397 };
1398 const struct option test_options[] = {
898 OPT_INTEGER('v', "verbose", &verbose, 1399 OPT_INTEGER('v', "verbose", &verbose,
899 "be more verbose (show symbol address, etc)"), 1400 "be more verbose (show symbol address, etc)"),
900 OPT_END() 1401 OPT_END()
901}; 1402 };
902 1403
903int cmd_test(int argc, const char **argv, const char *prefix __used)
904{
905 argc = parse_options(argc, argv, test_options, test_usage, 0); 1404 argc = parse_options(argc, argv, test_options, test_usage, 0);
906 if (argc) 1405 if (argc >= 1 && !strcmp(argv[0], "list"))
907 usage_with_options(test_usage, test_options); 1406 return perf_test__list(argc, argv);
908 1407
909 symbol_conf.priv_size = sizeof(int); 1408 symbol_conf.priv_size = sizeof(int);
910 symbol_conf.sort_by_name = true; 1409 symbol_conf.sort_by_name = true;
@@ -915,5 +1414,5 @@ int cmd_test(int argc, const char **argv, const char *prefix __used)
915 1414
916 setup_pager(); 1415 setup_pager();
917 1416
918 return __cmd_test(); 1417 return __cmd_test(argc, argv);
919} 1418}
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index aa26f4d66d10..3b75b2e21ea5 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -19,6 +19,7 @@
19#include "util/color.h" 19#include "util/color.h"
20#include <linux/list.h> 20#include <linux/list.h>
21#include "util/cache.h" 21#include "util/cache.h"
22#include "util/evsel.h"
22#include <linux/rbtree.h> 23#include <linux/rbtree.h>
23#include "util/symbol.h" 24#include "util/symbol.h"
24#include "util/callchain.h" 25#include "util/callchain.h"
@@ -31,13 +32,14 @@
31#include "util/event.h" 32#include "util/event.h"
32#include "util/session.h" 33#include "util/session.h"
33#include "util/svghelper.h" 34#include "util/svghelper.h"
35#include "util/tool.h"
34 36
35#define SUPPORT_OLD_POWER_EVENTS 1 37#define SUPPORT_OLD_POWER_EVENTS 1
36#define PWR_EVENT_EXIT -1 38#define PWR_EVENT_EXIT -1
37 39
38 40
39static char const *input_name = "perf.data"; 41static const char *input_name;
40static char const *output_name = "output.svg"; 42static const char *output_name = "output.svg";
41 43
42static unsigned int numcpus; 44static unsigned int numcpus;
43static u64 min_freq; /* Lowest CPU frequency seen */ 45static u64 min_freq; /* Lowest CPU frequency seen */
@@ -273,25 +275,28 @@ static int cpus_cstate_state[MAX_CPUS];
273static u64 cpus_pstate_start_times[MAX_CPUS]; 275static u64 cpus_pstate_start_times[MAX_CPUS];
274static u64 cpus_pstate_state[MAX_CPUS]; 276static u64 cpus_pstate_state[MAX_CPUS];
275 277
276static int process_comm_event(union perf_event *event, 278static int process_comm_event(struct perf_tool *tool __used,
279 union perf_event *event,
277 struct perf_sample *sample __used, 280 struct perf_sample *sample __used,
278 struct perf_session *session __used) 281 struct machine *machine __used)
279{ 282{
280 pid_set_comm(event->comm.tid, event->comm.comm); 283 pid_set_comm(event->comm.tid, event->comm.comm);
281 return 0; 284 return 0;
282} 285}
283 286
284static int process_fork_event(union perf_event *event, 287static int process_fork_event(struct perf_tool *tool __used,
288 union perf_event *event,
285 struct perf_sample *sample __used, 289 struct perf_sample *sample __used,
286 struct perf_session *session __used) 290 struct machine *machine __used)
287{ 291{
288 pid_fork(event->fork.pid, event->fork.ppid, event->fork.time); 292 pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
289 return 0; 293 return 0;
290} 294}
291 295
292static int process_exit_event(union perf_event *event, 296static int process_exit_event(struct perf_tool *tool __used,
297 union perf_event *event,
293 struct perf_sample *sample __used, 298 struct perf_sample *sample __used,
294 struct perf_session *session __used) 299 struct machine *machine __used)
295{ 300{
296 pid_exit(event->fork.pid, event->fork.time); 301 pid_exit(event->fork.pid, event->fork.time);
297 return 0; 302 return 0;
@@ -486,14 +491,15 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
486} 491}
487 492
488 493
489static int process_sample_event(union perf_event *event __used, 494static int process_sample_event(struct perf_tool *tool __used,
495 union perf_event *event __used,
490 struct perf_sample *sample, 496 struct perf_sample *sample,
491 struct perf_evsel *evsel __used, 497 struct perf_evsel *evsel,
492 struct perf_session *session) 498 struct machine *machine __used)
493{ 499{
494 struct trace_entry *te; 500 struct trace_entry *te;
495 501
496 if (session->sample_type & PERF_SAMPLE_TIME) { 502 if (evsel->attr.sample_type & PERF_SAMPLE_TIME) {
497 if (!first_time || first_time > sample->time) 503 if (!first_time || first_time > sample->time)
498 first_time = sample->time; 504 first_time = sample->time;
499 if (last_time < sample->time) 505 if (last_time < sample->time)
@@ -501,7 +507,7 @@ static int process_sample_event(union perf_event *event __used,
501 } 507 }
502 508
503 te = (void *)sample->raw_data; 509 te = (void *)sample->raw_data;
504 if (session->sample_type & PERF_SAMPLE_RAW && sample->raw_size > 0) { 510 if ((evsel->attr.sample_type & PERF_SAMPLE_RAW) && sample->raw_size > 0) {
505 char *event_str; 511 char *event_str;
506#ifdef SUPPORT_OLD_POWER_EVENTS 512#ifdef SUPPORT_OLD_POWER_EVENTS
507 struct power_entry_old *peo; 513 struct power_entry_old *peo;
@@ -974,7 +980,7 @@ static void write_svg_file(const char *filename)
974 svg_close(); 980 svg_close();
975} 981}
976 982
977static struct perf_event_ops event_ops = { 983static struct perf_tool perf_timechart = {
978 .comm = process_comm_event, 984 .comm = process_comm_event,
979 .fork = process_fork_event, 985 .fork = process_fork_event,
980 .exit = process_exit_event, 986 .exit = process_exit_event,
@@ -985,7 +991,7 @@ static struct perf_event_ops event_ops = {
985static int __cmd_timechart(void) 991static int __cmd_timechart(void)
986{ 992{
987 struct perf_session *session = perf_session__new(input_name, O_RDONLY, 993 struct perf_session *session = perf_session__new(input_name, O_RDONLY,
988 0, false, &event_ops); 994 0, false, &perf_timechart);
989 int ret = -EINVAL; 995 int ret = -EINVAL;
990 996
991 if (session == NULL) 997 if (session == NULL)
@@ -994,7 +1000,7 @@ static int __cmd_timechart(void)
994 if (!perf_session__has_traces(session, "timechart record")) 1000 if (!perf_session__has_traces(session, "timechart record"))
995 goto out_delete; 1001 goto out_delete;
996 1002
997 ret = perf_session__process_events(session, &event_ops); 1003 ret = perf_session__process_events(session, &perf_timechart);
998 if (ret) 1004 if (ret)
999 goto out_delete; 1005 goto out_delete;
1000 1006
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c9cdedb58134..4f81eeb99875 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -64,44 +64,6 @@
64#include <linux/unistd.h> 64#include <linux/unistd.h>
65#include <linux/types.h> 65#include <linux/types.h>
66 66
67static struct perf_top top = {
68 .count_filter = 5,
69 .delay_secs = 2,
70 .target_pid = -1,
71 .target_tid = -1,
72 .freq = 1000, /* 1 KHz */
73};
74
75static bool system_wide = false;
76
77static bool use_tui, use_stdio;
78
79static bool sort_has_symbols;
80
81static bool dont_use_callchains;
82static char callchain_default_opt[] = "fractal,0.5,callee";
83
84
85static int default_interval = 0;
86
87static bool kptr_restrict_warned;
88static bool vmlinux_warned;
89static bool inherit = false;
90static int realtime_prio = 0;
91static bool group = false;
92static bool sample_id_all_avail = true;
93static unsigned int mmap_pages = 128;
94
95static bool dump_symtab = false;
96
97static struct winsize winsize;
98
99static const char *sym_filter = NULL;
100static int sym_pcnt_filter = 5;
101
102/*
103 * Source functions
104 */
105 67
106void get_term_dimensions(struct winsize *ws) 68void get_term_dimensions(struct winsize *ws)
107{ 69{
@@ -125,21 +87,23 @@ void get_term_dimensions(struct winsize *ws)
125 ws->ws_col = 80; 87 ws->ws_col = 80;
126} 88}
127 89
128static void update_print_entries(struct winsize *ws) 90static void perf_top__update_print_entries(struct perf_top *top)
129{ 91{
130 top.print_entries = ws->ws_row; 92 top->print_entries = top->winsize.ws_row;
131 93
132 if (top.print_entries > 9) 94 if (top->print_entries > 9)
133 top.print_entries -= 9; 95 top->print_entries -= 9;
134} 96}
135 97
136static void sig_winch_handler(int sig __used) 98static void perf_top__sig_winch(int sig __used, siginfo_t *info __used, void *arg)
137{ 99{
138 get_term_dimensions(&winsize); 100 struct perf_top *top = arg;
139 update_print_entries(&winsize); 101
102 get_term_dimensions(&top->winsize);
103 perf_top__update_print_entries(top);
140} 104}
141 105
142static int parse_source(struct hist_entry *he) 106static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
143{ 107{
144 struct symbol *sym; 108 struct symbol *sym;
145 struct annotation *notes; 109 struct annotation *notes;
@@ -170,7 +134,7 @@ static int parse_source(struct hist_entry *he)
170 134
171 pthread_mutex_lock(&notes->lock); 135 pthread_mutex_lock(&notes->lock);
172 136
173 if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { 137 if (symbol__alloc_hist(sym) < 0) {
174 pthread_mutex_unlock(&notes->lock); 138 pthread_mutex_unlock(&notes->lock);
175 pr_err("Not enough memory for annotating '%s' symbol!\n", 139 pr_err("Not enough memory for annotating '%s' symbol!\n",
176 sym->name); 140 sym->name);
@@ -181,7 +145,7 @@ static int parse_source(struct hist_entry *he)
181 err = symbol__annotate(sym, map, 0); 145 err = symbol__annotate(sym, map, 0);
182 if (err == 0) { 146 if (err == 0) {
183out_assign: 147out_assign:
184 top.sym_filter_entry = he; 148 top->sym_filter_entry = he;
185 } 149 }
186 150
187 pthread_mutex_unlock(&notes->lock); 151 pthread_mutex_unlock(&notes->lock);
@@ -194,14 +158,16 @@ static void __zero_source_counters(struct hist_entry *he)
194 symbol__annotate_zero_histograms(sym); 158 symbol__annotate_zero_histograms(sym);
195} 159}
196 160
197static void record_precise_ip(struct hist_entry *he, int counter, u64 ip) 161static void perf_top__record_precise_ip(struct perf_top *top,
162 struct hist_entry *he,
163 int counter, u64 ip)
198{ 164{
199 struct annotation *notes; 165 struct annotation *notes;
200 struct symbol *sym; 166 struct symbol *sym;
201 167
202 if (he == NULL || he->ms.sym == NULL || 168 if (he == NULL || he->ms.sym == NULL ||
203 ((top.sym_filter_entry == NULL || 169 ((top->sym_filter_entry == NULL ||
204 top.sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1)) 170 top->sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1))
205 return; 171 return;
206 172
207 sym = he->ms.sym; 173 sym = he->ms.sym;
@@ -210,8 +176,7 @@ static void record_precise_ip(struct hist_entry *he, int counter, u64 ip)
210 if (pthread_mutex_trylock(&notes->lock)) 176 if (pthread_mutex_trylock(&notes->lock))
211 return; 177 return;
212 178
213 if (notes->src == NULL && 179 if (notes->src == NULL && symbol__alloc_hist(sym) < 0) {
214 symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
215 pthread_mutex_unlock(&notes->lock); 180 pthread_mutex_unlock(&notes->lock);
216 pr_err("Not enough memory for annotating '%s' symbol!\n", 181 pr_err("Not enough memory for annotating '%s' symbol!\n",
217 sym->name); 182 sym->name);
@@ -225,8 +190,9 @@ static void record_precise_ip(struct hist_entry *he, int counter, u64 ip)
225 pthread_mutex_unlock(&notes->lock); 190 pthread_mutex_unlock(&notes->lock);
226} 191}
227 192
228static void show_details(struct hist_entry *he) 193static void perf_top__show_details(struct perf_top *top)
229{ 194{
195 struct hist_entry *he = top->sym_filter_entry;
230 struct annotation *notes; 196 struct annotation *notes;
231 struct symbol *symbol; 197 struct symbol *symbol;
232 int more; 198 int more;
@@ -242,15 +208,15 @@ static void show_details(struct hist_entry *he)
242 if (notes->src == NULL) 208 if (notes->src == NULL)
243 goto out_unlock; 209 goto out_unlock;
244 210
245 printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name); 211 printf("Showing %s for %s\n", event_name(top->sym_evsel), symbol->name);
246 printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); 212 printf(" Events Pcnt (>=%d%%)\n", top->sym_pcnt_filter);
247 213
248 more = symbol__annotate_printf(symbol, he->ms.map, top.sym_evsel->idx, 214 more = symbol__annotate_printf(symbol, he->ms.map, top->sym_evsel->idx,
249 0, sym_pcnt_filter, top.print_entries, 4); 215 0, top->sym_pcnt_filter, top->print_entries, 4);
250 if (top.zero) 216 if (top->zero)
251 symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx); 217 symbol__annotate_zero_histogram(symbol, top->sym_evsel->idx);
252 else 218 else
253 symbol__annotate_decay_histogram(symbol, top.sym_evsel->idx); 219 symbol__annotate_decay_histogram(symbol, top->sym_evsel->idx);
254 if (more != 0) 220 if (more != 0)
255 printf("%d lines not displayed, maybe increase display entries [e]\n", more); 221 printf("%d lines not displayed, maybe increase display entries [e]\n", more);
256out_unlock: 222out_unlock:
@@ -259,11 +225,9 @@ out_unlock:
259 225
260static const char CONSOLE_CLEAR[] = ""; 226static const char CONSOLE_CLEAR[] = "";
261 227
262static struct hist_entry * 228static struct hist_entry *perf_evsel__add_hist_entry(struct perf_evsel *evsel,
263 perf_session__add_hist_entry(struct perf_session *session, 229 struct addr_location *al,
264 struct addr_location *al, 230 struct perf_sample *sample)
265 struct perf_sample *sample,
266 struct perf_evsel *evsel)
267{ 231{
268 struct hist_entry *he; 232 struct hist_entry *he;
269 233
@@ -271,50 +235,51 @@ static struct hist_entry *
271 if (he == NULL) 235 if (he == NULL)
272 return NULL; 236 return NULL;
273 237
274 session->hists.stats.total_period += sample->period; 238 evsel->hists.stats.total_period += sample->period;
275 hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); 239 hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
276 return he; 240 return he;
277} 241}
278 242
279static void print_sym_table(void) 243static void perf_top__print_sym_table(struct perf_top *top)
280{ 244{
281 char bf[160]; 245 char bf[160];
282 int printed = 0; 246 int printed = 0;
283 const int win_width = winsize.ws_col - 1; 247 const int win_width = top->winsize.ws_col - 1;
284 248
285 puts(CONSOLE_CLEAR); 249 puts(CONSOLE_CLEAR);
286 250
287 perf_top__header_snprintf(&top, bf, sizeof(bf)); 251 perf_top__header_snprintf(top, bf, sizeof(bf));
288 printf("%s\n", bf); 252 printf("%s\n", bf);
289 253
290 perf_top__reset_sample_counters(&top); 254 perf_top__reset_sample_counters(top);
291 255
292 printf("%-*.*s\n", win_width, win_width, graph_dotted_line); 256 printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
293 257
294 if (top.sym_evsel->hists.stats.nr_lost_warned != 258 if (top->sym_evsel->hists.stats.nr_lost_warned !=
295 top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) { 259 top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) {
296 top.sym_evsel->hists.stats.nr_lost_warned = 260 top->sym_evsel->hists.stats.nr_lost_warned =
297 top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]; 261 top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST];
298 color_fprintf(stdout, PERF_COLOR_RED, 262 color_fprintf(stdout, PERF_COLOR_RED,
299 "WARNING: LOST %d chunks, Check IO/CPU overload", 263 "WARNING: LOST %d chunks, Check IO/CPU overload",
300 top.sym_evsel->hists.stats.nr_lost_warned); 264 top->sym_evsel->hists.stats.nr_lost_warned);
301 ++printed; 265 ++printed;
302 } 266 }
303 267
304 if (top.sym_filter_entry) { 268 if (top->sym_filter_entry) {
305 show_details(top.sym_filter_entry); 269 perf_top__show_details(top);
306 return; 270 return;
307 } 271 }
308 272
309 hists__collapse_resort_threaded(&top.sym_evsel->hists); 273 hists__collapse_resort_threaded(&top->sym_evsel->hists);
310 hists__output_resort_threaded(&top.sym_evsel->hists); 274 hists__output_resort_threaded(&top->sym_evsel->hists);
311 hists__decay_entries_threaded(&top.sym_evsel->hists, 275 hists__decay_entries_threaded(&top->sym_evsel->hists,
312 top.hide_user_symbols, 276 top->hide_user_symbols,
313 top.hide_kernel_symbols); 277 top->hide_kernel_symbols);
314 hists__output_recalc_col_len(&top.sym_evsel->hists, winsize.ws_row - 3); 278 hists__output_recalc_col_len(&top->sym_evsel->hists,
279 top->winsize.ws_row - 3);
315 putchar('\n'); 280 putchar('\n');
316 hists__fprintf(&top.sym_evsel->hists, NULL, false, false, 281 hists__fprintf(&top->sym_evsel->hists, NULL, false, false,
317 winsize.ws_row - 4 - printed, win_width, stdout); 282 top->winsize.ws_row - 4 - printed, win_width, stdout);
318} 283}
319 284
320static void prompt_integer(int *target, const char *msg) 285static void prompt_integer(int *target, const char *msg)
@@ -352,17 +317,17 @@ static void prompt_percent(int *target, const char *msg)
352 *target = tmp; 317 *target = tmp;
353} 318}
354 319
355static void prompt_symbol(struct hist_entry **target, const char *msg) 320static void perf_top__prompt_symbol(struct perf_top *top, const char *msg)
356{ 321{
357 char *buf = malloc(0), *p; 322 char *buf = malloc(0), *p;
358 struct hist_entry *syme = *target, *n, *found = NULL; 323 struct hist_entry *syme = top->sym_filter_entry, *n, *found = NULL;
359 struct rb_node *next; 324 struct rb_node *next;
360 size_t dummy = 0; 325 size_t dummy = 0;
361 326
362 /* zero counters of active symbol */ 327 /* zero counters of active symbol */
363 if (syme) { 328 if (syme) {
364 __zero_source_counters(syme); 329 __zero_source_counters(syme);
365 *target = NULL; 330 top->sym_filter_entry = NULL;
366 } 331 }
367 332
368 fprintf(stdout, "\n%s: ", msg); 333 fprintf(stdout, "\n%s: ", msg);
@@ -373,7 +338,7 @@ static void prompt_symbol(struct hist_entry **target, const char *msg)
373 if (p) 338 if (p)
374 *p = 0; 339 *p = 0;
375 340
376 next = rb_first(&top.sym_evsel->hists.entries); 341 next = rb_first(&top->sym_evsel->hists.entries);
377 while (next) { 342 while (next) {
378 n = rb_entry(next, struct hist_entry, rb_node); 343 n = rb_entry(next, struct hist_entry, rb_node);
379 if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) { 344 if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) {
@@ -386,47 +351,46 @@ static void prompt_symbol(struct hist_entry **target, const char *msg)
386 if (!found) { 351 if (!found) {
387 fprintf(stderr, "Sorry, %s is not active.\n", buf); 352 fprintf(stderr, "Sorry, %s is not active.\n", buf);
388 sleep(1); 353 sleep(1);
389 return;
390 } else 354 } else
391 parse_source(found); 355 perf_top__parse_source(top, found);
392 356
393out_free: 357out_free:
394 free(buf); 358 free(buf);
395} 359}
396 360
397static void print_mapped_keys(void) 361static void perf_top__print_mapped_keys(struct perf_top *top)
398{ 362{
399 char *name = NULL; 363 char *name = NULL;
400 364
401 if (top.sym_filter_entry) { 365 if (top->sym_filter_entry) {
402 struct symbol *sym = top.sym_filter_entry->ms.sym; 366 struct symbol *sym = top->sym_filter_entry->ms.sym;
403 name = sym->name; 367 name = sym->name;
404 } 368 }
405 369
406 fprintf(stdout, "\nMapped keys:\n"); 370 fprintf(stdout, "\nMapped keys:\n");
407 fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top.delay_secs); 371 fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top->delay_secs);
408 fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top.print_entries); 372 fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top->print_entries);
409 373
410 if (top.evlist->nr_entries > 1) 374 if (top->evlist->nr_entries > 1)
411 fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top.sym_evsel)); 375 fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top->sym_evsel));
412 376
413 fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top.count_filter); 377 fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top->count_filter);
414 378
415 fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); 379 fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", top->sym_pcnt_filter);
416 fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); 380 fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL");
417 fprintf(stdout, "\t[S] stop annotation.\n"); 381 fprintf(stdout, "\t[S] stop annotation.\n");
418 382
419 fprintf(stdout, 383 fprintf(stdout,
420 "\t[K] hide kernel_symbols symbols. \t(%s)\n", 384 "\t[K] hide kernel_symbols symbols. \t(%s)\n",
421 top.hide_kernel_symbols ? "yes" : "no"); 385 top->hide_kernel_symbols ? "yes" : "no");
422 fprintf(stdout, 386 fprintf(stdout,
423 "\t[U] hide user symbols. \t(%s)\n", 387 "\t[U] hide user symbols. \t(%s)\n",
424 top.hide_user_symbols ? "yes" : "no"); 388 top->hide_user_symbols ? "yes" : "no");
425 fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top.zero ? 1 : 0); 389 fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top->zero ? 1 : 0);
426 fprintf(stdout, "\t[qQ] quit.\n"); 390 fprintf(stdout, "\t[qQ] quit.\n");
427} 391}
428 392
429static int key_mapped(int c) 393static int perf_top__key_mapped(struct perf_top *top, int c)
430{ 394{
431 switch (c) { 395 switch (c) {
432 case 'd': 396 case 'd':
@@ -442,7 +406,7 @@ static int key_mapped(int c)
442 case 'S': 406 case 'S':
443 return 1; 407 return 1;
444 case 'E': 408 case 'E':
445 return top.evlist->nr_entries > 1 ? 1 : 0; 409 return top->evlist->nr_entries > 1 ? 1 : 0;
446 default: 410 default:
447 break; 411 break;
448 } 412 }
@@ -450,13 +414,13 @@ static int key_mapped(int c)
450 return 0; 414 return 0;
451} 415}
452 416
453static void handle_keypress(int c) 417static void perf_top__handle_keypress(struct perf_top *top, int c)
454{ 418{
455 if (!key_mapped(c)) { 419 if (!perf_top__key_mapped(top, c)) {
456 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; 420 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
457 struct termios tc, save; 421 struct termios tc, save;
458 422
459 print_mapped_keys(); 423 perf_top__print_mapped_keys(top);
460 fprintf(stdout, "\nEnter selection, or unmapped key to continue: "); 424 fprintf(stdout, "\nEnter selection, or unmapped key to continue: ");
461 fflush(stdout); 425 fflush(stdout);
462 426
@@ -471,81 +435,86 @@ static void handle_keypress(int c)
471 c = getc(stdin); 435 c = getc(stdin);
472 436
473 tcsetattr(0, TCSAFLUSH, &save); 437 tcsetattr(0, TCSAFLUSH, &save);
474 if (!key_mapped(c)) 438 if (!perf_top__key_mapped(top, c))
475 return; 439 return;
476 } 440 }
477 441
478 switch (c) { 442 switch (c) {
479 case 'd': 443 case 'd':
480 prompt_integer(&top.delay_secs, "Enter display delay"); 444 prompt_integer(&top->delay_secs, "Enter display delay");
481 if (top.delay_secs < 1) 445 if (top->delay_secs < 1)
482 top.delay_secs = 1; 446 top->delay_secs = 1;
483 break; 447 break;
484 case 'e': 448 case 'e':
485 prompt_integer(&top.print_entries, "Enter display entries (lines)"); 449 prompt_integer(&top->print_entries, "Enter display entries (lines)");
486 if (top.print_entries == 0) { 450 if (top->print_entries == 0) {
487 sig_winch_handler(SIGWINCH); 451 struct sigaction act = {
488 signal(SIGWINCH, sig_winch_handler); 452 .sa_sigaction = perf_top__sig_winch,
453 .sa_flags = SA_SIGINFO,
454 };
455 perf_top__sig_winch(SIGWINCH, NULL, top);
456 sigaction(SIGWINCH, &act, NULL);
489 } else 457 } else
490 signal(SIGWINCH, SIG_DFL); 458 signal(SIGWINCH, SIG_DFL);
491 break; 459 break;
492 case 'E': 460 case 'E':
493 if (top.evlist->nr_entries > 1) { 461 if (top->evlist->nr_entries > 1) {
494 /* Select 0 as the default event: */ 462 /* Select 0 as the default event: */
495 int counter = 0; 463 int counter = 0;
496 464
497 fprintf(stderr, "\nAvailable events:"); 465 fprintf(stderr, "\nAvailable events:");
498 466
499 list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) 467 list_for_each_entry(top->sym_evsel, &top->evlist->entries, node)
500 fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel)); 468 fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, event_name(top->sym_evsel));
501 469
502 prompt_integer(&counter, "Enter details event counter"); 470 prompt_integer(&counter, "Enter details event counter");
503 471
504 if (counter >= top.evlist->nr_entries) { 472 if (counter >= top->evlist->nr_entries) {
505 top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); 473 top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node);
506 fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel)); 474 fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top->sym_evsel));
507 sleep(1); 475 sleep(1);
508 break; 476 break;
509 } 477 }
510 list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) 478 list_for_each_entry(top->sym_evsel, &top->evlist->entries, node)
511 if (top.sym_evsel->idx == counter) 479 if (top->sym_evsel->idx == counter)
512 break; 480 break;
513 } else 481 } else
514 top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); 482 top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node);
515 break; 483 break;
516 case 'f': 484 case 'f':
517 prompt_integer(&top.count_filter, "Enter display event count filter"); 485 prompt_integer(&top->count_filter, "Enter display event count filter");
518 break; 486 break;
519 case 'F': 487 case 'F':
520 prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)"); 488 prompt_percent(&top->sym_pcnt_filter,
489 "Enter details display event filter (percent)");
521 break; 490 break;
522 case 'K': 491 case 'K':
523 top.hide_kernel_symbols = !top.hide_kernel_symbols; 492 top->hide_kernel_symbols = !top->hide_kernel_symbols;
524 break; 493 break;
525 case 'q': 494 case 'q':
526 case 'Q': 495 case 'Q':
527 printf("exiting.\n"); 496 printf("exiting.\n");
528 if (dump_symtab) 497 if (top->dump_symtab)
529 perf_session__fprintf_dsos(top.session, stderr); 498 perf_session__fprintf_dsos(top->session, stderr);
530 exit(0); 499 exit(0);
531 case 's': 500 case 's':
532 prompt_symbol(&top.sym_filter_entry, "Enter details symbol"); 501 perf_top__prompt_symbol(top, "Enter details symbol");
533 break; 502 break;
534 case 'S': 503 case 'S':
535 if (!top.sym_filter_entry) 504 if (!top->sym_filter_entry)
536 break; 505 break;
537 else { 506 else {
538 struct hist_entry *syme = top.sym_filter_entry; 507 struct hist_entry *syme = top->sym_filter_entry;
539 508
540 top.sym_filter_entry = NULL; 509 top->sym_filter_entry = NULL;
541 __zero_source_counters(syme); 510 __zero_source_counters(syme);
542 } 511 }
543 break; 512 break;
544 case 'U': 513 case 'U':
545 top.hide_user_symbols = !top.hide_user_symbols; 514 top->hide_user_symbols = !top->hide_user_symbols;
546 break; 515 break;
547 case 'z': 516 case 'z':
548 top.zero = !top.zero; 517 top->zero = !top->zero;
549 break; 518 break;
550 default: 519 default:
551 break; 520 break;
@@ -563,28 +532,30 @@ static void perf_top__sort_new_samples(void *arg)
563 hists__collapse_resort_threaded(&t->sym_evsel->hists); 532 hists__collapse_resort_threaded(&t->sym_evsel->hists);
564 hists__output_resort_threaded(&t->sym_evsel->hists); 533 hists__output_resort_threaded(&t->sym_evsel->hists);
565 hists__decay_entries_threaded(&t->sym_evsel->hists, 534 hists__decay_entries_threaded(&t->sym_evsel->hists,
566 top.hide_user_symbols, 535 t->hide_user_symbols,
567 top.hide_kernel_symbols); 536 t->hide_kernel_symbols);
568} 537}
569 538
570static void *display_thread_tui(void *arg __used) 539static void *display_thread_tui(void *arg)
571{ 540{
541 struct perf_top *top = arg;
572 const char *help = "For a higher level overview, try: perf top --sort comm,dso"; 542 const char *help = "For a higher level overview, try: perf top --sort comm,dso";
573 543
574 perf_top__sort_new_samples(&top); 544 perf_top__sort_new_samples(top);
575 perf_evlist__tui_browse_hists(top.evlist, help, 545 perf_evlist__tui_browse_hists(top->evlist, help,
576 perf_top__sort_new_samples, 546 perf_top__sort_new_samples,
577 &top, top.delay_secs); 547 top, top->delay_secs);
578 548
579 exit_browser(0); 549 exit_browser(0);
580 exit(0); 550 exit(0);
581 return NULL; 551 return NULL;
582} 552}
583 553
584static void *display_thread(void *arg __used) 554static void *display_thread(void *arg)
585{ 555{
586 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; 556 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
587 struct termios tc, save; 557 struct termios tc, save;
558 struct perf_top *top = arg;
588 int delay_msecs, c; 559 int delay_msecs, c;
589 560
590 tcgetattr(0, &save); 561 tcgetattr(0, &save);
@@ -595,13 +566,13 @@ static void *display_thread(void *arg __used)
595 566
596 pthread__unblock_sigwinch(); 567 pthread__unblock_sigwinch();
597repeat: 568repeat:
598 delay_msecs = top.delay_secs * 1000; 569 delay_msecs = top->delay_secs * 1000;
599 tcsetattr(0, TCSANOW, &tc); 570 tcsetattr(0, TCSANOW, &tc);
600 /* trash return*/ 571 /* trash return*/
601 getc(stdin); 572 getc(stdin);
602 573
603 while (1) { 574 while (1) {
604 print_sym_table(); 575 perf_top__print_sym_table(top);
605 /* 576 /*
606 * Either timeout expired or we got an EINTR due to SIGWINCH, 577 * Either timeout expired or we got an EINTR due to SIGWINCH,
607 * refresh screen in both cases. 578 * refresh screen in both cases.
@@ -621,7 +592,7 @@ process_hotkey:
621 c = getc(stdin); 592 c = getc(stdin);
622 tcsetattr(0, TCSAFLUSH, &save); 593 tcsetattr(0, TCSAFLUSH, &save);
623 594
624 handle_keypress(c); 595 perf_top__handle_keypress(top, c);
625 goto repeat; 596 goto repeat;
626 597
627 return NULL; 598 return NULL;
@@ -673,47 +644,17 @@ static int symbol_filter(struct map *map __used, struct symbol *sym)
673 return 0; 644 return 0;
674} 645}
675 646
676static void perf_event__process_sample(const union perf_event *event, 647static void perf_event__process_sample(struct perf_tool *tool,
648 const union perf_event *event,
677 struct perf_evsel *evsel, 649 struct perf_evsel *evsel,
678 struct perf_sample *sample, 650 struct perf_sample *sample,
679 struct perf_session *session) 651 struct machine *machine)
680{ 652{
653 struct perf_top *top = container_of(tool, struct perf_top, tool);
681 struct symbol *parent = NULL; 654 struct symbol *parent = NULL;
682 u64 ip = event->ip.ip; 655 u64 ip = event->ip.ip;
683 struct addr_location al; 656 struct addr_location al;
684 struct machine *machine;
685 int err; 657 int err;
686 u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
687
688 ++top.samples;
689
690 switch (origin) {
691 case PERF_RECORD_MISC_USER:
692 ++top.us_samples;
693 if (top.hide_user_symbols)
694 return;
695 machine = perf_session__find_host_machine(session);
696 break;
697 case PERF_RECORD_MISC_KERNEL:
698 ++top.kernel_samples;
699 if (top.hide_kernel_symbols)
700 return;
701 machine = perf_session__find_host_machine(session);
702 break;
703 case PERF_RECORD_MISC_GUEST_KERNEL:
704 ++top.guest_kernel_samples;
705 machine = perf_session__find_machine(session, event->ip.pid);
706 break;
707 case PERF_RECORD_MISC_GUEST_USER:
708 ++top.guest_us_samples;
709 /*
710 * TODO: we don't process guest user from host side
711 * except simple counting.
712 */
713 return;
714 default:
715 return;
716 }
717 658
718 if (!machine && perf_guest) { 659 if (!machine && perf_guest) {
719 pr_err("Can't find guest [%d]'s kernel information\n", 660 pr_err("Can't find guest [%d]'s kernel information\n",
@@ -722,14 +663,14 @@ static void perf_event__process_sample(const union perf_event *event,
722 } 663 }
723 664
724 if (event->header.misc & PERF_RECORD_MISC_EXACT_IP) 665 if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
725 top.exact_samples++; 666 top->exact_samples++;
726 667
727 if (perf_event__preprocess_sample(event, session, &al, sample, 668 if (perf_event__preprocess_sample(event, machine, &al, sample,
728 symbol_filter) < 0 || 669 symbol_filter) < 0 ||
729 al.filtered) 670 al.filtered)
730 return; 671 return;
731 672
732 if (!kptr_restrict_warned && 673 if (!top->kptr_restrict_warned &&
733 symbol_conf.kptr_restrict && 674 symbol_conf.kptr_restrict &&
734 al.cpumode == PERF_RECORD_MISC_KERNEL) { 675 al.cpumode == PERF_RECORD_MISC_KERNEL) {
735 ui__warning( 676 ui__warning(
@@ -740,7 +681,7 @@ static void perf_event__process_sample(const union perf_event *event,
740 " modules" : ""); 681 " modules" : "");
741 if (use_browser <= 0) 682 if (use_browser <= 0)
742 sleep(5); 683 sleep(5);
743 kptr_restrict_warned = true; 684 top->kptr_restrict_warned = true;
744 } 685 }
745 686
746 if (al.sym == NULL) { 687 if (al.sym == NULL) {
@@ -756,7 +697,7 @@ static void perf_event__process_sample(const union perf_event *event,
756 * --hide-kernel-symbols, even if the user specifies an 697 * --hide-kernel-symbols, even if the user specifies an
757 * invalid --vmlinux ;-) 698 * invalid --vmlinux ;-)
758 */ 699 */
759 if (!kptr_restrict_warned && !vmlinux_warned && 700 if (!top->kptr_restrict_warned && !top->vmlinux_warned &&
760 al.map == machine->vmlinux_maps[MAP__FUNCTION] && 701 al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
761 RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) { 702 RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
762 if (symbol_conf.vmlinux_name) { 703 if (symbol_conf.vmlinux_name) {
@@ -769,7 +710,7 @@ static void perf_event__process_sample(const union perf_event *event,
769 710
770 if (use_browser <= 0) 711 if (use_browser <= 0)
771 sleep(5); 712 sleep(5);
772 vmlinux_warned = true; 713 top->vmlinux_warned = true;
773 } 714 }
774 } 715 }
775 716
@@ -778,70 +719,109 @@ static void perf_event__process_sample(const union perf_event *event,
778 719
779 if ((sort__has_parent || symbol_conf.use_callchain) && 720 if ((sort__has_parent || symbol_conf.use_callchain) &&
780 sample->callchain) { 721 sample->callchain) {
781 err = perf_session__resolve_callchain(session, al.thread, 722 err = machine__resolve_callchain(machine, evsel, al.thread,
782 sample->callchain, &parent); 723 sample->callchain, &parent);
783 if (err) 724 if (err)
784 return; 725 return;
785 } 726 }
786 727
787 he = perf_session__add_hist_entry(session, &al, sample, evsel); 728 he = perf_evsel__add_hist_entry(evsel, &al, sample);
788 if (he == NULL) { 729 if (he == NULL) {
789 pr_err("Problem incrementing symbol period, skipping event\n"); 730 pr_err("Problem incrementing symbol period, skipping event\n");
790 return; 731 return;
791 } 732 }
792 733
793 if (symbol_conf.use_callchain) { 734 if (symbol_conf.use_callchain) {
794 err = callchain_append(he->callchain, &session->callchain_cursor, 735 err = callchain_append(he->callchain, &evsel->hists.callchain_cursor,
795 sample->period); 736 sample->period);
796 if (err) 737 if (err)
797 return; 738 return;
798 } 739 }
799 740
800 if (sort_has_symbols) 741 if (top->sort_has_symbols)
801 record_precise_ip(he, evsel->idx, ip); 742 perf_top__record_precise_ip(top, he, evsel->idx, ip);
802 } 743 }
803 744
804 return; 745 return;
805} 746}
806 747
807static void perf_session__mmap_read_idx(struct perf_session *self, int idx) 748static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
808{ 749{
809 struct perf_sample sample; 750 struct perf_sample sample;
810 struct perf_evsel *evsel; 751 struct perf_evsel *evsel;
752 struct perf_session *session = top->session;
811 union perf_event *event; 753 union perf_event *event;
754 struct machine *machine;
755 u8 origin;
812 int ret; 756 int ret;
813 757
814 while ((event = perf_evlist__mmap_read(top.evlist, idx)) != NULL) { 758 while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) {
815 ret = perf_session__parse_sample(self, event, &sample); 759 ret = perf_session__parse_sample(session, event, &sample);
816 if (ret) { 760 if (ret) {
817 pr_err("Can't parse sample, err = %d\n", ret); 761 pr_err("Can't parse sample, err = %d\n", ret);
818 continue; 762 continue;
819 } 763 }
820 764
821 evsel = perf_evlist__id2evsel(self->evlist, sample.id); 765 evsel = perf_evlist__id2evsel(session->evlist, sample.id);
822 assert(evsel != NULL); 766 assert(evsel != NULL);
823 767
768 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
769
824 if (event->header.type == PERF_RECORD_SAMPLE) 770 if (event->header.type == PERF_RECORD_SAMPLE)
825 perf_event__process_sample(event, evsel, &sample, self); 771 ++top->samples;
826 else if (event->header.type < PERF_RECORD_MAX) { 772
773 switch (origin) {
774 case PERF_RECORD_MISC_USER:
775 ++top->us_samples;
776 if (top->hide_user_symbols)
777 continue;
778 machine = perf_session__find_host_machine(session);
779 break;
780 case PERF_RECORD_MISC_KERNEL:
781 ++top->kernel_samples;
782 if (top->hide_kernel_symbols)
783 continue;
784 machine = perf_session__find_host_machine(session);
785 break;
786 case PERF_RECORD_MISC_GUEST_KERNEL:
787 ++top->guest_kernel_samples;
788 machine = perf_session__find_machine(session, event->ip.pid);
789 break;
790 case PERF_RECORD_MISC_GUEST_USER:
791 ++top->guest_us_samples;
792 /*
793 * TODO: we don't process guest user from host side
794 * except simple counting.
795 */
796 /* Fall thru */
797 default:
798 continue;
799 }
800
801
802 if (event->header.type == PERF_RECORD_SAMPLE) {
803 perf_event__process_sample(&top->tool, event, evsel,
804 &sample, machine);
805 } else if (event->header.type < PERF_RECORD_MAX) {
827 hists__inc_nr_events(&evsel->hists, event->header.type); 806 hists__inc_nr_events(&evsel->hists, event->header.type);
828 perf_event__process(event, &sample, self); 807 perf_event__process(&top->tool, event, &sample, machine);
829 } else 808 } else
830 ++self->hists.stats.nr_unknown_events; 809 ++session->hists.stats.nr_unknown_events;
831 } 810 }
832} 811}
833 812
834static void perf_session__mmap_read(struct perf_session *self) 813static void perf_top__mmap_read(struct perf_top *top)
835{ 814{
836 int i; 815 int i;
837 816
838 for (i = 0; i < top.evlist->nr_mmaps; i++) 817 for (i = 0; i < top->evlist->nr_mmaps; i++)
839 perf_session__mmap_read_idx(self, i); 818 perf_top__mmap_read_idx(top, i);
840} 819}
841 820
842static void start_counters(struct perf_evlist *evlist) 821static void perf_top__start_counters(struct perf_top *top)
843{ 822{
844 struct perf_evsel *counter, *first; 823 struct perf_evsel *counter, *first;
824 struct perf_evlist *evlist = top->evlist;
845 825
846 first = list_entry(evlist->entries.next, struct perf_evsel, node); 826 first = list_entry(evlist->entries.next, struct perf_evsel, node);
847 827
@@ -849,15 +829,15 @@ static void start_counters(struct perf_evlist *evlist)
849 struct perf_event_attr *attr = &counter->attr; 829 struct perf_event_attr *attr = &counter->attr;
850 struct xyarray *group_fd = NULL; 830 struct xyarray *group_fd = NULL;
851 831
852 if (group && counter != first) 832 if (top->group && counter != first)
853 group_fd = first->fd; 833 group_fd = first->fd;
854 834
855 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; 835 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
856 836
857 if (top.freq) { 837 if (top->freq) {
858 attr->sample_type |= PERF_SAMPLE_PERIOD; 838 attr->sample_type |= PERF_SAMPLE_PERIOD;
859 attr->freq = 1; 839 attr->freq = 1;
860 attr->sample_freq = top.freq; 840 attr->sample_freq = top->freq;
861 } 841 }
862 842
863 if (evlist->nr_entries > 1) { 843 if (evlist->nr_entries > 1) {
@@ -870,23 +850,23 @@ static void start_counters(struct perf_evlist *evlist)
870 850
871 attr->mmap = 1; 851 attr->mmap = 1;
872 attr->comm = 1; 852 attr->comm = 1;
873 attr->inherit = inherit; 853 attr->inherit = top->inherit;
874retry_sample_id: 854retry_sample_id:
875 attr->sample_id_all = sample_id_all_avail ? 1 : 0; 855 attr->sample_id_all = top->sample_id_all_avail ? 1 : 0;
876try_again: 856try_again:
877 if (perf_evsel__open(counter, top.evlist->cpus, 857 if (perf_evsel__open(counter, top->evlist->cpus,
878 top.evlist->threads, group, 858 top->evlist->threads, top->group,
879 group_fd) < 0) { 859 group_fd) < 0) {
880 int err = errno; 860 int err = errno;
881 861
882 if (err == EPERM || err == EACCES) { 862 if (err == EPERM || err == EACCES) {
883 ui__error_paranoid(); 863 ui__error_paranoid();
884 goto out_err; 864 goto out_err;
885 } else if (err == EINVAL && sample_id_all_avail) { 865 } else if (err == EINVAL && top->sample_id_all_avail) {
886 /* 866 /*
887 * Old kernel, no attr->sample_id_type_all field 867 * Old kernel, no attr->sample_id_type_all field
888 */ 868 */
889 sample_id_all_avail = false; 869 top->sample_id_all_avail = false;
890 goto retry_sample_id; 870 goto retry_sample_id;
891 } 871 }
892 /* 872 /*
@@ -920,7 +900,7 @@ try_again:
920 } 900 }
921 } 901 }
922 902
923 if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) { 903 if (perf_evlist__mmap(evlist, top->mmap_pages, false) < 0) {
924 ui__warning("Failed to mmap with %d (%s)\n", 904 ui__warning("Failed to mmap with %d (%s)\n",
925 errno, strerror(errno)); 905 errno, strerror(errno));
926 goto out_err; 906 goto out_err;
@@ -933,14 +913,14 @@ out_err:
933 exit(0); 913 exit(0);
934} 914}
935 915
936static int setup_sample_type(void) 916static int perf_top__setup_sample_type(struct perf_top *top)
937{ 917{
938 if (!sort_has_symbols) { 918 if (!top->sort_has_symbols) {
939 if (symbol_conf.use_callchain) { 919 if (symbol_conf.use_callchain) {
940 ui__warning("Selected -g but \"sym\" not present in --sort/-s."); 920 ui__warning("Selected -g but \"sym\" not present in --sort/-s.");
941 return -EINVAL; 921 return -EINVAL;
942 } 922 }
943 } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE) { 923 } else if (!top->dont_use_callchains && callchain_param.mode != CHAIN_NONE) {
944 if (callchain_register_param(&callchain_param) < 0) { 924 if (callchain_register_param(&callchain_param) < 0) {
945 ui__warning("Can't register callchain params.\n"); 925 ui__warning("Can't register callchain params.\n");
946 return -EINVAL; 926 return -EINVAL;
@@ -950,7 +930,7 @@ static int setup_sample_type(void)
950 return 0; 930 return 0;
951} 931}
952 932
953static int __cmd_top(void) 933static int __cmd_top(struct perf_top *top)
954{ 934{
955 pthread_t thread; 935 pthread_t thread;
956 int ret; 936 int ret;
@@ -958,39 +938,40 @@ static int __cmd_top(void)
958 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this 938 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
959 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now. 939 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
960 */ 940 */
961 top.session = perf_session__new(NULL, O_WRONLY, false, false, NULL); 941 top->session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
962 if (top.session == NULL) 942 if (top->session == NULL)
963 return -ENOMEM; 943 return -ENOMEM;
964 944
965 ret = setup_sample_type(); 945 ret = perf_top__setup_sample_type(top);
966 if (ret) 946 if (ret)
967 goto out_delete; 947 goto out_delete;
968 948
969 if (top.target_tid != -1) 949 if (top->target_tid != -1)
970 perf_event__synthesize_thread_map(top.evlist->threads, 950 perf_event__synthesize_thread_map(&top->tool, top->evlist->threads,
971 perf_event__process, top.session); 951 perf_event__process,
952 &top->session->host_machine);
972 else 953 else
973 perf_event__synthesize_threads(perf_event__process, top.session); 954 perf_event__synthesize_threads(&top->tool, perf_event__process,
974 955 &top->session->host_machine);
975 start_counters(top.evlist); 956 perf_top__start_counters(top);
976 top.session->evlist = top.evlist; 957 top->session->evlist = top->evlist;
977 perf_session__update_sample_type(top.session); 958 perf_session__update_sample_type(top->session);
978 959
979 /* Wait for a minimal set of events before starting the snapshot */ 960 /* Wait for a minimal set of events before starting the snapshot */
980 poll(top.evlist->pollfd, top.evlist->nr_fds, 100); 961 poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
981 962
982 perf_session__mmap_read(top.session); 963 perf_top__mmap_read(top);
983 964
984 if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui : 965 if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
985 display_thread), NULL)) { 966 display_thread), top)) {
986 printf("Could not create display thread.\n"); 967 printf("Could not create display thread.\n");
987 exit(-1); 968 exit(-1);
988 } 969 }
989 970
990 if (realtime_prio) { 971 if (top->realtime_prio) {
991 struct sched_param param; 972 struct sched_param param;
992 973
993 param.sched_priority = realtime_prio; 974 param.sched_priority = top->realtime_prio;
994 if (sched_setscheduler(0, SCHED_FIFO, &param)) { 975 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
995 printf("Could not set realtime priority.\n"); 976 printf("Could not set realtime priority.\n");
996 exit(-1); 977 exit(-1);
@@ -998,25 +979,25 @@ static int __cmd_top(void)
998 } 979 }
999 980
1000 while (1) { 981 while (1) {
1001 u64 hits = top.samples; 982 u64 hits = top->samples;
1002 983
1003 perf_session__mmap_read(top.session); 984 perf_top__mmap_read(top);
1004 985
1005 if (hits == top.samples) 986 if (hits == top->samples)
1006 ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100); 987 ret = poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
1007 } 988 }
1008 989
1009out_delete: 990out_delete:
1010 perf_session__delete(top.session); 991 perf_session__delete(top->session);
1011 top.session = NULL; 992 top->session = NULL;
1012 993
1013 return 0; 994 return 0;
1014} 995}
1015 996
1016static int 997static int
1017parse_callchain_opt(const struct option *opt __used, const char *arg, 998parse_callchain_opt(const struct option *opt, const char *arg, int unset)
1018 int unset)
1019{ 999{
1000 struct perf_top *top = (struct perf_top *)opt->value;
1020 char *tok, *tok2; 1001 char *tok, *tok2;
1021 char *endptr; 1002 char *endptr;
1022 1003
@@ -1024,7 +1005,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
1024 * --no-call-graph 1005 * --no-call-graph
1025 */ 1006 */
1026 if (unset) { 1007 if (unset) {
1027 dont_use_callchains = true; 1008 top->dont_use_callchains = true;
1028 return 0; 1009 return 0;
1029 } 1010 }
1030 1011
@@ -1052,9 +1033,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
1052 symbol_conf.use_callchain = false; 1033 symbol_conf.use_callchain = false;
1053 1034
1054 return 0; 1035 return 0;
1055 } 1036 } else
1056
1057 else
1058 return -1; 1037 return -1;
1059 1038
1060 /* get the min percentage */ 1039 /* get the min percentage */
@@ -1098,17 +1077,32 @@ static const char * const top_usage[] = {
1098 NULL 1077 NULL
1099}; 1078};
1100 1079
1101static const struct option options[] = { 1080int cmd_top(int argc, const char **argv, const char *prefix __used)
1081{
1082 struct perf_evsel *pos;
1083 int status = -ENOMEM;
1084 struct perf_top top = {
1085 .count_filter = 5,
1086 .delay_secs = 2,
1087 .target_pid = -1,
1088 .target_tid = -1,
1089 .freq = 1000, /* 1 KHz */
1090 .sample_id_all_avail = true,
1091 .mmap_pages = 128,
1092 .sym_pcnt_filter = 5,
1093 };
1094 char callchain_default_opt[] = "fractal,0.5,callee";
1095 const struct option options[] = {
1102 OPT_CALLBACK('e', "event", &top.evlist, "event", 1096 OPT_CALLBACK('e', "event", &top.evlist, "event",
1103 "event selector. use 'perf list' to list available events", 1097 "event selector. use 'perf list' to list available events",
1104 parse_events_option), 1098 parse_events_option),
1105 OPT_INTEGER('c', "count", &default_interval, 1099 OPT_INTEGER('c', "count", &top.default_interval,
1106 "event period to sample"), 1100 "event period to sample"),
1107 OPT_INTEGER('p', "pid", &top.target_pid, 1101 OPT_INTEGER('p', "pid", &top.target_pid,
1108 "profile events on existing process id"), 1102 "profile events on existing process id"),
1109 OPT_INTEGER('t', "tid", &top.target_tid, 1103 OPT_INTEGER('t', "tid", &top.target_tid,
1110 "profile events on existing thread id"), 1104 "profile events on existing thread id"),
1111 OPT_BOOLEAN('a', "all-cpus", &system_wide, 1105 OPT_BOOLEAN('a', "all-cpus", &top.system_wide,
1112 "system-wide collection from all CPUs"), 1106 "system-wide collection from all CPUs"),
1113 OPT_STRING('C', "cpu", &top.cpu_list, "cpu", 1107 OPT_STRING('C', "cpu", &top.cpu_list, "cpu",
1114 "list of cpus to monitor"), 1108 "list of cpus to monitor"),
@@ -1116,20 +1110,20 @@ static const struct option options[] = {
1116 "file", "vmlinux pathname"), 1110 "file", "vmlinux pathname"),
1117 OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols, 1111 OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
1118 "hide kernel symbols"), 1112 "hide kernel symbols"),
1119 OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), 1113 OPT_UINTEGER('m', "mmap-pages", &top.mmap_pages, "number of mmap data pages"),
1120 OPT_INTEGER('r', "realtime", &realtime_prio, 1114 OPT_INTEGER('r', "realtime", &top.realtime_prio,
1121 "collect data with this RT SCHED_FIFO priority"), 1115 "collect data with this RT SCHED_FIFO priority"),
1122 OPT_INTEGER('d', "delay", &top.delay_secs, 1116 OPT_INTEGER('d', "delay", &top.delay_secs,
1123 "number of seconds to delay between refreshes"), 1117 "number of seconds to delay between refreshes"),
1124 OPT_BOOLEAN('D', "dump-symtab", &dump_symtab, 1118 OPT_BOOLEAN('D', "dump-symtab", &top.dump_symtab,
1125 "dump the symbol table used for profiling"), 1119 "dump the symbol table used for profiling"),
1126 OPT_INTEGER('f', "count-filter", &top.count_filter, 1120 OPT_INTEGER('f', "count-filter", &top.count_filter,
1127 "only display functions with more events than this"), 1121 "only display functions with more events than this"),
1128 OPT_BOOLEAN('g', "group", &group, 1122 OPT_BOOLEAN('g', "group", &top.group,
1129 "put the counters into a counter group"), 1123 "put the counters into a counter group"),
1130 OPT_BOOLEAN('i', "inherit", &inherit, 1124 OPT_BOOLEAN('i', "inherit", &top.inherit,
1131 "child tasks inherit counters"), 1125 "child tasks inherit counters"),
1132 OPT_STRING(0, "sym-annotate", &sym_filter, "symbol name", 1126 OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name",
1133 "symbol to annotate"), 1127 "symbol to annotate"),
1134 OPT_BOOLEAN('z', "zero", &top.zero, 1128 OPT_BOOLEAN('z', "zero", &top.zero,
1135 "zero history across updates"), 1129 "zero history across updates"),
@@ -1139,15 +1133,15 @@ static const struct option options[] = {
1139 "display this many functions"), 1133 "display this many functions"),
1140 OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols, 1134 OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
1141 "hide user symbols"), 1135 "hide user symbols"),
1142 OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), 1136 OPT_BOOLEAN(0, "tui", &top.use_tui, "Use the TUI interface"),
1143 OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), 1137 OPT_BOOLEAN(0, "stdio", &top.use_stdio, "Use the stdio interface"),
1144 OPT_INCR('v', "verbose", &verbose, 1138 OPT_INCR('v', "verbose", &verbose,
1145 "be more verbose (show counter open errors, etc)"), 1139 "be more verbose (show counter open errors, etc)"),
1146 OPT_STRING('s', "sort", &sort_order, "key[,key2...]", 1140 OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1147 "sort by key(s): pid, comm, dso, symbol, parent"), 1141 "sort by key(s): pid, comm, dso, symbol, parent"),
1148 OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, 1142 OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
1149 "Show a column with the number of samples"), 1143 "Show a column with the number of samples"),
1150 OPT_CALLBACK_DEFAULT('G', "call-graph", NULL, "output_type,min_percent, call_order", 1144 OPT_CALLBACK_DEFAULT('G', "call-graph", &top, "output_type,min_percent, call_order",
1151 "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. " 1145 "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. "
1152 "Default: fractal,0.5,callee", &parse_callchain_opt, 1146 "Default: fractal,0.5,callee", &parse_callchain_opt,
1153 callchain_default_opt), 1147 callchain_default_opt),
@@ -1166,12 +1160,7 @@ static const struct option options[] = {
1166 OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", 1160 OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
1167 "Specify disassembler style (e.g. -M intel for intel syntax)"), 1161 "Specify disassembler style (e.g. -M intel for intel syntax)"),
1168 OPT_END() 1162 OPT_END()
1169}; 1163 };
1170
1171int cmd_top(int argc, const char **argv, const char *prefix __used)
1172{
1173 struct perf_evsel *pos;
1174 int status = -ENOMEM;
1175 1164
1176 top.evlist = perf_evlist__new(NULL, NULL); 1165 top.evlist = perf_evlist__new(NULL, NULL);
1177 if (top.evlist == NULL) 1166 if (top.evlist == NULL)
@@ -1188,9 +1177,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
1188 1177
1189 setup_sorting(top_usage, options); 1178 setup_sorting(top_usage, options);
1190 1179
1191 if (use_stdio) 1180 if (top.use_stdio)
1192 use_browser = 0; 1181 use_browser = 0;
1193 else if (use_tui) 1182 else if (top.use_tui)
1194 use_browser = 1; 1183 use_browser = 1;
1195 1184
1196 setup_browser(false); 1185 setup_browser(false);
@@ -1215,38 +1204,31 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
1215 return -ENOMEM; 1204 return -ENOMEM;
1216 } 1205 }
1217 1206
1207 symbol_conf.nr_events = top.evlist->nr_entries;
1208
1218 if (top.delay_secs < 1) 1209 if (top.delay_secs < 1)
1219 top.delay_secs = 1; 1210 top.delay_secs = 1;
1220 1211
1221 /* 1212 /*
1222 * User specified count overrides default frequency. 1213 * User specified count overrides default frequency.
1223 */ 1214 */
1224 if (default_interval) 1215 if (top.default_interval)
1225 top.freq = 0; 1216 top.freq = 0;
1226 else if (top.freq) { 1217 else if (top.freq) {
1227 default_interval = top.freq; 1218 top.default_interval = top.freq;
1228 } else { 1219 } else {
1229 fprintf(stderr, "frequency and count are zero, aborting\n"); 1220 fprintf(stderr, "frequency and count are zero, aborting\n");
1230 exit(EXIT_FAILURE); 1221 exit(EXIT_FAILURE);
1231 } 1222 }
1232 1223
1233 list_for_each_entry(pos, &top.evlist->entries, node) { 1224 list_for_each_entry(pos, &top.evlist->entries, node) {
1234 if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr,
1235 top.evlist->threads->nr) < 0)
1236 goto out_free_fd;
1237 /* 1225 /*
1238 * Fill in the ones not specifically initialized via -c: 1226 * Fill in the ones not specifically initialized via -c:
1239 */ 1227 */
1240 if (pos->attr.sample_period) 1228 if (!pos->attr.sample_period)
1241 continue; 1229 pos->attr.sample_period = top.default_interval;
1242
1243 pos->attr.sample_period = default_interval;
1244 } 1230 }
1245 1231
1246 if (perf_evlist__alloc_pollfd(top.evlist) < 0 ||
1247 perf_evlist__alloc_mmap(top.evlist) < 0)
1248 goto out_free_fd;
1249
1250 top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); 1232 top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
1251 1233
1252 symbol_conf.priv_size = sizeof(struct annotation); 1234 symbol_conf.priv_size = sizeof(struct annotation);
@@ -1263,16 +1245,20 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
1263 * Avoid annotation data structures overhead when symbols aren't on the 1245 * Avoid annotation data structures overhead when symbols aren't on the
1264 * sort list. 1246 * sort list.
1265 */ 1247 */
1266 sort_has_symbols = sort_sym.list.next != NULL; 1248 top.sort_has_symbols = sort_sym.list.next != NULL;
1267 1249
1268 get_term_dimensions(&winsize); 1250 get_term_dimensions(&top.winsize);
1269 if (top.print_entries == 0) { 1251 if (top.print_entries == 0) {
1270 update_print_entries(&winsize); 1252 struct sigaction act = {
1271 signal(SIGWINCH, sig_winch_handler); 1253 .sa_sigaction = perf_top__sig_winch,
1254 .sa_flags = SA_SIGINFO,
1255 };
1256 perf_top__update_print_entries(&top);
1257 sigaction(SIGWINCH, &act, NULL);
1272 } 1258 }
1273 1259
1274 status = __cmd_top(); 1260 status = __cmd_top(&top);
1275out_free_fd: 1261
1276 perf_evlist__delete(top.evlist); 1262 perf_evlist__delete(top.evlist);
1277 1263
1278 return status; 1264 return status;
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 73d0cac8b67e..2b2e225a4d4c 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -29,8 +29,6 @@ struct pager_config {
29 int val; 29 int val;
30}; 30};
31 31
32static char debugfs_mntpt[MAXPATHLEN];
33
34static int pager_command_config(const char *var, const char *value, void *data) 32static int pager_command_config(const char *var, const char *value, void *data)
35{ 33{
36 struct pager_config *c = data; 34 struct pager_config *c = data;
@@ -81,15 +79,6 @@ static void commit_pager_choice(void)
81 } 79 }
82} 80}
83 81
84static void set_debugfs_path(void)
85{
86 char *path;
87
88 path = getenv(PERF_DEBUGFS_ENVIRONMENT);
89 snprintf(debugfs_path, MAXPATHLEN, "%s/%s", path ?: debugfs_mntpt,
90 "tracing/events");
91}
92
93static int handle_options(const char ***argv, int *argc, int *envchanged) 82static int handle_options(const char ***argv, int *argc, int *envchanged)
94{ 83{
95 int handled = 0; 84 int handled = 0;
@@ -161,15 +150,14 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
161 fprintf(stderr, "No directory given for --debugfs-dir.\n"); 150 fprintf(stderr, "No directory given for --debugfs-dir.\n");
162 usage(perf_usage_string); 151 usage(perf_usage_string);
163 } 152 }
164 strncpy(debugfs_mntpt, (*argv)[1], MAXPATHLEN); 153 debugfs_set_path((*argv)[1]);
165 debugfs_mntpt[MAXPATHLEN - 1] = '\0';
166 if (envchanged) 154 if (envchanged)
167 *envchanged = 1; 155 *envchanged = 1;
168 (*argv)++; 156 (*argv)++;
169 (*argc)--; 157 (*argc)--;
170 } else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) { 158 } else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) {
171 strncpy(debugfs_mntpt, cmd + strlen(CMD_DEBUGFS_DIR), MAXPATHLEN); 159 debugfs_set_path(cmd + strlen(CMD_DEBUGFS_DIR));
172 debugfs_mntpt[MAXPATHLEN - 1] = '\0'; 160 fprintf(stderr, "dir: %s\n", debugfs_mountpoint);
173 if (envchanged) 161 if (envchanged)
174 *envchanged = 1; 162 *envchanged = 1;
175 } else { 163 } else {
@@ -281,7 +269,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
281 if (use_pager == -1 && p->option & USE_PAGER) 269 if (use_pager == -1 && p->option & USE_PAGER)
282 use_pager = 1; 270 use_pager = 1;
283 commit_pager_choice(); 271 commit_pager_choice();
284 set_debugfs_path();
285 272
286 status = p->fn(argc, argv, prefix); 273 status = p->fn(argc, argv, prefix);
287 exit_browser(status); 274 exit_browser(status);
@@ -416,17 +403,6 @@ static int run_argv(int *argcp, const char ***argv)
416 return done_alias; 403 return done_alias;
417} 404}
418 405
419/* mini /proc/mounts parser: searching for "^blah /mount/point debugfs" */
420static void get_debugfs_mntpt(void)
421{
422 const char *path = debugfs_mount(NULL);
423
424 if (path)
425 strncpy(debugfs_mntpt, path, sizeof(debugfs_mntpt));
426 else
427 debugfs_mntpt[0] = '\0';
428}
429
430static void pthread__block_sigwinch(void) 406static void pthread__block_sigwinch(void)
431{ 407{
432 sigset_t set; 408 sigset_t set;
@@ -453,7 +429,7 @@ int main(int argc, const char **argv)
453 if (!cmd) 429 if (!cmd)
454 cmd = "perf-help"; 430 cmd = "perf-help";
455 /* get debugfs mount point from /proc/mounts */ 431 /* get debugfs mount point from /proc/mounts */
456 get_debugfs_mntpt(); 432 debugfs_mount(NULL);
457 /* 433 /*
458 * "perf-xxxx" is the same as "perf xxxx", but we obviously: 434 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
459 * 435 *
@@ -476,7 +452,6 @@ int main(int argc, const char **argv)
476 argc--; 452 argc--;
477 handle_options(&argv, &argc, NULL); 453 handle_options(&argv, &argc, NULL);
478 commit_pager_choice(); 454 commit_pager_choice();
479 set_debugfs_path();
480 set_buildid_dir(); 455 set_buildid_dir();
481 456
482 if (argc > 0) { 457 if (argc > 0) {
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 914c895510f7..64f8bee31ced 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -185,4 +185,28 @@ extern const char perf_version_string[];
185 185
186void pthread__unblock_sigwinch(void); 186void pthread__unblock_sigwinch(void);
187 187
188struct perf_record_opts {
189 pid_t target_pid;
190 pid_t target_tid;
191 bool call_graph;
192 bool group;
193 bool inherit_stat;
194 bool no_delay;
195 bool no_inherit;
196 bool no_samples;
197 bool pipe_output;
198 bool raw_samples;
199 bool sample_address;
200 bool sample_time;
201 bool sample_id_all_avail;
202 bool system_wide;
203 bool period;
204 unsigned int freq;
205 unsigned int mmap_pages;
206 unsigned int user_freq;
207 u64 default_interval;
208 u64 user_interval;
209 const char *cpu_list;
210};
211
188#endif 212#endif
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 119e996035c8..011ed2676604 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -25,17 +25,17 @@ int symbol__annotate_init(struct map *map __used, struct symbol *sym)
25 return 0; 25 return 0;
26} 26}
27 27
28int symbol__alloc_hist(struct symbol *sym, int nevents) 28int symbol__alloc_hist(struct symbol *sym)
29{ 29{
30 struct annotation *notes = symbol__annotation(sym); 30 struct annotation *notes = symbol__annotation(sym);
31 size_t sizeof_sym_hist = (sizeof(struct sym_hist) + 31 size_t sizeof_sym_hist = (sizeof(struct sym_hist) +
32 (sym->end - sym->start) * sizeof(u64)); 32 (sym->end - sym->start) * sizeof(u64));
33 33
34 notes->src = zalloc(sizeof(*notes->src) + nevents * sizeof_sym_hist); 34 notes->src = zalloc(sizeof(*notes->src) + symbol_conf.nr_events * sizeof_sym_hist);
35 if (notes->src == NULL) 35 if (notes->src == NULL)
36 return -1; 36 return -1;
37 notes->src->sizeof_sym_hist = sizeof_sym_hist; 37 notes->src->sizeof_sym_hist = sizeof_sym_hist;
38 notes->src->nr_histograms = nevents; 38 notes->src->nr_histograms = symbol_conf.nr_events;
39 INIT_LIST_HEAD(&notes->src->source); 39 INIT_LIST_HEAD(&notes->src->source);
40 return 0; 40 return 0;
41} 41}
@@ -334,7 +334,7 @@ fallback:
334 disassembler_style ? "-M " : "", 334 disassembler_style ? "-M " : "",
335 disassembler_style ? disassembler_style : "", 335 disassembler_style ? disassembler_style : "",
336 map__rip_2objdump(map, sym->start), 336 map__rip_2objdump(map, sym->start),
337 map__rip_2objdump(map, sym->end), 337 map__rip_2objdump(map, sym->end+1),
338 symbol_conf.annotate_asm_raw ? "" : "--no-show-raw", 338 symbol_conf.annotate_asm_raw ? "" : "--no-show-raw",
339 symbol_conf.annotate_src ? "-S" : "", 339 symbol_conf.annotate_src ? "-S" : "",
340 symfs_filename, filename); 340 symfs_filename, filename);
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index d9072523d342..efa5dc82bfae 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -72,7 +72,7 @@ static inline struct annotation *symbol__annotation(struct symbol *sym)
72 72
73int symbol__inc_addr_samples(struct symbol *sym, struct map *map, 73int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
74 int evidx, u64 addr); 74 int evidx, u64 addr);
75int symbol__alloc_hist(struct symbol *sym, int nevents); 75int symbol__alloc_hist(struct symbol *sym);
76void symbol__annotate_zero_histograms(struct symbol *sym); 76void symbol__annotate_zero_histograms(struct symbol *sym);
77 77
78int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize); 78int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize);
@@ -99,8 +99,7 @@ static inline int symbol__tui_annotate(struct symbol *sym __used,
99} 99}
100#else 100#else
101int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, 101int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
102 int nr_events, void(*timer)(void *arg), void *arg, 102 void(*timer)(void *arg), void *arg, int delay_secs);
103 int delay_secs);
104#endif 103#endif
105 104
106extern const char *disassembler_style; 105extern const char *disassembler_style;
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index a91cd99f26ea..dff9c7a725f4 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -13,15 +13,18 @@
13#include "symbol.h" 13#include "symbol.h"
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include "debug.h" 15#include "debug.h"
16#include "session.h"
17#include "tool.h"
16 18
17static int build_id__mark_dso_hit(union perf_event *event, 19static int build_id__mark_dso_hit(struct perf_tool *tool __used,
20 union perf_event *event,
18 struct perf_sample *sample __used, 21 struct perf_sample *sample __used,
19 struct perf_evsel *evsel __used, 22 struct perf_evsel *evsel __used,
20 struct perf_session *session) 23 struct machine *machine)
21{ 24{
22 struct addr_location al; 25 struct addr_location al;
23 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 26 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
24 struct thread *thread = perf_session__findnew(session, event->ip.pid); 27 struct thread *thread = machine__findnew_thread(machine, event->ip.pid);
25 28
26 if (thread == NULL) { 29 if (thread == NULL) {
27 pr_err("problem processing %d event, skipping it.\n", 30 pr_err("problem processing %d event, skipping it.\n",
@@ -29,8 +32,8 @@ static int build_id__mark_dso_hit(union perf_event *event,
29 return -1; 32 return -1;
30 } 33 }
31 34
32 thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, 35 thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
33 event->ip.pid, event->ip.ip, &al); 36 event->ip.ip, &al);
34 37
35 if (al.map != NULL) 38 if (al.map != NULL)
36 al.map->dso->hit = 1; 39 al.map->dso->hit = 1;
@@ -38,25 +41,26 @@ static int build_id__mark_dso_hit(union perf_event *event,
38 return 0; 41 return 0;
39} 42}
40 43
41static int perf_event__exit_del_thread(union perf_event *event, 44static int perf_event__exit_del_thread(struct perf_tool *tool __used,
45 union perf_event *event,
42 struct perf_sample *sample __used, 46 struct perf_sample *sample __used,
43 struct perf_session *session) 47 struct machine *machine)
44{ 48{
45 struct thread *thread = perf_session__findnew(session, event->fork.tid); 49 struct thread *thread = machine__findnew_thread(machine, event->fork.tid);
46 50
47 dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, 51 dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
48 event->fork.ppid, event->fork.ptid); 52 event->fork.ppid, event->fork.ptid);
49 53
50 if (thread) { 54 if (thread) {
51 rb_erase(&thread->rb_node, &session->threads); 55 rb_erase(&thread->rb_node, &machine->threads);
52 session->last_match = NULL; 56 machine->last_match = NULL;
53 thread__delete(thread); 57 thread__delete(thread);
54 } 58 }
55 59
56 return 0; 60 return 0;
57} 61}
58 62
59struct perf_event_ops build_id__mark_dso_hit_ops = { 63struct perf_tool build_id__mark_dso_hit_ops = {
60 .sample = build_id__mark_dso_hit, 64 .sample = build_id__mark_dso_hit,
61 .mmap = perf_event__process_mmap, 65 .mmap = perf_event__process_mmap,
62 .fork = perf_event__process_task, 66 .fork = perf_event__process_task,
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index 5dafb00eaa06..a993ba87d996 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -3,7 +3,7 @@
3 3
4#include "session.h" 4#include "session.h"
5 5
6extern struct perf_event_ops build_id__mark_dso_hit_ops; 6extern struct perf_tool build_id__mark_dso_hit_ops;
7 7
8char *dso__build_id_filename(struct dso *self, char *bf, size_t size); 8char *dso__build_id_filename(struct dso *self, char *bf, size_t size);
9 9
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 9b4ff16cac96..7f9c0f1ae3a9 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -101,6 +101,9 @@ int callchain_append(struct callchain_root *root,
101int callchain_merge(struct callchain_cursor *cursor, 101int callchain_merge(struct callchain_cursor *cursor,
102 struct callchain_root *dst, struct callchain_root *src); 102 struct callchain_root *dst, struct callchain_root *src);
103 103
104struct ip_callchain;
105union perf_event;
106
104bool ip_callchain__valid(struct ip_callchain *chain, 107bool ip_callchain__valid(struct ip_callchain *chain,
105 const union perf_event *event); 108 const union perf_event *event);
106/* 109/*
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 96bee5c46008..dbe2f16b1a1a 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -3,7 +3,6 @@
3#include "parse-options.h" 3#include "parse-options.h"
4#include "evsel.h" 4#include "evsel.h"
5#include "cgroup.h" 5#include "cgroup.h"
6#include "debugfs.h" /* MAX_PATH, STR() */
7#include "evlist.h" 6#include "evlist.h"
8 7
9int nr_cgroups; 8int nr_cgroups;
@@ -12,7 +11,7 @@ static int
12cgroupfs_find_mountpoint(char *buf, size_t maxlen) 11cgroupfs_find_mountpoint(char *buf, size_t maxlen)
13{ 12{
14 FILE *fp; 13 FILE *fp;
15 char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1]; 14 char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
16 char *token, *saved_ptr = NULL; 15 char *token, *saved_ptr = NULL;
17 int found = 0; 16 int found = 0;
18 17
@@ -25,8 +24,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
25 * and inspect every cgroupfs mount point to find one that has 24 * and inspect every cgroupfs mount point to find one that has
26 * perf_event subsystem 25 * perf_event subsystem
27 */ 26 */
28 while (fscanf(fp, "%*s %"STR(MAX_PATH)"s %"STR(MAX_PATH)"s %" 27 while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
29 STR(MAX_PATH)"s %*d %*d\n", 28 STR(PATH_MAX)"s %*d %*d\n",
30 mountpoint, type, tokens) == 3) { 29 mountpoint, type, tokens) == 3) {
31 30
32 if (!strcmp(type, "cgroup")) { 31 if (!strcmp(type, "cgroup")) {
@@ -57,15 +56,15 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
57 56
58static int open_cgroup(char *name) 57static int open_cgroup(char *name)
59{ 58{
60 char path[MAX_PATH+1]; 59 char path[PATH_MAX + 1];
61 char mnt[MAX_PATH+1]; 60 char mnt[PATH_MAX + 1];
62 int fd; 61 int fd;
63 62
64 63
65 if (cgroupfs_find_mountpoint(mnt, MAX_PATH+1)) 64 if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1))
66 return -1; 65 return -1;
67 66
68 snprintf(path, MAX_PATH, "%s/%s", mnt, name); 67 snprintf(path, PATH_MAX, "%s/%s", mnt, name);
69 68
70 fd = open(path, O_RDONLY); 69 fd = open(path, O_RDONLY);
71 if (fd == -1) 70 if (fd == -1)
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 80d9598db31a..0deac6a14b65 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -1,5 +1,8 @@
1/* 1/*
2 * GIT - The information manager from hell 2 * config.c
3 *
4 * Helper functions for parsing config items.
5 * Originally copied from GIT source.
3 * 6 *
4 * Copyright (C) Linus Torvalds, 2005 7 * Copyright (C) Linus Torvalds, 2005
5 * Copyright (C) Johannes Schindelin, 2005 8 * Copyright (C) Johannes Schindelin, 2005
diff --git a/tools/perf/util/debugfs.c b/tools/perf/util/debugfs.c
index a88fefc0cc0a..ffc35e748e89 100644
--- a/tools/perf/util/debugfs.c
+++ b/tools/perf/util/debugfs.c
@@ -2,8 +2,12 @@
2#include "debugfs.h" 2#include "debugfs.h"
3#include "cache.h" 3#include "cache.h"
4 4
5#include <linux/kernel.h>
6#include <sys/mount.h>
7
5static int debugfs_premounted; 8static int debugfs_premounted;
6static char debugfs_mountpoint[MAX_PATH+1]; 9char debugfs_mountpoint[PATH_MAX + 1] = "/sys/kernel/debug";
10char tracing_events_path[PATH_MAX + 1] = "/sys/kernel/debug/tracing/events";
7 11
8static const char *debugfs_known_mountpoints[] = { 12static const char *debugfs_known_mountpoints[] = {
9 "/sys/kernel/debug/", 13 "/sys/kernel/debug/",
@@ -62,11 +66,9 @@ const char *debugfs_find_mountpoint(void)
62 /* give up and parse /proc/mounts */ 66 /* give up and parse /proc/mounts */
63 fp = fopen("/proc/mounts", "r"); 67 fp = fopen("/proc/mounts", "r");
64 if (fp == NULL) 68 if (fp == NULL)
65 die("Can't open /proc/mounts for read"); 69 return NULL;
66 70
67 while (fscanf(fp, "%*s %" 71 while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n",
68 STR(MAX_PATH)
69 "s %99s %*s %*d %*d\n",
70 debugfs_mountpoint, type) == 2) { 72 debugfs_mountpoint, type) == 2) {
71 if (strcmp(type, "debugfs") == 0) 73 if (strcmp(type, "debugfs") == 0)
72 break; 74 break;
@@ -106,6 +108,12 @@ int debugfs_valid_entry(const char *path)
106 return 0; 108 return 0;
107} 109}
108 110
111static void debugfs_set_tracing_events_path(const char *mountpoint)
112{
113 snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s",
114 mountpoint, "tracing/events");
115}
116
109/* mount the debugfs somewhere if it's not mounted */ 117/* mount the debugfs somewhere if it's not mounted */
110 118
111char *debugfs_mount(const char *mountpoint) 119char *debugfs_mount(const char *mountpoint)
@@ -113,7 +121,7 @@ char *debugfs_mount(const char *mountpoint)
113 /* see if it's already mounted */ 121 /* see if it's already mounted */
114 if (debugfs_find_mountpoint()) { 122 if (debugfs_find_mountpoint()) {
115 debugfs_premounted = 1; 123 debugfs_premounted = 1;
116 return debugfs_mountpoint; 124 goto out;
117 } 125 }
118 126
119 /* if not mounted and no argument */ 127 /* if not mounted and no argument */
@@ -129,12 +137,19 @@ char *debugfs_mount(const char *mountpoint)
129 return NULL; 137 return NULL;
130 138
131 /* save the mountpoint */ 139 /* save the mountpoint */
132 strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint));
133 debugfs_found = 1; 140 debugfs_found = 1;
134 141 strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint));
142out:
143 debugfs_set_tracing_events_path(debugfs_mountpoint);
135 return debugfs_mountpoint; 144 return debugfs_mountpoint;
136} 145}
137 146
147void debugfs_set_path(const char *mountpoint)
148{
149 snprintf(debugfs_mountpoint, sizeof(debugfs_mountpoint), "%s", mountpoint);
150 debugfs_set_tracing_events_path(mountpoint);
151}
152
138/* umount the debugfs */ 153/* umount the debugfs */
139 154
140int debugfs_umount(void) 155int debugfs_umount(void)
@@ -158,7 +173,7 @@ int debugfs_umount(void)
158 173
159int debugfs_write(const char *entry, const char *value) 174int debugfs_write(const char *entry, const char *value)
160{ 175{
161 char path[MAX_PATH+1]; 176 char path[PATH_MAX + 1];
162 int ret, count; 177 int ret, count;
163 int fd; 178 int fd;
164 179
@@ -203,7 +218,7 @@ int debugfs_write(const char *entry, const char *value)
203 */ 218 */
204int debugfs_read(const char *entry, char *buffer, size_t size) 219int debugfs_read(const char *entry, char *buffer, size_t size)
205{ 220{
206 char path[MAX_PATH+1]; 221 char path[PATH_MAX + 1];
207 int ret; 222 int ret;
208 int fd; 223 int fd;
209 224
diff --git a/tools/perf/util/debugfs.h b/tools/perf/util/debugfs.h
index 83a02879745f..4a878f735eb0 100644
--- a/tools/perf/util/debugfs.h
+++ b/tools/perf/util/debugfs.h
@@ -1,25 +1,18 @@
1#ifndef __DEBUGFS_H__ 1#ifndef __DEBUGFS_H__
2#define __DEBUGFS_H__ 2#define __DEBUGFS_H__
3 3
4#include <sys/mount.h> 4const char *debugfs_find_mountpoint(void);
5int debugfs_valid_mountpoint(const char *debugfs);
6int debugfs_valid_entry(const char *path);
7char *debugfs_mount(const char *mountpoint);
8int debugfs_umount(void);
9void debugfs_set_path(const char *mountpoint);
10int debugfs_write(const char *entry, const char *value);
11int debugfs_read(const char *entry, char *buffer, size_t size);
12void debugfs_force_cleanup(void);
13int debugfs_make_path(const char *element, char *buffer, int size);
5 14
6#ifndef MAX_PATH 15extern char debugfs_mountpoint[];
7# define MAX_PATH 256 16extern char tracing_events_path[];
8#endif
9
10#ifndef STR
11# define _STR(x) #x
12# define STR(x) _STR(x)
13#endif
14
15extern const char *debugfs_find_mountpoint(void);
16extern int debugfs_valid_mountpoint(const char *debugfs);
17extern int debugfs_valid_entry(const char *path);
18extern char *debugfs_mount(const char *mountpoint);
19extern int debugfs_umount(void);
20extern int debugfs_write(const char *entry, const char *value);
21extern int debugfs_read(const char *entry, char *buffer, size_t size);
22extern void debugfs_force_cleanup(void);
23extern int debugfs_make_path(const char *element, char *buffer, int size);
24 17
25#endif /* __DEBUGFS_H__ */ 18#endif /* __DEBUGFS_H__ */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 437f8ca679a0..73ddaf06b8e7 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -1,7 +1,6 @@
1#include <linux/types.h> 1#include <linux/types.h>
2#include "event.h" 2#include "event.h"
3#include "debug.h" 3#include "debug.h"
4#include "session.h"
5#include "sort.h" 4#include "sort.h"
6#include "string.h" 5#include "string.h"
7#include "strlist.h" 6#include "strlist.h"
@@ -44,36 +43,27 @@ static struct perf_sample synth_sample = {
44 .period = 1, 43 .period = 1,
45}; 44};
46 45
47static pid_t perf_event__synthesize_comm(union perf_event *event, pid_t pid, 46static pid_t perf_event__get_comm_tgid(pid_t pid, char *comm, size_t len)
48 int full, perf_event__handler_t process,
49 struct perf_session *session)
50{ 47{
51 char filename[PATH_MAX]; 48 char filename[PATH_MAX];
52 char bf[BUFSIZ]; 49 char bf[BUFSIZ];
53 FILE *fp; 50 FILE *fp;
54 size_t size = 0; 51 size_t size = 0;
55 DIR *tasks; 52 pid_t tgid = -1;
56 struct dirent dirent, *next;
57 pid_t tgid = 0;
58 53
59 snprintf(filename, sizeof(filename), "/proc/%d/status", pid); 54 snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
60 55
61 fp = fopen(filename, "r"); 56 fp = fopen(filename, "r");
62 if (fp == NULL) { 57 if (fp == NULL) {
63out_race:
64 /*
65 * We raced with a task exiting - just return:
66 */
67 pr_debug("couldn't open %s\n", filename); 58 pr_debug("couldn't open %s\n", filename);
68 return 0; 59 return 0;
69 } 60 }
70 61
71 memset(&event->comm, 0, sizeof(event->comm)); 62 while (!comm[0] || (tgid < 0)) {
72
73 while (!event->comm.comm[0] || !event->comm.pid) {
74 if (fgets(bf, sizeof(bf), fp) == NULL) { 63 if (fgets(bf, sizeof(bf), fp) == NULL) {
75 pr_warning("couldn't get COMM and pgid, malformed %s\n", filename); 64 pr_warning("couldn't get COMM and pgid, malformed %s\n",
76 goto out; 65 filename);
66 break;
77 } 67 }
78 68
79 if (memcmp(bf, "Name:", 5) == 0) { 69 if (memcmp(bf, "Name:", 5) == 0) {
@@ -81,33 +71,65 @@ out_race:
81 while (*name && isspace(*name)) 71 while (*name && isspace(*name))
82 ++name; 72 ++name;
83 size = strlen(name) - 1; 73 size = strlen(name) - 1;
84 memcpy(event->comm.comm, name, size++); 74 if (size >= len)
75 size = len - 1;
76 memcpy(comm, name, size);
77
85 } else if (memcmp(bf, "Tgid:", 5) == 0) { 78 } else if (memcmp(bf, "Tgid:", 5) == 0) {
86 char *tgids = bf + 5; 79 char *tgids = bf + 5;
87 while (*tgids && isspace(*tgids)) 80 while (*tgids && isspace(*tgids))
88 ++tgids; 81 ++tgids;
89 tgid = event->comm.pid = atoi(tgids); 82 tgid = atoi(tgids);
90 } 83 }
91 } 84 }
92 85
86 fclose(fp);
87
88 return tgid;
89}
90
91static pid_t perf_event__synthesize_comm(struct perf_tool *tool,
92 union perf_event *event, pid_t pid,
93 int full,
94 perf_event__handler_t process,
95 struct machine *machine)
96{
97 char filename[PATH_MAX];
98 size_t size;
99 DIR *tasks;
100 struct dirent dirent, *next;
101 pid_t tgid;
102
103 memset(&event->comm, 0, sizeof(event->comm));
104
105 tgid = perf_event__get_comm_tgid(pid, event->comm.comm,
106 sizeof(event->comm.comm));
107 if (tgid < 0)
108 goto out;
109
110 event->comm.pid = tgid;
93 event->comm.header.type = PERF_RECORD_COMM; 111 event->comm.header.type = PERF_RECORD_COMM;
112
113 size = strlen(event->comm.comm) + 1;
94 size = ALIGN(size, sizeof(u64)); 114 size = ALIGN(size, sizeof(u64));
95 memset(event->comm.comm + size, 0, session->id_hdr_size); 115 memset(event->comm.comm + size, 0, machine->id_hdr_size);
96 event->comm.header.size = (sizeof(event->comm) - 116 event->comm.header.size = (sizeof(event->comm) -
97 (sizeof(event->comm.comm) - size) + 117 (sizeof(event->comm.comm) - size) +
98 session->id_hdr_size); 118 machine->id_hdr_size);
99 if (!full) { 119 if (!full) {
100 event->comm.tid = pid; 120 event->comm.tid = pid;
101 121
102 process(event, &synth_sample, session); 122 process(tool, event, &synth_sample, machine);
103 goto out; 123 goto out;
104 } 124 }
105 125
106 snprintf(filename, sizeof(filename), "/proc/%d/task", pid); 126 snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
107 127
108 tasks = opendir(filename); 128 tasks = opendir(filename);
109 if (tasks == NULL) 129 if (tasks == NULL) {
110 goto out_race; 130 pr_debug("couldn't open %s\n", filename);
131 return 0;
132 }
111 133
112 while (!readdir_r(tasks, &dirent, &next) && next) { 134 while (!readdir_r(tasks, &dirent, &next) && next) {
113 char *end; 135 char *end;
@@ -115,22 +137,32 @@ out_race:
115 if (*end) 137 if (*end)
116 continue; 138 continue;
117 139
140 /* already have tgid; jut want to update the comm */
141 (void) perf_event__get_comm_tgid(pid, event->comm.comm,
142 sizeof(event->comm.comm));
143
144 size = strlen(event->comm.comm) + 1;
145 size = ALIGN(size, sizeof(u64));
146 memset(event->comm.comm + size, 0, machine->id_hdr_size);
147 event->comm.header.size = (sizeof(event->comm) -
148 (sizeof(event->comm.comm) - size) +
149 machine->id_hdr_size);
150
118 event->comm.tid = pid; 151 event->comm.tid = pid;
119 152
120 process(event, &synth_sample, session); 153 process(tool, event, &synth_sample, machine);
121 } 154 }
122 155
123 closedir(tasks); 156 closedir(tasks);
124out: 157out:
125 fclose(fp);
126
127 return tgid; 158 return tgid;
128} 159}
129 160
130static int perf_event__synthesize_mmap_events(union perf_event *event, 161static int perf_event__synthesize_mmap_events(struct perf_tool *tool,
162 union perf_event *event,
131 pid_t pid, pid_t tgid, 163 pid_t pid, pid_t tgid,
132 perf_event__handler_t process, 164 perf_event__handler_t process,
133 struct perf_session *session) 165 struct machine *machine)
134{ 166{
135 char filename[PATH_MAX]; 167 char filename[PATH_MAX];
136 FILE *fp; 168 FILE *fp;
@@ -193,12 +225,12 @@ static int perf_event__synthesize_mmap_events(union perf_event *event,
193 event->mmap.len -= event->mmap.start; 225 event->mmap.len -= event->mmap.start;
194 event->mmap.header.size = (sizeof(event->mmap) - 226 event->mmap.header.size = (sizeof(event->mmap) -
195 (sizeof(event->mmap.filename) - size)); 227 (sizeof(event->mmap.filename) - size));
196 memset(event->mmap.filename + size, 0, session->id_hdr_size); 228 memset(event->mmap.filename + size, 0, machine->id_hdr_size);
197 event->mmap.header.size += session->id_hdr_size; 229 event->mmap.header.size += machine->id_hdr_size;
198 event->mmap.pid = tgid; 230 event->mmap.pid = tgid;
199 event->mmap.tid = pid; 231 event->mmap.tid = pid;
200 232
201 process(event, &synth_sample, session); 233 process(tool, event, &synth_sample, machine);
202 } 234 }
203 } 235 }
204 236
@@ -206,14 +238,14 @@ static int perf_event__synthesize_mmap_events(union perf_event *event,
206 return 0; 238 return 0;
207} 239}
208 240
209int perf_event__synthesize_modules(perf_event__handler_t process, 241int perf_event__synthesize_modules(struct perf_tool *tool,
210 struct perf_session *session, 242 perf_event__handler_t process,
211 struct machine *machine) 243 struct machine *machine)
212{ 244{
213 struct rb_node *nd; 245 struct rb_node *nd;
214 struct map_groups *kmaps = &machine->kmaps; 246 struct map_groups *kmaps = &machine->kmaps;
215 union perf_event *event = zalloc((sizeof(event->mmap) + 247 union perf_event *event = zalloc((sizeof(event->mmap) +
216 session->id_hdr_size)); 248 machine->id_hdr_size));
217 if (event == NULL) { 249 if (event == NULL) {
218 pr_debug("Not enough memory synthesizing mmap event " 250 pr_debug("Not enough memory synthesizing mmap event "
219 "for kernel modules\n"); 251 "for kernel modules\n");
@@ -243,15 +275,15 @@ int perf_event__synthesize_modules(perf_event__handler_t process,
243 event->mmap.header.type = PERF_RECORD_MMAP; 275 event->mmap.header.type = PERF_RECORD_MMAP;
244 event->mmap.header.size = (sizeof(event->mmap) - 276 event->mmap.header.size = (sizeof(event->mmap) -
245 (sizeof(event->mmap.filename) - size)); 277 (sizeof(event->mmap.filename) - size));
246 memset(event->mmap.filename + size, 0, session->id_hdr_size); 278 memset(event->mmap.filename + size, 0, machine->id_hdr_size);
247 event->mmap.header.size += session->id_hdr_size; 279 event->mmap.header.size += machine->id_hdr_size;
248 event->mmap.start = pos->start; 280 event->mmap.start = pos->start;
249 event->mmap.len = pos->end - pos->start; 281 event->mmap.len = pos->end - pos->start;
250 event->mmap.pid = machine->pid; 282 event->mmap.pid = machine->pid;
251 283
252 memcpy(event->mmap.filename, pos->dso->long_name, 284 memcpy(event->mmap.filename, pos->dso->long_name,
253 pos->dso->long_name_len + 1); 285 pos->dso->long_name_len + 1);
254 process(event, &synth_sample, session); 286 process(tool, event, &synth_sample, machine);
255 } 287 }
256 288
257 free(event); 289 free(event);
@@ -260,40 +292,69 @@ int perf_event__synthesize_modules(perf_event__handler_t process,
260 292
261static int __event__synthesize_thread(union perf_event *comm_event, 293static int __event__synthesize_thread(union perf_event *comm_event,
262 union perf_event *mmap_event, 294 union perf_event *mmap_event,
263 pid_t pid, perf_event__handler_t process, 295 pid_t pid, int full,
264 struct perf_session *session) 296 perf_event__handler_t process,
297 struct perf_tool *tool,
298 struct machine *machine)
265{ 299{
266 pid_t tgid = perf_event__synthesize_comm(comm_event, pid, 1, process, 300 pid_t tgid = perf_event__synthesize_comm(tool, comm_event, pid, full,
267 session); 301 process, machine);
268 if (tgid == -1) 302 if (tgid == -1)
269 return -1; 303 return -1;
270 return perf_event__synthesize_mmap_events(mmap_event, pid, tgid, 304 return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
271 process, session); 305 process, machine);
272} 306}
273 307
274int perf_event__synthesize_thread_map(struct thread_map *threads, 308int perf_event__synthesize_thread_map(struct perf_tool *tool,
309 struct thread_map *threads,
275 perf_event__handler_t process, 310 perf_event__handler_t process,
276 struct perf_session *session) 311 struct machine *machine)
277{ 312{
278 union perf_event *comm_event, *mmap_event; 313 union perf_event *comm_event, *mmap_event;
279 int err = -1, thread; 314 int err = -1, thread, j;
280 315
281 comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); 316 comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
282 if (comm_event == NULL) 317 if (comm_event == NULL)
283 goto out; 318 goto out;
284 319
285 mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); 320 mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size);
286 if (mmap_event == NULL) 321 if (mmap_event == NULL)
287 goto out_free_comm; 322 goto out_free_comm;
288 323
289 err = 0; 324 err = 0;
290 for (thread = 0; thread < threads->nr; ++thread) { 325 for (thread = 0; thread < threads->nr; ++thread) {
291 if (__event__synthesize_thread(comm_event, mmap_event, 326 if (__event__synthesize_thread(comm_event, mmap_event,
292 threads->map[thread], 327 threads->map[thread], 0,
293 process, session)) { 328 process, tool, machine)) {
294 err = -1; 329 err = -1;
295 break; 330 break;
296 } 331 }
332
333 /*
334 * comm.pid is set to thread group id by
335 * perf_event__synthesize_comm
336 */
337 if ((int) comm_event->comm.pid != threads->map[thread]) {
338 bool need_leader = true;
339
340 /* is thread group leader in thread_map? */
341 for (j = 0; j < threads->nr; ++j) {
342 if ((int) comm_event->comm.pid == threads->map[j]) {
343 need_leader = false;
344 break;
345 }
346 }
347
348 /* if not, generate events for it */
349 if (need_leader &&
350 __event__synthesize_thread(comm_event,
351 mmap_event,
352 comm_event->comm.pid, 0,
353 process, tool, machine)) {
354 err = -1;
355 break;
356 }
357 }
297 } 358 }
298 free(mmap_event); 359 free(mmap_event);
299out_free_comm: 360out_free_comm:
@@ -302,19 +363,20 @@ out:
302 return err; 363 return err;
303} 364}
304 365
305int perf_event__synthesize_threads(perf_event__handler_t process, 366int perf_event__synthesize_threads(struct perf_tool *tool,
306 struct perf_session *session) 367 perf_event__handler_t process,
368 struct machine *machine)
307{ 369{
308 DIR *proc; 370 DIR *proc;
309 struct dirent dirent, *next; 371 struct dirent dirent, *next;
310 union perf_event *comm_event, *mmap_event; 372 union perf_event *comm_event, *mmap_event;
311 int err = -1; 373 int err = -1;
312 374
313 comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); 375 comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
314 if (comm_event == NULL) 376 if (comm_event == NULL)
315 goto out; 377 goto out;
316 378
317 mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); 379 mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size);
318 if (mmap_event == NULL) 380 if (mmap_event == NULL)
319 goto out_free_comm; 381 goto out_free_comm;
320 382
@@ -329,8 +391,8 @@ int perf_event__synthesize_threads(perf_event__handler_t process,
329 if (*end) /* only interested in proper numerical dirents */ 391 if (*end) /* only interested in proper numerical dirents */
330 continue; 392 continue;
331 393
332 __event__synthesize_thread(comm_event, mmap_event, pid, 394 __event__synthesize_thread(comm_event, mmap_event, pid, 1,
333 process, session); 395 process, tool, machine);
334 } 396 }
335 397
336 closedir(proc); 398 closedir(proc);
@@ -365,8 +427,8 @@ static int find_symbol_cb(void *arg, const char *name, char type,
365 return 1; 427 return 1;
366} 428}
367 429
368int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, 430int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
369 struct perf_session *session, 431 perf_event__handler_t process,
370 struct machine *machine, 432 struct machine *machine,
371 const char *symbol_name) 433 const char *symbol_name)
372{ 434{
@@ -383,7 +445,7 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
383 */ 445 */
384 struct process_symbol_args args = { .name = symbol_name, }; 446 struct process_symbol_args args = { .name = symbol_name, };
385 union perf_event *event = zalloc((sizeof(event->mmap) + 447 union perf_event *event = zalloc((sizeof(event->mmap) +
386 session->id_hdr_size)); 448 machine->id_hdr_size));
387 if (event == NULL) { 449 if (event == NULL) {
388 pr_debug("Not enough memory synthesizing mmap event " 450 pr_debug("Not enough memory synthesizing mmap event "
389 "for kernel modules\n"); 451 "for kernel modules\n");
@@ -417,25 +479,32 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
417 size = ALIGN(size, sizeof(u64)); 479 size = ALIGN(size, sizeof(u64));
418 event->mmap.header.type = PERF_RECORD_MMAP; 480 event->mmap.header.type = PERF_RECORD_MMAP;
419 event->mmap.header.size = (sizeof(event->mmap) - 481 event->mmap.header.size = (sizeof(event->mmap) -
420 (sizeof(event->mmap.filename) - size) + session->id_hdr_size); 482 (sizeof(event->mmap.filename) - size) + machine->id_hdr_size);
421 event->mmap.pgoff = args.start; 483 event->mmap.pgoff = args.start;
422 event->mmap.start = map->start; 484 event->mmap.start = map->start;
423 event->mmap.len = map->end - event->mmap.start; 485 event->mmap.len = map->end - event->mmap.start;
424 event->mmap.pid = machine->pid; 486 event->mmap.pid = machine->pid;
425 487
426 err = process(event, &synth_sample, session); 488 err = process(tool, event, &synth_sample, machine);
427 free(event); 489 free(event);
428 490
429 return err; 491 return err;
430} 492}
431 493
432int perf_event__process_comm(union perf_event *event, 494size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
495{
496 return fprintf(fp, ": %s:%d\n", event->comm.comm, event->comm.tid);
497}
498
499int perf_event__process_comm(struct perf_tool *tool __used,
500 union perf_event *event,
433 struct perf_sample *sample __used, 501 struct perf_sample *sample __used,
434 struct perf_session *session) 502 struct machine *machine)
435{ 503{
436 struct thread *thread = perf_session__findnew(session, event->comm.tid); 504 struct thread *thread = machine__findnew_thread(machine, event->comm.tid);
437 505
438 dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid); 506 if (dump_trace)
507 perf_event__fprintf_comm(event, stdout);
439 508
440 if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { 509 if (thread == NULL || thread__set_comm(thread, event->comm.comm)) {
441 dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); 510 dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
@@ -445,13 +514,13 @@ int perf_event__process_comm(union perf_event *event,
445 return 0; 514 return 0;
446} 515}
447 516
448int perf_event__process_lost(union perf_event *event, 517int perf_event__process_lost(struct perf_tool *tool __used,
518 union perf_event *event,
449 struct perf_sample *sample __used, 519 struct perf_sample *sample __used,
450 struct perf_session *session) 520 struct machine *machine __used)
451{ 521{
452 dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n", 522 dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n",
453 event->lost.id, event->lost.lost); 523 event->lost.id, event->lost.lost);
454 session->hists.stats.total_lost += event->lost.lost;
455 return 0; 524 return 0;
456} 525}
457 526
@@ -468,21 +537,15 @@ static void perf_event__set_kernel_mmap_len(union perf_event *event,
468 maps[MAP__FUNCTION]->end = ~0ULL; 537 maps[MAP__FUNCTION]->end = ~0ULL;
469} 538}
470 539
471static int perf_event__process_kernel_mmap(union perf_event *event, 540static int perf_event__process_kernel_mmap(struct perf_tool *tool __used,
472 struct perf_session *session) 541 union perf_event *event,
542 struct machine *machine)
473{ 543{
474 struct map *map; 544 struct map *map;
475 char kmmap_prefix[PATH_MAX]; 545 char kmmap_prefix[PATH_MAX];
476 struct machine *machine;
477 enum dso_kernel_type kernel_type; 546 enum dso_kernel_type kernel_type;
478 bool is_kernel_mmap; 547 bool is_kernel_mmap;
479 548
480 machine = perf_session__findnew_machine(session, event->mmap.pid);
481 if (!machine) {
482 pr_err("Can't find id %d's machine\n", event->mmap.pid);
483 goto out_problem;
484 }
485
486 machine__mmap_name(machine, kmmap_prefix, sizeof(kmmap_prefix)); 549 machine__mmap_name(machine, kmmap_prefix, sizeof(kmmap_prefix));
487 if (machine__is_host(machine)) 550 if (machine__is_host(machine))
488 kernel_type = DSO_TYPE_KERNEL; 551 kernel_type = DSO_TYPE_KERNEL;
@@ -549,9 +612,9 @@ static int perf_event__process_kernel_mmap(union perf_event *event,
549 * time /proc/sys/kernel/kptr_restrict was non zero. 612 * time /proc/sys/kernel/kptr_restrict was non zero.
550 */ 613 */
551 if (event->mmap.pgoff != 0) { 614 if (event->mmap.pgoff != 0) {
552 perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, 615 maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
553 symbol_name, 616 symbol_name,
554 event->mmap.pgoff); 617 event->mmap.pgoff);
555 } 618 }
556 619
557 if (machine__is_default_guest(machine)) { 620 if (machine__is_default_guest(machine)) {
@@ -567,32 +630,35 @@ out_problem:
567 return -1; 630 return -1;
568} 631}
569 632
570int perf_event__process_mmap(union perf_event *event, 633size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp)
634{
635 return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n",
636 event->mmap.pid, event->mmap.tid, event->mmap.start,
637 event->mmap.len, event->mmap.pgoff, event->mmap.filename);
638}
639
640int perf_event__process_mmap(struct perf_tool *tool,
641 union perf_event *event,
571 struct perf_sample *sample __used, 642 struct perf_sample *sample __used,
572 struct perf_session *session) 643 struct machine *machine)
573{ 644{
574 struct machine *machine;
575 struct thread *thread; 645 struct thread *thread;
576 struct map *map; 646 struct map *map;
577 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 647 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
578 int ret = 0; 648 int ret = 0;
579 649
580 dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", 650 if (dump_trace)
581 event->mmap.pid, event->mmap.tid, event->mmap.start, 651 perf_event__fprintf_mmap(event, stdout);
582 event->mmap.len, event->mmap.pgoff, event->mmap.filename);
583 652
584 if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL || 653 if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL ||
585 cpumode == PERF_RECORD_MISC_KERNEL) { 654 cpumode == PERF_RECORD_MISC_KERNEL) {
586 ret = perf_event__process_kernel_mmap(event, session); 655 ret = perf_event__process_kernel_mmap(tool, event, machine);
587 if (ret < 0) 656 if (ret < 0)
588 goto out_problem; 657 goto out_problem;
589 return 0; 658 return 0;
590 } 659 }
591 660
592 machine = perf_session__find_host_machine(session); 661 thread = machine__findnew_thread(machine, event->mmap.pid);
593 if (machine == NULL)
594 goto out_problem;
595 thread = perf_session__findnew(session, event->mmap.pid);
596 if (thread == NULL) 662 if (thread == NULL)
597 goto out_problem; 663 goto out_problem;
598 map = map__new(&machine->user_dsos, event->mmap.start, 664 map = map__new(&machine->user_dsos, event->mmap.start,
@@ -610,18 +676,26 @@ out_problem:
610 return 0; 676 return 0;
611} 677}
612 678
613int perf_event__process_task(union perf_event *event, 679size_t perf_event__fprintf_task(union perf_event *event, FILE *fp)
680{
681 return fprintf(fp, "(%d:%d):(%d:%d)\n",
682 event->fork.pid, event->fork.tid,
683 event->fork.ppid, event->fork.ptid);
684}
685
686int perf_event__process_task(struct perf_tool *tool __used,
687 union perf_event *event,
614 struct perf_sample *sample __used, 688 struct perf_sample *sample __used,
615 struct perf_session *session) 689 struct machine *machine)
616{ 690{
617 struct thread *thread = perf_session__findnew(session, event->fork.tid); 691 struct thread *thread = machine__findnew_thread(machine, event->fork.tid);
618 struct thread *parent = perf_session__findnew(session, event->fork.ptid); 692 struct thread *parent = machine__findnew_thread(machine, event->fork.ptid);
619 693
620 dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, 694 if (dump_trace)
621 event->fork.ppid, event->fork.ptid); 695 perf_event__fprintf_task(event, stdout);
622 696
623 if (event->header.type == PERF_RECORD_EXIT) { 697 if (event->header.type == PERF_RECORD_EXIT) {
624 perf_session__remove_thread(session, thread); 698 machine__remove_thread(machine, thread);
625 return 0; 699 return 0;
626 } 700 }
627 701
@@ -634,22 +708,45 @@ int perf_event__process_task(union perf_event *event,
634 return 0; 708 return 0;
635} 709}
636 710
637int perf_event__process(union perf_event *event, struct perf_sample *sample, 711size_t perf_event__fprintf(union perf_event *event, FILE *fp)
638 struct perf_session *session) 712{
713 size_t ret = fprintf(fp, "PERF_RECORD_%s",
714 perf_event__name(event->header.type));
715
716 switch (event->header.type) {
717 case PERF_RECORD_COMM:
718 ret += perf_event__fprintf_comm(event, fp);
719 break;
720 case PERF_RECORD_FORK:
721 case PERF_RECORD_EXIT:
722 ret += perf_event__fprintf_task(event, fp);
723 break;
724 case PERF_RECORD_MMAP:
725 ret += perf_event__fprintf_mmap(event, fp);
726 break;
727 default:
728 ret += fprintf(fp, "\n");
729 }
730
731 return ret;
732}
733
734int perf_event__process(struct perf_tool *tool, union perf_event *event,
735 struct perf_sample *sample, struct machine *machine)
639{ 736{
640 switch (event->header.type) { 737 switch (event->header.type) {
641 case PERF_RECORD_COMM: 738 case PERF_RECORD_COMM:
642 perf_event__process_comm(event, sample, session); 739 perf_event__process_comm(tool, event, sample, machine);
643 break; 740 break;
644 case PERF_RECORD_MMAP: 741 case PERF_RECORD_MMAP:
645 perf_event__process_mmap(event, sample, session); 742 perf_event__process_mmap(tool, event, sample, machine);
646 break; 743 break;
647 case PERF_RECORD_FORK: 744 case PERF_RECORD_FORK:
648 case PERF_RECORD_EXIT: 745 case PERF_RECORD_EXIT:
649 perf_event__process_task(event, sample, session); 746 perf_event__process_task(tool, event, sample, machine);
650 break; 747 break;
651 case PERF_RECORD_LOST: 748 case PERF_RECORD_LOST:
652 perf_event__process_lost(event, sample, session); 749 perf_event__process_lost(tool, event, sample, machine);
653 default: 750 default:
654 break; 751 break;
655 } 752 }
@@ -658,36 +755,29 @@ int perf_event__process(union perf_event *event, struct perf_sample *sample,
658} 755}
659 756
660void thread__find_addr_map(struct thread *self, 757void thread__find_addr_map(struct thread *self,
661 struct perf_session *session, u8 cpumode, 758 struct machine *machine, u8 cpumode,
662 enum map_type type, pid_t pid, u64 addr, 759 enum map_type type, u64 addr,
663 struct addr_location *al) 760 struct addr_location *al)
664{ 761{
665 struct map_groups *mg = &self->mg; 762 struct map_groups *mg = &self->mg;
666 struct machine *machine = NULL;
667 763
668 al->thread = self; 764 al->thread = self;
669 al->addr = addr; 765 al->addr = addr;
670 al->cpumode = cpumode; 766 al->cpumode = cpumode;
671 al->filtered = false; 767 al->filtered = false;
672 768
769 if (machine == NULL) {
770 al->map = NULL;
771 return;
772 }
773
673 if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) { 774 if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) {
674 al->level = 'k'; 775 al->level = 'k';
675 machine = perf_session__find_host_machine(session);
676 if (machine == NULL) {
677 al->map = NULL;
678 return;
679 }
680 mg = &machine->kmaps; 776 mg = &machine->kmaps;
681 } else if (cpumode == PERF_RECORD_MISC_USER && perf_host) { 777 } else if (cpumode == PERF_RECORD_MISC_USER && perf_host) {
682 al->level = '.'; 778 al->level = '.';
683 machine = perf_session__find_host_machine(session);
684 } else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) { 779 } else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) {
685 al->level = 'g'; 780 al->level = 'g';
686 machine = perf_session__find_machine(session, pid);
687 if (machine == NULL) {
688 al->map = NULL;
689 return;
690 }
691 mg = &machine->kmaps; 781 mg = &machine->kmaps;
692 } else { 782 } else {
693 /* 783 /*
@@ -733,13 +823,12 @@ try_again:
733 al->addr = al->map->map_ip(al->map, al->addr); 823 al->addr = al->map->map_ip(al->map, al->addr);
734} 824}
735 825
736void thread__find_addr_location(struct thread *self, 826void thread__find_addr_location(struct thread *thread, struct machine *machine,
737 struct perf_session *session, u8 cpumode, 827 u8 cpumode, enum map_type type, u64 addr,
738 enum map_type type, pid_t pid, u64 addr,
739 struct addr_location *al, 828 struct addr_location *al,
740 symbol_filter_t filter) 829 symbol_filter_t filter)
741{ 830{
742 thread__find_addr_map(self, session, cpumode, type, pid, addr, al); 831 thread__find_addr_map(thread, machine, cpumode, type, addr, al);
743 if (al->map != NULL) 832 if (al->map != NULL)
744 al->sym = map__find_symbol(al->map, al->addr, filter); 833 al->sym = map__find_symbol(al->map, al->addr, filter);
745 else 834 else
@@ -747,13 +836,13 @@ void thread__find_addr_location(struct thread *self,
747} 836}
748 837
749int perf_event__preprocess_sample(const union perf_event *event, 838int perf_event__preprocess_sample(const union perf_event *event,
750 struct perf_session *session, 839 struct machine *machine,
751 struct addr_location *al, 840 struct addr_location *al,
752 struct perf_sample *sample, 841 struct perf_sample *sample,
753 symbol_filter_t filter) 842 symbol_filter_t filter)
754{ 843{
755 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 844 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
756 struct thread *thread = perf_session__findnew(session, event->ip.pid); 845 struct thread *thread = machine__findnew_thread(machine, event->ip.pid);
757 846
758 if (thread == NULL) 847 if (thread == NULL)
759 return -1; 848 return -1;
@@ -764,18 +853,18 @@ int perf_event__preprocess_sample(const union perf_event *event,
764 853
765 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); 854 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
766 /* 855 /*
767 * Have we already created the kernel maps for the host machine? 856 * Have we already created the kernel maps for this machine?
768 * 857 *
769 * This should have happened earlier, when we processed the kernel MMAP 858 * This should have happened earlier, when we processed the kernel MMAP
770 * events, but for older perf.data files there was no such thing, so do 859 * events, but for older perf.data files there was no such thing, so do
771 * it now. 860 * it now.
772 */ 861 */
773 if (cpumode == PERF_RECORD_MISC_KERNEL && 862 if (cpumode == PERF_RECORD_MISC_KERNEL &&
774 session->host_machine.vmlinux_maps[MAP__FUNCTION] == NULL) 863 machine->vmlinux_maps[MAP__FUNCTION] == NULL)
775 machine__create_kernel_maps(&session->host_machine); 864 machine__create_kernel_maps(machine);
776 865
777 thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, 866 thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
778 event->ip.pid, event->ip.ip, al); 867 event->ip.ip, al);
779 dump_printf(" ...... dso: %s\n", 868 dump_printf(" ...... dso: %s\n",
780 al->map ? al->map->dso->long_name : 869 al->map ? al->map->dso->long_name :
781 al->level == 'H' ? "[hypervisor]" : "<not found>"); 870 al->level == 'H' ? "[hypervisor]" : "<not found>");
@@ -783,13 +872,14 @@ int perf_event__preprocess_sample(const union perf_event *event,
783 al->cpu = sample->cpu; 872 al->cpu = sample->cpu;
784 873
785 if (al->map) { 874 if (al->map) {
875 struct dso *dso = al->map->dso;
876
786 if (symbol_conf.dso_list && 877 if (symbol_conf.dso_list &&
787 (!al->map || !al->map->dso || 878 (!dso || !(strlist__has_entry(symbol_conf.dso_list,
788 !(strlist__has_entry(symbol_conf.dso_list, 879 dso->short_name) ||
789 al->map->dso->short_name) || 880 (dso->short_name != dso->long_name &&
790 (al->map->dso->short_name != al->map->dso->long_name && 881 strlist__has_entry(symbol_conf.dso_list,
791 strlist__has_entry(symbol_conf.dso_list, 882 dso->long_name)))))
792 al->map->dso->long_name)))))
793 goto out_filtered; 883 goto out_filtered;
794 884
795 al->sym = map__find_symbol(al->map, al->addr, filter); 885 al->sym = map__find_symbol(al->map, al->addr, filter);
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 357a85b85248..cbdeaad9c5e5 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -2,6 +2,7 @@
2#define __PERF_RECORD_H 2#define __PERF_RECORD_H
3 3
4#include <limits.h> 4#include <limits.h>
5#include <stdio.h>
5 6
6#include "../perf.h" 7#include "../perf.h"
7#include "map.h" 8#include "map.h"
@@ -141,43 +142,54 @@ union perf_event {
141 142
142void perf_event__print_totals(void); 143void perf_event__print_totals(void);
143 144
144struct perf_session; 145struct perf_tool;
145struct thread_map; 146struct thread_map;
146 147
147typedef int (*perf_event__handler_synth_t)(union perf_event *event, 148typedef int (*perf_event__handler_t)(struct perf_tool *tool,
148 struct perf_session *session); 149 union perf_event *event,
149typedef int (*perf_event__handler_t)(union perf_event *event,
150 struct perf_sample *sample, 150 struct perf_sample *sample,
151 struct perf_session *session); 151 struct machine *machine);
152 152
153int perf_event__synthesize_thread_map(struct thread_map *threads, 153int perf_event__synthesize_thread_map(struct perf_tool *tool,
154 struct thread_map *threads,
154 perf_event__handler_t process, 155 perf_event__handler_t process,
155 struct perf_session *session); 156 struct machine *machine);
156int perf_event__synthesize_threads(perf_event__handler_t process, 157int perf_event__synthesize_threads(struct perf_tool *tool,
157 struct perf_session *session); 158 perf_event__handler_t process,
158int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, 159 struct machine *machine);
159 struct perf_session *session, 160int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
161 perf_event__handler_t process,
160 struct machine *machine, 162 struct machine *machine,
161 const char *symbol_name); 163 const char *symbol_name);
162 164
163int perf_event__synthesize_modules(perf_event__handler_t process, 165int perf_event__synthesize_modules(struct perf_tool *tool,
164 struct perf_session *session, 166 perf_event__handler_t process,
165 struct machine *machine); 167 struct machine *machine);
166 168
167int perf_event__process_comm(union perf_event *event, struct perf_sample *sample, 169int perf_event__process_comm(struct perf_tool *tool,
168 struct perf_session *session); 170 union perf_event *event,
169int perf_event__process_lost(union perf_event *event, struct perf_sample *sample, 171 struct perf_sample *sample,
170 struct perf_session *session); 172 struct machine *machine);
171int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample, 173int perf_event__process_lost(struct perf_tool *tool,
172 struct perf_session *session); 174 union perf_event *event,
173int perf_event__process_task(union perf_event *event, struct perf_sample *sample, 175 struct perf_sample *sample,
174 struct perf_session *session); 176 struct machine *machine);
175int perf_event__process(union perf_event *event, struct perf_sample *sample, 177int perf_event__process_mmap(struct perf_tool *tool,
176 struct perf_session *session); 178 union perf_event *event,
179 struct perf_sample *sample,
180 struct machine *machine);
181int perf_event__process_task(struct perf_tool *tool,
182 union perf_event *event,
183 struct perf_sample *sample,
184 struct machine *machine);
185int perf_event__process(struct perf_tool *tool,
186 union perf_event *event,
187 struct perf_sample *sample,
188 struct machine *machine);
177 189
178struct addr_location; 190struct addr_location;
179int perf_event__preprocess_sample(const union perf_event *self, 191int perf_event__preprocess_sample(const union perf_event *self,
180 struct perf_session *session, 192 struct machine *machine,
181 struct addr_location *al, 193 struct addr_location *al,
182 struct perf_sample *sample, 194 struct perf_sample *sample,
183 symbol_filter_t filter); 195 symbol_filter_t filter);
@@ -187,5 +199,13 @@ const char *perf_event__name(unsigned int id);
187int perf_event__parse_sample(const union perf_event *event, u64 type, 199int perf_event__parse_sample(const union perf_event *event, u64 type,
188 int sample_size, bool sample_id_all, 200 int sample_size, bool sample_id_all,
189 struct perf_sample *sample, bool swapped); 201 struct perf_sample *sample, bool swapped);
202int perf_event__synthesize_sample(union perf_event *event, u64 type,
203 const struct perf_sample *sample,
204 bool swapped);
205
206size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
207size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
208size_t perf_event__fprintf_task(union perf_event *event, FILE *fp);
209size_t perf_event__fprintf(union perf_event *event, FILE *fp);
190 210
191#endif /* __PERF_RECORD_H */ 211#endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index fbb4b4ab9cc6..fa1837088ca8 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -6,12 +6,16 @@
6 * 6 *
7 * Released under the GPL v2. (and only v2, not any later version) 7 * Released under the GPL v2. (and only v2, not any later version)
8 */ 8 */
9#include "util.h"
10#include "debugfs.h"
9#include <poll.h> 11#include <poll.h>
10#include "cpumap.h" 12#include "cpumap.h"
11#include "thread_map.h" 13#include "thread_map.h"
12#include "evlist.h" 14#include "evlist.h"
13#include "evsel.h" 15#include "evsel.h"
14#include "util.h" 16#include <unistd.h>
17
18#include "parse-events.h"
15 19
16#include <sys/mman.h> 20#include <sys/mman.h>
17 21
@@ -30,6 +34,7 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
30 INIT_HLIST_HEAD(&evlist->heads[i]); 34 INIT_HLIST_HEAD(&evlist->heads[i]);
31 INIT_LIST_HEAD(&evlist->entries); 35 INIT_LIST_HEAD(&evlist->entries);
32 perf_evlist__set_maps(evlist, cpus, threads); 36 perf_evlist__set_maps(evlist, cpus, threads);
37 evlist->workload.pid = -1;
33} 38}
34 39
35struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, 40struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
@@ -43,6 +48,22 @@ struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
43 return evlist; 48 return evlist;
44} 49}
45 50
51void perf_evlist__config_attrs(struct perf_evlist *evlist,
52 struct perf_record_opts *opts)
53{
54 struct perf_evsel *evsel;
55
56 if (evlist->cpus->map[0] < 0)
57 opts->no_inherit = true;
58
59 list_for_each_entry(evsel, &evlist->entries, node) {
60 perf_evsel__config(evsel, opts);
61
62 if (evlist->nr_entries > 1)
63 evsel->attr.sample_type |= PERF_SAMPLE_ID;
64 }
65}
66
46static void perf_evlist__purge(struct perf_evlist *evlist) 67static void perf_evlist__purge(struct perf_evlist *evlist)
47{ 68{
48 struct perf_evsel *pos, *n; 69 struct perf_evsel *pos, *n;
@@ -76,6 +97,14 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
76 ++evlist->nr_entries; 97 ++evlist->nr_entries;
77} 98}
78 99
100static void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
101 struct list_head *list,
102 int nr_entries)
103{
104 list_splice_tail(list, &evlist->entries);
105 evlist->nr_entries += nr_entries;
106}
107
79int perf_evlist__add_default(struct perf_evlist *evlist) 108int perf_evlist__add_default(struct perf_evlist *evlist)
80{ 109{
81 struct perf_event_attr attr = { 110 struct perf_event_attr attr = {
@@ -100,6 +129,126 @@ error:
100 return -ENOMEM; 129 return -ENOMEM;
101} 130}
102 131
132int perf_evlist__add_attrs(struct perf_evlist *evlist,
133 struct perf_event_attr *attrs, size_t nr_attrs)
134{
135 struct perf_evsel *evsel, *n;
136 LIST_HEAD(head);
137 size_t i;
138
139 for (i = 0; i < nr_attrs; i++) {
140 evsel = perf_evsel__new(attrs + i, evlist->nr_entries + i);
141 if (evsel == NULL)
142 goto out_delete_partial_list;
143 list_add_tail(&evsel->node, &head);
144 }
145
146 perf_evlist__splice_list_tail(evlist, &head, nr_attrs);
147
148 return 0;
149
150out_delete_partial_list:
151 list_for_each_entry_safe(evsel, n, &head, node)
152 perf_evsel__delete(evsel);
153 return -1;
154}
155
156static int trace_event__id(const char *evname)
157{
158 char *filename, *colon;
159 int err = -1, fd;
160
161 if (asprintf(&filename, "%s/%s/id", tracing_events_path, evname) < 0)
162 return -1;
163
164 colon = strrchr(filename, ':');
165 if (colon != NULL)
166 *colon = '/';
167
168 fd = open(filename, O_RDONLY);
169 if (fd >= 0) {
170 char id[16];
171 if (read(fd, id, sizeof(id)) > 0)
172 err = atoi(id);
173 close(fd);
174 }
175
176 free(filename);
177 return err;
178}
179
180int perf_evlist__add_tracepoints(struct perf_evlist *evlist,
181 const char *tracepoints[],
182 size_t nr_tracepoints)
183{
184 int err;
185 size_t i;
186 struct perf_event_attr *attrs = zalloc(nr_tracepoints * sizeof(*attrs));
187
188 if (attrs == NULL)
189 return -1;
190
191 for (i = 0; i < nr_tracepoints; i++) {
192 err = trace_event__id(tracepoints[i]);
193
194 if (err < 0)
195 goto out_free_attrs;
196
197 attrs[i].type = PERF_TYPE_TRACEPOINT;
198 attrs[i].config = err;
199 attrs[i].sample_type = (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
200 PERF_SAMPLE_CPU);
201 attrs[i].sample_period = 1;
202 }
203
204 err = perf_evlist__add_attrs(evlist, attrs, nr_tracepoints);
205out_free_attrs:
206 free(attrs);
207 return err;
208}
209
210static struct perf_evsel *
211 perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id)
212{
213 struct perf_evsel *evsel;
214
215 list_for_each_entry(evsel, &evlist->entries, node) {
216 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
217 (int)evsel->attr.config == id)
218 return evsel;
219 }
220
221 return NULL;
222}
223
224int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist,
225 const struct perf_evsel_str_handler *assocs,
226 size_t nr_assocs)
227{
228 struct perf_evsel *evsel;
229 int err;
230 size_t i;
231
232 for (i = 0; i < nr_assocs; i++) {
233 err = trace_event__id(assocs[i].name);
234 if (err < 0)
235 goto out;
236
237 evsel = perf_evlist__find_tracepoint_by_id(evlist, err);
238 if (evsel == NULL)
239 continue;
240
241 err = -EEXIST;
242 if (evsel->handler.func != NULL)
243 goto out;
244 evsel->handler.func = assocs[i].handler;
245 }
246
247 err = 0;
248out:
249 return err;
250}
251
103void perf_evlist__disable(struct perf_evlist *evlist) 252void perf_evlist__disable(struct perf_evlist *evlist)
104{ 253{
105 int cpu, thread; 254 int cpu, thread;
@@ -126,7 +275,7 @@ void perf_evlist__enable(struct perf_evlist *evlist)
126 } 275 }
127} 276}
128 277
129int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) 278static int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
130{ 279{
131 int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries; 280 int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries;
132 evlist->pollfd = malloc(sizeof(struct pollfd) * nfds); 281 evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
@@ -282,7 +431,7 @@ void perf_evlist__munmap(struct perf_evlist *evlist)
282 evlist->mmap = NULL; 431 evlist->mmap = NULL;
283} 432}
284 433
285int perf_evlist__alloc_mmap(struct perf_evlist *evlist) 434static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
286{ 435{
287 evlist->nr_mmaps = evlist->cpus->nr; 436 evlist->nr_mmaps = evlist->cpus->nr;
288 if (evlist->cpus->map[0] == -1) 437 if (evlist->cpus->map[0] == -1)
@@ -298,8 +447,10 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist,
298 evlist->mmap[idx].mask = mask; 447 evlist->mmap[idx].mask = mask;
299 evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot, 448 evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot,
300 MAP_SHARED, fd, 0); 449 MAP_SHARED, fd, 0);
301 if (evlist->mmap[idx].base == MAP_FAILED) 450 if (evlist->mmap[idx].base == MAP_FAILED) {
451 evlist->mmap[idx].base = NULL;
302 return -1; 452 return -1;
453 }
303 454
304 perf_evlist__add_pollfd(evlist, fd); 455 perf_evlist__add_pollfd(evlist, fd);
305 return 0; 456 return 0;
@@ -400,14 +551,22 @@ out_unmap:
400 * 551 *
401 * Using perf_evlist__read_on_cpu does this automatically. 552 * Using perf_evlist__read_on_cpu does this automatically.
402 */ 553 */
403int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) 554int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
555 bool overwrite)
404{ 556{
405 unsigned int page_size = sysconf(_SC_PAGE_SIZE); 557 unsigned int page_size = sysconf(_SC_PAGE_SIZE);
406 int mask = pages * page_size - 1;
407 struct perf_evsel *evsel; 558 struct perf_evsel *evsel;
408 const struct cpu_map *cpus = evlist->cpus; 559 const struct cpu_map *cpus = evlist->cpus;
409 const struct thread_map *threads = evlist->threads; 560 const struct thread_map *threads = evlist->threads;
410 int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); 561 int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE), mask;
562
563 /* 512 kiB: default amount of unprivileged mlocked memory */
564 if (pages == UINT_MAX)
565 pages = (512 * 1024) / page_size;
566 else if (!is_power_of_2(pages))
567 return -EINVAL;
568
569 mask = pages * page_size - 1;
411 570
412 if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0) 571 if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
413 return -ENOMEM; 572 return -ENOMEM;
@@ -512,6 +671,38 @@ u64 perf_evlist__sample_type(const struct perf_evlist *evlist)
512 return first->attr.sample_type; 671 return first->attr.sample_type;
513} 672}
514 673
674u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist)
675{
676 struct perf_evsel *first;
677 struct perf_sample *data;
678 u64 sample_type;
679 u16 size = 0;
680
681 first = list_entry(evlist->entries.next, struct perf_evsel, node);
682
683 if (!first->attr.sample_id_all)
684 goto out;
685
686 sample_type = first->attr.sample_type;
687
688 if (sample_type & PERF_SAMPLE_TID)
689 size += sizeof(data->tid) * 2;
690
691 if (sample_type & PERF_SAMPLE_TIME)
692 size += sizeof(data->time);
693
694 if (sample_type & PERF_SAMPLE_ID)
695 size += sizeof(data->id);
696
697 if (sample_type & PERF_SAMPLE_STREAM_ID)
698 size += sizeof(data->stream_id);
699
700 if (sample_type & PERF_SAMPLE_CPU)
701 size += sizeof(data->cpu) * 2;
702out:
703 return size;
704}
705
515bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist) 706bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist)
516{ 707{
517 struct perf_evsel *pos, *first; 708 struct perf_evsel *pos, *first;
@@ -569,3 +760,97 @@ out_err:
569 760
570 return err; 761 return err;
571} 762}
763
764int perf_evlist__prepare_workload(struct perf_evlist *evlist,
765 struct perf_record_opts *opts,
766 const char *argv[])
767{
768 int child_ready_pipe[2], go_pipe[2];
769 char bf;
770
771 if (pipe(child_ready_pipe) < 0) {
772 perror("failed to create 'ready' pipe");
773 return -1;
774 }
775
776 if (pipe(go_pipe) < 0) {
777 perror("failed to create 'go' pipe");
778 goto out_close_ready_pipe;
779 }
780
781 evlist->workload.pid = fork();
782 if (evlist->workload.pid < 0) {
783 perror("failed to fork");
784 goto out_close_pipes;
785 }
786
787 if (!evlist->workload.pid) {
788 if (opts->pipe_output)
789 dup2(2, 1);
790
791 close(child_ready_pipe[0]);
792 close(go_pipe[1]);
793 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
794
795 /*
796 * Do a dummy execvp to get the PLT entry resolved,
797 * so we avoid the resolver overhead on the real
798 * execvp call.
799 */
800 execvp("", (char **)argv);
801
802 /*
803 * Tell the parent we're ready to go
804 */
805 close(child_ready_pipe[1]);
806
807 /*
808 * Wait until the parent tells us to go.
809 */
810 if (read(go_pipe[0], &bf, 1) == -1)
811 perror("unable to read pipe");
812
813 execvp(argv[0], (char **)argv);
814
815 perror(argv[0]);
816 kill(getppid(), SIGUSR1);
817 exit(-1);
818 }
819
820 if (!opts->system_wide && opts->target_tid == -1 && opts->target_pid == -1)
821 evlist->threads->map[0] = evlist->workload.pid;
822
823 close(child_ready_pipe[1]);
824 close(go_pipe[0]);
825 /*
826 * wait for child to settle
827 */
828 if (read(child_ready_pipe[0], &bf, 1) == -1) {
829 perror("unable to read pipe");
830 goto out_close_pipes;
831 }
832
833 evlist->workload.cork_fd = go_pipe[1];
834 close(child_ready_pipe[0]);
835 return 0;
836
837out_close_pipes:
838 close(go_pipe[0]);
839 close(go_pipe[1]);
840out_close_ready_pipe:
841 close(child_ready_pipe[0]);
842 close(child_ready_pipe[1]);
843 return -1;
844}
845
846int perf_evlist__start_workload(struct perf_evlist *evlist)
847{
848 if (evlist->workload.cork_fd > 0) {
849 /*
850 * Remove the cork, let it rip!
851 */
852 return close(evlist->workload.cork_fd);
853 }
854
855 return 0;
856}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 1779ffef7828..8922aeed0467 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -2,12 +2,16 @@
2#define __PERF_EVLIST_H 1 2#define __PERF_EVLIST_H 1
3 3
4#include <linux/list.h> 4#include <linux/list.h>
5#include <stdio.h>
5#include "../perf.h" 6#include "../perf.h"
6#include "event.h" 7#include "event.h"
8#include "util.h"
9#include <unistd.h>
7 10
8struct pollfd; 11struct pollfd;
9struct thread_map; 12struct thread_map;
10struct cpu_map; 13struct cpu_map;
14struct perf_record_opts;
11 15
12#define PERF_EVLIST__HLIST_BITS 8 16#define PERF_EVLIST__HLIST_BITS 8
13#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) 17#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
@@ -19,6 +23,10 @@ struct perf_evlist {
19 int nr_fds; 23 int nr_fds;
20 int nr_mmaps; 24 int nr_mmaps;
21 int mmap_len; 25 int mmap_len;
26 struct {
27 int cork_fd;
28 pid_t pid;
29 } workload;
22 bool overwrite; 30 bool overwrite;
23 union perf_event event_copy; 31 union perf_event event_copy;
24 struct perf_mmap *mmap; 32 struct perf_mmap *mmap;
@@ -28,6 +36,11 @@ struct perf_evlist {
28 struct perf_evsel *selected; 36 struct perf_evsel *selected;
29}; 37};
30 38
39struct perf_evsel_str_handler {
40 const char *name;
41 void *handler;
42};
43
31struct perf_evsel; 44struct perf_evsel;
32 45
33struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, 46struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
@@ -39,11 +52,26 @@ void perf_evlist__delete(struct perf_evlist *evlist);
39 52
40void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); 53void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
41int perf_evlist__add_default(struct perf_evlist *evlist); 54int perf_evlist__add_default(struct perf_evlist *evlist);
55int perf_evlist__add_attrs(struct perf_evlist *evlist,
56 struct perf_event_attr *attrs, size_t nr_attrs);
57int perf_evlist__add_tracepoints(struct perf_evlist *evlist,
58 const char *tracepoints[], size_t nr_tracepoints);
59int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist,
60 const struct perf_evsel_str_handler *assocs,
61 size_t nr_assocs);
62
63#define perf_evlist__add_attrs_array(evlist, array) \
64 perf_evlist__add_attrs(evlist, array, ARRAY_SIZE(array))
65
66#define perf_evlist__add_tracepoints_array(evlist, array) \
67 perf_evlist__add_tracepoints(evlist, array, ARRAY_SIZE(array))
68
69#define perf_evlist__set_tracepoints_handlers_array(evlist, array) \
70 perf_evlist__set_tracepoints_handlers(evlist, array, ARRAY_SIZE(array))
42 71
43void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, 72void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
44 int cpu, int thread, u64 id); 73 int cpu, int thread, u64 id);
45 74
46int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
47void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); 75void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
48 76
49struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); 77struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
@@ -52,8 +80,16 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx);
52 80
53int perf_evlist__open(struct perf_evlist *evlist, bool group); 81int perf_evlist__open(struct perf_evlist *evlist, bool group);
54 82
55int perf_evlist__alloc_mmap(struct perf_evlist *evlist); 83void perf_evlist__config_attrs(struct perf_evlist *evlist,
56int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite); 84 struct perf_record_opts *opts);
85
86int perf_evlist__prepare_workload(struct perf_evlist *evlist,
87 struct perf_record_opts *opts,
88 const char *argv[]);
89int perf_evlist__start_workload(struct perf_evlist *evlist);
90
91int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
92 bool overwrite);
57void perf_evlist__munmap(struct perf_evlist *evlist); 93void perf_evlist__munmap(struct perf_evlist *evlist);
58 94
59void perf_evlist__disable(struct perf_evlist *evlist); 95void perf_evlist__disable(struct perf_evlist *evlist);
@@ -77,6 +113,7 @@ int perf_evlist__set_filters(struct perf_evlist *evlist);
77 113
78u64 perf_evlist__sample_type(const struct perf_evlist *evlist); 114u64 perf_evlist__sample_type(const struct perf_evlist *evlist);
79bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist); 115bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist);
116u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist);
80 117
81bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist); 118bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist);
82bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist); 119bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d7915d4e77cb..667f3b78bb2c 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -63,6 +63,79 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
63 return evsel; 63 return evsel;
64} 64}
65 65
66void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts)
67{
68 struct perf_event_attr *attr = &evsel->attr;
69 int track = !evsel->idx; /* only the first counter needs these */
70
71 attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0;
72 attr->inherit = !opts->no_inherit;
73 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
74 PERF_FORMAT_TOTAL_TIME_RUNNING |
75 PERF_FORMAT_ID;
76
77 attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
78
79 /*
80 * We default some events to a 1 default interval. But keep
81 * it a weak assumption overridable by the user.
82 */
83 if (!attr->sample_period || (opts->user_freq != UINT_MAX &&
84 opts->user_interval != ULLONG_MAX)) {
85 if (opts->freq) {
86 attr->sample_type |= PERF_SAMPLE_PERIOD;
87 attr->freq = 1;
88 attr->sample_freq = opts->freq;
89 } else {
90 attr->sample_period = opts->default_interval;
91 }
92 }
93
94 if (opts->no_samples)
95 attr->sample_freq = 0;
96
97 if (opts->inherit_stat)
98 attr->inherit_stat = 1;
99
100 if (opts->sample_address) {
101 attr->sample_type |= PERF_SAMPLE_ADDR;
102 attr->mmap_data = track;
103 }
104
105 if (opts->call_graph)
106 attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
107
108 if (opts->system_wide)
109 attr->sample_type |= PERF_SAMPLE_CPU;
110
111 if (opts->period)
112 attr->sample_type |= PERF_SAMPLE_PERIOD;
113
114 if (opts->sample_id_all_avail &&
115 (opts->sample_time || opts->system_wide ||
116 !opts->no_inherit || opts->cpu_list))
117 attr->sample_type |= PERF_SAMPLE_TIME;
118
119 if (opts->raw_samples) {
120 attr->sample_type |= PERF_SAMPLE_TIME;
121 attr->sample_type |= PERF_SAMPLE_RAW;
122 attr->sample_type |= PERF_SAMPLE_CPU;
123 }
124
125 if (opts->no_delay) {
126 attr->watermark = 0;
127 attr->wakeup_events = 1;
128 }
129
130 attr->mmap = track;
131 attr->comm = track;
132
133 if (opts->target_pid == -1 && opts->target_tid == -1 && !opts->system_wide) {
134 attr->disabled = 1;
135 attr->enable_on_exec = 1;
136 }
137}
138
66int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) 139int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
67{ 140{
68 int cpu, thread; 141 int cpu, thread;
@@ -387,7 +460,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
387 u32 val32[2]; 460 u32 val32[2];
388 } u; 461 } u;
389 462
390 463 memset(data, 0, sizeof(*data));
391 data->cpu = data->pid = data->tid = -1; 464 data->cpu = data->pid = data->tid = -1;
392 data->stream_id = data->id = data->time = -1ULL; 465 data->stream_id = data->id = data->time = -1ULL;
393 466
@@ -504,3 +577,82 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
504 577
505 return 0; 578 return 0;
506} 579}
580
581int perf_event__synthesize_sample(union perf_event *event, u64 type,
582 const struct perf_sample *sample,
583 bool swapped)
584{
585 u64 *array;
586
587 /*
588 * used for cross-endian analysis. See git commit 65014ab3
589 * for why this goofiness is needed.
590 */
591 union {
592 u64 val64;
593 u32 val32[2];
594 } u;
595
596 array = event->sample.array;
597
598 if (type & PERF_SAMPLE_IP) {
599 event->ip.ip = sample->ip;
600 array++;
601 }
602
603 if (type & PERF_SAMPLE_TID) {
604 u.val32[0] = sample->pid;
605 u.val32[1] = sample->tid;
606 if (swapped) {
607 /*
608 * Inverse of what is done in perf_event__parse_sample
609 */
610 u.val32[0] = bswap_32(u.val32[0]);
611 u.val32[1] = bswap_32(u.val32[1]);
612 u.val64 = bswap_64(u.val64);
613 }
614
615 *array = u.val64;
616 array++;
617 }
618
619 if (type & PERF_SAMPLE_TIME) {
620 *array = sample->time;
621 array++;
622 }
623
624 if (type & PERF_SAMPLE_ADDR) {
625 *array = sample->addr;
626 array++;
627 }
628
629 if (type & PERF_SAMPLE_ID) {
630 *array = sample->id;
631 array++;
632 }
633
634 if (type & PERF_SAMPLE_STREAM_ID) {
635 *array = sample->stream_id;
636 array++;
637 }
638
639 if (type & PERF_SAMPLE_CPU) {
640 u.val32[0] = sample->cpu;
641 if (swapped) {
642 /*
643 * Inverse of what is done in perf_event__parse_sample
644 */
645 u.val32[0] = bswap_32(u.val32[0]);
646 u.val64 = bswap_64(u.val64);
647 }
648 *array = u.val64;
649 array++;
650 }
651
652 if (type & PERF_SAMPLE_PERIOD) {
653 *array = sample->period;
654 array++;
655 }
656
657 return 0;
658}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index b1d15e6f7ae3..326b8e4d5035 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -61,12 +61,17 @@ struct perf_evsel {
61 off_t id_offset; 61 off_t id_offset;
62 }; 62 };
63 struct cgroup_sel *cgrp; 63 struct cgroup_sel *cgrp;
64 struct {
65 void *func;
66 void *data;
67 } handler;
64 bool supported; 68 bool supported;
65}; 69};
66 70
67struct cpu_map; 71struct cpu_map;
68struct thread_map; 72struct thread_map;
69struct perf_evlist; 73struct perf_evlist;
74struct perf_record_opts;
70 75
71struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx); 76struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
72void perf_evsel__init(struct perf_evsel *evsel, 77void perf_evsel__init(struct perf_evsel *evsel,
@@ -74,6 +79,9 @@ void perf_evsel__init(struct perf_evsel *evsel,
74void perf_evsel__exit(struct perf_evsel *evsel); 79void perf_evsel__exit(struct perf_evsel *evsel);
75void perf_evsel__delete(struct perf_evsel *evsel); 80void perf_evsel__delete(struct perf_evsel *evsel);
76 81
82void perf_evsel__config(struct perf_evsel *evsel,
83 struct perf_record_opts *opts);
84
77int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); 85int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
78int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads); 86int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
79int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); 87int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 33c17a2b2a81..3e7e0b09c12c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -8,6 +8,7 @@
8#include <stdlib.h> 8#include <stdlib.h>
9#include <linux/list.h> 9#include <linux/list.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/bitops.h>
11#include <sys/utsname.h> 12#include <sys/utsname.h>
12 13
13#include "evlist.h" 14#include "evlist.h"
@@ -28,9 +29,6 @@ static struct perf_trace_event_type *events;
28static u32 header_argc; 29static u32 header_argc;
29static const char **header_argv; 30static const char **header_argv;
30 31
31static int dsos__write_buildid_table(struct perf_header *header, int fd);
32static int perf_session__cache_build_ids(struct perf_session *session);
33
34int perf_header__push_event(u64 id, const char *name) 32int perf_header__push_event(u64 id, const char *name)
35{ 33{
36 if (strlen(name) > MAX_EVENT_NAME) 34 if (strlen(name) > MAX_EVENT_NAME)
@@ -187,6 +185,252 @@ perf_header__set_cmdline(int argc, const char **argv)
187 return 0; 185 return 0;
188} 186}
189 187
188#define dsos__for_each_with_build_id(pos, head) \
189 list_for_each_entry(pos, head, node) \
190 if (!pos->has_build_id) \
191 continue; \
192 else
193
194static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
195 u16 misc, int fd)
196{
197 struct dso *pos;
198
199 dsos__for_each_with_build_id(pos, head) {
200 int err;
201 struct build_id_event b;
202 size_t len;
203
204 if (!pos->hit)
205 continue;
206 len = pos->long_name_len + 1;
207 len = ALIGN(len, NAME_ALIGN);
208 memset(&b, 0, sizeof(b));
209 memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
210 b.pid = pid;
211 b.header.misc = misc;
212 b.header.size = sizeof(b) + len;
213 err = do_write(fd, &b, sizeof(b));
214 if (err < 0)
215 return err;
216 err = write_padded(fd, pos->long_name,
217 pos->long_name_len + 1, len);
218 if (err < 0)
219 return err;
220 }
221
222 return 0;
223}
224
225static int machine__write_buildid_table(struct machine *machine, int fd)
226{
227 int err;
228 u16 kmisc = PERF_RECORD_MISC_KERNEL,
229 umisc = PERF_RECORD_MISC_USER;
230
231 if (!machine__is_host(machine)) {
232 kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
233 umisc = PERF_RECORD_MISC_GUEST_USER;
234 }
235
236 err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid,
237 kmisc, fd);
238 if (err == 0)
239 err = __dsos__write_buildid_table(&machine->user_dsos,
240 machine->pid, umisc, fd);
241 return err;
242}
243
244static int dsos__write_buildid_table(struct perf_header *header, int fd)
245{
246 struct perf_session *session = container_of(header,
247 struct perf_session, header);
248 struct rb_node *nd;
249 int err = machine__write_buildid_table(&session->host_machine, fd);
250
251 if (err)
252 return err;
253
254 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
255 struct machine *pos = rb_entry(nd, struct machine, rb_node);
256 err = machine__write_buildid_table(pos, fd);
257 if (err)
258 break;
259 }
260 return err;
261}
262
263int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
264 const char *name, bool is_kallsyms)
265{
266 const size_t size = PATH_MAX;
267 char *realname, *filename = zalloc(size),
268 *linkname = zalloc(size), *targetname;
269 int len, err = -1;
270
271 if (is_kallsyms) {
272 if (symbol_conf.kptr_restrict) {
273 pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n");
274 return 0;
275 }
276 realname = (char *)name;
277 } else
278 realname = realpath(name, NULL);
279
280 if (realname == NULL || filename == NULL || linkname == NULL)
281 goto out_free;
282
283 len = snprintf(filename, size, "%s%s%s",
284 debugdir, is_kallsyms ? "/" : "", realname);
285 if (mkdir_p(filename, 0755))
286 goto out_free;
287
288 snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id);
289
290 if (access(filename, F_OK)) {
291 if (is_kallsyms) {
292 if (copyfile("/proc/kallsyms", filename))
293 goto out_free;
294 } else if (link(realname, filename) && copyfile(name, filename))
295 goto out_free;
296 }
297
298 len = snprintf(linkname, size, "%s/.build-id/%.2s",
299 debugdir, sbuild_id);
300
301 if (access(linkname, X_OK) && mkdir_p(linkname, 0755))
302 goto out_free;
303
304 snprintf(linkname + len, size - len, "/%s", sbuild_id + 2);
305 targetname = filename + strlen(debugdir) - 5;
306 memcpy(targetname, "../..", 5);
307
308 if (symlink(targetname, linkname) == 0)
309 err = 0;
310out_free:
311 if (!is_kallsyms)
312 free(realname);
313 free(filename);
314 free(linkname);
315 return err;
316}
317
318static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size,
319 const char *name, const char *debugdir,
320 bool is_kallsyms)
321{
322 char sbuild_id[BUILD_ID_SIZE * 2 + 1];
323
324 build_id__sprintf(build_id, build_id_size, sbuild_id);
325
326 return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms);
327}
328
329int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir)
330{
331 const size_t size = PATH_MAX;
332 char *filename = zalloc(size),
333 *linkname = zalloc(size);
334 int err = -1;
335
336 if (filename == NULL || linkname == NULL)
337 goto out_free;
338
339 snprintf(linkname, size, "%s/.build-id/%.2s/%s",
340 debugdir, sbuild_id, sbuild_id + 2);
341
342 if (access(linkname, F_OK))
343 goto out_free;
344
345 if (readlink(linkname, filename, size - 1) < 0)
346 goto out_free;
347
348 if (unlink(linkname))
349 goto out_free;
350
351 /*
352 * Since the link is relative, we must make it absolute:
353 */
354 snprintf(linkname, size, "%s/.build-id/%.2s/%s",
355 debugdir, sbuild_id, filename);
356
357 if (unlink(linkname))
358 goto out_free;
359
360 err = 0;
361out_free:
362 free(filename);
363 free(linkname);
364 return err;
365}
366
367static int dso__cache_build_id(struct dso *dso, const char *debugdir)
368{
369 bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
370
371 return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id),
372 dso->long_name, debugdir, is_kallsyms);
373}
374
375static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
376{
377 struct dso *pos;
378 int err = 0;
379
380 dsos__for_each_with_build_id(pos, head)
381 if (dso__cache_build_id(pos, debugdir))
382 err = -1;
383
384 return err;
385}
386
387static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
388{
389 int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir);
390 ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir);
391 return ret;
392}
393
394static int perf_session__cache_build_ids(struct perf_session *session)
395{
396 struct rb_node *nd;
397 int ret;
398 char debugdir[PATH_MAX];
399
400 snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir);
401
402 if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
403 return -1;
404
405 ret = machine__cache_build_ids(&session->host_machine, debugdir);
406
407 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
408 struct machine *pos = rb_entry(nd, struct machine, rb_node);
409 ret |= machine__cache_build_ids(pos, debugdir);
410 }
411 return ret ? -1 : 0;
412}
413
414static bool machine__read_build_ids(struct machine *machine, bool with_hits)
415{
416 bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits);
417 ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits);
418 return ret;
419}
420
421static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
422{
423 struct rb_node *nd;
424 bool ret = machine__read_build_ids(&session->host_machine, with_hits);
425
426 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
427 struct machine *pos = rb_entry(nd, struct machine, rb_node);
428 ret |= machine__read_build_ids(pos, with_hits);
429 }
430
431 return ret;
432}
433
190static int write_trace_info(int fd, struct perf_header *h __used, 434static int write_trace_info(int fd, struct perf_header *h __used,
191 struct perf_evlist *evlist) 435 struct perf_evlist *evlist)
192{ 436{
@@ -202,6 +446,9 @@ static int write_build_id(int fd, struct perf_header *h,
202 446
203 session = container_of(h, struct perf_session, header); 447 session = container_of(h, struct perf_session, header);
204 448
449 if (!perf_session__read_build_ids(session, true))
450 return -1;
451
205 err = dsos__write_buildid_table(h, fd); 452 err = dsos__write_buildid_table(h, fd);
206 if (err < 0) { 453 if (err < 0) {
207 pr_debug("failed to write buildid table\n"); 454 pr_debug("failed to write buildid table\n");
@@ -1065,26 +1312,30 @@ struct feature_ops {
1065 bool full_only; 1312 bool full_only;
1066}; 1313};
1067 1314
1068#define FEAT_OPA(n, w, p) \ 1315#define FEAT_OPA(n, func) \
1069 [n] = { .name = #n, .write = w, .print = p } 1316 [n] = { .name = #n, .write = write_##func, .print = print_##func }
1070#define FEAT_OPF(n, w, p) \ 1317#define FEAT_OPF(n, func) \
1071 [n] = { .name = #n, .write = w, .print = p, .full_only = true } 1318 [n] = { .name = #n, .write = write_##func, .print = print_##func, .full_only = true }
1319
1320/* feature_ops not implemented: */
1321#define print_trace_info NULL
1322#define print_build_id NULL
1072 1323
1073static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { 1324static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
1074 FEAT_OPA(HEADER_TRACE_INFO, write_trace_info, NULL), 1325 FEAT_OPA(HEADER_TRACE_INFO, trace_info),
1075 FEAT_OPA(HEADER_BUILD_ID, write_build_id, NULL), 1326 FEAT_OPA(HEADER_BUILD_ID, build_id),
1076 FEAT_OPA(HEADER_HOSTNAME, write_hostname, print_hostname), 1327 FEAT_OPA(HEADER_HOSTNAME, hostname),
1077 FEAT_OPA(HEADER_OSRELEASE, write_osrelease, print_osrelease), 1328 FEAT_OPA(HEADER_OSRELEASE, osrelease),
1078 FEAT_OPA(HEADER_VERSION, write_version, print_version), 1329 FEAT_OPA(HEADER_VERSION, version),
1079 FEAT_OPA(HEADER_ARCH, write_arch, print_arch), 1330 FEAT_OPA(HEADER_ARCH, arch),
1080 FEAT_OPA(HEADER_NRCPUS, write_nrcpus, print_nrcpus), 1331 FEAT_OPA(HEADER_NRCPUS, nrcpus),
1081 FEAT_OPA(HEADER_CPUDESC, write_cpudesc, print_cpudesc), 1332 FEAT_OPA(HEADER_CPUDESC, cpudesc),
1082 FEAT_OPA(HEADER_CPUID, write_cpuid, print_cpuid), 1333 FEAT_OPA(HEADER_CPUID, cpuid),
1083 FEAT_OPA(HEADER_TOTAL_MEM, write_total_mem, print_total_mem), 1334 FEAT_OPA(HEADER_TOTAL_MEM, total_mem),
1084 FEAT_OPA(HEADER_EVENT_DESC, write_event_desc, print_event_desc), 1335 FEAT_OPA(HEADER_EVENT_DESC, event_desc),
1085 FEAT_OPA(HEADER_CMDLINE, write_cmdline, print_cmdline), 1336 FEAT_OPA(HEADER_CMDLINE, cmdline),
1086 FEAT_OPF(HEADER_CPU_TOPOLOGY, write_cpu_topology, print_cpu_topology), 1337 FEAT_OPF(HEADER_CPU_TOPOLOGY, cpu_topology),
1087 FEAT_OPF(HEADER_NUMA_TOPOLOGY, write_numa_topology, print_numa_topology), 1338 FEAT_OPF(HEADER_NUMA_TOPOLOGY, numa_topology),
1088}; 1339};
1089 1340
1090struct header_print_data { 1341struct header_print_data {
@@ -1103,9 +1354,9 @@ static int perf_file_section__fprintf_info(struct perf_file_section *section,
1103 "%d, continuing...\n", section->offset, feat); 1354 "%d, continuing...\n", section->offset, feat);
1104 return 0; 1355 return 0;
1105 } 1356 }
1106 if (feat < HEADER_TRACE_INFO || feat >= HEADER_LAST_FEATURE) { 1357 if (feat >= HEADER_LAST_FEATURE) {
1107 pr_warning("unknown feature %d\n", feat); 1358 pr_warning("unknown feature %d\n", feat);
1108 return -1; 1359 return 0;
1109 } 1360 }
1110 if (!feat_ops[feat].print) 1361 if (!feat_ops[feat].print)
1111 return 0; 1362 return 0;
@@ -1132,252 +1383,6 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full)
1132 return 0; 1383 return 0;
1133} 1384}
1134 1385
1135#define dsos__for_each_with_build_id(pos, head) \
1136 list_for_each_entry(pos, head, node) \
1137 if (!pos->has_build_id) \
1138 continue; \
1139 else
1140
1141static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
1142 u16 misc, int fd)
1143{
1144 struct dso *pos;
1145
1146 dsos__for_each_with_build_id(pos, head) {
1147 int err;
1148 struct build_id_event b;
1149 size_t len;
1150
1151 if (!pos->hit)
1152 continue;
1153 len = pos->long_name_len + 1;
1154 len = ALIGN(len, NAME_ALIGN);
1155 memset(&b, 0, sizeof(b));
1156 memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
1157 b.pid = pid;
1158 b.header.misc = misc;
1159 b.header.size = sizeof(b) + len;
1160 err = do_write(fd, &b, sizeof(b));
1161 if (err < 0)
1162 return err;
1163 err = write_padded(fd, pos->long_name,
1164 pos->long_name_len + 1, len);
1165 if (err < 0)
1166 return err;
1167 }
1168
1169 return 0;
1170}
1171
1172static int machine__write_buildid_table(struct machine *machine, int fd)
1173{
1174 int err;
1175 u16 kmisc = PERF_RECORD_MISC_KERNEL,
1176 umisc = PERF_RECORD_MISC_USER;
1177
1178 if (!machine__is_host(machine)) {
1179 kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
1180 umisc = PERF_RECORD_MISC_GUEST_USER;
1181 }
1182
1183 err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid,
1184 kmisc, fd);
1185 if (err == 0)
1186 err = __dsos__write_buildid_table(&machine->user_dsos,
1187 machine->pid, umisc, fd);
1188 return err;
1189}
1190
1191static int dsos__write_buildid_table(struct perf_header *header, int fd)
1192{
1193 struct perf_session *session = container_of(header,
1194 struct perf_session, header);
1195 struct rb_node *nd;
1196 int err = machine__write_buildid_table(&session->host_machine, fd);
1197
1198 if (err)
1199 return err;
1200
1201 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
1202 struct machine *pos = rb_entry(nd, struct machine, rb_node);
1203 err = machine__write_buildid_table(pos, fd);
1204 if (err)
1205 break;
1206 }
1207 return err;
1208}
1209
1210int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
1211 const char *name, bool is_kallsyms)
1212{
1213 const size_t size = PATH_MAX;
1214 char *realname, *filename = zalloc(size),
1215 *linkname = zalloc(size), *targetname;
1216 int len, err = -1;
1217
1218 if (is_kallsyms) {
1219 if (symbol_conf.kptr_restrict) {
1220 pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n");
1221 return 0;
1222 }
1223 realname = (char *)name;
1224 } else
1225 realname = realpath(name, NULL);
1226
1227 if (realname == NULL || filename == NULL || linkname == NULL)
1228 goto out_free;
1229
1230 len = snprintf(filename, size, "%s%s%s",
1231 debugdir, is_kallsyms ? "/" : "", realname);
1232 if (mkdir_p(filename, 0755))
1233 goto out_free;
1234
1235 snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id);
1236
1237 if (access(filename, F_OK)) {
1238 if (is_kallsyms) {
1239 if (copyfile("/proc/kallsyms", filename))
1240 goto out_free;
1241 } else if (link(realname, filename) && copyfile(name, filename))
1242 goto out_free;
1243 }
1244
1245 len = snprintf(linkname, size, "%s/.build-id/%.2s",
1246 debugdir, sbuild_id);
1247
1248 if (access(linkname, X_OK) && mkdir_p(linkname, 0755))
1249 goto out_free;
1250
1251 snprintf(linkname + len, size - len, "/%s", sbuild_id + 2);
1252 targetname = filename + strlen(debugdir) - 5;
1253 memcpy(targetname, "../..", 5);
1254
1255 if (symlink(targetname, linkname) == 0)
1256 err = 0;
1257out_free:
1258 if (!is_kallsyms)
1259 free(realname);
1260 free(filename);
1261 free(linkname);
1262 return err;
1263}
1264
1265static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size,
1266 const char *name, const char *debugdir,
1267 bool is_kallsyms)
1268{
1269 char sbuild_id[BUILD_ID_SIZE * 2 + 1];
1270
1271 build_id__sprintf(build_id, build_id_size, sbuild_id);
1272
1273 return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms);
1274}
1275
1276int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir)
1277{
1278 const size_t size = PATH_MAX;
1279 char *filename = zalloc(size),
1280 *linkname = zalloc(size);
1281 int err = -1;
1282
1283 if (filename == NULL || linkname == NULL)
1284 goto out_free;
1285
1286 snprintf(linkname, size, "%s/.build-id/%.2s/%s",
1287 debugdir, sbuild_id, sbuild_id + 2);
1288
1289 if (access(linkname, F_OK))
1290 goto out_free;
1291
1292 if (readlink(linkname, filename, size - 1) < 0)
1293 goto out_free;
1294
1295 if (unlink(linkname))
1296 goto out_free;
1297
1298 /*
1299 * Since the link is relative, we must make it absolute:
1300 */
1301 snprintf(linkname, size, "%s/.build-id/%.2s/%s",
1302 debugdir, sbuild_id, filename);
1303
1304 if (unlink(linkname))
1305 goto out_free;
1306
1307 err = 0;
1308out_free:
1309 free(filename);
1310 free(linkname);
1311 return err;
1312}
1313
1314static int dso__cache_build_id(struct dso *dso, const char *debugdir)
1315{
1316 bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
1317
1318 return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id),
1319 dso->long_name, debugdir, is_kallsyms);
1320}
1321
1322static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
1323{
1324 struct dso *pos;
1325 int err = 0;
1326
1327 dsos__for_each_with_build_id(pos, head)
1328 if (dso__cache_build_id(pos, debugdir))
1329 err = -1;
1330
1331 return err;
1332}
1333
1334static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
1335{
1336 int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir);
1337 ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir);
1338 return ret;
1339}
1340
1341static int perf_session__cache_build_ids(struct perf_session *session)
1342{
1343 struct rb_node *nd;
1344 int ret;
1345 char debugdir[PATH_MAX];
1346
1347 snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir);
1348
1349 if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
1350 return -1;
1351
1352 ret = machine__cache_build_ids(&session->host_machine, debugdir);
1353
1354 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
1355 struct machine *pos = rb_entry(nd, struct machine, rb_node);
1356 ret |= machine__cache_build_ids(pos, debugdir);
1357 }
1358 return ret ? -1 : 0;
1359}
1360
1361static bool machine__read_build_ids(struct machine *machine, bool with_hits)
1362{
1363 bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits);
1364 ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits);
1365 return ret;
1366}
1367
1368static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
1369{
1370 struct rb_node *nd;
1371 bool ret = machine__read_build_ids(&session->host_machine, with_hits);
1372
1373 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
1374 struct machine *pos = rb_entry(nd, struct machine, rb_node);
1375 ret |= machine__read_build_ids(pos, with_hits);
1376 }
1377
1378 return ret;
1379}
1380
1381static int do_write_feat(int fd, struct perf_header *h, int type, 1386static int do_write_feat(int fd, struct perf_header *h, int type,
1382 struct perf_file_section **p, 1387 struct perf_file_section **p,
1383 struct perf_evlist *evlist) 1388 struct perf_evlist *evlist)
@@ -1386,6 +1391,8 @@ static int do_write_feat(int fd, struct perf_header *h, int type,
1386 int ret = 0; 1391 int ret = 0;
1387 1392
1388 if (perf_header__has_feat(h, type)) { 1393 if (perf_header__has_feat(h, type)) {
1394 if (!feat_ops[type].write)
1395 return -1;
1389 1396
1390 (*p)->offset = lseek(fd, 0, SEEK_CUR); 1397 (*p)->offset = lseek(fd, 0, SEEK_CUR);
1391 1398
@@ -1408,18 +1415,12 @@ static int perf_header__adds_write(struct perf_header *header,
1408 struct perf_evlist *evlist, int fd) 1415 struct perf_evlist *evlist, int fd)
1409{ 1416{
1410 int nr_sections; 1417 int nr_sections;
1411 struct perf_session *session;
1412 struct perf_file_section *feat_sec, *p; 1418 struct perf_file_section *feat_sec, *p;
1413 int sec_size; 1419 int sec_size;
1414 u64 sec_start; 1420 u64 sec_start;
1421 int feat;
1415 int err; 1422 int err;
1416 1423
1417 session = container_of(header, struct perf_session, header);
1418
1419 if (perf_header__has_feat(header, HEADER_BUILD_ID &&
1420 !perf_session__read_build_ids(session, true)))
1421 perf_header__clear_feat(header, HEADER_BUILD_ID);
1422
1423 nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); 1424 nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
1424 if (!nr_sections) 1425 if (!nr_sections)
1425 return 0; 1426 return 0;
@@ -1433,64 +1434,11 @@ static int perf_header__adds_write(struct perf_header *header,
1433 sec_start = header->data_offset + header->data_size; 1434 sec_start = header->data_offset + header->data_size;
1434 lseek(fd, sec_start + sec_size, SEEK_SET); 1435 lseek(fd, sec_start + sec_size, SEEK_SET);
1435 1436
1436 err = do_write_feat(fd, header, HEADER_TRACE_INFO, &p, evlist); 1437 for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) {
1437 if (err) 1438 if (do_write_feat(fd, header, feat, &p, evlist))
1438 goto out_free; 1439 perf_header__clear_feat(header, feat);
1439
1440 err = do_write_feat(fd, header, HEADER_BUILD_ID, &p, evlist);
1441 if (err) {
1442 perf_header__clear_feat(header, HEADER_BUILD_ID);
1443 goto out_free;
1444 } 1440 }
1445 1441
1446 err = do_write_feat(fd, header, HEADER_HOSTNAME, &p, evlist);
1447 if (err)
1448 perf_header__clear_feat(header, HEADER_HOSTNAME);
1449
1450 err = do_write_feat(fd, header, HEADER_OSRELEASE, &p, evlist);
1451 if (err)
1452 perf_header__clear_feat(header, HEADER_OSRELEASE);
1453
1454 err = do_write_feat(fd, header, HEADER_VERSION, &p, evlist);
1455 if (err)
1456 perf_header__clear_feat(header, HEADER_VERSION);
1457
1458 err = do_write_feat(fd, header, HEADER_ARCH, &p, evlist);
1459 if (err)
1460 perf_header__clear_feat(header, HEADER_ARCH);
1461
1462 err = do_write_feat(fd, header, HEADER_NRCPUS, &p, evlist);
1463 if (err)
1464 perf_header__clear_feat(header, HEADER_NRCPUS);
1465
1466 err = do_write_feat(fd, header, HEADER_CPUDESC, &p, evlist);
1467 if (err)
1468 perf_header__clear_feat(header, HEADER_CPUDESC);
1469
1470 err = do_write_feat(fd, header, HEADER_CPUID, &p, evlist);
1471 if (err)
1472 perf_header__clear_feat(header, HEADER_CPUID);
1473
1474 err = do_write_feat(fd, header, HEADER_TOTAL_MEM, &p, evlist);
1475 if (err)
1476 perf_header__clear_feat(header, HEADER_TOTAL_MEM);
1477
1478 err = do_write_feat(fd, header, HEADER_CMDLINE, &p, evlist);
1479 if (err)
1480 perf_header__clear_feat(header, HEADER_CMDLINE);
1481
1482 err = do_write_feat(fd, header, HEADER_EVENT_DESC, &p, evlist);
1483 if (err)
1484 perf_header__clear_feat(header, HEADER_EVENT_DESC);
1485
1486 err = do_write_feat(fd, header, HEADER_CPU_TOPOLOGY, &p, evlist);
1487 if (err)
1488 perf_header__clear_feat(header, HEADER_CPU_TOPOLOGY);
1489
1490 err = do_write_feat(fd, header, HEADER_NUMA_TOPOLOGY, &p, evlist);
1491 if (err)
1492 perf_header__clear_feat(header, HEADER_NUMA_TOPOLOGY);
1493
1494 lseek(fd, sec_start, SEEK_SET); 1442 lseek(fd, sec_start, SEEK_SET);
1495 /* 1443 /*
1496 * may write more than needed due to dropped feature, but 1444 * may write more than needed due to dropped feature, but
@@ -1499,7 +1447,6 @@ static int perf_header__adds_write(struct perf_header *header,
1499 err = do_write(fd, feat_sec, sec_size); 1447 err = do_write(fd, feat_sec, sec_size);
1500 if (err < 0) 1448 if (err < 0)
1501 pr_debug("failed to write feature section\n"); 1449 pr_debug("failed to write feature section\n");
1502out_free:
1503 free(feat_sec); 1450 free(feat_sec);
1504 return err; 1451 return err;
1505} 1452}
@@ -1637,20 +1584,20 @@ static int perf_header__getbuffer64(struct perf_header *header,
1637int perf_header__process_sections(struct perf_header *header, int fd, 1584int perf_header__process_sections(struct perf_header *header, int fd,
1638 void *data, 1585 void *data,
1639 int (*process)(struct perf_file_section *section, 1586 int (*process)(struct perf_file_section *section,
1640 struct perf_header *ph, 1587 struct perf_header *ph,
1641 int feat, int fd, void *data)) 1588 int feat, int fd, void *data))
1642{ 1589{
1643 struct perf_file_section *feat_sec; 1590 struct perf_file_section *feat_sec, *sec;
1644 int nr_sections; 1591 int nr_sections;
1645 int sec_size; 1592 int sec_size;
1646 int idx = 0; 1593 int feat;
1647 int err = -1, feat = 1; 1594 int err;
1648 1595
1649 nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); 1596 nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
1650 if (!nr_sections) 1597 if (!nr_sections)
1651 return 0; 1598 return 0;
1652 1599
1653 feat_sec = calloc(sizeof(*feat_sec), nr_sections); 1600 feat_sec = sec = calloc(sizeof(*feat_sec), nr_sections);
1654 if (!feat_sec) 1601 if (!feat_sec)
1655 return -1; 1602 return -1;
1656 1603
@@ -1658,20 +1605,16 @@ int perf_header__process_sections(struct perf_header *header, int fd,
1658 1605
1659 lseek(fd, header->data_offset + header->data_size, SEEK_SET); 1606 lseek(fd, header->data_offset + header->data_size, SEEK_SET);
1660 1607
1661 if (perf_header__getbuffer64(header, fd, feat_sec, sec_size)) 1608 err = perf_header__getbuffer64(header, fd, feat_sec, sec_size);
1609 if (err < 0)
1662 goto out_free; 1610 goto out_free;
1663 1611
1664 err = 0; 1612 for_each_set_bit(feat, header->adds_features, HEADER_LAST_FEATURE) {
1665 while (idx < nr_sections && feat < HEADER_LAST_FEATURE) { 1613 err = process(sec++, header, feat, fd, data);
1666 if (perf_header__has_feat(header, feat)) { 1614 if (err < 0)
1667 struct perf_file_section *sec = &feat_sec[idx++]; 1615 goto out_free;
1668
1669 err = process(sec, header, feat, fd, data);
1670 if (err < 0)
1671 break;
1672 }
1673 ++feat;
1674 } 1616 }
1617 err = 0;
1675out_free: 1618out_free:
1676 free(feat_sec); 1619 free(feat_sec);
1677 return err; 1620 return err;
@@ -1906,32 +1849,21 @@ static int perf_file_section__process(struct perf_file_section *section,
1906 return 0; 1849 return 0;
1907 } 1850 }
1908 1851
1852 if (feat >= HEADER_LAST_FEATURE) {
1853 pr_debug("unknown feature %d, continuing...\n", feat);
1854 return 0;
1855 }
1856
1909 switch (feat) { 1857 switch (feat) {
1910 case HEADER_TRACE_INFO: 1858 case HEADER_TRACE_INFO:
1911 trace_report(fd, false); 1859 trace_report(fd, false);
1912 break; 1860 break;
1913
1914 case HEADER_BUILD_ID: 1861 case HEADER_BUILD_ID:
1915 if (perf_header__read_build_ids(ph, fd, section->offset, section->size)) 1862 if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
1916 pr_debug("Failed to read buildids, continuing...\n"); 1863 pr_debug("Failed to read buildids, continuing...\n");
1917 break; 1864 break;
1918
1919 case HEADER_HOSTNAME:
1920 case HEADER_OSRELEASE:
1921 case HEADER_VERSION:
1922 case HEADER_ARCH:
1923 case HEADER_NRCPUS:
1924 case HEADER_CPUDESC:
1925 case HEADER_CPUID:
1926 case HEADER_TOTAL_MEM:
1927 case HEADER_CMDLINE:
1928 case HEADER_EVENT_DESC:
1929 case HEADER_CPU_TOPOLOGY:
1930 case HEADER_NUMA_TOPOLOGY:
1931 break;
1932
1933 default: 1865 default:
1934 pr_debug("unknown feature %d, continuing...\n", feat); 1866 break;
1935 } 1867 }
1936 1868
1937 return 0; 1869 return 0;
@@ -2041,6 +1973,8 @@ int perf_session__read_header(struct perf_session *session, int fd)
2041 lseek(fd, tmp, SEEK_SET); 1973 lseek(fd, tmp, SEEK_SET);
2042 } 1974 }
2043 1975
1976 symbol_conf.nr_events = nr_attrs;
1977
2044 if (f_header.event_types.size) { 1978 if (f_header.event_types.size) {
2045 lseek(fd, f_header.event_types.offset, SEEK_SET); 1979 lseek(fd, f_header.event_types.offset, SEEK_SET);
2046 events = malloc(f_header.event_types.size); 1980 events = malloc(f_header.event_types.size);
@@ -2068,9 +2002,9 @@ out_delete_evlist:
2068 return -ENOMEM; 2002 return -ENOMEM;
2069} 2003}
2070 2004
2071int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, 2005int perf_event__synthesize_attr(struct perf_tool *tool,
2072 perf_event__handler_t process, 2006 struct perf_event_attr *attr, u16 ids, u64 *id,
2073 struct perf_session *session) 2007 perf_event__handler_t process)
2074{ 2008{
2075 union perf_event *ev; 2009 union perf_event *ev;
2076 size_t size; 2010 size_t size;
@@ -2092,22 +2026,23 @@ int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
2092 ev->attr.header.type = PERF_RECORD_HEADER_ATTR; 2026 ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
2093 ev->attr.header.size = size; 2027 ev->attr.header.size = size;
2094 2028
2095 err = process(ev, NULL, session); 2029 err = process(tool, ev, NULL, NULL);
2096 2030
2097 free(ev); 2031 free(ev);
2098 2032
2099 return err; 2033 return err;
2100} 2034}
2101 2035
2102int perf_session__synthesize_attrs(struct perf_session *session, 2036int perf_event__synthesize_attrs(struct perf_tool *tool,
2037 struct perf_session *session,
2103 perf_event__handler_t process) 2038 perf_event__handler_t process)
2104{ 2039{
2105 struct perf_evsel *attr; 2040 struct perf_evsel *attr;
2106 int err = 0; 2041 int err = 0;
2107 2042
2108 list_for_each_entry(attr, &session->evlist->entries, node) { 2043 list_for_each_entry(attr, &session->evlist->entries, node) {
2109 err = perf_event__synthesize_attr(&attr->attr, attr->ids, 2044 err = perf_event__synthesize_attr(tool, &attr->attr, attr->ids,
2110 attr->id, process, session); 2045 attr->id, process);
2111 if (err) { 2046 if (err) {
2112 pr_debug("failed to create perf header attribute\n"); 2047 pr_debug("failed to create perf header attribute\n");
2113 return err; 2048 return err;
@@ -2118,23 +2053,23 @@ int perf_session__synthesize_attrs(struct perf_session *session,
2118} 2053}
2119 2054
2120int perf_event__process_attr(union perf_event *event, 2055int perf_event__process_attr(union perf_event *event,
2121 struct perf_session *session) 2056 struct perf_evlist **pevlist)
2122{ 2057{
2123 unsigned int i, ids, n_ids; 2058 unsigned int i, ids, n_ids;
2124 struct perf_evsel *evsel; 2059 struct perf_evsel *evsel;
2060 struct perf_evlist *evlist = *pevlist;
2125 2061
2126 if (session->evlist == NULL) { 2062 if (evlist == NULL) {
2127 session->evlist = perf_evlist__new(NULL, NULL); 2063 *pevlist = evlist = perf_evlist__new(NULL, NULL);
2128 if (session->evlist == NULL) 2064 if (evlist == NULL)
2129 return -ENOMEM; 2065 return -ENOMEM;
2130 } 2066 }
2131 2067
2132 evsel = perf_evsel__new(&event->attr.attr, 2068 evsel = perf_evsel__new(&event->attr.attr, evlist->nr_entries);
2133 session->evlist->nr_entries);
2134 if (evsel == NULL) 2069 if (evsel == NULL)
2135 return -ENOMEM; 2070 return -ENOMEM;
2136 2071
2137 perf_evlist__add(session->evlist, evsel); 2072 perf_evlist__add(evlist, evsel);
2138 2073
2139 ids = event->header.size; 2074 ids = event->header.size;
2140 ids -= (void *)&event->attr.id - (void *)event; 2075 ids -= (void *)&event->attr.id - (void *)event;
@@ -2148,18 +2083,16 @@ int perf_event__process_attr(union perf_event *event,
2148 return -ENOMEM; 2083 return -ENOMEM;
2149 2084
2150 for (i = 0; i < n_ids; i++) { 2085 for (i = 0; i < n_ids; i++) {
2151 perf_evlist__id_add(session->evlist, evsel, 0, i, 2086 perf_evlist__id_add(evlist, evsel, 0, i, event->attr.id[i]);
2152 event->attr.id[i]);
2153 } 2087 }
2154 2088
2155 perf_session__update_sample_type(session);
2156
2157 return 0; 2089 return 0;
2158} 2090}
2159 2091
2160int perf_event__synthesize_event_type(u64 event_id, char *name, 2092int perf_event__synthesize_event_type(struct perf_tool *tool,
2093 u64 event_id, char *name,
2161 perf_event__handler_t process, 2094 perf_event__handler_t process,
2162 struct perf_session *session) 2095 struct machine *machine)
2163{ 2096{
2164 union perf_event ev; 2097 union perf_event ev;
2165 size_t size = 0; 2098 size_t size = 0;
@@ -2177,13 +2110,14 @@ int perf_event__synthesize_event_type(u64 event_id, char *name,
2177 ev.event_type.header.size = sizeof(ev.event_type) - 2110 ev.event_type.header.size = sizeof(ev.event_type) -
2178 (sizeof(ev.event_type.event_type.name) - size); 2111 (sizeof(ev.event_type.event_type.name) - size);
2179 2112
2180 err = process(&ev, NULL, session); 2113 err = process(tool, &ev, NULL, machine);
2181 2114
2182 return err; 2115 return err;
2183} 2116}
2184 2117
2185int perf_event__synthesize_event_types(perf_event__handler_t process, 2118int perf_event__synthesize_event_types(struct perf_tool *tool,
2186 struct perf_session *session) 2119 perf_event__handler_t process,
2120 struct machine *machine)
2187{ 2121{
2188 struct perf_trace_event_type *type; 2122 struct perf_trace_event_type *type;
2189 int i, err = 0; 2123 int i, err = 0;
@@ -2191,9 +2125,9 @@ int perf_event__synthesize_event_types(perf_event__handler_t process,
2191 for (i = 0; i < event_count; i++) { 2125 for (i = 0; i < event_count; i++) {
2192 type = &events[i]; 2126 type = &events[i];
2193 2127
2194 err = perf_event__synthesize_event_type(type->event_id, 2128 err = perf_event__synthesize_event_type(tool, type->event_id,
2195 type->name, process, 2129 type->name, process,
2196 session); 2130 machine);
2197 if (err) { 2131 if (err) {
2198 pr_debug("failed to create perf header event type\n"); 2132 pr_debug("failed to create perf header event type\n");
2199 return err; 2133 return err;
@@ -2203,8 +2137,8 @@ int perf_event__synthesize_event_types(perf_event__handler_t process,
2203 return err; 2137 return err;
2204} 2138}
2205 2139
2206int perf_event__process_event_type(union perf_event *event, 2140int perf_event__process_event_type(struct perf_tool *tool __unused,
2207 struct perf_session *session __unused) 2141 union perf_event *event)
2208{ 2142{
2209 if (perf_header__push_event(event->event_type.event_type.event_id, 2143 if (perf_header__push_event(event->event_type.event_type.event_id,
2210 event->event_type.event_type.name) < 0) 2144 event->event_type.event_type.name) < 0)
@@ -2213,9 +2147,9 @@ int perf_event__process_event_type(union perf_event *event,
2213 return 0; 2147 return 0;
2214} 2148}
2215 2149
2216int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, 2150int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd,
2217 perf_event__handler_t process, 2151 struct perf_evlist *evlist,
2218 struct perf_session *session __unused) 2152 perf_event__handler_t process)
2219{ 2153{
2220 union perf_event ev; 2154 union perf_event ev;
2221 struct tracing_data *tdata; 2155 struct tracing_data *tdata;
@@ -2246,7 +2180,7 @@ int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
2246 ev.tracing_data.header.size = sizeof(ev.tracing_data); 2180 ev.tracing_data.header.size = sizeof(ev.tracing_data);
2247 ev.tracing_data.size = aligned_size; 2181 ev.tracing_data.size = aligned_size;
2248 2182
2249 process(&ev, NULL, session); 2183 process(tool, &ev, NULL, NULL);
2250 2184
2251 /* 2185 /*
2252 * The put function will copy all the tracing data 2186 * The put function will copy all the tracing data
@@ -2288,10 +2222,10 @@ int perf_event__process_tracing_data(union perf_event *event,
2288 return size_read + padding; 2222 return size_read + padding;
2289} 2223}
2290 2224
2291int perf_event__synthesize_build_id(struct dso *pos, u16 misc, 2225int perf_event__synthesize_build_id(struct perf_tool *tool,
2226 struct dso *pos, u16 misc,
2292 perf_event__handler_t process, 2227 perf_event__handler_t process,
2293 struct machine *machine, 2228 struct machine *machine)
2294 struct perf_session *session)
2295{ 2229{
2296 union perf_event ev; 2230 union perf_event ev;
2297 size_t len; 2231 size_t len;
@@ -2311,12 +2245,13 @@ int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
2311 ev.build_id.header.size = sizeof(ev.build_id) + len; 2245 ev.build_id.header.size = sizeof(ev.build_id) + len;
2312 memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); 2246 memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
2313 2247
2314 err = process(&ev, NULL, session); 2248 err = process(tool, &ev, NULL, machine);
2315 2249
2316 return err; 2250 return err;
2317} 2251}
2318 2252
2319int perf_event__process_build_id(union perf_event *event, 2253int perf_event__process_build_id(struct perf_tool *tool __used,
2254 union perf_event *event,
2320 struct perf_session *session) 2255 struct perf_session *session)
2321{ 2256{
2322 __event_process_build_id(&event->build_id, 2257 __event_process_build_id(&event->build_id,
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 3d5a742f4a2a..ac4ec956024e 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -10,7 +10,8 @@
10#include <linux/bitmap.h> 10#include <linux/bitmap.h>
11 11
12enum { 12enum {
13 HEADER_TRACE_INFO = 1, 13 HEADER_RESERVED = 0, /* always cleared */
14 HEADER_TRACE_INFO = 1,
14 HEADER_BUILD_ID, 15 HEADER_BUILD_ID,
15 16
16 HEADER_HOSTNAME, 17 HEADER_HOSTNAME,
@@ -27,10 +28,9 @@ enum {
27 HEADER_NUMA_TOPOLOGY, 28 HEADER_NUMA_TOPOLOGY,
28 29
29 HEADER_LAST_FEATURE, 30 HEADER_LAST_FEATURE,
31 HEADER_FEAT_BITS = 256,
30}; 32};
31 33
32#define HEADER_FEAT_BITS 256
33
34struct perf_file_section { 34struct perf_file_section {
35 u64 offset; 35 u64 offset;
36 u64 size; 36 u64 size;
@@ -68,6 +68,7 @@ struct perf_header {
68}; 68};
69 69
70struct perf_evlist; 70struct perf_evlist;
71struct perf_session;
71 72
72int perf_session__read_header(struct perf_session *session, int fd); 73int perf_session__read_header(struct perf_session *session, int fd);
73int perf_session__write_header(struct perf_session *session, 74int perf_session__write_header(struct perf_session *session,
@@ -96,32 +97,36 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
96 const char *name, bool is_kallsyms); 97 const char *name, bool is_kallsyms);
97int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir); 98int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
98 99
99int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, 100int perf_event__synthesize_attr(struct perf_tool *tool,
100 perf_event__handler_t process, 101 struct perf_event_attr *attr, u16 ids, u64 *id,
101 struct perf_session *session); 102 perf_event__handler_t process);
102int perf_session__synthesize_attrs(struct perf_session *session, 103int perf_event__synthesize_attrs(struct perf_tool *tool,
103 perf_event__handler_t process); 104 struct perf_session *session,
104int perf_event__process_attr(union perf_event *event, struct perf_session *session); 105 perf_event__handler_t process);
106int perf_event__process_attr(union perf_event *event, struct perf_evlist **pevlist);
105 107
106int perf_event__synthesize_event_type(u64 event_id, char *name, 108int perf_event__synthesize_event_type(struct perf_tool *tool,
109 u64 event_id, char *name,
107 perf_event__handler_t process, 110 perf_event__handler_t process,
108 struct perf_session *session); 111 struct machine *machine);
109int perf_event__synthesize_event_types(perf_event__handler_t process, 112int perf_event__synthesize_event_types(struct perf_tool *tool,
110 struct perf_session *session); 113 perf_event__handler_t process,
111int perf_event__process_event_type(union perf_event *event, 114 struct machine *machine);
112 struct perf_session *session); 115int perf_event__process_event_type(struct perf_tool *tool,
113 116 union perf_event *event);
114int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, 117
115 perf_event__handler_t process, 118int perf_event__synthesize_tracing_data(struct perf_tool *tool,
116 struct perf_session *session); 119 int fd, struct perf_evlist *evlist,
120 perf_event__handler_t process);
117int perf_event__process_tracing_data(union perf_event *event, 121int perf_event__process_tracing_data(union perf_event *event,
118 struct perf_session *session); 122 struct perf_session *session);
119 123
120int perf_event__synthesize_build_id(struct dso *pos, u16 misc, 124int perf_event__synthesize_build_id(struct perf_tool *tool,
125 struct dso *pos, u16 misc,
121 perf_event__handler_t process, 126 perf_event__handler_t process,
122 struct machine *machine, 127 struct machine *machine);
123 struct perf_session *session); 128int perf_event__process_build_id(struct perf_tool *tool,
124int perf_event__process_build_id(union perf_event *event, 129 union perf_event *event,
125 struct perf_session *session); 130 struct perf_session *session);
126 131
127/* 132/*
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 89289c8e935e..ff6f9d56ea41 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -117,7 +117,6 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used,
117 117
118static inline int hist_entry__tui_annotate(struct hist_entry *self __used, 118static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
119 int evidx __used, 119 int evidx __used,
120 int nr_events __used,
121 void(*timer)(void *arg) __used, 120 void(*timer)(void *arg) __used,
122 void *arg __used, 121 void *arg __used,
123 int delay_secs __used) 122 int delay_secs __used)
@@ -128,7 +127,7 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
128#define K_RIGHT -2 127#define K_RIGHT -2
129#else 128#else
130#include "ui/keysyms.h" 129#include "ui/keysyms.h"
131int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events, 130int hist_entry__tui_annotate(struct hist_entry *he, int evidx,
132 void(*timer)(void *arg), void *arg, int delay_secs); 131 void(*timer)(void *arg), void *arg, int delay_secs);
133 132
134int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, 133int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index 305c8484f200..62cdee78db7b 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -9,6 +9,17 @@
9#define BITS_PER_BYTE 8 9#define BITS_PER_BYTE 8
10#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) 10#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
11 11
12#define for_each_set_bit(bit, addr, size) \
13 for ((bit) = find_first_bit((addr), (size)); \
14 (bit) < (size); \
15 (bit) = find_next_bit((addr), (size), (bit) + 1))
16
17/* same as for_each_set_bit() but use bit as value to start with */
18#define for_each_set_bit_cont(bit, addr, size) \
19 for ((bit) = find_next_bit((addr), (size), (bit)); \
20 (bit) < (size); \
21 (bit) = find_next_bit((addr), (size), (bit) + 1))
22
12static inline void set_bit(int nr, unsigned long *addr) 23static inline void set_bit(int nr, unsigned long *addr)
13{ 24{
14 addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); 25 addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
@@ -30,4 +41,111 @@ static inline unsigned long hweight_long(unsigned long w)
30 return sizeof(w) == 4 ? hweight32(w) : hweight64(w); 41 return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
31} 42}
32 43
44#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
45
46/**
47 * __ffs - find first bit in word.
48 * @word: The word to search
49 *
50 * Undefined if no bit exists, so code should check against 0 first.
51 */
52static __always_inline unsigned long __ffs(unsigned long word)
53{
54 int num = 0;
55
56#if BITS_PER_LONG == 64
57 if ((word & 0xffffffff) == 0) {
58 num += 32;
59 word >>= 32;
60 }
61#endif
62 if ((word & 0xffff) == 0) {
63 num += 16;
64 word >>= 16;
65 }
66 if ((word & 0xff) == 0) {
67 num += 8;
68 word >>= 8;
69 }
70 if ((word & 0xf) == 0) {
71 num += 4;
72 word >>= 4;
73 }
74 if ((word & 0x3) == 0) {
75 num += 2;
76 word >>= 2;
77 }
78 if ((word & 0x1) == 0)
79 num += 1;
80 return num;
81}
82
83/*
84 * Find the first set bit in a memory region.
85 */
86static inline unsigned long
87find_first_bit(const unsigned long *addr, unsigned long size)
88{
89 const unsigned long *p = addr;
90 unsigned long result = 0;
91 unsigned long tmp;
92
93 while (size & ~(BITS_PER_LONG-1)) {
94 if ((tmp = *(p++)))
95 goto found;
96 result += BITS_PER_LONG;
97 size -= BITS_PER_LONG;
98 }
99 if (!size)
100 return result;
101
102 tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
103 if (tmp == 0UL) /* Are any bits set? */
104 return result + size; /* Nope. */
105found:
106 return result + __ffs(tmp);
107}
108
109/*
110 * Find the next set bit in a memory region.
111 */
112static inline unsigned long
113find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset)
114{
115 const unsigned long *p = addr + BITOP_WORD(offset);
116 unsigned long result = offset & ~(BITS_PER_LONG-1);
117 unsigned long tmp;
118
119 if (offset >= size)
120 return size;
121 size -= result;
122 offset %= BITS_PER_LONG;
123 if (offset) {
124 tmp = *(p++);
125 tmp &= (~0UL << offset);
126 if (size < BITS_PER_LONG)
127 goto found_first;
128 if (tmp)
129 goto found_middle;
130 size -= BITS_PER_LONG;
131 result += BITS_PER_LONG;
132 }
133 while (size & ~(BITS_PER_LONG-1)) {
134 if ((tmp = *(p++)))
135 goto found_middle;
136 result += BITS_PER_LONG;
137 size -= BITS_PER_LONG;
138 }
139 if (!size)
140 return result;
141 tmp = *p;
142
143found_first:
144 tmp &= (~0UL >> (BITS_PER_LONG - size));
145 if (tmp == 0UL) /* Are any bits set? */
146 return result + size; /* Nope. */
147found_middle:
148 return result + __ffs(tmp);
149}
150
33#endif 151#endif
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 78284b13e808..316aa0ab7122 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -562,6 +562,10 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid)
562 INIT_LIST_HEAD(&self->user_dsos); 562 INIT_LIST_HEAD(&self->user_dsos);
563 INIT_LIST_HEAD(&self->kernel_dsos); 563 INIT_LIST_HEAD(&self->kernel_dsos);
564 564
565 self->threads = RB_ROOT;
566 INIT_LIST_HEAD(&self->dead_threads);
567 self->last_match = NULL;
568
565 self->kmaps.machine = self; 569 self->kmaps.machine = self;
566 self->pid = pid; 570 self->pid = pid;
567 self->root_dir = strdup(root_dir); 571 self->root_dir = strdup(root_dir);
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 890d85545d0f..2b8017f8a930 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -18,9 +18,11 @@ enum map_type {
18extern const char *map_type__name[MAP__NR_TYPES]; 18extern const char *map_type__name[MAP__NR_TYPES];
19 19
20struct dso; 20struct dso;
21struct ip_callchain;
21struct ref_reloc_sym; 22struct ref_reloc_sym;
22struct map_groups; 23struct map_groups;
23struct machine; 24struct machine;
25struct perf_evsel;
24 26
25struct map { 27struct map {
26 union { 28 union {
@@ -61,7 +63,11 @@ struct map_groups {
61struct machine { 63struct machine {
62 struct rb_node rb_node; 64 struct rb_node rb_node;
63 pid_t pid; 65 pid_t pid;
66 u16 id_hdr_size;
64 char *root_dir; 67 char *root_dir;
68 struct rb_root threads;
69 struct list_head dead_threads;
70 struct thread *last_match;
65 struct list_head user_dsos; 71 struct list_head user_dsos;
66 struct list_head kernel_dsos; 72 struct list_head kernel_dsos;
67 struct map_groups kmaps; 73 struct map_groups kmaps;
@@ -148,6 +154,13 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid);
148void machine__exit(struct machine *self); 154void machine__exit(struct machine *self);
149void machine__delete(struct machine *self); 155void machine__delete(struct machine *self);
150 156
157int machine__resolve_callchain(struct machine *machine,
158 struct perf_evsel *evsel, struct thread *thread,
159 struct ip_callchain *chain,
160 struct symbol **parent);
161int maps__set_kallsyms_ref_reloc_sym(struct map **maps, const char *symbol_name,
162 u64 addr);
163
151/* 164/*
152 * Default guest kernel is defined by parameter --guestkallsyms 165 * Default guest kernel is defined by parameter --guestkallsyms
153 * and --guestmodules 166 * and --guestmodules
@@ -190,6 +203,12 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
190 struct map **mapp, 203 struct map **mapp,
191 symbol_filter_t filter); 204 symbol_filter_t filter);
192 205
206
207struct thread *machine__findnew_thread(struct machine *machine, pid_t pid);
208void machine__remove_thread(struct machine *machine, struct thread *th);
209
210size_t machine__fprintf(struct machine *machine, FILE *fp);
211
193static inline 212static inline
194struct symbol *machine__find_kernel_symbol(struct machine *self, 213struct symbol *machine__find_kernel_symbol(struct machine *self,
195 enum map_type type, u64 addr, 214 enum map_type type, u64 addr,
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 928918b796b2..531c283fc0c5 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -25,8 +25,6 @@ enum event_result {
25 EVT_HANDLED_ALL 25 EVT_HANDLED_ALL
26}; 26};
27 27
28char debugfs_path[MAXPATHLEN];
29
30#define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x 28#define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x
31#define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x 29#define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x
32 30
@@ -40,6 +38,7 @@ static struct event_symbol event_symbols[] = {
40 { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, 38 { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" },
41 { CHW(BRANCH_MISSES), "branch-misses", "" }, 39 { CHW(BRANCH_MISSES), "branch-misses", "" },
42 { CHW(BUS_CYCLES), "bus-cycles", "" }, 40 { CHW(BUS_CYCLES), "bus-cycles", "" },
41 { CHW(REF_CPU_CYCLES), "ref-cycles", "" },
43 42
44 { CSW(CPU_CLOCK), "cpu-clock", "" }, 43 { CSW(CPU_CLOCK), "cpu-clock", "" },
45 { CSW(TASK_CLOCK), "task-clock", "" }, 44 { CSW(TASK_CLOCK), "task-clock", "" },
@@ -70,6 +69,7 @@ static const char *hw_event_names[PERF_COUNT_HW_MAX] = {
70 "bus-cycles", 69 "bus-cycles",
71 "stalled-cycles-frontend", 70 "stalled-cycles-frontend",
72 "stalled-cycles-backend", 71 "stalled-cycles-backend",
72 "ref-cycles",
73}; 73};
74 74
75static const char *sw_event_names[PERF_COUNT_SW_MAX] = { 75static const char *sw_event_names[PERF_COUNT_SW_MAX] = {
@@ -140,7 +140,7 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
140 char evt_path[MAXPATHLEN]; 140 char evt_path[MAXPATHLEN];
141 int fd; 141 int fd;
142 142
143 snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, 143 snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path,
144 sys_dir->d_name, evt_dir->d_name); 144 sys_dir->d_name, evt_dir->d_name);
145 fd = open(evt_path, O_RDONLY); 145 fd = open(evt_path, O_RDONLY);
146 if (fd < 0) 146 if (fd < 0)
@@ -171,16 +171,16 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
171 char evt_path[MAXPATHLEN]; 171 char evt_path[MAXPATHLEN];
172 char dir_path[MAXPATHLEN]; 172 char dir_path[MAXPATHLEN];
173 173
174 if (debugfs_valid_mountpoint(debugfs_path)) 174 if (debugfs_valid_mountpoint(tracing_events_path))
175 return NULL; 175 return NULL;
176 176
177 sys_dir = opendir(debugfs_path); 177 sys_dir = opendir(tracing_events_path);
178 if (!sys_dir) 178 if (!sys_dir)
179 return NULL; 179 return NULL;
180 180
181 for_each_subsystem(sys_dir, sys_dirent, sys_next) { 181 for_each_subsystem(sys_dir, sys_dirent, sys_next) {
182 182
183 snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, 183 snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
184 sys_dirent.d_name); 184 sys_dirent.d_name);
185 evt_dir = opendir(dir_path); 185 evt_dir = opendir(dir_path);
186 if (!evt_dir) 186 if (!evt_dir)
@@ -447,7 +447,7 @@ parse_single_tracepoint_event(char *sys_name,
447 u64 id; 447 u64 id;
448 int fd; 448 int fd;
449 449
450 snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, 450 snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path,
451 sys_name, evt_name); 451 sys_name, evt_name);
452 452
453 fd = open(evt_path, O_RDONLY); 453 fd = open(evt_path, O_RDONLY);
@@ -485,7 +485,7 @@ parse_multiple_tracepoint_event(struct perf_evlist *evlist, char *sys_name,
485 struct dirent *evt_ent; 485 struct dirent *evt_ent;
486 DIR *evt_dir; 486 DIR *evt_dir;
487 487
488 snprintf(evt_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_name); 488 snprintf(evt_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_name);
489 evt_dir = opendir(evt_path); 489 evt_dir = opendir(evt_path);
490 490
491 if (!evt_dir) { 491 if (!evt_dir) {
@@ -528,7 +528,7 @@ parse_tracepoint_event(struct perf_evlist *evlist, const char **strp,
528 char sys_name[MAX_EVENT_LENGTH]; 528 char sys_name[MAX_EVENT_LENGTH];
529 unsigned int sys_length, evt_length; 529 unsigned int sys_length, evt_length;
530 530
531 if (debugfs_valid_mountpoint(debugfs_path)) 531 if (debugfs_valid_mountpoint(tracing_events_path))
532 return 0; 532 return 0;
533 533
534 evt_name = strchr(*strp, ':'); 534 evt_name = strchr(*strp, ':');
@@ -920,10 +920,10 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
920 char evt_path[MAXPATHLEN]; 920 char evt_path[MAXPATHLEN];
921 char dir_path[MAXPATHLEN]; 921 char dir_path[MAXPATHLEN];
922 922
923 if (debugfs_valid_mountpoint(debugfs_path)) 923 if (debugfs_valid_mountpoint(tracing_events_path))
924 return; 924 return;
925 925
926 sys_dir = opendir(debugfs_path); 926 sys_dir = opendir(tracing_events_path);
927 if (!sys_dir) 927 if (!sys_dir)
928 return; 928 return;
929 929
@@ -932,7 +932,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
932 !strglobmatch(sys_dirent.d_name, subsys_glob)) 932 !strglobmatch(sys_dirent.d_name, subsys_glob))
933 continue; 933 continue;
934 934
935 snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, 935 snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
936 sys_dirent.d_name); 936 sys_dirent.d_name);
937 evt_dir = opendir(dir_path); 937 evt_dir = opendir(dir_path);
938 if (!evt_dir) 938 if (!evt_dir)
@@ -964,16 +964,16 @@ int is_valid_tracepoint(const char *event_string)
964 char evt_path[MAXPATHLEN]; 964 char evt_path[MAXPATHLEN];
965 char dir_path[MAXPATHLEN]; 965 char dir_path[MAXPATHLEN];
966 966
967 if (debugfs_valid_mountpoint(debugfs_path)) 967 if (debugfs_valid_mountpoint(tracing_events_path))
968 return 0; 968 return 0;
969 969
970 sys_dir = opendir(debugfs_path); 970 sys_dir = opendir(tracing_events_path);
971 if (!sys_dir) 971 if (!sys_dir)
972 return 0; 972 return 0;
973 973
974 for_each_subsystem(sys_dir, sys_dirent, sys_next) { 974 for_each_subsystem(sys_dir, sys_dirent, sys_next) {
975 975
976 snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, 976 snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
977 sys_dirent.d_name); 977 sys_dirent.d_name);
978 evt_dir = opendir(dir_path); 978 evt_dir = opendir(dir_path);
979 if (!evt_dir) 979 if (!evt_dir)
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 2f8e375e038d..7e0cbe75d5f1 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -39,7 +39,6 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob);
39int print_hwcache_events(const char *event_glob); 39int print_hwcache_events(const char *event_glob);
40extern int is_valid_tracepoint(const char *event_string); 40extern int is_valid_tracepoint(const char *event_string);
41 41
42extern char debugfs_path[];
43extern int valid_debugfs_mount(const char *debugfs); 42extern int valid_debugfs_mount(const char *debugfs);
44 43
45#endif /* __PERF_PARSE_EVENTS_H */ 44#endif /* __PERF_PARSE_EVENTS_H */
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 1132c8f0ce89..17e94d0c36f9 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -5,7 +5,6 @@
5#include "util.h" 5#include "util.h"
6#include "probe-event.h" 6#include "probe-event.h"
7 7
8#define MAX_PATH_LEN 256
9#define MAX_PROBE_BUFFER 1024 8#define MAX_PROBE_BUFFER 1024
10#define MAX_PROBES 128 9#define MAX_PROBES 128
11 10
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index 74350ffb57fe..e30749e38a9b 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -27,7 +27,10 @@
27 27
28#include "../../perf.h" 28#include "../../perf.h"
29#include "../util.h" 29#include "../util.h"
30#include "../thread.h"
31#include "../event.h"
30#include "../trace-event.h" 32#include "../trace-event.h"
33#include "../evsel.h"
31 34
32#include <EXTERN.h> 35#include <EXTERN.h>
33#include <perl.h> 36#include <perl.h>
@@ -245,11 +248,11 @@ static inline struct event *find_cache_event(int type)
245 return event; 248 return event;
246} 249}
247 250
248static void perl_process_event(union perf_event *pevent __unused, 251static void perl_process_tracepoint(union perf_event *pevent __unused,
249 struct perf_sample *sample, 252 struct perf_sample *sample,
250 struct perf_evsel *evsel, 253 struct perf_evsel *evsel,
251 struct perf_session *session __unused, 254 struct machine *machine __unused,
252 struct thread *thread) 255 struct thread *thread)
253{ 256{
254 struct format_field *field; 257 struct format_field *field;
255 static char handler[256]; 258 static char handler[256];
@@ -265,6 +268,9 @@ static void perl_process_event(union perf_event *pevent __unused,
265 268
266 dSP; 269 dSP;
267 270
271 if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
272 return;
273
268 type = trace_parse_common_type(data); 274 type = trace_parse_common_type(data);
269 275
270 event = find_cache_event(type); 276 event = find_cache_event(type);
@@ -332,6 +338,42 @@ static void perl_process_event(union perf_event *pevent __unused,
332 LEAVE; 338 LEAVE;
333} 339}
334 340
341static void perl_process_event_generic(union perf_event *pevent __unused,
342 struct perf_sample *sample,
343 struct perf_evsel *evsel __unused,
344 struct machine *machine __unused,
345 struct thread *thread __unused)
346{
347 dSP;
348
349 if (!get_cv("process_event", 0))
350 return;
351
352 ENTER;
353 SAVETMPS;
354 PUSHMARK(SP);
355 XPUSHs(sv_2mortal(newSVpvn((const char *)pevent, pevent->header.size)));
356 XPUSHs(sv_2mortal(newSVpvn((const char *)&evsel->attr, sizeof(evsel->attr))));
357 XPUSHs(sv_2mortal(newSVpvn((const char *)sample, sizeof(*sample))));
358 XPUSHs(sv_2mortal(newSVpvn((const char *)sample->raw_data, sample->raw_size)));
359 PUTBACK;
360 call_pv("process_event", G_SCALAR);
361 SPAGAIN;
362 PUTBACK;
363 FREETMPS;
364 LEAVE;
365}
366
367static void perl_process_event(union perf_event *pevent,
368 struct perf_sample *sample,
369 struct perf_evsel *evsel,
370 struct machine *machine,
371 struct thread *thread)
372{
373 perl_process_tracepoint(pevent, sample, evsel, machine, thread);
374 perl_process_event_generic(pevent, sample, evsel, machine, thread);
375}
376
335static void run_start_sub(void) 377static void run_start_sub(void)
336{ 378{
337 dSP; /* access to Perl stack */ 379 dSP; /* access to Perl stack */
@@ -553,7 +595,28 @@ static int perl_generate_script(const char *outfile)
553 fprintf(ofp, "sub print_header\n{\n" 595 fprintf(ofp, "sub print_header\n{\n"
554 "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n" 596 "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n"
555 "\tprintf(\"%%-20s %%5u %%05u.%%09u %%8u %%-20s \",\n\t " 597 "\tprintf(\"%%-20s %%5u %%05u.%%09u %%8u %%-20s \",\n\t "
556 "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}"); 598 "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}\n");
599
600 fprintf(ofp,
601 "\n# Packed byte string args of process_event():\n"
602 "#\n"
603 "# $event:\tunion perf_event\tutil/event.h\n"
604 "# $attr:\tstruct perf_event_attr\tlinux/perf_event.h\n"
605 "# $sample:\tstruct perf_sample\tutil/event.h\n"
606 "# $raw_data:\tperf_sample->raw_data\tutil/event.h\n"
607 "\n"
608 "sub process_event\n"
609 "{\n"
610 "\tmy ($event, $attr, $sample, $raw_data) = @_;\n"
611 "\n"
612 "\tmy @event\t= unpack(\"LSS\", $event);\n"
613 "\tmy @attr\t= unpack(\"LLQQQQQLLQQ\", $attr);\n"
614 "\tmy @sample\t= unpack(\"QLLQQQQQLL\", $sample);\n"
615 "\tmy @raw_data\t= unpack(\"C*\", $raw_data);\n"
616 "\n"
617 "\tuse Data::Dumper;\n"
618 "\tprint Dumper \\@event, \\@attr, \\@sample, \\@raw_data;\n"
619 "}\n");
557 620
558 fclose(ofp); 621 fclose(ofp);
559 622
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 6ccf70e8d8f2..0b2a48783172 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -29,6 +29,8 @@
29 29
30#include "../../perf.h" 30#include "../../perf.h"
31#include "../util.h" 31#include "../util.h"
32#include "../event.h"
33#include "../thread.h"
32#include "../trace-event.h" 34#include "../trace-event.h"
33 35
34PyMODINIT_FUNC initperf_trace_context(void); 36PyMODINIT_FUNC initperf_trace_context(void);
@@ -207,7 +209,7 @@ static inline struct event *find_cache_event(int type)
207static void python_process_event(union perf_event *pevent __unused, 209static void python_process_event(union perf_event *pevent __unused,
208 struct perf_sample *sample, 210 struct perf_sample *sample,
209 struct perf_evsel *evsel __unused, 211 struct perf_evsel *evsel __unused,
210 struct perf_session *session __unused, 212 struct machine *machine __unused,
211 struct thread *thread) 213 struct thread *thread)
212{ 214{
213 PyObject *handler, *retval, *context, *t, *obj, *dict = NULL; 215 PyObject *handler, *retval, *context, *t, *obj, *dict = NULL;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0f4555ce9063..b5ca2558c7bb 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -10,6 +10,7 @@
10#include "evlist.h" 10#include "evlist.h"
11#include "evsel.h" 11#include "evsel.h"
12#include "session.h" 12#include "session.h"
13#include "tool.h"
13#include "sort.h" 14#include "sort.h"
14#include "util.h" 15#include "util.h"
15#include "cpumap.h" 16#include "cpumap.h"
@@ -78,39 +79,13 @@ out_close:
78 return -1; 79 return -1;
79} 80}
80 81
81static void perf_session__id_header_size(struct perf_session *session)
82{
83 struct perf_sample *data;
84 u64 sample_type = session->sample_type;
85 u16 size = 0;
86
87 if (!session->sample_id_all)
88 goto out;
89
90 if (sample_type & PERF_SAMPLE_TID)
91 size += sizeof(data->tid) * 2;
92
93 if (sample_type & PERF_SAMPLE_TIME)
94 size += sizeof(data->time);
95
96 if (sample_type & PERF_SAMPLE_ID)
97 size += sizeof(data->id);
98
99 if (sample_type & PERF_SAMPLE_STREAM_ID)
100 size += sizeof(data->stream_id);
101
102 if (sample_type & PERF_SAMPLE_CPU)
103 size += sizeof(data->cpu) * 2;
104out:
105 session->id_hdr_size = size;
106}
107
108void perf_session__update_sample_type(struct perf_session *self) 82void perf_session__update_sample_type(struct perf_session *self)
109{ 83{
110 self->sample_type = perf_evlist__sample_type(self->evlist); 84 self->sample_type = perf_evlist__sample_type(self->evlist);
111 self->sample_size = __perf_evsel__sample_size(self->sample_type); 85 self->sample_size = __perf_evsel__sample_size(self->sample_type);
112 self->sample_id_all = perf_evlist__sample_id_all(self->evlist); 86 self->sample_id_all = perf_evlist__sample_id_all(self->evlist);
113 perf_session__id_header_size(self); 87 self->id_hdr_size = perf_evlist__id_hdr_size(self->evlist);
88 self->host_machine.id_hdr_size = self->id_hdr_size;
114} 89}
115 90
116int perf_session__create_kernel_maps(struct perf_session *self) 91int perf_session__create_kernel_maps(struct perf_session *self)
@@ -130,18 +105,26 @@ static void perf_session__destroy_kernel_maps(struct perf_session *self)
130 105
131struct perf_session *perf_session__new(const char *filename, int mode, 106struct perf_session *perf_session__new(const char *filename, int mode,
132 bool force, bool repipe, 107 bool force, bool repipe,
133 struct perf_event_ops *ops) 108 struct perf_tool *tool)
134{ 109{
135 size_t len = filename ? strlen(filename) + 1 : 0; 110 struct perf_session *self;
136 struct perf_session *self = zalloc(sizeof(*self) + len); 111 struct stat st;
112 size_t len;
113
114 if (!filename || !strlen(filename)) {
115 if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
116 filename = "-";
117 else
118 filename = "perf.data";
119 }
120
121 len = strlen(filename);
122 self = zalloc(sizeof(*self) + len);
137 123
138 if (self == NULL) 124 if (self == NULL)
139 goto out; 125 goto out;
140 126
141 memcpy(self->filename, filename, len); 127 memcpy(self->filename, filename, len);
142 self->threads = RB_ROOT;
143 INIT_LIST_HEAD(&self->dead_threads);
144 self->last_match = NULL;
145 /* 128 /*
146 * On 64bit we can mmap the data file in one go. No need for tiny mmap 129 * On 64bit we can mmap the data file in one go. No need for tiny mmap
147 * slices. On 32bit we use 32MB. 130 * slices. On 32bit we use 32MB.
@@ -171,10 +154,10 @@ struct perf_session *perf_session__new(const char *filename, int mode,
171 goto out_delete; 154 goto out_delete;
172 } 155 }
173 156
174 if (ops && ops->ordering_requires_timestamps && 157 if (tool && tool->ordering_requires_timestamps &&
175 ops->ordered_samples && !self->sample_id_all) { 158 tool->ordered_samples && !self->sample_id_all) {
176 dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); 159 dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
177 ops->ordered_samples = false; 160 tool->ordered_samples = false;
178 } 161 }
179 162
180out: 163out:
@@ -184,17 +167,22 @@ out_delete:
184 return NULL; 167 return NULL;
185} 168}
186 169
187static void perf_session__delete_dead_threads(struct perf_session *self) 170static void machine__delete_dead_threads(struct machine *machine)
188{ 171{
189 struct thread *n, *t; 172 struct thread *n, *t;
190 173
191 list_for_each_entry_safe(t, n, &self->dead_threads, node) { 174 list_for_each_entry_safe(t, n, &machine->dead_threads, node) {
192 list_del(&t->node); 175 list_del(&t->node);
193 thread__delete(t); 176 thread__delete(t);
194 } 177 }
195} 178}
196 179
197static void perf_session__delete_threads(struct perf_session *self) 180static void perf_session__delete_dead_threads(struct perf_session *session)
181{
182 machine__delete_dead_threads(&session->host_machine);
183}
184
185static void machine__delete_threads(struct machine *self)
198{ 186{
199 struct rb_node *nd = rb_first(&self->threads); 187 struct rb_node *nd = rb_first(&self->threads);
200 188
@@ -207,6 +195,11 @@ static void perf_session__delete_threads(struct perf_session *self)
207 } 195 }
208} 196}
209 197
198static void perf_session__delete_threads(struct perf_session *session)
199{
200 machine__delete_threads(&session->host_machine);
201}
202
210void perf_session__delete(struct perf_session *self) 203void perf_session__delete(struct perf_session *self)
211{ 204{
212 perf_session__destroy_kernel_maps(self); 205 perf_session__destroy_kernel_maps(self);
@@ -217,7 +210,7 @@ void perf_session__delete(struct perf_session *self)
217 free(self); 210 free(self);
218} 211}
219 212
220void perf_session__remove_thread(struct perf_session *self, struct thread *th) 213void machine__remove_thread(struct machine *self, struct thread *th)
221{ 214{
222 self->last_match = NULL; 215 self->last_match = NULL;
223 rb_erase(&th->rb_node, &self->threads); 216 rb_erase(&th->rb_node, &self->threads);
@@ -236,16 +229,16 @@ static bool symbol__match_parent_regex(struct symbol *sym)
236 return 0; 229 return 0;
237} 230}
238 231
239int perf_session__resolve_callchain(struct perf_session *self, 232int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel,
240 struct thread *thread, 233 struct thread *thread,
241 struct ip_callchain *chain, 234 struct ip_callchain *chain,
242 struct symbol **parent) 235 struct symbol **parent)
243{ 236{
244 u8 cpumode = PERF_RECORD_MISC_USER; 237 u8 cpumode = PERF_RECORD_MISC_USER;
245 unsigned int i; 238 unsigned int i;
246 int err; 239 int err;
247 240
248 callchain_cursor_reset(&self->callchain_cursor); 241 callchain_cursor_reset(&evsel->hists.callchain_cursor);
249 242
250 for (i = 0; i < chain->nr; i++) { 243 for (i = 0; i < chain->nr; i++) {
251 u64 ip; 244 u64 ip;
@@ -272,7 +265,7 @@ int perf_session__resolve_callchain(struct perf_session *self,
272 265
273 al.filtered = false; 266 al.filtered = false;
274 thread__find_addr_location(thread, self, cpumode, 267 thread__find_addr_location(thread, self, cpumode,
275 MAP__FUNCTION, thread->pid, ip, &al, NULL); 268 MAP__FUNCTION, ip, &al, NULL);
276 if (al.sym != NULL) { 269 if (al.sym != NULL) {
277 if (sort__has_parent && !*parent && 270 if (sort__has_parent && !*parent &&
278 symbol__match_parent_regex(al.sym)) 271 symbol__match_parent_regex(al.sym))
@@ -281,7 +274,7 @@ int perf_session__resolve_callchain(struct perf_session *self,
281 break; 274 break;
282 } 275 }
283 276
284 err = callchain_cursor_append(&self->callchain_cursor, 277 err = callchain_cursor_append(&evsel->hists.callchain_cursor,
285 ip, al.map, al.sym); 278 ip, al.map, al.sym);
286 if (err) 279 if (err)
287 return err; 280 return err;
@@ -290,75 +283,91 @@ int perf_session__resolve_callchain(struct perf_session *self,
290 return 0; 283 return 0;
291} 284}
292 285
293static int process_event_synth_stub(union perf_event *event __used, 286static int process_event_synth_tracing_data_stub(union perf_event *event __used,
294 struct perf_session *session __used) 287 struct perf_session *session __used)
288{
289 dump_printf(": unhandled!\n");
290 return 0;
291}
292
293static int process_event_synth_attr_stub(union perf_event *event __used,
294 struct perf_evlist **pevlist __used)
295{ 295{
296 dump_printf(": unhandled!\n"); 296 dump_printf(": unhandled!\n");
297 return 0; 297 return 0;
298} 298}
299 299
300static int process_event_sample_stub(union perf_event *event __used, 300static int process_event_sample_stub(struct perf_tool *tool __used,
301 union perf_event *event __used,
301 struct perf_sample *sample __used, 302 struct perf_sample *sample __used,
302 struct perf_evsel *evsel __used, 303 struct perf_evsel *evsel __used,
303 struct perf_session *session __used) 304 struct machine *machine __used)
304{ 305{
305 dump_printf(": unhandled!\n"); 306 dump_printf(": unhandled!\n");
306 return 0; 307 return 0;
307} 308}
308 309
309static int process_event_stub(union perf_event *event __used, 310static int process_event_stub(struct perf_tool *tool __used,
311 union perf_event *event __used,
310 struct perf_sample *sample __used, 312 struct perf_sample *sample __used,
311 struct perf_session *session __used) 313 struct machine *machine __used)
312{ 314{
313 dump_printf(": unhandled!\n"); 315 dump_printf(": unhandled!\n");
314 return 0; 316 return 0;
315} 317}
316 318
317static int process_finished_round_stub(union perf_event *event __used, 319static int process_finished_round_stub(struct perf_tool *tool __used,
318 struct perf_session *session __used, 320 union perf_event *event __used,
319 struct perf_event_ops *ops __used) 321 struct perf_session *perf_session __used)
320{ 322{
321 dump_printf(": unhandled!\n"); 323 dump_printf(": unhandled!\n");
322 return 0; 324 return 0;
323} 325}
324 326
325static int process_finished_round(union perf_event *event, 327static int process_event_type_stub(struct perf_tool *tool __used,
326 struct perf_session *session, 328 union perf_event *event __used)
327 struct perf_event_ops *ops); 329{
330 dump_printf(": unhandled!\n");
331 return 0;
332}
328 333
329static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) 334static int process_finished_round(struct perf_tool *tool,
335 union perf_event *event,
336 struct perf_session *session);
337
338static void perf_tool__fill_defaults(struct perf_tool *tool)
330{ 339{
331 if (handler->sample == NULL) 340 if (tool->sample == NULL)
332 handler->sample = process_event_sample_stub; 341 tool->sample = process_event_sample_stub;
333 if (handler->mmap == NULL) 342 if (tool->mmap == NULL)
334 handler->mmap = process_event_stub; 343 tool->mmap = process_event_stub;
335 if (handler->comm == NULL) 344 if (tool->comm == NULL)
336 handler->comm = process_event_stub; 345 tool->comm = process_event_stub;
337 if (handler->fork == NULL) 346 if (tool->fork == NULL)
338 handler->fork = process_event_stub; 347 tool->fork = process_event_stub;
339 if (handler->exit == NULL) 348 if (tool->exit == NULL)
340 handler->exit = process_event_stub; 349 tool->exit = process_event_stub;
341 if (handler->lost == NULL) 350 if (tool->lost == NULL)
342 handler->lost = perf_event__process_lost; 351 tool->lost = perf_event__process_lost;
343 if (handler->read == NULL) 352 if (tool->read == NULL)
344 handler->read = process_event_stub; 353 tool->read = process_event_sample_stub;
345 if (handler->throttle == NULL) 354 if (tool->throttle == NULL)
346 handler->throttle = process_event_stub; 355 tool->throttle = process_event_stub;
347 if (handler->unthrottle == NULL) 356 if (tool->unthrottle == NULL)
348 handler->unthrottle = process_event_stub; 357 tool->unthrottle = process_event_stub;
349 if (handler->attr == NULL) 358 if (tool->attr == NULL)
350 handler->attr = process_event_synth_stub; 359 tool->attr = process_event_synth_attr_stub;
351 if (handler->event_type == NULL) 360 if (tool->event_type == NULL)
352 handler->event_type = process_event_synth_stub; 361 tool->event_type = process_event_type_stub;
353 if (handler->tracing_data == NULL) 362 if (tool->tracing_data == NULL)
354 handler->tracing_data = process_event_synth_stub; 363 tool->tracing_data = process_event_synth_tracing_data_stub;
355 if (handler->build_id == NULL) 364 if (tool->build_id == NULL)
356 handler->build_id = process_event_synth_stub; 365 tool->build_id = process_finished_round_stub;
357 if (handler->finished_round == NULL) { 366 if (tool->finished_round == NULL) {
358 if (handler->ordered_samples) 367 if (tool->ordered_samples)
359 handler->finished_round = process_finished_round; 368 tool->finished_round = process_finished_round;
360 else 369 else
361 handler->finished_round = process_finished_round_stub; 370 tool->finished_round = process_finished_round_stub;
362 } 371 }
363} 372}
364 373
@@ -490,11 +499,11 @@ static void perf_session_free_sample_buffers(struct perf_session *session)
490static int perf_session_deliver_event(struct perf_session *session, 499static int perf_session_deliver_event(struct perf_session *session,
491 union perf_event *event, 500 union perf_event *event,
492 struct perf_sample *sample, 501 struct perf_sample *sample,
493 struct perf_event_ops *ops, 502 struct perf_tool *tool,
494 u64 file_offset); 503 u64 file_offset);
495 504
496static void flush_sample_queue(struct perf_session *s, 505static void flush_sample_queue(struct perf_session *s,
497 struct perf_event_ops *ops) 506 struct perf_tool *tool)
498{ 507{
499 struct ordered_samples *os = &s->ordered_samples; 508 struct ordered_samples *os = &s->ordered_samples;
500 struct list_head *head = &os->samples; 509 struct list_head *head = &os->samples;
@@ -505,7 +514,7 @@ static void flush_sample_queue(struct perf_session *s,
505 unsigned idx = 0, progress_next = os->nr_samples / 16; 514 unsigned idx = 0, progress_next = os->nr_samples / 16;
506 int ret; 515 int ret;
507 516
508 if (!ops->ordered_samples || !limit) 517 if (!tool->ordered_samples || !limit)
509 return; 518 return;
510 519
511 list_for_each_entry_safe(iter, tmp, head, list) { 520 list_for_each_entry_safe(iter, tmp, head, list) {
@@ -516,7 +525,7 @@ static void flush_sample_queue(struct perf_session *s,
516 if (ret) 525 if (ret)
517 pr_err("Can't parse sample, err = %d\n", ret); 526 pr_err("Can't parse sample, err = %d\n", ret);
518 else 527 else
519 perf_session_deliver_event(s, iter->event, &sample, ops, 528 perf_session_deliver_event(s, iter->event, &sample, tool,
520 iter->file_offset); 529 iter->file_offset);
521 530
522 os->last_flush = iter->timestamp; 531 os->last_flush = iter->timestamp;
@@ -578,11 +587,11 @@ static void flush_sample_queue(struct perf_session *s,
578 * Flush every events below timestamp 7 587 * Flush every events below timestamp 7
579 * etc... 588 * etc...
580 */ 589 */
581static int process_finished_round(union perf_event *event __used, 590static int process_finished_round(struct perf_tool *tool,
582 struct perf_session *session, 591 union perf_event *event __used,
583 struct perf_event_ops *ops) 592 struct perf_session *session)
584{ 593{
585 flush_sample_queue(session, ops); 594 flush_sample_queue(session, tool);
586 session->ordered_samples.next_flush = session->ordered_samples.max_timestamp; 595 session->ordered_samples.next_flush = session->ordered_samples.max_timestamp;
587 596
588 return 0; 597 return 0;
@@ -737,13 +746,26 @@ static void dump_sample(struct perf_session *session, union perf_event *event,
737 callchain__printf(sample); 746 callchain__printf(sample);
738} 747}
739 748
749static struct machine *
750 perf_session__find_machine_for_cpumode(struct perf_session *session,
751 union perf_event *event)
752{
753 const u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
754
755 if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest)
756 return perf_session__find_machine(session, event->ip.pid);
757
758 return perf_session__find_host_machine(session);
759}
760
740static int perf_session_deliver_event(struct perf_session *session, 761static int perf_session_deliver_event(struct perf_session *session,
741 union perf_event *event, 762 union perf_event *event,
742 struct perf_sample *sample, 763 struct perf_sample *sample,
743 struct perf_event_ops *ops, 764 struct perf_tool *tool,
744 u64 file_offset) 765 u64 file_offset)
745{ 766{
746 struct perf_evsel *evsel; 767 struct perf_evsel *evsel;
768 struct machine *machine;
747 769
748 dump_event(session, event, file_offset, sample); 770 dump_event(session, event, file_offset, sample);
749 771
@@ -765,6 +787,8 @@ static int perf_session_deliver_event(struct perf_session *session,
765 hists__inc_nr_events(&evsel->hists, event->header.type); 787 hists__inc_nr_events(&evsel->hists, event->header.type);
766 } 788 }
767 789
790 machine = perf_session__find_machine_for_cpumode(session, event);
791
768 switch (event->header.type) { 792 switch (event->header.type) {
769 case PERF_RECORD_SAMPLE: 793 case PERF_RECORD_SAMPLE:
770 dump_sample(session, event, sample); 794 dump_sample(session, event, sample);
@@ -772,23 +796,25 @@ static int perf_session_deliver_event(struct perf_session *session,
772 ++session->hists.stats.nr_unknown_id; 796 ++session->hists.stats.nr_unknown_id;
773 return -1; 797 return -1;
774 } 798 }
775 return ops->sample(event, sample, evsel, session); 799 return tool->sample(tool, event, sample, evsel, machine);
776 case PERF_RECORD_MMAP: 800 case PERF_RECORD_MMAP:
777 return ops->mmap(event, sample, session); 801 return tool->mmap(tool, event, sample, machine);
778 case PERF_RECORD_COMM: 802 case PERF_RECORD_COMM:
779 return ops->comm(event, sample, session); 803 return tool->comm(tool, event, sample, machine);
780 case PERF_RECORD_FORK: 804 case PERF_RECORD_FORK:
781 return ops->fork(event, sample, session); 805 return tool->fork(tool, event, sample, machine);
782 case PERF_RECORD_EXIT: 806 case PERF_RECORD_EXIT:
783 return ops->exit(event, sample, session); 807 return tool->exit(tool, event, sample, machine);
784 case PERF_RECORD_LOST: 808 case PERF_RECORD_LOST:
785 return ops->lost(event, sample, session); 809 if (tool->lost == perf_event__process_lost)
810 session->hists.stats.total_lost += event->lost.lost;
811 return tool->lost(tool, event, sample, machine);
786 case PERF_RECORD_READ: 812 case PERF_RECORD_READ:
787 return ops->read(event, sample, session); 813 return tool->read(tool, event, sample, evsel, machine);
788 case PERF_RECORD_THROTTLE: 814 case PERF_RECORD_THROTTLE:
789 return ops->throttle(event, sample, session); 815 return tool->throttle(tool, event, sample, machine);
790 case PERF_RECORD_UNTHROTTLE: 816 case PERF_RECORD_UNTHROTTLE:
791 return ops->unthrottle(event, sample, session); 817 return tool->unthrottle(tool, event, sample, machine);
792 default: 818 default:
793 ++session->hists.stats.nr_unknown_events; 819 ++session->hists.stats.nr_unknown_events;
794 return -1; 820 return -1;
@@ -812,24 +838,29 @@ static int perf_session__preprocess_sample(struct perf_session *session,
812} 838}
813 839
814static int perf_session__process_user_event(struct perf_session *session, union perf_event *event, 840static int perf_session__process_user_event(struct perf_session *session, union perf_event *event,
815 struct perf_event_ops *ops, u64 file_offset) 841 struct perf_tool *tool, u64 file_offset)
816{ 842{
843 int err;
844
817 dump_event(session, event, file_offset, NULL); 845 dump_event(session, event, file_offset, NULL);
818 846
819 /* These events are processed right away */ 847 /* These events are processed right away */
820 switch (event->header.type) { 848 switch (event->header.type) {
821 case PERF_RECORD_HEADER_ATTR: 849 case PERF_RECORD_HEADER_ATTR:
822 return ops->attr(event, session); 850 err = tool->attr(event, &session->evlist);
851 if (err == 0)
852 perf_session__update_sample_type(session);
853 return err;
823 case PERF_RECORD_HEADER_EVENT_TYPE: 854 case PERF_RECORD_HEADER_EVENT_TYPE:
824 return ops->event_type(event, session); 855 return tool->event_type(tool, event);
825 case PERF_RECORD_HEADER_TRACING_DATA: 856 case PERF_RECORD_HEADER_TRACING_DATA:
826 /* setup for reading amidst mmap */ 857 /* setup for reading amidst mmap */
827 lseek(session->fd, file_offset, SEEK_SET); 858 lseek(session->fd, file_offset, SEEK_SET);
828 return ops->tracing_data(event, session); 859 return tool->tracing_data(event, session);
829 case PERF_RECORD_HEADER_BUILD_ID: 860 case PERF_RECORD_HEADER_BUILD_ID:
830 return ops->build_id(event, session); 861 return tool->build_id(tool, event, session);
831 case PERF_RECORD_FINISHED_ROUND: 862 case PERF_RECORD_FINISHED_ROUND:
832 return ops->finished_round(event, session, ops); 863 return tool->finished_round(tool, event, session);
833 default: 864 default:
834 return -EINVAL; 865 return -EINVAL;
835 } 866 }
@@ -837,7 +868,7 @@ static int perf_session__process_user_event(struct perf_session *session, union
837 868
838static int perf_session__process_event(struct perf_session *session, 869static int perf_session__process_event(struct perf_session *session,
839 union perf_event *event, 870 union perf_event *event,
840 struct perf_event_ops *ops, 871 struct perf_tool *tool,
841 u64 file_offset) 872 u64 file_offset)
842{ 873{
843 struct perf_sample sample; 874 struct perf_sample sample;
@@ -853,7 +884,7 @@ static int perf_session__process_event(struct perf_session *session,
853 hists__inc_nr_events(&session->hists, event->header.type); 884 hists__inc_nr_events(&session->hists, event->header.type);
854 885
855 if (event->header.type >= PERF_RECORD_USER_TYPE_START) 886 if (event->header.type >= PERF_RECORD_USER_TYPE_START)
856 return perf_session__process_user_event(session, event, ops, file_offset); 887 return perf_session__process_user_event(session, event, tool, file_offset);
857 888
858 /* 889 /*
859 * For all kernel events we get the sample data 890 * For all kernel events we get the sample data
@@ -866,14 +897,14 @@ static int perf_session__process_event(struct perf_session *session,
866 if (perf_session__preprocess_sample(session, event, &sample)) 897 if (perf_session__preprocess_sample(session, event, &sample))
867 return 0; 898 return 0;
868 899
869 if (ops->ordered_samples) { 900 if (tool->ordered_samples) {
870 ret = perf_session_queue_event(session, event, &sample, 901 ret = perf_session_queue_event(session, event, &sample,
871 file_offset); 902 file_offset);
872 if (ret != -ETIME) 903 if (ret != -ETIME)
873 return ret; 904 return ret;
874 } 905 }
875 906
876 return perf_session_deliver_event(session, event, &sample, ops, 907 return perf_session_deliver_event(session, event, &sample, tool,
877 file_offset); 908 file_offset);
878} 909}
879 910
@@ -884,6 +915,11 @@ void perf_event_header__bswap(struct perf_event_header *self)
884 self->size = bswap_16(self->size); 915 self->size = bswap_16(self->size);
885} 916}
886 917
918struct thread *perf_session__findnew(struct perf_session *session, pid_t pid)
919{
920 return machine__findnew_thread(&session->host_machine, pid);
921}
922
887static struct thread *perf_session__register_idle_thread(struct perf_session *self) 923static struct thread *perf_session__register_idle_thread(struct perf_session *self)
888{ 924{
889 struct thread *thread = perf_session__findnew(self, 0); 925 struct thread *thread = perf_session__findnew(self, 0);
@@ -897,9 +933,9 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
897} 933}
898 934
899static void perf_session__warn_about_errors(const struct perf_session *session, 935static void perf_session__warn_about_errors(const struct perf_session *session,
900 const struct perf_event_ops *ops) 936 const struct perf_tool *tool)
901{ 937{
902 if (ops->lost == perf_event__process_lost && 938 if (tool->lost == perf_event__process_lost &&
903 session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) { 939 session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) {
904 ui__warning("Processed %d events and lost %d chunks!\n\n" 940 ui__warning("Processed %d events and lost %d chunks!\n\n"
905 "Check IO/CPU overload!\n\n", 941 "Check IO/CPU overload!\n\n",
@@ -934,7 +970,7 @@ static void perf_session__warn_about_errors(const struct perf_session *session,
934volatile int session_done; 970volatile int session_done;
935 971
936static int __perf_session__process_pipe_events(struct perf_session *self, 972static int __perf_session__process_pipe_events(struct perf_session *self,
937 struct perf_event_ops *ops) 973 struct perf_tool *tool)
938{ 974{
939 union perf_event event; 975 union perf_event event;
940 uint32_t size; 976 uint32_t size;
@@ -943,7 +979,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self,
943 int err; 979 int err;
944 void *p; 980 void *p;
945 981
946 perf_event_ops__fill_defaults(ops); 982 perf_tool__fill_defaults(tool);
947 983
948 head = 0; 984 head = 0;
949more: 985more:
@@ -979,8 +1015,7 @@ more:
979 } 1015 }
980 } 1016 }
981 1017
982 if (size == 0 || 1018 if ((skip = perf_session__process_event(self, &event, tool, head)) < 0) {
983 (skip = perf_session__process_event(self, &event, ops, head)) < 0) {
984 dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", 1019 dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n",
985 head, event.header.size, event.header.type); 1020 head, event.header.size, event.header.type);
986 /* 1021 /*
@@ -1003,7 +1038,7 @@ more:
1003done: 1038done:
1004 err = 0; 1039 err = 0;
1005out_err: 1040out_err:
1006 perf_session__warn_about_errors(self, ops); 1041 perf_session__warn_about_errors(self, tool);
1007 perf_session_free_sample_buffers(self); 1042 perf_session_free_sample_buffers(self);
1008 return err; 1043 return err;
1009} 1044}
@@ -1034,7 +1069,7 @@ fetch_mmaped_event(struct perf_session *session,
1034 1069
1035int __perf_session__process_events(struct perf_session *session, 1070int __perf_session__process_events(struct perf_session *session,
1036 u64 data_offset, u64 data_size, 1071 u64 data_offset, u64 data_size,
1037 u64 file_size, struct perf_event_ops *ops) 1072 u64 file_size, struct perf_tool *tool)
1038{ 1073{
1039 u64 head, page_offset, file_offset, file_pos, progress_next; 1074 u64 head, page_offset, file_offset, file_pos, progress_next;
1040 int err, mmap_prot, mmap_flags, map_idx = 0; 1075 int err, mmap_prot, mmap_flags, map_idx = 0;
@@ -1043,7 +1078,7 @@ int __perf_session__process_events(struct perf_session *session,
1043 union perf_event *event; 1078 union perf_event *event;
1044 uint32_t size; 1079 uint32_t size;
1045 1080
1046 perf_event_ops__fill_defaults(ops); 1081 perf_tool__fill_defaults(tool);
1047 1082
1048 page_size = sysconf(_SC_PAGESIZE); 1083 page_size = sysconf(_SC_PAGESIZE);
1049 1084
@@ -1098,7 +1133,7 @@ more:
1098 size = event->header.size; 1133 size = event->header.size;
1099 1134
1100 if (size == 0 || 1135 if (size == 0 ||
1101 perf_session__process_event(session, event, ops, file_pos) < 0) { 1136 perf_session__process_event(session, event, tool, file_pos) < 0) {
1102 dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", 1137 dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n",
1103 file_offset + head, event->header.size, 1138 file_offset + head, event->header.size,
1104 event->header.type); 1139 event->header.type);
@@ -1127,15 +1162,15 @@ more:
1127 err = 0; 1162 err = 0;
1128 /* do the final flush for ordered samples */ 1163 /* do the final flush for ordered samples */
1129 session->ordered_samples.next_flush = ULLONG_MAX; 1164 session->ordered_samples.next_flush = ULLONG_MAX;
1130 flush_sample_queue(session, ops); 1165 flush_sample_queue(session, tool);
1131out_err: 1166out_err:
1132 perf_session__warn_about_errors(session, ops); 1167 perf_session__warn_about_errors(session, tool);
1133 perf_session_free_sample_buffers(session); 1168 perf_session_free_sample_buffers(session);
1134 return err; 1169 return err;
1135} 1170}
1136 1171
1137int perf_session__process_events(struct perf_session *self, 1172int perf_session__process_events(struct perf_session *self,
1138 struct perf_event_ops *ops) 1173 struct perf_tool *tool)
1139{ 1174{
1140 int err; 1175 int err;
1141 1176
@@ -1146,9 +1181,9 @@ int perf_session__process_events(struct perf_session *self,
1146 err = __perf_session__process_events(self, 1181 err = __perf_session__process_events(self,
1147 self->header.data_offset, 1182 self->header.data_offset,
1148 self->header.data_size, 1183 self->header.data_size,
1149 self->size, ops); 1184 self->size, tool);
1150 else 1185 else
1151 err = __perf_session__process_pipe_events(self, ops); 1186 err = __perf_session__process_pipe_events(self, tool);
1152 1187
1153 return err; 1188 return err;
1154} 1189}
@@ -1163,9 +1198,8 @@ bool perf_session__has_traces(struct perf_session *self, const char *msg)
1163 return true; 1198 return true;
1164} 1199}
1165 1200
1166int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps, 1201int maps__set_kallsyms_ref_reloc_sym(struct map **maps,
1167 const char *symbol_name, 1202 const char *symbol_name, u64 addr)
1168 u64 addr)
1169{ 1203{
1170 char *bracket; 1204 char *bracket;
1171 enum map_type i; 1205 enum map_type i;
@@ -1224,6 +1258,27 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
1224 return ret; 1258 return ret;
1225} 1259}
1226 1260
1261size_t perf_session__fprintf(struct perf_session *session, FILE *fp)
1262{
1263 /*
1264 * FIXME: Here we have to actually print all the machines in this
1265 * session, not just the host...
1266 */
1267 return machine__fprintf(&session->host_machine, fp);
1268}
1269
1270void perf_session__remove_thread(struct perf_session *session,
1271 struct thread *th)
1272{
1273 /*
1274 * FIXME: This one makes no sense, we need to remove the thread from
1275 * the machine it belongs to, perf_session can have many machines, so
1276 * doing it always on ->host_machine is wrong. Fix when auditing all
1277 * the 'perf kvm' code.
1278 */
1279 machine__remove_thread(&session->host_machine, th);
1280}
1281
1227struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, 1282struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
1228 unsigned int type) 1283 unsigned int type)
1229{ 1284{
@@ -1236,17 +1291,16 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
1236 return NULL; 1291 return NULL;
1237} 1292}
1238 1293
1239void perf_session__print_ip(union perf_event *event, 1294void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
1240 struct perf_sample *sample, 1295 struct machine *machine, struct perf_evsel *evsel,
1241 struct perf_session *session, 1296 int print_sym, int print_dso)
1242 int print_sym, int print_dso)
1243{ 1297{
1244 struct addr_location al; 1298 struct addr_location al;
1245 const char *symname, *dsoname; 1299 const char *symname, *dsoname;
1246 struct callchain_cursor *cursor = &session->callchain_cursor; 1300 struct callchain_cursor *cursor = &evsel->hists.callchain_cursor;
1247 struct callchain_cursor_node *node; 1301 struct callchain_cursor_node *node;
1248 1302
1249 if (perf_event__preprocess_sample(event, session, &al, sample, 1303 if (perf_event__preprocess_sample(event, machine, &al, sample,
1250 NULL) < 0) { 1304 NULL) < 0) {
1251 error("problem processing %d event, skipping it.\n", 1305 error("problem processing %d event, skipping it.\n",
1252 event->header.type); 1306 event->header.type);
@@ -1255,7 +1309,7 @@ void perf_session__print_ip(union perf_event *event,
1255 1309
1256 if (symbol_conf.use_callchain && sample->callchain) { 1310 if (symbol_conf.use_callchain && sample->callchain) {
1257 1311
1258 if (perf_session__resolve_callchain(session, al.thread, 1312 if (machine__resolve_callchain(machine, evsel, al.thread,
1259 sample->callchain, NULL) != 0) { 1313 sample->callchain, NULL) != 0) {
1260 if (verbose) 1314 if (verbose)
1261 error("Failed to resolve callchain. Skipping\n"); 1315 error("Failed to resolve callchain. Skipping\n");
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 6e393c98eb34..37bc38381fb6 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -30,9 +30,6 @@ struct perf_session {
30 struct perf_header header; 30 struct perf_header header;
31 unsigned long size; 31 unsigned long size;
32 unsigned long mmap_window; 32 unsigned long mmap_window;
33 struct rb_root threads;
34 struct list_head dead_threads;
35 struct thread *last_match;
36 struct machine host_machine; 33 struct machine host_machine;
37 struct rb_root machines; 34 struct rb_root machines;
38 struct perf_evlist *evlist; 35 struct perf_evlist *evlist;
@@ -53,65 +50,31 @@ struct perf_session {
53 int cwdlen; 50 int cwdlen;
54 char *cwd; 51 char *cwd;
55 struct ordered_samples ordered_samples; 52 struct ordered_samples ordered_samples;
56 struct callchain_cursor callchain_cursor; 53 char filename[1];
57 char filename[0];
58}; 54};
59 55
60struct perf_evsel; 56struct perf_tool;
61struct perf_event_ops;
62
63typedef int (*event_sample)(union perf_event *event, struct perf_sample *sample,
64 struct perf_evsel *evsel, struct perf_session *session);
65typedef int (*event_op)(union perf_event *self, struct perf_sample *sample,
66 struct perf_session *session);
67typedef int (*event_synth_op)(union perf_event *self,
68 struct perf_session *session);
69typedef int (*event_op2)(union perf_event *self, struct perf_session *session,
70 struct perf_event_ops *ops);
71
72struct perf_event_ops {
73 event_sample sample;
74 event_op mmap,
75 comm,
76 fork,
77 exit,
78 lost,
79 read,
80 throttle,
81 unthrottle;
82 event_synth_op attr,
83 event_type,
84 tracing_data,
85 build_id;
86 event_op2 finished_round;
87 bool ordered_samples;
88 bool ordering_requires_timestamps;
89};
90 57
91struct perf_session *perf_session__new(const char *filename, int mode, 58struct perf_session *perf_session__new(const char *filename, int mode,
92 bool force, bool repipe, 59 bool force, bool repipe,
93 struct perf_event_ops *ops); 60 struct perf_tool *tool);
94void perf_session__delete(struct perf_session *self); 61void perf_session__delete(struct perf_session *self);
95 62
96void perf_event_header__bswap(struct perf_event_header *self); 63void perf_event_header__bswap(struct perf_event_header *self);
97 64
98int __perf_session__process_events(struct perf_session *self, 65int __perf_session__process_events(struct perf_session *self,
99 u64 data_offset, u64 data_size, u64 size, 66 u64 data_offset, u64 data_size, u64 size,
100 struct perf_event_ops *ops); 67 struct perf_tool *tool);
101int perf_session__process_events(struct perf_session *self, 68int perf_session__process_events(struct perf_session *self,
102 struct perf_event_ops *event_ops); 69 struct perf_tool *tool);
103 70
104int perf_session__resolve_callchain(struct perf_session *self, 71int perf_session__resolve_callchain(struct perf_session *self, struct perf_evsel *evsel,
105 struct thread *thread, 72 struct thread *thread,
106 struct ip_callchain *chain, 73 struct ip_callchain *chain,
107 struct symbol **parent); 74 struct symbol **parent);
108 75
109bool perf_session__has_traces(struct perf_session *self, const char *msg); 76bool perf_session__has_traces(struct perf_session *self, const char *msg);
110 77
111int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps,
112 const char *symbol_name,
113 u64 addr);
114
115void mem_bswap_64(void *src, int byte_size); 78void mem_bswap_64(void *src, int byte_size);
116void perf_event__attr_swap(struct perf_event_attr *attr); 79void perf_event__attr_swap(struct perf_event_attr *attr);
117 80
@@ -144,12 +107,16 @@ struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t p
144 107
145static inline 108static inline
146void perf_session__process_machines(struct perf_session *self, 109void perf_session__process_machines(struct perf_session *self,
110 struct perf_tool *tool,
147 machine__process_t process) 111 machine__process_t process)
148{ 112{
149 process(&self->host_machine, self); 113 process(&self->host_machine, tool);
150 return machines__process(&self->machines, process, self); 114 return machines__process(&self->machines, process, tool);
151} 115}
152 116
117struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
118size_t perf_session__fprintf(struct perf_session *self, FILE *fp);
119
153size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp); 120size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp);
154 121
155size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, 122size_t perf_session__fprintf_dsos_buildid(struct perf_session *self,
@@ -167,13 +134,20 @@ static inline int perf_session__parse_sample(struct perf_session *session,
167 session->header.needs_swap); 134 session->header.needs_swap);
168} 135}
169 136
137static inline int perf_session__synthesize_sample(struct perf_session *session,
138 union perf_event *event,
139 const struct perf_sample *sample)
140{
141 return perf_event__synthesize_sample(event, session->sample_type,
142 sample, session->header.needs_swap);
143}
144
170struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, 145struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
171 unsigned int type); 146 unsigned int type);
172 147
173void perf_session__print_ip(union perf_event *event, 148void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
174 struct perf_sample *sample, 149 struct machine *machine, struct perf_evsel *evsel,
175 struct perf_session *session, 150 int print_sym, int print_dso);
176 int print_sym, int print_dso);
177 151
178int perf_session__cpu_bitmap(struct perf_session *session, 152int perf_session__cpu_bitmap(struct perf_session *session,
179 const char *cpu_list, unsigned long *cpu_bitmap); 153 const char *cpu_list, unsigned long *cpu_bitmap);
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 95d370074928..36d4c5619575 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -27,7 +27,8 @@ build_tmp = getenv('PYTHON_EXTBUILD_TMP')
27perf = Extension('perf', 27perf = Extension('perf',
28 sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c', 28 sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c',
29 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c', 29 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c',
30 'util/util.c', 'util/xyarray.c', 'util/cgroup.c'], 30 'util/util.c', 'util/xyarray.c', 'util/cgroup.c',
31 'util/debugfs.c'],
31 include_dirs = ['util/include'], 32 include_dirs = ['util/include'],
32 extra_compile_args = cflags, 33 extra_compile_args = cflags,
33 ) 34 )
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 632b50c7bc26..215d50f2042e 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1757,7 +1757,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg,
1757 struct stat st; 1757 struct stat st;
1758 1758
1759 /*sshfs might return bad dent->d_type, so we have to stat*/ 1759 /*sshfs might return bad dent->d_type, so we have to stat*/
1760 sprintf(path, "%s/%s", dir_name, dent->d_name); 1760 snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
1761 if (stat(path, &st)) 1761 if (stat(path, &st))
1762 continue; 1762 continue;
1763 1763
@@ -1766,8 +1766,6 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg,
1766 !strcmp(dent->d_name, "..")) 1766 !strcmp(dent->d_name, ".."))
1767 continue; 1767 continue;
1768 1768
1769 snprintf(path, sizeof(path), "%s/%s",
1770 dir_name, dent->d_name);
1771 ret = map_groups__set_modules_path_dir(mg, path); 1769 ret = map_groups__set_modules_path_dir(mg, path);
1772 if (ret < 0) 1770 if (ret < 0)
1773 goto out; 1771 goto out;
@@ -1788,9 +1786,6 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg,
1788 if (map == NULL) 1786 if (map == NULL)
1789 continue; 1787 continue;
1790 1788
1791 snprintf(path, sizeof(path), "%s/%s",
1792 dir_name, dent->d_name);
1793
1794 long_name = strdup(path); 1789 long_name = strdup(path);
1795 if (long_name == NULL) { 1790 if (long_name == NULL) {
1796 ret = -1; 1791 ret = -1;
@@ -2609,10 +2604,10 @@ int symbol__init(void)
2609 symbol_conf.initialized = true; 2604 symbol_conf.initialized = true;
2610 return 0; 2605 return 0;
2611 2606
2612out_free_dso_list:
2613 strlist__delete(symbol_conf.dso_list);
2614out_free_comm_list: 2607out_free_comm_list:
2615 strlist__delete(symbol_conf.comm_list); 2608 strlist__delete(symbol_conf.comm_list);
2609out_free_dso_list:
2610 strlist__delete(symbol_conf.dso_list);
2616 return -1; 2611 return -1;
2617} 2612}
2618 2613
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 29f8d742e92f..123c2e14353e 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -68,6 +68,7 @@ struct strlist;
68 68
69struct symbol_conf { 69struct symbol_conf {
70 unsigned short priv_size; 70 unsigned short priv_size;
71 unsigned short nr_events;
71 bool try_vmlinux_path, 72 bool try_vmlinux_path,
72 use_modules, 73 use_modules,
73 sort_by_name, 74 sort_by_name,
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index d5d3b22250f3..fb4b7ea6752f 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -61,7 +61,7 @@ static size_t thread__fprintf(struct thread *self, FILE *fp)
61 map_groups__fprintf(&self->mg, verbose, fp); 61 map_groups__fprintf(&self->mg, verbose, fp);
62} 62}
63 63
64struct thread *perf_session__findnew(struct perf_session *self, pid_t pid) 64struct thread *machine__findnew_thread(struct machine *self, pid_t pid)
65{ 65{
66 struct rb_node **p = &self->threads.rb_node; 66 struct rb_node **p = &self->threads.rb_node;
67 struct rb_node *parent = NULL; 67 struct rb_node *parent = NULL;
@@ -125,12 +125,12 @@ int thread__fork(struct thread *self, struct thread *parent)
125 return 0; 125 return 0;
126} 126}
127 127
128size_t perf_session__fprintf(struct perf_session *self, FILE *fp) 128size_t machine__fprintf(struct machine *machine, FILE *fp)
129{ 129{
130 size_t ret = 0; 130 size_t ret = 0;
131 struct rb_node *nd; 131 struct rb_node *nd;
132 132
133 for (nd = rb_first(&self->threads); nd; nd = rb_next(nd)) { 133 for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
134 struct thread *pos = rb_entry(nd, struct thread, rb_node); 134 struct thread *pos = rb_entry(nd, struct thread, rb_node);
135 135
136 ret += thread__fprintf(pos, fp); 136 ret += thread__fprintf(pos, fp);
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index e5f2401c1b5e..70c2c13ff679 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -18,16 +18,14 @@ struct thread {
18 int comm_len; 18 int comm_len;
19}; 19};
20 20
21struct perf_session; 21struct machine;
22 22
23void thread__delete(struct thread *self); 23void thread__delete(struct thread *self);
24 24
25int thread__set_comm(struct thread *self, const char *comm); 25int thread__set_comm(struct thread *self, const char *comm);
26int thread__comm_len(struct thread *self); 26int thread__comm_len(struct thread *self);
27struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
28void thread__insert_map(struct thread *self, struct map *map); 27void thread__insert_map(struct thread *self, struct map *map);
29int thread__fork(struct thread *self, struct thread *parent); 28int thread__fork(struct thread *self, struct thread *parent);
30size_t perf_session__fprintf(struct perf_session *self, FILE *fp);
31 29
32static inline struct map *thread__find_map(struct thread *self, 30static inline struct map *thread__find_map(struct thread *self,
33 enum map_type type, u64 addr) 31 enum map_type type, u64 addr)
@@ -35,14 +33,12 @@ static inline struct map *thread__find_map(struct thread *self,
35 return self ? map_groups__find(&self->mg, type, addr) : NULL; 33 return self ? map_groups__find(&self->mg, type, addr) : NULL;
36} 34}
37 35
38void thread__find_addr_map(struct thread *self, 36void thread__find_addr_map(struct thread *thread, struct machine *machine,
39 struct perf_session *session, u8 cpumode, 37 u8 cpumode, enum map_type type, u64 addr,
40 enum map_type type, pid_t pid, u64 addr,
41 struct addr_location *al); 38 struct addr_location *al);
42 39
43void thread__find_addr_location(struct thread *self, 40void thread__find_addr_location(struct thread *thread, struct machine *machine,
44 struct perf_session *session, u8 cpumode, 41 u8 cpumode, enum map_type type, u64 addr,
45 enum map_type type, pid_t pid, u64 addr,
46 struct addr_location *al, 42 struct addr_location *al,
47 symbol_filter_t filter); 43 symbol_filter_t filter);
48#endif /* __PERF_THREAD_H */ 44#endif /* __PERF_THREAD_H */
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
new file mode 100644
index 000000000000..b0e1aadba8d5
--- /dev/null
+++ b/tools/perf/util/tool.h
@@ -0,0 +1,50 @@
1#ifndef __PERF_TOOL_H
2#define __PERF_TOOL_H
3
4#include <stdbool.h>
5
6struct perf_session;
7union perf_event;
8struct perf_evlist;
9struct perf_evsel;
10struct perf_sample;
11struct perf_tool;
12struct machine;
13
14typedef int (*event_sample)(struct perf_tool *tool, union perf_event *event,
15 struct perf_sample *sample,
16 struct perf_evsel *evsel, struct machine *machine);
17
18typedef int (*event_op)(struct perf_tool *tool, union perf_event *event,
19 struct perf_sample *sample, struct machine *machine);
20
21typedef int (*event_attr_op)(union perf_event *event,
22 struct perf_evlist **pevlist);
23typedef int (*event_simple_op)(struct perf_tool *tool, union perf_event *event);
24
25typedef int (*event_synth_op)(union perf_event *event,
26 struct perf_session *session);
27
28typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event,
29 struct perf_session *session);
30
31struct perf_tool {
32 event_sample sample,
33 read;
34 event_op mmap,
35 comm,
36 fork,
37 exit,
38 lost,
39 throttle,
40 unthrottle;
41 event_attr_op attr;
42 event_synth_op tracing_data;
43 event_simple_op event_type;
44 event_op2 finished_round,
45 build_id;
46 bool ordered_samples;
47 bool ordering_requires_timestamps;
48};
49
50#endif /* __PERF_TOOL_H */
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 399650967958..a248f3c2c60d 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -1,15 +1,17 @@
1#ifndef __PERF_TOP_H 1#ifndef __PERF_TOP_H
2#define __PERF_TOP_H 1 2#define __PERF_TOP_H 1
3 3
4#include "tool.h"
4#include "types.h" 5#include "types.h"
5#include "../perf.h"
6#include <stddef.h> 6#include <stddef.h>
7#include <stdbool.h>
7 8
8struct perf_evlist; 9struct perf_evlist;
9struct perf_evsel; 10struct perf_evsel;
10struct perf_session; 11struct perf_session;
11 12
12struct perf_top { 13struct perf_top {
14 struct perf_tool tool;
13 struct perf_evlist *evlist; 15 struct perf_evlist *evlist;
14 /* 16 /*
15 * Symbols will be added here in perf_event__process_sample and will 17 * Symbols will be added here in perf_event__process_sample and will
@@ -23,10 +25,26 @@ struct perf_top {
23 int freq; 25 int freq;
24 pid_t target_pid, target_tid; 26 pid_t target_pid, target_tid;
25 bool hide_kernel_symbols, hide_user_symbols, zero; 27 bool hide_kernel_symbols, hide_user_symbols, zero;
28 bool system_wide;
29 bool use_tui, use_stdio;
30 bool sort_has_symbols;
31 bool dont_use_callchains;
32 bool kptr_restrict_warned;
33 bool vmlinux_warned;
34 bool inherit;
35 bool group;
36 bool sample_id_all_avail;
37 bool dump_symtab;
26 const char *cpu_list; 38 const char *cpu_list;
27 struct hist_entry *sym_filter_entry; 39 struct hist_entry *sym_filter_entry;
28 struct perf_evsel *sym_evsel; 40 struct perf_evsel *sym_evsel;
29 struct perf_session *session; 41 struct perf_session *session;
42 struct winsize winsize;
43 unsigned int mmap_pages;
44 int default_interval;
45 int realtime_prio;
46 int sym_pcnt_filter;
47 const char *sym_filter;
30}; 48};
31 49
32size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size); 50size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index d2655f08bcc0..ac6830d8292b 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -18,7 +18,8 @@
18 * 18 *
19 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 19 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 */ 20 */
21#define _GNU_SOURCE 21#include <ctype.h>
22#include "util.h"
22#include <dirent.h> 23#include <dirent.h>
23#include <mntent.h> 24#include <mntent.h>
24#include <stdio.h> 25#include <stdio.h>
@@ -31,7 +32,6 @@
31#include <pthread.h> 32#include <pthread.h>
32#include <fcntl.h> 33#include <fcntl.h>
33#include <unistd.h> 34#include <unistd.h>
34#include <ctype.h>
35#include <errno.h> 35#include <errno.h>
36#include <stdbool.h> 36#include <stdbool.h>
37#include <linux/list.h> 37#include <linux/list.h>
@@ -44,10 +44,6 @@
44 44
45#define VERSION "0.5" 45#define VERSION "0.5"
46 46
47#define _STR(x) #x
48#define STR(x) _STR(x)
49#define MAX_PATH 256
50
51#define TRACE_CTRL "tracing_on" 47#define TRACE_CTRL "tracing_on"
52#define TRACE "trace" 48#define TRACE "trace"
53#define AVAILABLE "available_tracers" 49#define AVAILABLE "available_tracers"
@@ -73,26 +69,6 @@ struct events {
73}; 69};
74 70
75 71
76
77static void die(const char *fmt, ...)
78{
79 va_list ap;
80 int ret = errno;
81
82 if (errno)
83 perror("perf");
84 else
85 ret = -1;
86
87 va_start(ap, fmt);
88 fprintf(stderr, " ");
89 vfprintf(stderr, fmt, ap);
90 va_end(ap);
91
92 fprintf(stderr, "\n");
93 exit(ret);
94}
95
96void *malloc_or_die(unsigned int size) 72void *malloc_or_die(unsigned int size)
97{ 73{
98 void *data; 74 void *data;
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index c9dcbec7d800..a3fdf55f317b 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -39,7 +39,7 @@ static int stop_script_unsupported(void)
39static void process_event_unsupported(union perf_event *event __unused, 39static void process_event_unsupported(union perf_event *event __unused,
40 struct perf_sample *sample __unused, 40 struct perf_sample *sample __unused,
41 struct perf_evsel *evsel __unused, 41 struct perf_evsel *evsel __unused,
42 struct perf_session *session __unused, 42 struct machine *machine __unused,
43 struct thread *thread __unused) 43 struct thread *thread __unused)
44{ 44{
45} 45}
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index a84100817649..58ae14c5baac 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -3,7 +3,11 @@
3 3
4#include <stdbool.h> 4#include <stdbool.h>
5#include "parse-events.h" 5#include "parse-events.h"
6#include "session.h" 6
7struct machine;
8struct perf_sample;
9union perf_event;
10struct thread;
7 11
8#define __unused __attribute__((unused)) 12#define __unused __attribute__((unused))
9 13
@@ -292,7 +296,7 @@ struct scripting_ops {
292 void (*process_event) (union perf_event *event, 296 void (*process_event) (union perf_event *event,
293 struct perf_sample *sample, 297 struct perf_sample *sample,
294 struct perf_evsel *evsel, 298 struct perf_evsel *evsel,
295 struct perf_session *session, 299 struct machine *machine,
296 struct thread *thread); 300 struct thread *thread);
297 int (*generate_script) (const char *outfile); 301 int (*generate_script) (const char *outfile);
298}; 302};
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index 0575905d1205..295a9c93f945 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -224,7 +224,7 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser)
224} 224}
225 225
226static int annotate_browser__run(struct annotate_browser *self, int evidx, 226static int annotate_browser__run(struct annotate_browser *self, int evidx,
227 int nr_events, void(*timer)(void *arg), 227 void(*timer)(void *arg),
228 void *arg, int delay_secs) 228 void *arg, int delay_secs)
229{ 229{
230 struct rb_node *nd = NULL; 230 struct rb_node *nd = NULL;
@@ -328,8 +328,7 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx,
328 notes = symbol__annotation(target); 328 notes = symbol__annotation(target);
329 pthread_mutex_lock(&notes->lock); 329 pthread_mutex_lock(&notes->lock);
330 330
331 if (notes->src == NULL && 331 if (notes->src == NULL && symbol__alloc_hist(target) < 0) {
332 symbol__alloc_hist(target, nr_events) < 0) {
333 pthread_mutex_unlock(&notes->lock); 332 pthread_mutex_unlock(&notes->lock);
334 ui__warning("Not enough memory for annotating '%s' symbol!\n", 333 ui__warning("Not enough memory for annotating '%s' symbol!\n",
335 target->name); 334 target->name);
@@ -337,7 +336,7 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx,
337 } 336 }
338 337
339 pthread_mutex_unlock(&notes->lock); 338 pthread_mutex_unlock(&notes->lock);
340 symbol__tui_annotate(target, ms->map, evidx, nr_events, 339 symbol__tui_annotate(target, ms->map, evidx,
341 timer, arg, delay_secs); 340 timer, arg, delay_secs);
342 } 341 }
343 continue; 342 continue;
@@ -358,15 +357,15 @@ out:
358 return key; 357 return key;
359} 358}
360 359
361int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events, 360int hist_entry__tui_annotate(struct hist_entry *he, int evidx,
362 void(*timer)(void *arg), void *arg, int delay_secs) 361 void(*timer)(void *arg), void *arg, int delay_secs)
363{ 362{
364 return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, nr_events, 363 return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx,
365 timer, arg, delay_secs); 364 timer, arg, delay_secs);
366} 365}
367 366
368int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, 367int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
369 int nr_events, void(*timer)(void *arg), void *arg, 368 void(*timer)(void *arg), void *arg,
370 int delay_secs) 369 int delay_secs)
371{ 370{
372 struct objdump_line *pos, *n; 371 struct objdump_line *pos, *n;
@@ -419,8 +418,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
419 browser.b.nr_entries = browser.nr_entries; 418 browser.b.nr_entries = browser.nr_entries;
420 browser.b.entries = &notes->src->source, 419 browser.b.entries = &notes->src->source,
421 browser.b.width += 18; /* Percentage */ 420 browser.b.width += 18; /* Percentage */
422 ret = annotate_browser__run(&browser, evidx, nr_events, 421 ret = annotate_browser__run(&browser, evidx, timer, arg, delay_secs);
423 timer, arg, delay_secs);
424 list_for_each_entry_safe(pos, n, &notes->src->source, node) { 422 list_for_each_entry_safe(pos, n, &notes->src->source, node) {
425 list_del(&pos->node); 423 list_del(&pos->node);
426 objdump_line__free(pos); 424 objdump_line__free(pos);
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index d0c94b459685..1212a386a033 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -1020,7 +1020,7 @@ do_annotate:
1020 * Don't let this be freed, say, by hists__decay_entry. 1020 * Don't let this be freed, say, by hists__decay_entry.
1021 */ 1021 */
1022 he->used = true; 1022 he->used = true;
1023 err = hist_entry__tui_annotate(he, evsel->idx, nr_events, 1023 err = hist_entry__tui_annotate(he, evsel->idx,
1024 timer, arg, delay_secs); 1024 timer, arg, delay_secs);
1025 he->used = false; 1025 he->used = false;
1026 ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries); 1026 ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
diff --git a/tools/perf/util/ui/progress.c b/tools/perf/util/ui/progress.c
index 295e366b6311..13aa64e50e11 100644
--- a/tools/perf/util/ui/progress.c
+++ b/tools/perf/util/ui/progress.c
@@ -14,6 +14,9 @@ void ui_progress__update(u64 curr, u64 total, const char *title)
14 if (use_browser <= 0) 14 if (use_browser <= 0)
15 return; 15 return;
16 16
17 if (total == 0)
18 return;
19
17 ui__refresh_dimensions(true); 20 ui__refresh_dimensions(true);
18 pthread_mutex_lock(&ui__lock); 21 pthread_mutex_lock(&ui__lock);
19 y = SLtt_Screen_Rows / 2 - 2; 22 y = SLtt_Screen_Rows / 2 - 2;
diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c
index e16bf9a707e8..d76d1c0ff98f 100644
--- a/tools/perf/util/usage.c
+++ b/tools/perf/util/usage.c
@@ -1,5 +1,8 @@
1/* 1/*
2 * GIT - The information manager from hell 2 * usage.c
3 *
4 * Various reporting routines.
5 * Originally copied from GIT source.
3 * 6 *
4 * Copyright (C) Linus Torvalds, 2005 7 * Copyright (C) Linus Torvalds, 2005
5 */ 8 */
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 0128906bac88..37be34dff798 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -245,4 +245,15 @@ int readn(int fd, void *buf, size_t size);
245#define _STR(x) #x 245#define _STR(x) #x
246#define STR(x) _STR(x) 246#define STR(x) _STR(x)
247 247
248/*
249 * Determine whether some value is a power of two, where zero is
250 * *not* considered a power of two.
251 */
252
253static inline __attribute__((const))
254bool is_power_of_2(unsigned long n)
255{
256 return (n != 0 && ((n & (n - 1)) == 0));
257}
258
248#endif 259#endif
diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c
index bdd33470b235..697c8b4e59cc 100644
--- a/tools/perf/util/values.c
+++ b/tools/perf/util/values.c
@@ -32,6 +32,7 @@ void perf_read_values_destroy(struct perf_read_values *values)
32 32
33 for (i = 0; i < values->threads; i++) 33 for (i = 0; i < values->threads; i++)
34 free(values->value[i]); 34 free(values->value[i]);
35 free(values->value);
35 free(values->pid); 36 free(values->pid);
36 free(values->tid); 37 free(values->tid);
37 free(values->counterrawid); 38 free(values->counterrawid);
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 3ad0925d23a9..758e3b36d4cf 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -17,6 +17,8 @@
17#include <linux/pci.h> 17#include <linux/pci.h>
18#include <linux/interrupt.h> 18#include <linux/interrupt.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/namei.h>
21#include <linux/fs.h>
20#include "irq.h" 22#include "irq.h"
21 23
22static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 24static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
@@ -480,12 +482,76 @@ out:
480 return r; 482 return r;
481} 483}
482 484
485/*
486 * We want to test whether the caller has been granted permissions to
487 * use this device. To be able to configure and control the device,
488 * the user needs access to PCI configuration space and BAR resources.
489 * These are accessed through PCI sysfs. PCI config space is often
490 * passed to the process calling this ioctl via file descriptor, so we
491 * can't rely on access to that file. We can check for permissions
492 * on each of the BAR resource files, which is a pretty clear
493 * indicator that the user has been granted access to the device.
494 */
495static int probe_sysfs_permissions(struct pci_dev *dev)
496{
497#ifdef CONFIG_SYSFS
498 int i;
499 bool bar_found = false;
500
501 for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
502 char *kpath, *syspath;
503 struct path path;
504 struct inode *inode;
505 int r;
506
507 if (!pci_resource_len(dev, i))
508 continue;
509
510 kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
511 if (!kpath)
512 return -ENOMEM;
513
514 /* Per sysfs-rules, sysfs is always at /sys */
515 syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
516 kfree(kpath);
517 if (!syspath)
518 return -ENOMEM;
519
520 r = kern_path(syspath, LOOKUP_FOLLOW, &path);
521 kfree(syspath);
522 if (r)
523 return r;
524
525 inode = path.dentry->d_inode;
526
527 r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
528 path_put(&path);
529 if (r)
530 return r;
531
532 bar_found = true;
533 }
534
535 /* If no resources, probably something special */
536 if (!bar_found)
537 return -EPERM;
538
539 return 0;
540#else
541 return -EINVAL; /* No way to control the device without sysfs */
542#endif
543}
544
483static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 545static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
484 struct kvm_assigned_pci_dev *assigned_dev) 546 struct kvm_assigned_pci_dev *assigned_dev)
485{ 547{
486 int r = 0, idx; 548 int r = 0, idx;
487 struct kvm_assigned_dev_kernel *match; 549 struct kvm_assigned_dev_kernel *match;
488 struct pci_dev *dev; 550 struct pci_dev *dev;
551 u8 header_type;
552
553 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
554 return -EINVAL;
489 555
490 mutex_lock(&kvm->lock); 556 mutex_lock(&kvm->lock);
491 idx = srcu_read_lock(&kvm->srcu); 557 idx = srcu_read_lock(&kvm->srcu);
@@ -513,6 +579,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
513 r = -EINVAL; 579 r = -EINVAL;
514 goto out_free; 580 goto out_free;
515 } 581 }
582
583 /* Don't allow bridges to be assigned */
584 pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type);
585 if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) {
586 r = -EPERM;
587 goto out_put;
588 }
589
590 r = probe_sysfs_permissions(dev);
591 if (r)
592 goto out_put;
593
516 if (pci_enable_device(dev)) { 594 if (pci_enable_device(dev)) {
517 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 595 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
518 r = -EBUSY; 596 r = -EBUSY;
@@ -544,16 +622,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
544 622
545 list_add(&match->list, &kvm->arch.assigned_dev_head); 623 list_add(&match->list, &kvm->arch.assigned_dev_head);
546 624
547 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 625 if (!kvm->arch.iommu_domain) {
548 if (!kvm->arch.iommu_domain) { 626 r = kvm_iommu_map_guest(kvm);
549 r = kvm_iommu_map_guest(kvm);
550 if (r)
551 goto out_list_del;
552 }
553 r = kvm_assign_device(kvm, match);
554 if (r) 627 if (r)
555 goto out_list_del; 628 goto out_list_del;
556 } 629 }
630 r = kvm_assign_device(kvm, match);
631 if (r)
632 goto out_list_del;
557 633
558out: 634out:
559 srcu_read_unlock(&kvm->srcu, idx); 635 srcu_read_unlock(&kvm->srcu, idx);
@@ -593,8 +669,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
593 goto out; 669 goto out;
594 } 670 }
595 671
596 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 672 kvm_deassign_device(kvm, match);
597 kvm_deassign_device(kvm, match);
598 673
599 kvm_free_assigned_device(kvm, match); 674 kvm_free_assigned_device(kvm, match);
600 675