aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/whatisRCU.txt31
-rw-r--r--Documentation/devicetree/bindings/i2c/ce4100-i2c.txt93
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-cmos.txt28
-rw-r--r--Documentation/devicetree/bindings/x86/ce4100.txt38
-rw-r--r--Documentation/devicetree/bindings/x86/interrupt.txt26
-rw-r--r--Documentation/devicetree/bindings/x86/timer.txt6
-rw-r--r--Documentation/devicetree/booting-without-of.txt20
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/memory-barriers.txt58
-rw-r--r--Documentation/rtc.txt29
-rw-r--r--Documentation/spinlocks.txt24
-rw-r--r--Documentation/trace/ftrace-design.txt7
-rw-r--r--Documentation/trace/ftrace.txt151
-rw-r--r--Documentation/trace/kprobetrace.txt16
-rw-r--r--arch/alpha/include/asm/fcntl.h2
-rw-r--r--arch/alpha/include/asm/futex.h29
-rw-r--r--arch/alpha/include/asm/rwsem.h36
-rw-r--r--arch/alpha/kernel/osf_sys.c36
-rw-r--r--arch/alpha/kernel/time.c8
-rw-r--r--arch/arm/include/asm/futex.h29
-rw-r--r--arch/arm/kernel/time.c4
-rw-r--r--arch/arm/mach-clps711x/include/mach/time.h2
-rw-r--r--arch/blackfin/kernel/time.c6
-rw-r--r--arch/cris/arch-v10/kernel/time.c4
-rw-r--r--arch/cris/arch-v32/kernel/smp.c4
-rw-r--r--arch/cris/arch-v32/kernel/time.c6
-rw-r--r--arch/frv/include/asm/futex.h5
-rw-r--r--arch/frv/kernel/futex.c14
-rw-r--r--arch/frv/kernel/time.c14
-rw-r--r--arch/h8300/kernel/time.c4
-rw-r--r--arch/h8300/kernel/timer/timer8.c2
-rw-r--r--arch/ia64/include/asm/futex.h15
-rw-r--r--arch/ia64/include/asm/rwsem.h37
-rw-r--r--arch/ia64/include/asm/xen/hypercall.h2
-rw-r--r--arch/ia64/kernel/time.c19
-rw-r--r--arch/ia64/xen/suspend.c9
-rw-r--r--arch/ia64/xen/time.c13
-rw-r--r--arch/m32r/kernel/time.c5
-rw-r--r--arch/m68k/bvme6000/config.c4
-rw-r--r--arch/m68k/kernel/time.c4
-rw-r--r--arch/m68k/mvme147/config.c4
-rw-r--r--arch/m68k/mvme16x/config.c4
-rw-r--r--arch/m68k/sun3/sun3ints.c2
-rw-r--r--arch/m68knommu/kernel/time.c8
-rw-r--r--arch/microblaze/include/asm/futex.h31
-rw-r--r--arch/microblaze/include/asm/pci-bridge.h12
-rw-r--r--arch/microblaze/include/asm/prom.h15
-rw-r--r--arch/microblaze/kernel/prom_parse.c77
-rw-r--r--arch/microblaze/pci/pci-common.c1
-rw-r--r--arch/mips/include/asm/futex.h39
-rw-r--r--arch/mn10300/kernel/time.c6
-rw-r--r--arch/parisc/hpux/sys_hpux.c65
-rw-r--r--arch/parisc/include/asm/fcntl.h2
-rw-r--r--arch/parisc/include/asm/futex.h24
-rw-r--r--arch/parisc/kernel/time.c7
-rw-r--r--arch/powerpc/include/asm/futex.h27
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h10
-rw-r--r--arch/powerpc/include/asm/prom.h15
-rw-r--r--arch/powerpc/include/asm/rwsem.h51
-rw-r--r--arch/powerpc/kernel/pci-common.c1
-rw-r--r--arch/powerpc/kernel/prom_parse.c84
-rw-r--r--arch/powerpc/platforms/cell/spufs/syscalls.c2
-rw-r--r--arch/s390/include/asm/futex.h12
-rw-r--r--arch/s390/include/asm/rwsem.h63
-rw-r--r--arch/s390/include/asm/uaccess.h4
-rw-r--r--arch/s390/lib/uaccess.h8
-rw-r--r--arch/s390/lib/uaccess_pt.c17
-rw-r--r--arch/s390/lib/uaccess_std.c8
-rw-r--r--arch/sh/include/asm/futex-irq.h24
-rw-r--r--arch/sh/include/asm/futex.h11
-rw-r--r--arch/sh/include/asm/rwsem.h56
-rw-r--r--arch/sparc/include/asm/fcntl.h2
-rw-r--r--arch/sparc/include/asm/futex_64.h20
-rw-r--r--arch/sparc/include/asm/rwsem.h46
-rw-r--r--arch/sparc/kernel/pcic.c4
-rw-r--r--arch/sparc/kernel/time_32.c9
-rw-r--r--arch/sparc/lib/atomic32.c2
-rw-r--r--arch/tile/include/asm/futex.h27
-rw-r--r--arch/um/Kconfig.common1
-rw-r--r--arch/um/Kconfig.x865
-rw-r--r--arch/um/drivers/mconsole_kern.c21
-rw-r--r--arch/um/drivers/ubd_kern.c2
-rw-r--r--arch/um/kernel/irq.c31
-rw-r--r--arch/x86/Kconfig30
-rw-r--r--arch/x86/Kconfig.cpu5
-rw-r--r--arch/x86/ia32/ia32entry.S32
-rw-r--r--arch/x86/include/asm/acpi.h10
-rw-r--r--arch/x86/include/asm/amd_nb.h8
-rw-r--r--arch/x86/include/asm/apic.h42
-rw-r--r--arch/x86/include/asm/apicdef.h12
-rw-r--r--arch/x86/include/asm/bootparam.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h5
-rw-r--r--arch/x86/include/asm/frame.h6
-rw-r--r--arch/x86/include/asm/futex.h22
-rw-r--r--arch/x86/include/asm/hw_irq.h24
-rw-r--r--arch/x86/include/asm/init.h6
-rw-r--r--arch/x86/include/asm/io_apic.h44
-rw-r--r--arch/x86/include/asm/ipi.h8
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_controller.h12
-rw-r--r--arch/x86/include/asm/irq_vectors.h45
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/mpspec.h3
-rw-r--r--arch/x86/include/asm/msr-index.h3
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/numa.h52
-rw-r--r--arch/x86/include/asm/numa_32.h7
-rw-r--r--arch/x86/include/asm/numa_64.h23
-rw-r--r--arch/x86/include/asm/olpc_ofw.h14
-rw-r--r--arch/x86/include/asm/page_types.h9
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/prom.h70
-rw-r--r--arch/x86/include/asm/rwsem.h80
-rw-r--r--arch/x86/include/asm/smp.h20
-rw-r--r--arch/x86/include/asm/system.h2
-rw-r--r--arch/x86/include/asm/topology.h19
-rw-r--r--arch/x86/include/asm/unistd_32.h5
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h15
-rw-r--r--arch/x86/include/asm/xen/page.h47
-rw-r--r--arch/x86/include/asm/xen/pci.h8
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c8
-rw-r--r--arch/x86/kernel/apb_timer.c60
-rw-r--r--arch/x86/kernel/aperture_64.c33
-rw-r--r--arch/x86/kernel/apic/apic.c150
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c26
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c34
-rw-r--r--arch/x86/kernel/apic/es7000_32.c35
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c388
-rw-r--r--arch/x86/kernel/apic/ipi.c12
-rw-r--r--arch/x86/kernel/apic/numaq_32.c21
-rw-r--r--arch/x86/kernel/apic/probe_32.c10
-rw-r--r--arch/x86/kernel/apic/summit_32.c47
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c65
-rw-r--r--arch/x86/kernel/asm-offsets_32.c69
-rw-r--r--arch/x86/kernel/asm-offsets_64.c90
-rw-r--r--arch/x86/kernel/cpu/amd.c51
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c7
-rw-r--r--arch/x86/kernel/cpu/perf_event.c170
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c175
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c417
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c97
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c8
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/devicetree.c441
-rw-r--r--arch/x86/kernel/dumpstack.c25
-rw-r--r--arch/x86/kernel/e820.c18
-rw-r--r--arch/x86/kernel/entry_32.S11
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/ftrace.c15
-rw-r--r--arch/x86/kernel/head_32.S10
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/ioport.c20
-rw-r--r--arch/x86/kernel/irq.c91
-rw-r--r--arch/x86/kernel/irqinit.c92
-rw-r--r--arch/x86/kernel/kgdb.c9
-rw-r--r--arch/x86/kernel/kprobes.c8
-rw-r--r--arch/x86/kernel/microcode_amd.c188
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/process.c9
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c76
-rw-r--r--arch/x86/kernel/setup_percpu.c11
-rw-r--r--arch/x86/kernel/smpboot.c123
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/vmlinux.lds.S3
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kvm/trace.h8
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/lib/atomic64_386_32.S6
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S6
-rw-r--r--arch/x86/lib/checksum_32.S63
-rw-r--r--arch/x86/lib/memmove_64.S197
-rw-r--r--arch/x86/lib/memmove_64.c192
-rw-r--r--arch/x86/lib/rwsem_64.S56
-rw-r--r--arch/x86/lib/semaphore_32.S38
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S27
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/amdtopology_64.c142
-rw-r--r--arch/x86/mm/init.c56
-rw-r--r--arch/x86/mm/init_32.c11
-rw-r--r--arch/x86/mm/init_64.c72
-rw-r--r--arch/x86/mm/numa.c212
-rw-r--r--arch/x86/mm/numa_32.c10
-rw-r--r--arch/x86/mm/numa_64.c984
-rw-r--r--arch/x86/mm/numa_emulation.c494
-rw-r--r--arch/x86/mm/numa_internal.h31
-rw-r--r--arch/x86/mm/srat_32.c6
-rw-r--r--arch/x86/mm/srat_64.c367
-rw-r--r--arch/x86/mm/tlb.c14
-rw-r--r--arch/x86/pci/ce4100.c2
-rw-r--r--arch/x86/pci/xen.c159
-rw-r--r--arch/x86/platform/ce4100/ce4100.c24
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts428
-rw-r--r--arch/x86/platform/mrst/mrst.c2
-rw-r--r--arch/x86/platform/mrst/vrtc.c16
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/uv/uv_irq.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c4
-rw-r--r--arch/x86/xen/Kconfig8
-rw-r--r--arch/x86/xen/enlighten.c8
-rw-r--r--arch/x86/xen/mmu.c74
-rw-r--r--arch/x86/xen/p2m.c330
-rw-r--r--arch/x86/xen/setup.c68
-rw-r--r--arch/x86/xen/smp.c38
-rw-r--r--arch/x86/xen/suspend.c8
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
-rw-r--r--arch/xtensa/include/asm/rwsem.h37
-rw-r--r--arch/xtensa/kernel/time.c6
-rw-r--r--drivers/acpi/numa.c9
-rw-r--r--drivers/block/xen-blkfront.c87
-rw-r--r--drivers/char/mmtimer.c30
-rw-r--r--drivers/i2c/busses/i2c-ocores.c14
-rw-r--r--drivers/i2c/i2c-core.c2
-rw-r--r--drivers/mmc/host/mmc_spi.c4
-rw-r--r--drivers/net/ethoc.c8
-rw-r--r--drivers/of/Kconfig6
-rw-r--r--drivers/of/Makefile1
-rw-r--r--drivers/of/of_pci.c92
-rw-r--r--drivers/pci/xen-pcifront.c31
-rw-r--r--drivers/rtc/class.c7
-rw-r--r--drivers/rtc/interface.c180
-rw-r--r--drivers/rtc/rtc-at91rm9200.c28
-rw-r--r--drivers/rtc/rtc-at91sam9.c28
-rw-r--r--drivers/rtc/rtc-bfin.c27
-rw-r--r--drivers/rtc/rtc-cmos.c111
-rw-r--r--drivers/rtc/rtc-davinci.c55
-rw-r--r--drivers/rtc/rtc-ds1511.c17
-rw-r--r--drivers/rtc/rtc-ds1553.c17
-rw-r--r--drivers/rtc/rtc-ds3232.c18
-rw-r--r--drivers/rtc/rtc-jz4740.c7
-rw-r--r--drivers/rtc/rtc-mc13xxx.c7
-rw-r--r--drivers/rtc/rtc-mpc5121.c20
-rw-r--r--drivers/rtc/rtc-mrst.c33
-rw-r--r--drivers/rtc/rtc-mxc.c7
-rw-r--r--drivers/rtc/rtc-nuc900.c15
-rw-r--r--drivers/rtc/rtc-omap.c39
-rw-r--r--drivers/rtc/rtc-pcap.c6
-rw-r--r--drivers/rtc/rtc-pcf50633.c22
-rw-r--r--drivers/rtc/rtc-pl030.c6
-rw-r--r--drivers/rtc/rtc-pl031.c55
-rw-r--r--drivers/rtc/rtc-proc.c8
-rw-r--r--drivers/rtc/rtc-pxa.c44
-rw-r--r--drivers/rtc/rtc-rs5c372.c52
-rw-r--r--drivers/rtc/rtc-rx8025.c25
-rw-r--r--drivers/rtc/rtc-s3c.c33
-rw-r--r--drivers/rtc/rtc-sa1100.c160
-rw-r--r--drivers/rtc/rtc-sh.c24
-rw-r--r--drivers/rtc/rtc-stmp3xxx.c15
-rw-r--r--drivers/rtc/rtc-test.c13
-rw-r--r--drivers/rtc/rtc-twl.c13
-rw-r--r--drivers/rtc/rtc-vr41xx.c32
-rw-r--r--drivers/rtc/rtc-wm831x.c16
-rw-r--r--drivers/rtc/rtc-wm8350.c21
-rw-r--r--drivers/spi/pxa2xx_spi.c2
-rw-r--r--drivers/spi/pxa2xx_spi_pci.c2
-rw-r--r--drivers/spi/xilinx_spi.c6
-rw-r--r--drivers/xen/balloon.c16
-rw-r--r--drivers/xen/events.c342
-rw-r--r--drivers/xen/manage.c153
-rw-r--r--drivers/xen/platform-pci.c3
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/btrfs/export.c8
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/compat.c61
-rw-r--r--fs/dcache.c95
-rw-r--r--fs/exec.c18
-rw-r--r--fs/exportfs/expfs.c11
-rw-r--r--fs/ext3/namei.c7
-rw-r--r--fs/ext3/super.c1
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fcntl.c37
-rw-r--r--fs/fhandle.c265
-rw-r--r--fs/file_table.c55
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/export.c8
-rw-r--r--fs/inode.c32
-rw-r--r--fs/internal.h13
-rw-r--r--fs/isofs/export.c8
-rw-r--r--fs/jfs/namei.c3
-rw-r--r--fs/namei.c1501
-rw-r--r--fs/namespace.c16
-rw-r--r--fs/nfsctl.c21
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/open.c126
-rw-r--r--fs/partitions/osf.c2
-rw-r--r--fs/reiserfs/inode.c7
-rw-r--r--fs/reiserfs/namei.c4
-rw-r--r--fs/stat.c7
-rw-r--r--fs/statfs.c176
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/udf/namei.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c4
-rw-r--r--include/asm-generic/cputime.h3
-rw-r--r--include/asm-generic/fcntl.h4
-rw-r--r--include/asm-generic/futex.h7
-rw-r--r--include/asm-generic/sections.h1
-rw-r--r--include/asm-generic/unistd.h6
-rw-r--r--include/asm-generic/vmlinux.lds.h6
-rw-r--r--include/linux/cgroup.h4
-rw-r--r--include/linux/cgroup_subsys.h4
-rw-r--r--include/linux/debugobjects.h5
-rw-r--r--include/linux/device.h7
-rw-r--r--include/linux/exportfs.h9
-rw-r--r--include/linux/fcntl.h1
-rw-r--r--include/linux/file.h2
-rw-r--r--include/linux/fs.h19
-rw-r--r--include/linux/ftrace.h2
-rw-r--r--include/linux/ftrace_event.h2
-rw-r--r--include/linux/hrtimer.h24
-rw-r--r--include/linux/i2c.h2
-rw-r--r--include/linux/interrupt.h85
-rw-r--r--include/linux/irq.h368
-rw-r--r--include/linux/irqdesc.h78
-rw-r--r--include/linux/jiffies.h1
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/namei.h7
-rw-r--r--include/linux/of.h16
-rw-r--r--include/linux/of_pci.h9
-rw-r--r--include/linux/perf_event.h50
-rw-r--r--include/linux/plist.h47
-rw-r--r--include/linux/posix-clock.h150
-rw-r--r--include/linux/posix-timers.h44
-rw-r--r--include/linux/ring_buffer.h2
-rw-r--r--include/linux/rtc.h5
-rw-r--r--include/linux/rwlock_types.h8
-rw-r--r--include/linux/rwsem-spinlock.h31
-rw-r--r--include/linux/rwsem.h53
-rw-r--r--include/linux/sched.h22
-rw-r--r--include/linux/security.h9
-rw-r--r--include/linux/spinlock_types.h8
-rw-r--r--include/linux/syscalls.h20
-rw-r--r--include/linux/thread_info.h3
-rw-r--r--include/linux/time.h14
-rw-r--r--include/linux/timex.h3
-rw-r--r--include/trace/events/mce.h8
-rw-r--r--include/trace/events/module.h5
-rw-r--r--include/trace/events/skb.h4
-rw-r--r--include/xen/events.h8
-rw-r--r--include/xen/interface/io/blkif.h37
-rw-r--r--include/xen/interface/xen.h4
-rw-r--r--include/xen/xen-ops.h6
-rw-r--r--init/Kconfig22
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/cgroup.c54
-rw-r--r--kernel/compat.c136
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/futex.c147
-rw-r--r--kernel/hrtimer.c90
-rw-r--r--kernel/irq/Kconfig39
-rw-r--r--kernel/irq/autoprobe.c54
-rw-r--r--kernel/irq/chip.c483
-rw-r--r--kernel/irq/compat.h72
-rw-r--r--kernel/irq/debug.h40
-rw-r--r--kernel/irq/handle.c144
-rw-r--r--kernel/irq/internals.h167
-rw-r--r--kernel/irq/irqdesc.c68
-rw-r--r--kernel/irq/manage.c602
-rw-r--r--kernel/irq/migration.c38
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c70
-rw-r--r--kernel/irq/resend.c17
-rw-r--r--kernel/irq/settings.h138
-rw-r--r--kernel/irq/spurious.c163
-rw-r--r--kernel/perf_event.c1004
-rw-r--r--kernel/posix-cpu-timers.c110
-rw-r--r--kernel/posix-timers.c342
-rw-r--r--kernel/rcupdate.c10
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c338
-rw-r--r--kernel/sched_autogroup.c15
-rw-r--r--kernel/sched_autogroup.h5
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c397
-rw-r--r--kernel/sched_idletask.c26
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stoptask.c7
-rw-r--r--kernel/softirq.c24
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/sysctl_binary.c19
-rw-r--r--kernel/time.c35
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/jiffies.c20
-rw-r--r--kernel/time/ntp.c13
-rw-r--r--kernel/time/posix-clock.c451
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h9
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/time/timekeeping.c141
-rw-r--r--kernel/timer.c42
-rw-r--r--kernel/trace/ftrace.c52
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/trace.c38
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_kprobe.c111
-rw-r--r--kernel/trace/trace_output.c36
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_syscalls.c42
-rw-r--r--kernel/workqueue.c6
-rw-r--r--lib/debugobjects.c9
-rw-r--r--lib/plist.c135
-rw-r--r--lib/rwsem.c10
-rw-r--r--mm/Makefile8
-rw-r--r--mm/bootmem.c180
-rw-r--r--mm/nobootmem.c435
-rw-r--r--mm/page_alloc.c71
-rw-r--r--mm/shmem.c4
-rw-r--r--net/core/scm.c2
-rw-r--r--net/unix/af_unix.c2
-rw-r--r--net/unix/garbage.c2
-rwxr-xr-xscripts/checkpatch.pl5
-rw-r--r--scripts/kconfig/streamline_config.pl2
-rw-r--r--scripts/recordmcount.c3
-rwxr-xr-xscripts/recordmcount.pl1
-rw-r--r--scripts/rt-tester/rt-tester.py2
-rw-r--r--scripts/rt-tester/t2-l1-2rt-sameprio.tst5
-rw-r--r--scripts/rt-tester/t2-l1-pi.tst5
-rw-r--r--scripts/rt-tester/t2-l1-signal.tst5
-rw-r--r--scripts/rt-tester/t2-l2-2rt-deadlock.tst5
-rw-r--r--scripts/rt-tester/t3-l1-pi-1rt.tst5
-rw-r--r--scripts/rt-tester/t3-l1-pi-2rt.tst5
-rw-r--r--scripts/rt-tester/t3-l1-pi-3rt.tst5
-rw-r--r--scripts/rt-tester/t3-l1-pi-signal.tst5
-rw-r--r--scripts/rt-tester/t3-l1-pi-steal.tst5
-rw-r--r--scripts/rt-tester/t3-l2-pi.tst5
-rw-r--r--scripts/rt-tester/t4-l2-pi-deboost.tst5
-rw-r--r--scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst5
-rw-r--r--scripts/rt-tester/t5-l4-pi-boost-deboost.tst5
-rw-r--r--security/commoncap.c2
-rw-r--r--security/security.c2
-rw-r--r--tools/perf/.gitignore1
-rw-r--r--tools/perf/Documentation/Makefile19
-rw-r--r--tools/perf/Documentation/perf-list.txt23
-rw-r--r--tools/perf/Documentation/perf-lock.txt12
-rw-r--r--tools/perf/Documentation/perf-probe.txt26
-rw-r--r--tools/perf/Documentation/perf-record.txt11
-rw-r--r--tools/perf/Documentation/perf-stat.txt11
-rw-r--r--tools/perf/Makefile649
-rw-r--r--tools/perf/bench/sched-pipe.c2
-rw-r--r--tools/perf/builtin-annotate.c351
-rw-r--r--tools/perf/builtin-diff.c16
-rw-r--r--tools/perf/builtin-inject.c82
-rw-r--r--tools/perf/builtin-kmem.c10
-rw-r--r--tools/perf/builtin-list.c43
-rw-r--r--tools/perf/builtin-lock.c8
-rw-r--r--tools/perf/builtin-probe.c70
-rw-r--r--tools/perf/builtin-record.c450
-rw-r--r--tools/perf/builtin-report.c225
-rw-r--r--tools/perf/builtin-sched.c27
-rw-r--r--tools/perf/builtin-script.c17
-rw-r--r--tools/perf/builtin-stat.c118
-rw-r--r--tools/perf/builtin-test.c184
-rw-r--r--tools/perf/builtin-timechart.c13
-rw-r--r--tools/perf/builtin-top.c1029
-rw-r--r--tools/perf/perf.h26
-rwxr-xr-xtools/perf/python/twatch.py41
-rw-r--r--tools/perf/util/annotate.c605
-rw-r--r--tools/perf/util/annotate.h103
-rw-r--r--tools/perf/util/build-id.c21
-rw-r--r--tools/perf/util/cache.h7
-rw-r--r--tools/perf/util/callchain.c227
-rw-r--r--tools/perf/util/callchain.h76
-rw-r--r--tools/perf/util/cgroup.c178
-rw-r--r--tools/perf/util/cgroup.h17
-rw-r--r--tools/perf/util/cpumap.c5
-rw-r--r--tools/perf/util/cpumap.h2
-rw-r--r--tools/perf/util/debug.c2
-rw-r--r--tools/perf/util/debug.h2
-rw-r--r--tools/perf/util/event.c374
-rw-r--r--tools/perf/util/event.h78
-rw-r--r--tools/perf/util/evlist.c394
-rw-r--r--tools/perf/util/evlist.h68
-rw-r--r--tools/perf/util/evsel.c234
-rw-r--r--tools/perf/util/evsel.h47
-rw-r--r--tools/perf/util/exec_cmd.c19
-rw-r--r--tools/perf/util/header.c535
-rw-r--r--tools/perf/util/header.h95
-rw-r--r--tools/perf/util/hist.c250
-rw-r--r--tools/perf/util/hist.h60
-rw-r--r--tools/perf/util/include/linux/list.h1
-rw-r--r--tools/perf/util/parse-events.c175
-rw-r--r--tools/perf/util/parse-events.h12
-rw-r--r--tools/perf/util/probe-event.c159
-rw-r--r--tools/perf/util/probe-event.h4
-rw-r--r--tools/perf/util/probe-finder.c533
-rw-r--r--tools/perf/util/python.c896
-rw-r--r--tools/perf/util/scripting-engines/trace-event-python.c3
-rw-r--r--tools/perf/util/session.c266
-rw-r--r--tools/perf/util/session.h41
-rw-r--r--tools/perf/util/setup.py19
-rw-r--r--tools/perf/util/strfilter.c199
-rw-r--r--tools/perf/util/strfilter.h48
-rw-r--r--tools/perf/util/symbol.c5
-rw-r--r--tools/perf/util/symbol.h1
-rw-r--r--tools/perf/util/thread.c55
-rw-r--r--tools/perf/util/thread.h14
-rw-r--r--tools/perf/util/thread_map.c64
-rw-r--r--tools/perf/util/thread_map.h15
-rw-r--r--tools/perf/util/top.c238
-rw-r--r--tools/perf/util/top.h66
-rw-r--r--tools/perf/util/trace-event-parse.c2
-rw-r--r--tools/perf/util/ui/browser.c25
-rw-r--r--tools/perf/util/ui/browser.h3
-rw-r--r--tools/perf/util/ui/browsers/annotate.c178
-rw-r--r--tools/perf/util/ui/browsers/hists.c197
-rw-r--r--tools/perf/util/ui/browsers/map.c2
-rw-r--r--tools/perf/util/ui/browsers/top.c213
-rw-r--r--tools/perf/util/ui/helpline.c5
-rw-r--r--tools/perf/util/ui/libslang.h6
-rw-r--r--tools/perf/util/ui/setup.c8
-rw-r--r--tools/perf/util/ui/ui.h8
-rw-r--r--tools/perf/util/ui/util.c7
-rw-r--r--tools/perf/util/util.h26
-rwxr-xr-xtools/testing/ktest/ktest.pl2
549 files changed, 20984 insertions, 13281 deletions
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index cfaac34c455..6ef692667e2 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -849,6 +849,37 @@ All: lockdep-checked RCU-protected pointer access
849See the comment headers in the source code (or the docbook generated 849See the comment headers in the source code (or the docbook generated
850from them) for more information. 850from them) for more information.
851 851
852However, given that there are no fewer than four families of RCU APIs
853in the Linux kernel, how do you choose which one to use? The following
854list can be helpful:
855
856a. Will readers need to block? If so, you need SRCU.
857
858b. What about the -rt patchset? If readers would need to block
859 in an non-rt kernel, you need SRCU. If readers would block
860 in a -rt kernel, but not in a non-rt kernel, SRCU is not
861 necessary.
862
863c. Do you need to treat NMI handlers, hardirq handlers,
864 and code segments with preemption disabled (whether
865 via preempt_disable(), local_irq_save(), local_bh_disable(),
866 or some other mechanism) as if they were explicit RCU readers?
867 If so, you need RCU-sched.
868
869d. Do you need RCU grace periods to complete even in the face
870 of softirq monopolization of one or more of the CPUs? For
871 example, is your code subject to network-based denial-of-service
872 attacks? If so, you need RCU-bh.
873
874e. Is your workload too update-intensive for normal use of
875 RCU, but inappropriate for other synchronization mechanisms?
876 If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
877
878f. Otherwise, use RCU.
879
880Of course, this all assumes that you have determined that RCU is in fact
881the right tool for your job.
882
852 883
8538. ANSWERS TO QUICK QUIZZES 8848. ANSWERS TO QUICK QUIZZES
854 885
diff --git a/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt
new file mode 100644
index 00000000000..569b1624851
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt
@@ -0,0 +1,93 @@
1CE4100 I2C
2----------
3
4CE4100 has one PCI device which is described as the I2C-Controller. This
5PCI device has three PCI-bars, each bar contains a complete I2C
6controller. So we have a total of three independent I2C-Controllers
7which share only an interrupt line.
8The driver is probed via the PCI-ID and is gathering the information of
9attached devices from the devices tree.
10Grant Likely recommended to use the ranges property to map the PCI-Bar
11number to its physical address and to use this to find the child nodes
12of the specific I2C controller. This were his exact words:
13
14 Here's where the magic happens. Each entry in
15 ranges describes how the parent pci address space
16 (middle group of 3) is translated to the local
17 address space (first group of 2) and the size of
18 each range (last cell). In this particular case,
19 the first cell of the local address is chosen to be
20 1:1 mapped to the BARs, and the second is the
21 offset from be base of the BAR (which would be
22 non-zero if you had 2 or more devices mapped off
23 the same BAR)
24
25 ranges allows the address mapping to be described
26 in a way that the OS can interpret without
27 requiring custom device driver code.
28
29This is an example which is used on FalconFalls:
30------------------------------------------------
31 i2c-controller@b,2 {
32 #address-cells = <2>;
33 #size-cells = <1>;
34 compatible = "pci8086,2e68.2",
35 "pci8086,2e68",
36 "pciclass,ff0000",
37 "pciclass,ff00";
38
39 reg = <0x15a00 0x0 0x0 0x0 0x0>;
40 interrupts = <16 1>;
41
42 /* as described by Grant, the first number in the group of
43 * three is the bar number followed by the 64bit bar address
44 * followed by size of the mapping. The bar address
45 * requires also a valid translation in parents ranges
46 * property.
47 */
48 ranges = <0 0 0x02000000 0 0xdffe0500 0x100
49 1 0 0x02000000 0 0xdffe0600 0x100
50 2 0 0x02000000 0 0xdffe0700 0x100>;
51
52 i2c@0 {
53 #address-cells = <1>;
54 #size-cells = <0>;
55 compatible = "intel,ce4100-i2c-controller";
56
57 /* The first number in the reg property is the
58 * number of the bar
59 */
60 reg = <0 0 0x100>;
61
62 /* This I2C controller has no devices */
63 };
64
65 i2c@1 {
66 #address-cells = <1>;
67 #size-cells = <0>;
68 compatible = "intel,ce4100-i2c-controller";
69 reg = <1 0 0x100>;
70
71 /* This I2C controller has one gpio controller */
72 gpio@26 {
73 #gpio-cells = <2>;
74 compatible = "ti,pcf8575";
75 reg = <0x26>;
76 gpio-controller;
77 };
78 };
79
80 i2c@2 {
81 #address-cells = <1>;
82 #size-cells = <0>;
83 compatible = "intel,ce4100-i2c-controller";
84 reg = <2 0 0x100>;
85
86 gpio@26 {
87 #gpio-cells = <2>;
88 compatible = "ti,pcf8575";
89 reg = <0x26>;
90 gpio-controller;
91 };
92 };
93 };
diff --git a/Documentation/devicetree/bindings/rtc/rtc-cmos.txt b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt
new file mode 100644
index 00000000000..7382989b305
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt
@@ -0,0 +1,28 @@
1 Motorola mc146818 compatible RTC
2~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3
4Required properties:
5 - compatible : "motorola,mc146818"
6 - reg : should contain registers location and length.
7
8Optional properties:
9 - interrupts : should contain interrupt.
10 - interrupt-parent : interrupt source phandle.
11 - ctrl-reg : Contains the initial value of the control register also
12 called "Register B".
13 - freq-reg : Contains the initial value of the frequency register also
14 called "Regsiter A".
15
16"Register A" and "B" are usually initialized by the firmware (BIOS for
17instance). If this is not done, it can be performed by the driver.
18
19ISA Example:
20
21 rtc@70 {
22 compatible = "motorola,mc146818";
23 interrupts = <8 3>;
24 interrupt-parent = <&ioapic1>;
25 ctrl-reg = <2>;
26 freq-reg = <0x26>;
27 reg = <1 0x70 2>;
28 };
diff --git a/Documentation/devicetree/bindings/x86/ce4100.txt b/Documentation/devicetree/bindings/x86/ce4100.txt
new file mode 100644
index 00000000000..b49ae593a60
--- /dev/null
+++ b/Documentation/devicetree/bindings/x86/ce4100.txt
@@ -0,0 +1,38 @@
1CE4100 Device Tree Bindings
2---------------------------
3
4The CE4100 SoC uses for in core peripherals the following compatible
5format: <vendor>,<chip>-<device>.
6Many of the "generic" devices like HPET or IO APIC have the ce4100
7name in their compatible property because they first appeared in this
8SoC.
9
10The CPU node
11------------
12 cpu@0 {
13 device_type = "cpu";
14 compatible = "intel,ce4100";
15 reg = <0>;
16 lapic = <&lapic0>;
17 };
18
19The reg property describes the CPU number. The lapic property points to
20the local APIC timer.
21
22The SoC node
23------------
24
25This node describes the in-core peripherals. Required property:
26 compatible = "intel,ce4100-cp";
27
28The PCI node
29------------
30This node describes the PCI bus on the SoC. Its property should be
31 compatible = "intel,ce4100-pci", "pci";
32
33If the OS is using the IO-APIC for interrupt routing then the reported
34interrupt numbers for devices is no longer true. In order to obtain the
35correct interrupt number, the child node which represents the device has
36to contain the interrupt property. Besides the interrupt property it has
37to contain at least the reg property containing the PCI bus address and
38compatible property according to "PCI Bus Binding Revision 2.1".
diff --git a/Documentation/devicetree/bindings/x86/interrupt.txt b/Documentation/devicetree/bindings/x86/interrupt.txt
new file mode 100644
index 00000000000..7d19f494f19
--- /dev/null
+++ b/Documentation/devicetree/bindings/x86/interrupt.txt
@@ -0,0 +1,26 @@
1Interrupt chips
2---------------
3
4* Intel I/O Advanced Programmable Interrupt Controller (IO APIC)
5
6 Required properties:
7 --------------------
8 compatible = "intel,ce4100-ioapic";
9 #interrupt-cells = <2>;
10
11 Device's interrupt property:
12
13 interrupts = <P S>;
14
15 The first number (P) represents the interrupt pin which is wired to the
16 IO APIC. The second number (S) represents the sense of interrupt which
17 should be configured and can be one of:
18 0 - Edge Rising
19 1 - Level Low
20 2 - Level High
21 3 - Edge Falling
22
23* Local APIC
24 Required property:
25
26 compatible = "intel,ce4100-lapic";
diff --git a/Documentation/devicetree/bindings/x86/timer.txt b/Documentation/devicetree/bindings/x86/timer.txt
new file mode 100644
index 00000000000..c688af58e3b
--- /dev/null
+++ b/Documentation/devicetree/bindings/x86/timer.txt
@@ -0,0 +1,6 @@
1Timers
2------
3
4* High Precision Event Timer (HPET)
5 Required property:
6 compatible = "intel,ce4100-hpet";
diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt
index 28b1c9d3d35..55fd2623445 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -13,6 +13,7 @@ Table of Contents
13 13
14 I - Introduction 14 I - Introduction
15 1) Entry point for arch/powerpc 15 1) Entry point for arch/powerpc
16 2) Entry point for arch/x86
16 17
17 II - The DT block format 18 II - The DT block format
18 1) Header 19 1) Header
@@ -225,6 +226,25 @@ it with special cases.
225 cannot support both configurations with Book E and configurations 226 cannot support both configurations with Book E and configurations
226 with classic Powerpc architectures. 227 with classic Powerpc architectures.
227 228
2292) Entry point for arch/x86
230-------------------------------
231
232 There is one single 32bit entry point to the kernel at code32_start,
233 the decompressor (the real mode entry point goes to the same 32bit
234 entry point once it switched into protected mode). That entry point
235 supports one calling convention which is documented in
236 Documentation/x86/boot.txt
237 The physical pointer to the device-tree block (defined in chapter II)
238 is passed via setup_data which requires at least boot protocol 2.09.
239 The type filed is defined as
240
241 #define SETUP_DTB 2
242
243 This device-tree is used as an extension to the "boot page". As such it
244 does not parse / consider data which is already covered by the boot
245 page. This includes memory size, reserved ranges, command line arguments
246 or initrd address. It simply holds information which can not be retrieved
247 otherwise like interrupt routing or a list of devices behind an I2C bus.
228 248
229II - The DT block format 249II - The DT block format
230======================== 250========================
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f4a04c0c7ed..738c6fda3fb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2444,6 +2444,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2444 <deci-seconds>: poll all this frequency 2444 <deci-seconds>: poll all this frequency
2445 0: no polling (default) 2445 0: no polling (default)
2446 2446
2447 threadirqs [KNL]
2448 Force threading of all interrupt handlers except those
2449 marked explicitely IRQF_NO_THREAD.
2450
2447 topology= [S390] 2451 topology= [S390]
2448 Format: {off | on} 2452 Format: {off | on}
2449 Specify if the kernel should make use of the cpu 2453 Specify if the kernel should make use of the cpu
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 631ad2f1b22..f0d3a8026a5 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -21,6 +21,7 @@ Contents:
21 - SMP barrier pairing. 21 - SMP barrier pairing.
22 - Examples of memory barrier sequences. 22 - Examples of memory barrier sequences.
23 - Read memory barriers vs load speculation. 23 - Read memory barriers vs load speculation.
24 - Transitivity
24 25
25 (*) Explicit kernel barriers. 26 (*) Explicit kernel barriers.
26 27
@@ -959,6 +960,63 @@ the speculation will be cancelled and the value reloaded:
959 retrieved : : +-------+ 960 retrieved : : +-------+
960 961
961 962
963TRANSITIVITY
964------------
965
966Transitivity is a deeply intuitive notion about ordering that is not
967always provided by real computer systems. The following example
968demonstrates transitivity (also called "cumulativity"):
969
970 CPU 1 CPU 2 CPU 3
971 ======================= ======================= =======================
972 { X = 0, Y = 0 }
973 STORE X=1 LOAD X STORE Y=1
974 <general barrier> <general barrier>
975 LOAD Y LOAD X
976
977Suppose that CPU 2's load from X returns 1 and its load from Y returns 0.
978This indicates that CPU 2's load from X in some sense follows CPU 1's
979store to X and that CPU 2's load from Y in some sense preceded CPU 3's
980store to Y. The question is then "Can CPU 3's load from X return 0?"
981
982Because CPU 2's load from X in some sense came after CPU 1's store, it
983is natural to expect that CPU 3's load from X must therefore return 1.
984This expectation is an example of transitivity: if a load executing on
985CPU A follows a load from the same variable executing on CPU B, then
986CPU A's load must either return the same value that CPU B's load did,
987or must return some later value.
988
989In the Linux kernel, use of general memory barriers guarantees
990transitivity. Therefore, in the above example, if CPU 2's load from X
991returns 1 and its load from Y returns 0, then CPU 3's load from X must
992also return 1.
993
994However, transitivity is -not- guaranteed for read or write barriers.
995For example, suppose that CPU 2's general barrier in the above example
996is changed to a read barrier as shown below:
997
998 CPU 1 CPU 2 CPU 3
999 ======================= ======================= =======================
1000 { X = 0, Y = 0 }
1001 STORE X=1 LOAD X STORE Y=1
1002 <read barrier> <general barrier>
1003 LOAD Y LOAD X
1004
1005This substitution destroys transitivity: in this example, it is perfectly
1006legal for CPU 2's load from X to return 1, its load from Y to return 0,
1007and CPU 3's load from X to return 0.
1008
1009The key point is that although CPU 2's read barrier orders its pair
1010of loads, it does not guarantee to order CPU 1's store. Therefore, if
1011this example runs on a system where CPUs 1 and 2 share a store buffer
1012or a level of cache, CPU 2 might have early access to CPU 1's writes.
1013General barriers are therefore required to ensure that all CPUs agree
1014on the combined order of CPU 1's and CPU 2's accesses.
1015
1016To reiterate, if your code requires transitivity, use general barriers
1017throughout.
1018
1019
962======================== 1020========================
963EXPLICIT KERNEL BARRIERS 1021EXPLICIT KERNEL BARRIERS
964======================== 1022========================
diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
index 9104c106208..250160469d8 100644
--- a/Documentation/rtc.txt
+++ b/Documentation/rtc.txt
@@ -178,38 +178,29 @@ RTC class framework, but can't be supported by the older driver.
178 setting the longer alarm time and enabling its IRQ using a single 178 setting the longer alarm time and enabling its IRQ using a single
179 request (using the same model as EFI firmware). 179 request (using the same model as EFI firmware).
180 180
181 * RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, it probably 181 * RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, the RTC framework
182 also offers update IRQs whenever the "seconds" counter changes. 182 will emulate this mechanism.
183 If needed, the RTC framework can emulate this mechanism.
184 183
185 * RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... another 184 * RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... these icotls
186 feature often accessible with an IRQ line is a periodic IRQ, issued 185 are emulated via a kernel hrtimer.
187 at settable frequencies (usually 2^N Hz).
188 186
189In many cases, the RTC alarm can be a system wake event, used to force 187In many cases, the RTC alarm can be a system wake event, used to force
190Linux out of a low power sleep state (or hibernation) back to a fully 188Linux out of a low power sleep state (or hibernation) back to a fully
191operational state. For example, a system could enter a deep power saving 189operational state. For example, a system could enter a deep power saving
192state until it's time to execute some scheduled tasks. 190state until it's time to execute some scheduled tasks.
193 191
194Note that many of these ioctls need not actually be implemented by your 192Note that many of these ioctls are handled by the common rtc-dev interface.
195driver. The common rtc-dev interface handles many of these nicely if your 193Some common examples:
196driver returns ENOIOCTLCMD. Some common examples:
197 194
198 * RTC_RD_TIME, RTC_SET_TIME: the read_time/set_time functions will be 195 * RTC_RD_TIME, RTC_SET_TIME: the read_time/set_time functions will be
199 called with appropriate values. 196 called with appropriate values.
200 197
201 * RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: the 198 * RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: gets or sets
202 set_alarm/read_alarm functions will be called. 199 the alarm rtc_timer. May call the set_alarm driver function.
203 200
204 * RTC_IRQP_SET, RTC_IRQP_READ: the irq_set_freq function will be called 201 * RTC_IRQP_SET, RTC_IRQP_READ: These are emulated by the generic code.
205 to set the frequency while the framework will handle the read for you
206 since the frequency is stored in the irq_freq member of the rtc_device
207 structure. Your driver needs to initialize the irq_freq member during
208 init. Make sure you check the requested frequency is in range of your
209 hardware in the irq_set_freq function. If it isn't, return -EINVAL. If
210 you cannot actually change the frequency, do not define irq_set_freq.
211 202
212 * RTC_PIE_ON, RTC_PIE_OFF: the irq_set_state function will be called. 203 * RTC_PIE_ON, RTC_PIE_OFF: These are also emulated by the generic code.
213 204
214If all else fails, check out the rtc-test.c driver! 205If all else fails, check out the rtc-test.c driver!
215 206
diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt
index 178c831b907..2e3c64b1a6a 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/spinlocks.txt
@@ -86,7 +86,7 @@ to change the variables it has to get an exclusive write lock.
86 86
87The routines look the same as above: 87The routines look the same as above:
88 88
89 rwlock_t xxx_lock = RW_LOCK_UNLOCKED; 89 rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
90 90
91 unsigned long flags; 91 unsigned long flags;
92 92
@@ -196,25 +196,3 @@ appropriate:
196 196
197For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or 197For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
198__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. 198__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
199
200SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated. These interfere
201with lockdep state tracking.
202
203Most of the time, you can simply turn:
204 static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
205into:
206 static DEFINE_SPINLOCK(xxx_lock);
207
208Static structure member variables go from:
209
210 struct foo bar {
211 .lock = SPIN_LOCK_UNLOCKED;
212 };
213
214to:
215
216 struct foo bar {
217 .lock = __SPIN_LOCK_UNLOCKED(bar.lock);
218 };
219
220Declaration of static rw_locks undergo a similar transformation.
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index dc52bd442c9..79fcafc7fd6 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -247,6 +247,13 @@ You need very few things to get the syscalls tracing in an arch.
247- Support the TIF_SYSCALL_TRACEPOINT thread flags. 247- Support the TIF_SYSCALL_TRACEPOINT thread flags.
248- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace 248- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
249 in the ptrace syscalls tracing path. 249 in the ptrace syscalls tracing path.
250- If the system call table on this arch is more complicated than a simple array
251 of addresses of the system calls, implement an arch_syscall_addr to return
252 the address of a given system call.
253- If the symbol names of the system calls do not match the function names on
254 this arch, define ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and
255 implement arch_syscall_match_sym_name with the appropriate logic to return
256 true if the function name corresponds with the symbol name.
250- Tag this arch as HAVE_SYSCALL_TRACEPOINTS. 257- Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
251 258
252 259
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 557c1edecca..1ebc24cf9a5 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -80,11 +80,11 @@ of ftrace. Here is a list of some of the key files:
80 tracers listed here can be configured by 80 tracers listed here can be configured by
81 echoing their name into current_tracer. 81 echoing their name into current_tracer.
82 82
83 tracing_enabled: 83 tracing_on:
84 84
85 This sets or displays whether the current_tracer 85 This sets or displays whether writing to the trace
86 is activated and tracing or not. Echo 0 into this 86 ring buffer is enabled. Echo 0 into this file to disable
87 file to disable the tracer or 1 to enable it. 87 the tracer or 1 to enable it.
88 88
89 trace: 89 trace:
90 90
@@ -202,10 +202,6 @@ Here is the list of current tracers that may be configured.
202 to draw a graph of function calls similar to C code 202 to draw a graph of function calls similar to C code
203 source. 203 source.
204 204
205 "sched_switch"
206
207 Traces the context switches and wakeups between tasks.
208
209 "irqsoff" 205 "irqsoff"
210 206
211 Traces the areas that disable interrupts and saves 207 Traces the areas that disable interrupts and saves
@@ -273,39 +269,6 @@ format, the function name that was traced "path_put" and the
273parent function that called this function "path_walk". The 269parent function that called this function "path_walk". The
274timestamp is the time at which the function was entered. 270timestamp is the time at which the function was entered.
275 271
276The sched_switch tracer also includes tracing of task wakeups
277and context switches.
278
279 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S
280 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S
281 ksoftirqd/1-7 [01] 1453.070013: 7:115:R ==> 10:115:R
282 events/1-10 [01] 1453.070013: 10:115:S ==> 2916:115:R
283 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R
284 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R
285
286Wake ups are represented by a "+" and the context switches are
287shown as "==>". The format is:
288
289 Context switches:
290
291 Previous task Next Task
292
293 <pid>:<prio>:<state> ==> <pid>:<prio>:<state>
294
295 Wake ups:
296
297 Current task Task waking up
298
299 <pid>:<prio>:<state> + <pid>:<prio>:<state>
300
301The prio is the internal kernel priority, which is the inverse
302of the priority that is usually displayed by user-space tools.
303Zero represents the highest priority (99). Prio 100 starts the
304"nice" priorities with 100 being equal to nice -20 and 139 being
305nice 19. The prio "140" is reserved for the idle task which is
306the lowest priority thread (pid 0).
307
308
309Latency trace format 272Latency trace format
310-------------------- 273--------------------
311 274
@@ -491,78 +454,10 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
491 latencies, as described in "Latency 454 latencies, as described in "Latency
492 trace format". 455 trace format".
493 456
494sched_switch 457 overwrite - This controls what happens when the trace buffer is
495------------ 458 full. If "1" (default), the oldest events are
496 459 discarded and overwritten. If "0", then the newest
497This tracer simply records schedule switches. Here is an example 460 events are discarded.
498of how to use it.
499
500 # echo sched_switch > current_tracer
501 # echo 1 > tracing_enabled
502 # sleep 1
503 # echo 0 > tracing_enabled
504 # cat trace
505
506# tracer: sched_switch
507#
508# TASK-PID CPU# TIMESTAMP FUNCTION
509# | | | | |
510 bash-3997 [01] 240.132281: 3997:120:R + 4055:120:R
511 bash-3997 [01] 240.132284: 3997:120:R ==> 4055:120:R
512 sleep-4055 [01] 240.132371: 4055:120:S ==> 3997:120:R
513 bash-3997 [01] 240.132454: 3997:120:R + 4055:120:S
514 bash-3997 [01] 240.132457: 3997:120:R ==> 4055:120:R
515 sleep-4055 [01] 240.132460: 4055:120:D ==> 3997:120:R
516 bash-3997 [01] 240.132463: 3997:120:R + 4055:120:D
517 bash-3997 [01] 240.132465: 3997:120:R ==> 4055:120:R
518 <idle>-0 [00] 240.132589: 0:140:R + 4:115:S
519 <idle>-0 [00] 240.132591: 0:140:R ==> 4:115:R
520 ksoftirqd/0-4 [00] 240.132595: 4:115:S ==> 0:140:R
521 <idle>-0 [00] 240.132598: 0:140:R + 4:115:S
522 <idle>-0 [00] 240.132599: 0:140:R ==> 4:115:R
523 ksoftirqd/0-4 [00] 240.132603: 4:115:S ==> 0:140:R
524 sleep-4055 [01] 240.133058: 4055:120:S ==> 3997:120:R
525 [...]
526
527
528As we have discussed previously about this format, the header
529shows the name of the trace and points to the options. The
530"FUNCTION" is a misnomer since here it represents the wake ups
531and context switches.
532
533The sched_switch file only lists the wake ups (represented with
534'+') and context switches ('==>') with the previous task or
535current task first followed by the next task or task waking up.
536The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
537Remember that the KERNEL-PRIO is the inverse of the actual
538priority with zero (0) being the highest priority and the nice
539values starting at 100 (nice -20). Below is a quick chart to map
540the kernel priority to user land priorities.
541
542 Kernel Space User Space
543 ===============================================================
544 0(high) to 98(low) user RT priority 99(high) to 1(low)
545 with SCHED_RR or SCHED_FIFO
546 ---------------------------------------------------------------
547 99 sched_priority is not used in scheduling
548 decisions(it must be specified as 0)
549 ---------------------------------------------------------------
550 100(high) to 139(low) user nice -20(high) to 19(low)
551 ---------------------------------------------------------------
552 140 idle task priority
553 ---------------------------------------------------------------
554
555The task states are:
556
557 R - running : wants to run, may not actually be running
558 S - sleep : process is waiting to be woken up (handles signals)
559 D - disk sleep (uninterruptible sleep) : process must be woken up
560 (ignores signals)
561 T - stopped : process suspended
562 t - traced : process is being traced (with something like gdb)
563 Z - zombie : process waiting to be cleaned up
564 X - unknown
565
566 461
567ftrace_enabled 462ftrace_enabled
568-------------- 463--------------
@@ -607,10 +502,10 @@ an example:
607 # echo irqsoff > current_tracer 502 # echo irqsoff > current_tracer
608 # echo latency-format > trace_options 503 # echo latency-format > trace_options
609 # echo 0 > tracing_max_latency 504 # echo 0 > tracing_max_latency
610 # echo 1 > tracing_enabled 505 # echo 1 > tracing_on
611 # ls -ltr 506 # ls -ltr
612 [...] 507 [...]
613 # echo 0 > tracing_enabled 508 # echo 0 > tracing_on
614 # cat trace 509 # cat trace
615# tracer: irqsoff 510# tracer: irqsoff
616# 511#
@@ -715,10 +610,10 @@ is much like the irqsoff tracer.
715 # echo preemptoff > current_tracer 610 # echo preemptoff > current_tracer
716 # echo latency-format > trace_options 611 # echo latency-format > trace_options
717 # echo 0 > tracing_max_latency 612 # echo 0 > tracing_max_latency
718 # echo 1 > tracing_enabled 613 # echo 1 > tracing_on
719 # ls -ltr 614 # ls -ltr
720 [...] 615 [...]
721 # echo 0 > tracing_enabled 616 # echo 0 > tracing_on
722 # cat trace 617 # cat trace
723# tracer: preemptoff 618# tracer: preemptoff
724# 619#
@@ -863,10 +758,10 @@ tracers.
863 # echo preemptirqsoff > current_tracer 758 # echo preemptirqsoff > current_tracer
864 # echo latency-format > trace_options 759 # echo latency-format > trace_options
865 # echo 0 > tracing_max_latency 760 # echo 0 > tracing_max_latency
866 # echo 1 > tracing_enabled 761 # echo 1 > tracing_on
867 # ls -ltr 762 # ls -ltr
868 [...] 763 [...]
869 # echo 0 > tracing_enabled 764 # echo 0 > tracing_on
870 # cat trace 765 # cat trace
871# tracer: preemptirqsoff 766# tracer: preemptirqsoff
872# 767#
@@ -1026,9 +921,9 @@ Instead of performing an 'ls', we will run 'sleep 1' under
1026 # echo wakeup > current_tracer 921 # echo wakeup > current_tracer
1027 # echo latency-format > trace_options 922 # echo latency-format > trace_options
1028 # echo 0 > tracing_max_latency 923 # echo 0 > tracing_max_latency
1029 # echo 1 > tracing_enabled 924 # echo 1 > tracing_on
1030 # chrt -f 5 sleep 1 925 # chrt -f 5 sleep 1
1031 # echo 0 > tracing_enabled 926 # echo 0 > tracing_on
1032 # cat trace 927 # cat trace
1033# tracer: wakeup 928# tracer: wakeup
1034# 929#
@@ -1140,9 +1035,9 @@ ftrace_enabled is set; otherwise this tracer is a nop.
1140 1035
1141 # sysctl kernel.ftrace_enabled=1 1036 # sysctl kernel.ftrace_enabled=1
1142 # echo function > current_tracer 1037 # echo function > current_tracer
1143 # echo 1 > tracing_enabled 1038 # echo 1 > tracing_on
1144 # usleep 1 1039 # usleep 1
1145 # echo 0 > tracing_enabled 1040 # echo 0 > tracing_on
1146 # cat trace 1041 # cat trace
1147# tracer: function 1042# tracer: function
1148# 1043#
@@ -1180,7 +1075,7 @@ int trace_fd;
1180[...] 1075[...]
1181int main(int argc, char *argv[]) { 1076int main(int argc, char *argv[]) {
1182 [...] 1077 [...]
1183 trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY); 1078 trace_fd = open(tracing_file("tracing_on"), O_WRONLY);
1184 [...] 1079 [...]
1185 if (condition_hit()) { 1080 if (condition_hit()) {
1186 write(trace_fd, "0", 1); 1081 write(trace_fd, "0", 1);
@@ -1631,9 +1526,9 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
1631 # echo sys_nanosleep hrtimer_interrupt \ 1526 # echo sys_nanosleep hrtimer_interrupt \
1632 > set_ftrace_filter 1527 > set_ftrace_filter
1633 # echo function > current_tracer 1528 # echo function > current_tracer
1634 # echo 1 > tracing_enabled 1529 # echo 1 > tracing_on
1635 # usleep 1 1530 # usleep 1
1636 # echo 0 > tracing_enabled 1531 # echo 0 > tracing_on
1637 # cat trace 1532 # cat trace
1638# tracer: ftrace 1533# tracer: ftrace
1639# 1534#
@@ -1879,9 +1774,9 @@ different. The trace is live.
1879 # echo function > current_tracer 1774 # echo function > current_tracer
1880 # cat trace_pipe > /tmp/trace.out & 1775 # cat trace_pipe > /tmp/trace.out &
1881[1] 4153 1776[1] 4153
1882 # echo 1 > tracing_enabled 1777 # echo 1 > tracing_on
1883 # usleep 1 1778 # usleep 1
1884 # echo 0 > tracing_enabled 1779 # echo 0 > tracing_on
1885 # cat trace 1780 # cat trace
1886# tracer: function 1781# tracer: function
1887# 1782#
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 5f77d94598d..6d27ab8d6e9 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -42,11 +42,25 @@ Synopsis of kprobe_events
42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) 42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
43 NAME=FETCHARG : Set NAME as the argument name of FETCHARG. 43 NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
44 FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types 44 FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
45 (u8/u16/u32/u64/s8/s16/s32/s64) and string are supported. 45 (u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield
46 are supported.
46 47
47 (*) only for return probe. 48 (*) only for return probe.
48 (**) this is useful for fetching a field of data structures. 49 (**) this is useful for fetching a field of data structures.
49 50
51Types
52-----
53Several types are supported for fetch-args. Kprobe tracer will access memory
54by given type. Prefix 's' and 'u' means those types are signed and unsigned
55respectively. Traced arguments are shown in decimal (signed) or hex (unsigned).
56String type is a special type, which fetches a "null-terminated" string from
57kernel space. This means it will fail and store NULL if the string container
58has been paged out.
59Bitfield is another special type, which takes 3 parameters, bit-width, bit-
60offset, and container-size (usually 32). The syntax is;
61
62 b<bit-width>@<bit-offset>/<container-size>
63
50 64
51Per-Probe Event Filtering 65Per-Probe Event Filtering
52------------------------- 66-------------------------
diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h
index 70145cbb21c..1b71ca70c9f 100644
--- a/arch/alpha/include/asm/fcntl.h
+++ b/arch/alpha/include/asm/fcntl.h
@@ -31,6 +31,8 @@
31#define __O_SYNC 020000000 31#define __O_SYNC 020000000
32#define O_SYNC (__O_SYNC|O_DSYNC) 32#define O_SYNC (__O_SYNC|O_DSYNC)
33 33
34#define O_PATH 040000000
35
34#define F_GETLK 7 36#define F_GETLK 7
35#define F_SETLK 8 37#define F_SETLK 8
36#define F_SETLKW 9 38#define F_SETLKW 9
diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index 945de222ab9..e8a761aee08 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -29,7 +29,7 @@
29 : "r" (uaddr), "r"(oparg) \ 29 : "r" (uaddr), "r"(oparg) \
30 : "memory") 30 : "memory")
31 31
32static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) 32static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
33{ 33{
34 int op = (encoded_op >> 28) & 7; 34 int op = (encoded_op >> 28) & 7;
35 int cmp = (encoded_op >> 24) & 15; 35 int cmp = (encoded_op >> 24) & 15;
@@ -39,7 +39,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
39 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 39 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
40 oparg = 1 << oparg; 40 oparg = 1 << oparg;
41 41
42 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 42 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
43 return -EFAULT; 43 return -EFAULT;
44 44
45 pagefault_disable(); 45 pagefault_disable();
@@ -81,21 +81,23 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
81} 81}
82 82
83static inline int 83static inline int
84futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 84futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
85 u32 oldval, u32 newval)
85{ 86{
86 int prev, cmp; 87 int ret = 0, cmp;
88 u32 prev;
87 89
88 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 90 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
89 return -EFAULT; 91 return -EFAULT;
90 92
91 __asm__ __volatile__ ( 93 __asm__ __volatile__ (
92 __ASM_SMP_MB 94 __ASM_SMP_MB
93 "1: ldl_l %0,0(%2)\n" 95 "1: ldl_l %1,0(%3)\n"
94 " cmpeq %0,%3,%1\n" 96 " cmpeq %1,%4,%2\n"
95 " beq %1,3f\n" 97 " beq %2,3f\n"
96 " mov %4,%1\n" 98 " mov %5,%2\n"
97 "2: stl_c %1,0(%2)\n" 99 "2: stl_c %2,0(%3)\n"
98 " beq %1,4f\n" 100 " beq %2,4f\n"
99 "3: .subsection 2\n" 101 "3: .subsection 2\n"
100 "4: br 1b\n" 102 "4: br 1b\n"
101 " .previous\n" 103 " .previous\n"
@@ -105,11 +107,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
105 " .long 2b-.\n" 107 " .long 2b-.\n"
106 " lda $31,3b-2b(%0)\n" 108 " lda $31,3b-2b(%0)\n"
107 " .previous\n" 109 " .previous\n"
108 : "=&r"(prev), "=&r"(cmp) 110 : "+r"(ret), "=&r"(prev), "=&r"(cmp)
109 : "r"(uaddr), "r"((long)oldval), "r"(newval) 111 : "r"(uaddr), "r"((long)oldval), "r"(newval)
110 : "memory"); 112 : "memory");
111 113
112 return prev; 114 *uval = prev;
115 return ret;
113} 116}
114 117
115#endif /* __KERNEL__ */ 118#endif /* __KERNEL__ */
diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 1570c0b5433..a83bbea62c6 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -13,44 +13,13 @@
13#ifdef __KERNEL__ 13#ifdef __KERNEL__
14 14
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <linux/list.h>
17#include <linux/spinlock.h>
18 16
19struct rwsem_waiter;
20
21extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
22extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
23extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
24extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
25
26/*
27 * the semaphore definition
28 */
29struct rw_semaphore {
30 long count;
31#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L 17#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L
32#define RWSEM_ACTIVE_BIAS 0x0000000000000001L 18#define RWSEM_ACTIVE_BIAS 0x0000000000000001L
33#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL 19#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL
34#define RWSEM_WAITING_BIAS (-0x0000000100000000L) 20#define RWSEM_WAITING_BIAS (-0x0000000100000000L)
35#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 21#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
36#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 22#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
37 spinlock_t wait_lock;
38 struct list_head wait_list;
39};
40
41#define __RWSEM_INITIALIZER(name) \
42 { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
43 LIST_HEAD_INIT((name).wait_list) }
44
45#define DECLARE_RWSEM(name) \
46 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
47
48static inline void init_rwsem(struct rw_semaphore *sem)
49{
50 sem->count = RWSEM_UNLOCKED_VALUE;
51 spin_lock_init(&sem->wait_lock);
52 INIT_LIST_HEAD(&sem->wait_list);
53}
54 23
55static inline void __down_read(struct rw_semaphore *sem) 24static inline void __down_read(struct rw_semaphore *sem)
56{ 25{
@@ -250,10 +219,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
250#endif 219#endif
251} 220}
252 221
253static inline int rwsem_is_locked(struct rw_semaphore *sem)
254{
255 return (sem->count != 0);
256}
257
258#endif /* __KERNEL__ */ 222#endif /* __KERNEL__ */
259#endif /* _ALPHA_RWSEM_H */ 223#endif /* _ALPHA_RWSEM_H */
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index fe698b5045e..376f2213079 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -230,44 +230,24 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st
230 return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0; 230 return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0;
231} 231}
232 232
233static int 233SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
234do_osf_statfs(struct path *path, struct osf_statfs __user *buffer, 234 struct osf_statfs __user *, buffer, unsigned long, bufsiz)
235 unsigned long bufsiz)
236{ 235{
237 struct kstatfs linux_stat; 236 struct kstatfs linux_stat;
238 int error = vfs_statfs(path, &linux_stat); 237 int error = user_statfs(pathname, &linux_stat);
239 if (!error) 238 if (!error)
240 error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); 239 error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
241 return error; 240 return error;
242} 241}
243 242
244SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
245 struct osf_statfs __user *, buffer, unsigned long, bufsiz)
246{
247 struct path path;
248 int retval;
249
250 retval = user_path(pathname, &path);
251 if (!retval) {
252 retval = do_osf_statfs(&path, buffer, bufsiz);
253 path_put(&path);
254 }
255 return retval;
256}
257
258SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd, 243SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd,
259 struct osf_statfs __user *, buffer, unsigned long, bufsiz) 244 struct osf_statfs __user *, buffer, unsigned long, bufsiz)
260{ 245{
261 struct file *file; 246 struct kstatfs linux_stat;
262 int retval; 247 int error = fd_statfs(fd, &linux_stat);
263 248 if (!error)
264 retval = -EBADF; 249 error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
265 file = fget(fd); 250 return error;
266 if (file) {
267 retval = do_osf_statfs(&file->f_path, buffer, bufsiz);
268 fput(file);
269 }
270 return retval;
271} 251}
272 252
273/* 253/*
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index c1f3e7cb82a..a58e84f1a63 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -159,7 +159,7 @@ void read_persistent_clock(struct timespec *ts)
159 159
160/* 160/*
161 * timer_interrupt() needs to keep up the real-time clock, 161 * timer_interrupt() needs to keep up the real-time clock,
162 * as well as call the "do_timer()" routine every clocktick 162 * as well as call the "xtime_update()" routine every clocktick
163 */ 163 */
164irqreturn_t timer_interrupt(int irq, void *dev) 164irqreturn_t timer_interrupt(int irq, void *dev)
165{ 165{
@@ -172,8 +172,6 @@ irqreturn_t timer_interrupt(int irq, void *dev)
172 profile_tick(CPU_PROFILING); 172 profile_tick(CPU_PROFILING);
173#endif 173#endif
174 174
175 write_seqlock(&xtime_lock);
176
177 /* 175 /*
178 * Calculate how many ticks have passed since the last update, 176 * Calculate how many ticks have passed since the last update,
179 * including any previous partial leftover. Save any resulting 177 * including any previous partial leftover. Save any resulting
@@ -187,9 +185,7 @@ irqreturn_t timer_interrupt(int irq, void *dev)
187 nticks = delta >> FIX_SHIFT; 185 nticks = delta >> FIX_SHIFT;
188 186
189 if (nticks) 187 if (nticks)
190 do_timer(nticks); 188 xtime_update(nticks);
191
192 write_sequnlock(&xtime_lock);
193 189
194 if (test_irq_work_pending()) { 190 if (test_irq_work_pending()) {
195 clear_irq_work_pending(); 191 clear_irq_work_pending();
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index b33fe7065b3..199a6b6de7f 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -35,7 +35,7 @@
35 : "cc", "memory") 35 : "cc", "memory")
36 36
37static inline int 37static inline int
38futex_atomic_op_inuser (int encoded_op, int __user *uaddr) 38futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
39{ 39{
40 int op = (encoded_op >> 28) & 7; 40 int op = (encoded_op >> 28) & 7;
41 int cmp = (encoded_op >> 24) & 15; 41 int cmp = (encoded_op >> 24) & 15;
@@ -46,7 +46,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
46 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 46 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
47 oparg = 1 << oparg; 47 oparg = 1 << oparg;
48 48
49 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 49 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
50 return -EFAULT; 50 return -EFAULT;
51 51
52 pagefault_disable(); /* implies preempt_disable() */ 52 pagefault_disable(); /* implies preempt_disable() */
@@ -88,36 +88,35 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
88} 88}
89 89
90static inline int 90static inline int
91futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 91futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
92 u32 oldval, u32 newval)
92{ 93{
93 int val; 94 int ret = 0;
95 u32 val;
94 96
95 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 97 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
96 return -EFAULT; 98 return -EFAULT;
97 99
98 pagefault_disable(); /* implies preempt_disable() */
99
100 __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" 100 __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
101 "1: " T(ldr) " %0, [%3]\n" 101 "1: " T(ldr) " %1, [%4]\n"
102 " teq %0, %1\n" 102 " teq %1, %2\n"
103 " it eq @ explicit IT needed for the 2b label\n" 103 " it eq @ explicit IT needed for the 2b label\n"
104 "2: " T(streq) " %2, [%3]\n" 104 "2: " T(streq) " %3, [%4]\n"
105 "3:\n" 105 "3:\n"
106 " .pushsection __ex_table,\"a\"\n" 106 " .pushsection __ex_table,\"a\"\n"
107 " .align 3\n" 107 " .align 3\n"
108 " .long 1b, 4f, 2b, 4f\n" 108 " .long 1b, 4f, 2b, 4f\n"
109 " .popsection\n" 109 " .popsection\n"
110 " .pushsection .fixup,\"ax\"\n" 110 " .pushsection .fixup,\"ax\"\n"
111 "4: mov %0, %4\n" 111 "4: mov %0, %5\n"
112 " b 3b\n" 112 " b 3b\n"
113 " .popsection" 113 " .popsection"
114 : "=&r" (val) 114 : "+r" (ret), "=&r" (val)
115 : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) 115 : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT)
116 : "cc", "memory"); 116 : "cc", "memory");
117 117
118 pagefault_enable(); /* subsumes preempt_enable() */ 118 *uval = val;
119 119 return ret;
120 return val;
121} 120}
122 121
123#endif /* !SMP */ 122#endif /* !SMP */
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 3d76bf23373..1ff46cabc7e 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -107,9 +107,7 @@ void timer_tick(void)
107{ 107{
108 profile_tick(CPU_PROFILING); 108 profile_tick(CPU_PROFILING);
109 do_leds(); 109 do_leds();
110 write_seqlock(&xtime_lock); 110 xtime_update(1);
111 do_timer(1);
112 write_sequnlock(&xtime_lock);
113#ifndef CONFIG_SMP 111#ifndef CONFIG_SMP
114 update_process_times(user_mode(get_irq_regs())); 112 update_process_times(user_mode(get_irq_regs()));
115#endif 113#endif
diff --git a/arch/arm/mach-clps711x/include/mach/time.h b/arch/arm/mach-clps711x/include/mach/time.h
index 8fe283ccd1f..61fef9129c6 100644
--- a/arch/arm/mach-clps711x/include/mach/time.h
+++ b/arch/arm/mach-clps711x/include/mach/time.h
@@ -30,7 +30,7 @@ p720t_timer_interrupt(int irq, void *dev_id)
30{ 30{
31 struct pt_regs *regs = get_irq_regs(); 31 struct pt_regs *regs = get_irq_regs();
32 do_leds(); 32 do_leds();
33 do_timer(1); 33 xtime_update(1);
34#ifndef CONFIG_SMP 34#ifndef CONFIG_SMP
35 update_process_times(user_mode(regs)); 35 update_process_times(user_mode(regs));
36#endif 36#endif
diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index c9113619029..8d73724c009 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c
@@ -114,16 +114,14 @@ u32 arch_gettimeoffset(void)
114 114
115/* 115/*
116 * timer_interrupt() needs to keep up the real-time clock, 116 * timer_interrupt() needs to keep up the real-time clock,
117 * as well as call the "do_timer()" routine every clocktick 117 * as well as call the "xtime_update()" routine every clocktick
118 */ 118 */
119#ifdef CONFIG_CORE_TIMER_IRQ_L1 119#ifdef CONFIG_CORE_TIMER_IRQ_L1
120__attribute__((l1_text)) 120__attribute__((l1_text))
121#endif 121#endif
122irqreturn_t timer_interrupt(int irq, void *dummy) 122irqreturn_t timer_interrupt(int irq, void *dummy)
123{ 123{
124 write_seqlock(&xtime_lock); 124 xtime_update(1);
125 do_timer(1);
126 write_sequnlock(&xtime_lock);
127 125
128#ifdef CONFIG_IPIPE 126#ifdef CONFIG_IPIPE
129 update_root_process_times(get_irq_regs()); 127 update_root_process_times(get_irq_regs());
diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index 00eb36f8deb..20c85b5dc7d 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -140,7 +140,7 @@ stop_watchdog(void)
140 140
141/* 141/*
142 * timer_interrupt() needs to keep up the real-time clock, 142 * timer_interrupt() needs to keep up the real-time clock,
143 * as well as call the "do_timer()" routine every clocktick 143 * as well as call the "xtime_update()" routine every clocktick
144 */ 144 */
145 145
146//static unsigned short myjiff; /* used by our debug routine print_timestamp */ 146//static unsigned short myjiff; /* used by our debug routine print_timestamp */
@@ -176,7 +176,7 @@ timer_interrupt(int irq, void *dev_id)
176 176
177 /* call the real timer interrupt handler */ 177 /* call the real timer interrupt handler */
178 178
179 do_timer(1); 179 xtime_update(1);
180 180
181 cris_do_profile(regs); /* Save profiling information */ 181 cris_do_profile(regs); /* Save profiling information */
182 return IRQ_HANDLED; 182 return IRQ_HANDLED;
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 84fed3b4b07..4c9e3e1ba5d 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -26,7 +26,9 @@
26#define FLUSH_ALL (void*)0xffffffff 26#define FLUSH_ALL (void*)0xffffffff
27 27
28/* Vector of locks used for various atomic operations */ 28/* Vector of locks used for various atomic operations */
29spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED}; 29spinlock_t cris_atomic_locks[] = {
30 [0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(cris_atomic_locks)
31};
30 32
31/* CPU masks */ 33/* CPU masks */
32cpumask_t phys_cpu_present_map = CPU_MASK_NONE; 34cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c
index a545211e999..bb978ede898 100644
--- a/arch/cris/arch-v32/kernel/time.c
+++ b/arch/cris/arch-v32/kernel/time.c
@@ -183,7 +183,7 @@ void handle_watchdog_bite(struct pt_regs *regs)
183 183
184/* 184/*
185 * timer_interrupt() needs to keep up the real-time clock, 185 * timer_interrupt() needs to keep up the real-time clock,
186 * as well as call the "do_timer()" routine every clocktick. 186 * as well as call the "xtime_update()" routine every clocktick.
187 */ 187 */
188extern void cris_do_profile(struct pt_regs *regs); 188extern void cris_do_profile(struct pt_regs *regs);
189 189
@@ -216,9 +216,7 @@ static inline irqreturn_t timer_interrupt(int irq, void *dev_id)
216 return IRQ_HANDLED; 216 return IRQ_HANDLED;
217 217
218 /* Call the real timer interrupt handler */ 218 /* Call the real timer interrupt handler */
219 write_seqlock(&xtime_lock); 219 xtime_update(1);
220 do_timer(1);
221 write_sequnlock(&xtime_lock);
222 return IRQ_HANDLED; 220 return IRQ_HANDLED;
223} 221}
224 222
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 08b3d1da358..4bea27f50a7 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,10 +7,11 @@
7#include <asm/errno.h> 7#include <asm/errno.h>
8#include <asm/uaccess.h> 8#include <asm/uaccess.h>
9 9
10extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr); 10extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
11 11
12static inline int 12static inline int
13futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 13futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
14 u32 oldval, u32 newval)
14{ 15{
15 return -ENOSYS; 16 return -ENOSYS;
16} 17}
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index 14f64b054c7..d155ca9e509 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -18,7 +18,7 @@
18 * the various futex operations; MMU fault checking is ignored under no-MMU 18 * the various futex operations; MMU fault checking is ignored under no-MMU
19 * conditions 19 * conditions
20 */ 20 */
21static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_oldval) 21static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr, int *_oldval)
22{ 22{
23 int oldval, ret; 23 int oldval, ret;
24 24
@@ -50,7 +50,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_o
50 return ret; 50 return ret;
51} 51}
52 52
53static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_oldval) 53static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr, int *_oldval)
54{ 54{
55 int oldval, ret; 55 int oldval, ret;
56 56
@@ -83,7 +83,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_o
83 return ret; 83 return ret;
84} 84}
85 85
86static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_oldval) 86static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr, int *_oldval)
87{ 87{
88 int oldval, ret; 88 int oldval, ret;
89 89
@@ -116,7 +116,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_ol
116 return ret; 116 return ret;
117} 117}
118 118
119static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_oldval) 119static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr, int *_oldval)
120{ 120{
121 int oldval, ret; 121 int oldval, ret;
122 122
@@ -149,7 +149,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_o
149 return ret; 149 return ret;
150} 150}
151 151
152static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_oldval) 152static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_oldval)
153{ 153{
154 int oldval, ret; 154 int oldval, ret;
155 155
@@ -186,7 +186,7 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_o
186/* 186/*
187 * do the futex operations 187 * do the futex operations
188 */ 188 */
189int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 189int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
190{ 190{
191 int op = (encoded_op >> 28) & 7; 191 int op = (encoded_op >> 28) & 7;
192 int cmp = (encoded_op >> 24) & 15; 192 int cmp = (encoded_op >> 24) & 15;
@@ -197,7 +197,7 @@ int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
197 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 197 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
198 oparg = 1 << oparg; 198 oparg = 1 << oparg;
199 199
200 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 200 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
201 return -EFAULT; 201 return -EFAULT;
202 202
203 pagefault_disable(); 203 pagefault_disable();
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 0ddbbae83cb..b457de496b7 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -50,21 +50,13 @@ static struct irqaction timer_irq = {
50 50
51/* 51/*
52 * timer_interrupt() needs to keep up the real-time clock, 52 * timer_interrupt() needs to keep up the real-time clock,
53 * as well as call the "do_timer()" routine every clocktick 53 * as well as call the "xtime_update()" routine every clocktick
54 */ 54 */
55static irqreturn_t timer_interrupt(int irq, void *dummy) 55static irqreturn_t timer_interrupt(int irq, void *dummy)
56{ 56{
57 profile_tick(CPU_PROFILING); 57 profile_tick(CPU_PROFILING);
58 /*
59 * Here we are in the timer irq handler. We just have irqs locally
60 * disabled but we don't know if the timer_bh is running on the other
61 * CPU. We need to avoid to SMP race with it. NOTE: we don't need
62 * the irq version of write_lock because as just said we have irq
63 * locally disabled. -arca
64 */
65 write_seqlock(&xtime_lock);
66 58
67 do_timer(1); 59 xtime_update(1);
68 60
69#ifdef CONFIG_HEARTBEAT 61#ifdef CONFIG_HEARTBEAT
70 static unsigned short n; 62 static unsigned short n;
@@ -72,8 +64,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy)
72 __set_LEDS(n); 64 __set_LEDS(n);
73#endif /* CONFIG_HEARTBEAT */ 65#endif /* CONFIG_HEARTBEAT */
74 66
75 write_sequnlock(&xtime_lock);
76
77 update_process_times(user_mode(get_irq_regs())); 67 update_process_times(user_mode(get_irq_regs()));
78 68
79 return IRQ_HANDLED; 69 return IRQ_HANDLED;
diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index 165005aff9d..32263a138aa 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -35,9 +35,7 @@ void h8300_timer_tick(void)
35{ 35{
36 if (current->pid) 36 if (current->pid)
37 profile_tick(CPU_PROFILING); 37 profile_tick(CPU_PROFILING);
38 write_seqlock(&xtime_lock); 38 xtime_update(1);
39 do_timer(1);
40 write_sequnlock(&xtime_lock);
41 update_process_times(user_mode(get_irq_regs())); 39 update_process_times(user_mode(get_irq_regs()));
42} 40}
43 41
diff --git a/arch/h8300/kernel/timer/timer8.c b/arch/h8300/kernel/timer/timer8.c
index 3946c0fa837..7a1533fad47 100644
--- a/arch/h8300/kernel/timer/timer8.c
+++ b/arch/h8300/kernel/timer/timer8.c
@@ -61,7 +61,7 @@
61 61
62/* 62/*
63 * timer_interrupt() needs to keep up the real-time clock, 63 * timer_interrupt() needs to keep up the real-time clock,
64 * as well as call the "do_timer()" routine every clocktick 64 * as well as call the "xtime_update()" routine every clocktick
65 */ 65 */
66 66
67static irqreturn_t timer_interrupt(int irq, void *dev_id) 67static irqreturn_t timer_interrupt(int irq, void *dev_id)
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index c7f0f062239..8428525ddb2 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -46,7 +46,7 @@ do { \
46} while (0) 46} while (0)
47 47
48static inline int 48static inline int
49futex_atomic_op_inuser (int encoded_op, int __user *uaddr) 49futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
50{ 50{
51 int op = (encoded_op >> 28) & 7; 51 int op = (encoded_op >> 28) & 7;
52 int cmp = (encoded_op >> 24) & 15; 52 int cmp = (encoded_op >> 24) & 15;
@@ -56,7 +56,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
56 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 56 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
57 oparg = 1 << oparg; 57 oparg = 1 << oparg;
58 58
59 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) 59 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
60 return -EFAULT; 60 return -EFAULT;
61 61
62 pagefault_disable(); 62 pagefault_disable();
@@ -100,23 +100,26 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
100} 100}
101 101
102static inline int 102static inline int
103futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 103futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
104 u32 oldval, u32 newval)
104{ 105{
105 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 106 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
106 return -EFAULT; 107 return -EFAULT;
107 108
108 { 109 {
109 register unsigned long r8 __asm ("r8"); 110 register unsigned long r8 __asm ("r8") = 0;
111 unsigned long prev;
110 __asm__ __volatile__( 112 __asm__ __volatile__(
111 " mf;; \n" 113 " mf;; \n"
112 " mov ar.ccv=%3;; \n" 114 " mov ar.ccv=%3;; \n"
113 "[1:] cmpxchg4.acq %0=[%1],%2,ar.ccv \n" 115 "[1:] cmpxchg4.acq %0=[%1],%2,ar.ccv \n"
114 " .xdata4 \"__ex_table\", 1b-., 2f-. \n" 116 " .xdata4 \"__ex_table\", 1b-., 2f-. \n"
115 "[2:]" 117 "[2:]"
116 : "=r" (r8) 118 : "=r" (prev)
117 : "r" (uaddr), "r" (newval), 119 : "r" (uaddr), "r" (newval),
118 "rO" ((long) (unsigned) oldval) 120 "rO" ((long) (unsigned) oldval)
119 : "memory"); 121 : "memory");
122 *uval = prev;
120 return r8; 123 return r8;
121 } 124 }
122} 125}
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 215d5454c7d..3027e7516d8 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -25,20 +25,8 @@
25#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead." 25#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
26#endif 26#endif
27 27
28#include <linux/list.h>
29#include <linux/spinlock.h>
30
31#include <asm/intrinsics.h> 28#include <asm/intrinsics.h>
32 29
33/*
34 * the semaphore definition
35 */
36struct rw_semaphore {
37 signed long count;
38 spinlock_t wait_lock;
39 struct list_head wait_list;
40};
41
42#define RWSEM_UNLOCKED_VALUE __IA64_UL_CONST(0x0000000000000000) 30#define RWSEM_UNLOCKED_VALUE __IA64_UL_CONST(0x0000000000000000)
43#define RWSEM_ACTIVE_BIAS (1L) 31#define RWSEM_ACTIVE_BIAS (1L)
44#define RWSEM_ACTIVE_MASK (0xffffffffL) 32#define RWSEM_ACTIVE_MASK (0xffffffffL)
@@ -46,26 +34,6 @@ struct rw_semaphore {
46#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 34#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
47#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 35#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
48 36
49#define __RWSEM_INITIALIZER(name) \
50 { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
51 LIST_HEAD_INIT((name).wait_list) }
52
53#define DECLARE_RWSEM(name) \
54 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
55
56extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
57extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
58extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
59extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
60
61static inline void
62init_rwsem (struct rw_semaphore *sem)
63{
64 sem->count = RWSEM_UNLOCKED_VALUE;
65 spin_lock_init(&sem->wait_lock);
66 INIT_LIST_HEAD(&sem->wait_list);
67}
68
69/* 37/*
70 * lock for reading 38 * lock for reading
71 */ 39 */
@@ -174,9 +142,4 @@ __downgrade_write (struct rw_semaphore *sem)
174#define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) 142#define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count))
175#define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) 143#define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
176 144
177static inline int rwsem_is_locked(struct rw_semaphore *sem)
178{
179 return (sem->count != 0);
180}
181
182#endif /* _ASM_IA64_RWSEM_H */ 145#endif /* _ASM_IA64_RWSEM_H */
diff --git a/arch/ia64/include/asm/xen/hypercall.h b/arch/ia64/include/asm/xen/hypercall.h
index 96fc62366aa..ed28bcd5bb8 100644
--- a/arch/ia64/include/asm/xen/hypercall.h
+++ b/arch/ia64/include/asm/xen/hypercall.h
@@ -107,7 +107,7 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2,
107static inline int 107static inline int
108xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg) 108xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg)
109{ 109{
110 return _hypercall2(int, sched_op_new, cmd, arg); 110 return _hypercall2(int, sched_op, cmd, arg);
111} 111}
112 112
113static inline long 113static inline long
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 9702fa92489..156ad803d5b 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -190,19 +190,10 @@ timer_interrupt (int irq, void *dev_id)
190 190
191 new_itm += local_cpu_data->itm_delta; 191 new_itm += local_cpu_data->itm_delta;
192 192
193 if (smp_processor_id() == time_keeper_id) { 193 if (smp_processor_id() == time_keeper_id)
194 /* 194 xtime_update(1);
195 * Here we are in the timer irq handler. We have irqs locally 195
196 * disabled, but we don't know if the timer_bh is running on 196 local_cpu_data->itm_next = new_itm;
197 * another CPU. We need to avoid to SMP race by acquiring the
198 * xtime_lock.
199 */
200 write_seqlock(&xtime_lock);
201 do_timer(1);
202 local_cpu_data->itm_next = new_itm;
203 write_sequnlock(&xtime_lock);
204 } else
205 local_cpu_data->itm_next = new_itm;
206 197
207 if (time_after(new_itm, ia64_get_itc())) 198 if (time_after(new_itm, ia64_get_itc()))
208 break; 199 break;
@@ -222,7 +213,7 @@ skip_process_time_accounting:
222 * comfort, we increase the safety margin by 213 * comfort, we increase the safety margin by
223 * intentionally dropping the next tick(s). We do NOT 214 * intentionally dropping the next tick(s). We do NOT
224 * update itm.next because that would force us to call 215 * update itm.next because that would force us to call
225 * do_timer() which in turn would let our clock run 216 * xtime_update() which in turn would let our clock run
226 * too fast (with the potentially devastating effect 217 * too fast (with the potentially devastating effect
227 * of losing monotony of time). 218 * of losing monotony of time).
228 */ 219 */
diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c
index fd66b048c6f..419c8620945 100644
--- a/arch/ia64/xen/suspend.c
+++ b/arch/ia64/xen/suspend.c
@@ -37,19 +37,14 @@ xen_mm_unpin_all(void)
37 /* nothing */ 37 /* nothing */
38} 38}
39 39
40void xen_pre_device_suspend(void)
41{
42 /* nothing */
43}
44
45void 40void
46xen_pre_suspend() 41xen_arch_pre_suspend()
47{ 42{
48 /* nothing */ 43 /* nothing */
49} 44}
50 45
51void 46void
52xen_post_suspend(int suspend_cancelled) 47xen_arch_post_suspend(int suspend_cancelled)
53{ 48{
54 if (suspend_cancelled) 49 if (suspend_cancelled)
55 return; 50 return;
diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index c1c544513e8..1f8244a78be 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -139,14 +139,11 @@ consider_steal_time(unsigned long new_itm)
139 run_posix_cpu_timers(p); 139 run_posix_cpu_timers(p);
140 delta_itm += local_cpu_data->itm_delta * (stolen + blocked); 140 delta_itm += local_cpu_data->itm_delta * (stolen + blocked);
141 141
142 if (cpu == time_keeper_id) { 142 if (cpu == time_keeper_id)
143 write_seqlock(&xtime_lock); 143 xtime_update(stolen + blocked);
144 do_timer(stolen + blocked); 144
145 local_cpu_data->itm_next = delta_itm + new_itm; 145 local_cpu_data->itm_next = delta_itm + new_itm;
146 write_sequnlock(&xtime_lock); 146
147 } else {
148 local_cpu_data->itm_next = delta_itm + new_itm;
149 }
150 per_cpu(xen_stolen_time, cpu) += NS_PER_TICK * stolen; 147 per_cpu(xen_stolen_time, cpu) += NS_PER_TICK * stolen;
151 per_cpu(xen_blocked_time, cpu) += NS_PER_TICK * blocked; 148 per_cpu(xen_blocked_time, cpu) += NS_PER_TICK * blocked;
152 } 149 }
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index bda86820bff..84dd04048db 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -107,15 +107,14 @@ u32 arch_gettimeoffset(void)
107 107
108/* 108/*
109 * timer_interrupt() needs to keep up the real-time clock, 109 * timer_interrupt() needs to keep up the real-time clock,
110 * as well as call the "do_timer()" routine every clocktick 110 * as well as call the "xtime_update()" routine every clocktick
111 */ 111 */
112static irqreturn_t timer_interrupt(int irq, void *dev_id) 112static irqreturn_t timer_interrupt(int irq, void *dev_id)
113{ 113{
114#ifndef CONFIG_SMP 114#ifndef CONFIG_SMP
115 profile_tick(CPU_PROFILING); 115 profile_tick(CPU_PROFILING);
116#endif 116#endif
117 /* XXX FIXME. Uh, the xtime_lock should be held here, no? */ 117 xtime_update(1);
118 do_timer(1);
119 118
120#ifndef CONFIG_SMP 119#ifndef CONFIG_SMP
121 update_process_times(user_mode(get_irq_regs())); 120 update_process_times(user_mode(get_irq_regs()));
diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c
index 9fe6fefb5e1..1edd95095cb 100644
--- a/arch/m68k/bvme6000/config.c
+++ b/arch/m68k/bvme6000/config.c
@@ -45,8 +45,8 @@ extern int bvme6000_set_clock_mmss (unsigned long);
45extern void bvme6000_reset (void); 45extern void bvme6000_reset (void);
46void bvme6000_set_vectors (void); 46void bvme6000_set_vectors (void);
47 47
48/* Save tick handler routine pointer, will point to do_timer() in 48/* Save tick handler routine pointer, will point to xtime_update() in
49 * kernel/sched.c, called via bvme6000_process_int() */ 49 * kernel/timer/timekeeping.c, called via bvme6000_process_int() */
50 50
51static irq_handler_t tick_handler; 51static irq_handler_t tick_handler;
52 52
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 06438dac08f..18b34ee5db3 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -37,11 +37,11 @@ static inline int set_rtc_mmss(unsigned long nowtime)
37 37
38/* 38/*
39 * timer_interrupt() needs to keep up the real-time clock, 39 * timer_interrupt() needs to keep up the real-time clock,
40 * as well as call the "do_timer()" routine every clocktick 40 * as well as call the "xtime_update()" routine every clocktick
41 */ 41 */
42static irqreturn_t timer_interrupt(int irq, void *dummy) 42static irqreturn_t timer_interrupt(int irq, void *dummy)
43{ 43{
44 do_timer(1); 44 xtime_update(1);
45 update_process_times(user_mode(get_irq_regs())); 45 update_process_times(user_mode(get_irq_regs()));
46 profile_tick(CPU_PROFILING); 46 profile_tick(CPU_PROFILING);
47 47
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c
index 100baaa692a..6cb9c3a9b6c 100644
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -46,8 +46,8 @@ extern void mvme147_reset (void);
46 46
47static int bcd2int (unsigned char b); 47static int bcd2int (unsigned char b);
48 48
49/* Save tick handler routine pointer, will point to do_timer() in 49/* Save tick handler routine pointer, will point to xtime_update() in
50 * kernel/sched.c, called via mvme147_process_int() */ 50 * kernel/time/timekeeping.c, called via mvme147_process_int() */
51 51
52irq_handler_t tick_handler; 52irq_handler_t tick_handler;
53 53
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c
index 11edf61cc2c..0b28e262165 100644
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -51,8 +51,8 @@ extern void mvme16x_reset (void);
51 51
52int bcd2int (unsigned char b); 52int bcd2int (unsigned char b);
53 53
54/* Save tick handler routine pointer, will point to do_timer() in 54/* Save tick handler routine pointer, will point to xtime_update() in
55 * kernel/sched.c, called via mvme16x_process_int() */ 55 * kernel/time/timekeeping.c, called via mvme16x_process_int() */
56 56
57static irq_handler_t tick_handler; 57static irq_handler_t tick_handler;
58 58
diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c
index 2d9e21bd313..6464ad3ae3e 100644
--- a/arch/m68k/sun3/sun3ints.c
+++ b/arch/m68k/sun3/sun3ints.c
@@ -66,7 +66,7 @@ static irqreturn_t sun3_int5(int irq, void *dev_id)
66#ifdef CONFIG_SUN3 66#ifdef CONFIG_SUN3
67 intersil_clear(); 67 intersil_clear();
68#endif 68#endif
69 do_timer(1); 69 xtime_update(1);
70 update_process_times(user_mode(get_irq_regs())); 70 update_process_times(user_mode(get_irq_regs()));
71 if (!(kstat_cpu(0).irqs[irq] % 20)) 71 if (!(kstat_cpu(0).irqs[irq] % 20))
72 sun3_leds(led_pattern[(kstat_cpu(0).irqs[irq] % 160) / 20]); 72 sun3_leds(led_pattern[(kstat_cpu(0).irqs[irq] % 160) / 20]);
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index d6ac2a43453..6623909f70e 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -36,7 +36,7 @@ static inline int set_rtc_mmss(unsigned long nowtime)
36#ifndef CONFIG_GENERIC_CLOCKEVENTS 36#ifndef CONFIG_GENERIC_CLOCKEVENTS
37/* 37/*
38 * timer_interrupt() needs to keep up the real-time clock, 38 * timer_interrupt() needs to keep up the real-time clock,
39 * as well as call the "do_timer()" routine every clocktick 39 * as well as call the "xtime_update()" routine every clocktick
40 */ 40 */
41irqreturn_t arch_timer_interrupt(int irq, void *dummy) 41irqreturn_t arch_timer_interrupt(int irq, void *dummy)
42{ 42{
@@ -44,11 +44,7 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy)
44 if (current->pid) 44 if (current->pid)
45 profile_tick(CPU_PROFILING); 45 profile_tick(CPU_PROFILING);
46 46
47 write_seqlock(&xtime_lock); 47 xtime_update(1);
48
49 do_timer(1);
50
51 write_sequnlock(&xtime_lock);
52 48
53 update_process_times(user_mode(get_irq_regs())); 49 update_process_times(user_mode(get_irq_regs()));
54 50
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index ad3fd61b2fe..b0526d2716f 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,7 +29,7 @@
29}) 29})
30 30
31static inline int 31static inline int
32futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 32futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
33{ 33{
34 int op = (encoded_op >> 28) & 7; 34 int op = (encoded_op >> 28) & 7;
35 int cmp = (encoded_op >> 24) & 15; 35 int cmp = (encoded_op >> 24) & 15;
@@ -39,7 +39,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
39 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 39 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
40 oparg = 1 << oparg; 40 oparg = 1 << oparg;
41 41
42 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 42 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
43 return -EFAULT; 43 return -EFAULT;
44 44
45 pagefault_disable(); 45 pagefault_disable();
@@ -94,31 +94,34 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
94} 94}
95 95
96static inline int 96static inline int
97futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 97futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
98 u32 oldval, u32 newval)
98{ 99{
99 int prev, cmp; 100 int ret = 0, cmp;
101 u32 prev;
100 102
101 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 103 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
102 return -EFAULT; 104 return -EFAULT;
103 105
104 __asm__ __volatile__ ("1: lwx %0, %2, r0; \ 106 __asm__ __volatile__ ("1: lwx %1, %3, r0; \
105 cmp %1, %0, %3; \ 107 cmp %2, %1, %4; \
106 beqi %1, 3f; \ 108 beqi %2, 3f; \
107 2: swx %4, %2, r0; \ 109 2: swx %5, %3, r0; \
108 addic %1, r0, 0; \ 110 addic %2, r0, 0; \
109 bnei %1, 1b; \ 111 bnei %2, 1b; \
110 3: \ 112 3: \
111 .section .fixup,\"ax\"; \ 113 .section .fixup,\"ax\"; \
112 4: brid 3b; \ 114 4: brid 3b; \
113 addik %0, r0, %5; \ 115 addik %0, r0, %6; \
114 .previous; \ 116 .previous; \
115 .section __ex_table,\"a\"; \ 117 .section __ex_table,\"a\"; \
116 .word 1b,4b,2b,4b; \ 118 .word 1b,4b,2b,4b; \
117 .previous;" \ 119 .previous;" \
118 : "=&r" (prev), "=&r"(cmp) \ 120 : "+r" (ret), "=&r" (prev), "=&r"(cmp) \
119 : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT)); 121 : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT));
120 122
121 return prev; 123 *uval = prev;
124 return ret;
122} 125}
123 126
124#endif /* __KERNEL__ */ 127#endif /* __KERNEL__ */
diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h
index 0c68764ab54..10717669e0c 100644
--- a/arch/microblaze/include/asm/pci-bridge.h
+++ b/arch/microblaze/include/asm/pci-bridge.h
@@ -104,11 +104,22 @@ struct pci_controller {
104 int global_number; /* PCI domain number */ 104 int global_number; /* PCI domain number */
105}; 105};
106 106
107#ifdef CONFIG_PCI
107static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) 108static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
108{ 109{
109 return bus->sysdata; 110 return bus->sysdata;
110} 111}
111 112
113static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
114{
115 struct pci_controller *host;
116
117 if (bus->self)
118 return pci_device_to_OF_node(bus->self);
119 host = pci_bus_to_host(bus);
120 return host ? host->dn : NULL;
121}
122
112static inline int isa_vaddr_is_ioport(void __iomem *address) 123static inline int isa_vaddr_is_ioport(void __iomem *address)
113{ 124{
114 /* No specific ISA handling on ppc32 at this stage, it 125 /* No specific ISA handling on ppc32 at this stage, it
@@ -116,6 +127,7 @@ static inline int isa_vaddr_is_ioport(void __iomem *address)
116 */ 127 */
117 return 0; 128 return 0;
118} 129}
130#endif /* CONFIG_PCI */
119 131
120/* These are used for config access before all the PCI probing 132/* These are used for config access before all the PCI probing
121 has been done. */ 133 has been done. */
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 2e72af078b0..d0890d36ef6 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -64,21 +64,6 @@ extern void kdump_move_device_tree(void);
64/* CPU OF node matching */ 64/* CPU OF node matching */
65struct device_node *of_get_cpu_node(int cpu, unsigned int *thread); 65struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
66 66
67/**
68 * of_irq_map_pci - Resolve the interrupt for a PCI device
69 * @pdev: the device whose interrupt is to be resolved
70 * @out_irq: structure of_irq filled by this function
71 *
72 * This function resolves the PCI interrupt for a given PCI device. If a
73 * device-node exists for a given pci_dev, it will use normal OF tree
74 * walking. If not, it will implement standard swizzling and walk up the
75 * PCI tree until an device-node is found, at which point it will finish
76 * resolving using the OF tree walking.
77 */
78struct pci_dev;
79struct of_irq;
80extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
81
82#endif /* __ASSEMBLY__ */ 67#endif /* __ASSEMBLY__ */
83#endif /* __KERNEL__ */ 68#endif /* __KERNEL__ */
84 69
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index 9ae24f4b882..47187cc2cf0 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -2,88 +2,11 @@
2 2
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/pci_regs.h>
6#include <linux/module.h> 5#include <linux/module.h>
7#include <linux/ioport.h> 6#include <linux/ioport.h>
8#include <linux/etherdevice.h> 7#include <linux/etherdevice.h>
9#include <linux/of_address.h> 8#include <linux/of_address.h>
10#include <asm/prom.h> 9#include <asm/prom.h>
11#include <asm/pci-bridge.h>
12
13#ifdef CONFIG_PCI
14int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
15{
16 struct device_node *dn, *ppnode;
17 struct pci_dev *ppdev;
18 u32 lspec;
19 u32 laddr[3];
20 u8 pin;
21 int rc;
22
23 /* Check if we have a device node, if yes, fallback to standard OF
24 * parsing
25 */
26 dn = pci_device_to_OF_node(pdev);
27 if (dn)
28 return of_irq_map_one(dn, 0, out_irq);
29
30 /* Ok, we don't, time to have fun. Let's start by building up an
31 * interrupt spec. we assume #interrupt-cells is 1, which is standard
32 * for PCI. If you do different, then don't use that routine.
33 */
34 rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
35 if (rc != 0)
36 return rc;
37 /* No pin, exit */
38 if (pin == 0)
39 return -ENODEV;
40
41 /* Now we walk up the PCI tree */
42 lspec = pin;
43 for (;;) {
44 /* Get the pci_dev of our parent */
45 ppdev = pdev->bus->self;
46
47 /* Ouch, it's a host bridge... */
48 if (ppdev == NULL) {
49 struct pci_controller *host;
50 host = pci_bus_to_host(pdev->bus);
51 ppnode = host ? host->dn : NULL;
52 /* No node for host bridge ? give up */
53 if (ppnode == NULL)
54 return -EINVAL;
55 } else
56 /* We found a P2P bridge, check if it has a node */
57 ppnode = pci_device_to_OF_node(ppdev);
58
59 /* Ok, we have found a parent with a device-node, hand over to
60 * the OF parsing code.
61 * We build a unit address from the linux device to be used for
62 * resolution. Note that we use the linux bus number which may
63 * not match your firmware bus numbering.
64 * Fortunately, in most cases, interrupt-map-mask doesn't
65 * include the bus number as part of the matching.
66 * You should still be careful about that though if you intend
67 * to rely on this function (you ship a firmware that doesn't
68 * create device nodes for all PCI devices).
69 */
70 if (ppnode)
71 break;
72
73 /* We can only get here if we hit a P2P bridge with no node,
74 * let's do standard swizzling and try again
75 */
76 lspec = pci_swizzle_interrupt_pin(pdev, lspec);
77 pdev = ppdev;
78 }
79
80 laddr[0] = (pdev->bus->number << 16)
81 | (pdev->devfn << 8);
82 laddr[1] = laddr[2] = 0;
83 return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq);
84}
85EXPORT_SYMBOL_GPL(of_irq_map_pci);
86#endif /* CONFIG_PCI */
87 10
88void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop, 11void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
89 unsigned long *busno, unsigned long *phys, unsigned long *size) 12 unsigned long *busno, unsigned long *phys, unsigned long *size)
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index e363615d679..1e01a125363 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -29,6 +29,7 @@
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/of.h> 30#include <linux/of.h>
31#include <linux/of_address.h> 31#include <linux/of_address.h>
32#include <linux/of_pci.h>
32 33
33#include <asm/processor.h> 34#include <asm/processor.h>
34#include <asm/io.h> 35#include <asm/io.h>
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index b9cce90346c..6ebf1734b41 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -75,7 +75,7 @@
75} 75}
76 76
77static inline int 77static inline int
78futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 78futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
79{ 79{
80 int op = (encoded_op >> 28) & 7; 80 int op = (encoded_op >> 28) & 7;
81 int cmp = (encoded_op >> 24) & 15; 81 int cmp = (encoded_op >> 24) & 15;
@@ -85,7 +85,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
85 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 85 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
86 oparg = 1 << oparg; 86 oparg = 1 << oparg;
87 87
88 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) 88 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
89 return -EFAULT; 89 return -EFAULT;
90 90
91 pagefault_disable(); 91 pagefault_disable();
@@ -132,11 +132,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
132} 132}
133 133
134static inline int 134static inline int
135futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 135futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
136 u32 oldval, u32 newval)
136{ 137{
137 int retval; 138 int ret = 0;
139 u32 val;
138 140
139 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 141 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
140 return -EFAULT; 142 return -EFAULT;
141 143
142 if (cpu_has_llsc && R10000_LLSC_WAR) { 144 if (cpu_has_llsc && R10000_LLSC_WAR) {
@@ -145,25 +147,25 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
145 " .set push \n" 147 " .set push \n"
146 " .set noat \n" 148 " .set noat \n"
147 " .set mips3 \n" 149 " .set mips3 \n"
148 "1: ll %0, %2 \n" 150 "1: ll %1, %3 \n"
149 " bne %0, %z3, 3f \n" 151 " bne %1, %z4, 3f \n"
150 " .set mips0 \n" 152 " .set mips0 \n"
151 " move $1, %z4 \n" 153 " move $1, %z5 \n"
152 " .set mips3 \n" 154 " .set mips3 \n"
153 "2: sc $1, %1 \n" 155 "2: sc $1, %2 \n"
154 " beqzl $1, 1b \n" 156 " beqzl $1, 1b \n"
155 __WEAK_LLSC_MB 157 __WEAK_LLSC_MB
156 "3: \n" 158 "3: \n"
157 " .set pop \n" 159 " .set pop \n"
158 " .section .fixup,\"ax\" \n" 160 " .section .fixup,\"ax\" \n"
159 "4: li %0, %5 \n" 161 "4: li %0, %6 \n"
160 " j 3b \n" 162 " j 3b \n"
161 " .previous \n" 163 " .previous \n"
162 " .section __ex_table,\"a\" \n" 164 " .section __ex_table,\"a\" \n"
163 " "__UA_ADDR "\t1b, 4b \n" 165 " "__UA_ADDR "\t1b, 4b \n"
164 " "__UA_ADDR "\t2b, 4b \n" 166 " "__UA_ADDR "\t2b, 4b \n"
165 " .previous \n" 167 " .previous \n"
166 : "=&r" (retval), "=R" (*uaddr) 168 : "+r" (ret), "=&r" (val), "=R" (*uaddr)
167 : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT) 169 : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT)
168 : "memory"); 170 : "memory");
169 } else if (cpu_has_llsc) { 171 } else if (cpu_has_llsc) {
@@ -172,31 +174,32 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
172 " .set push \n" 174 " .set push \n"
173 " .set noat \n" 175 " .set noat \n"
174 " .set mips3 \n" 176 " .set mips3 \n"
175 "1: ll %0, %2 \n" 177 "1: ll %1, %3 \n"
176 " bne %0, %z3, 3f \n" 178 " bne %1, %z4, 3f \n"
177 " .set mips0 \n" 179 " .set mips0 \n"
178 " move $1, %z4 \n" 180 " move $1, %z5 \n"
179 " .set mips3 \n" 181 " .set mips3 \n"
180 "2: sc $1, %1 \n" 182 "2: sc $1, %2 \n"
181 " beqz $1, 1b \n" 183 " beqz $1, 1b \n"
182 __WEAK_LLSC_MB 184 __WEAK_LLSC_MB
183 "3: \n" 185 "3: \n"
184 " .set pop \n" 186 " .set pop \n"
185 " .section .fixup,\"ax\" \n" 187 " .section .fixup,\"ax\" \n"
186 "4: li %0, %5 \n" 188 "4: li %0, %6 \n"
187 " j 3b \n" 189 " j 3b \n"
188 " .previous \n" 190 " .previous \n"
189 " .section __ex_table,\"a\" \n" 191 " .section __ex_table,\"a\" \n"
190 " "__UA_ADDR "\t1b, 4b \n" 192 " "__UA_ADDR "\t1b, 4b \n"
191 " "__UA_ADDR "\t2b, 4b \n" 193 " "__UA_ADDR "\t2b, 4b \n"
192 " .previous \n" 194 " .previous \n"
193 : "=&r" (retval), "=R" (*uaddr) 195 : "+r" (ret), "=&r" (val), "=R" (*uaddr)
194 : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT) 196 : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT)
195 : "memory"); 197 : "memory");
196 } else 198 } else
197 return -ENOSYS; 199 return -ENOSYS;
198 200
199 return retval; 201 *uval = val;
202 return ret;
200} 203}
201 204
202#endif 205#endif
diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c
index 75da468090b..5b955000626 100644
--- a/arch/mn10300/kernel/time.c
+++ b/arch/mn10300/kernel/time.c
@@ -104,8 +104,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
104 unsigned tsc, elapse; 104 unsigned tsc, elapse;
105 irqreturn_t ret; 105 irqreturn_t ret;
106 106
107 write_seqlock(&xtime_lock);
108
109 while (tsc = get_cycles(), 107 while (tsc = get_cycles(),
110 elapse = tsc - mn10300_last_tsc, /* time elapsed since last 108 elapse = tsc - mn10300_last_tsc, /* time elapsed since last
111 * tick */ 109 * tick */
@@ -114,11 +112,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
114 mn10300_last_tsc += MN10300_TSC_PER_HZ; 112 mn10300_last_tsc += MN10300_TSC_PER_HZ;
115 113
116 /* advance the kernel's time tracking system */ 114 /* advance the kernel's time tracking system */
117 do_timer(1); 115 xtime_update(1);
118 } 116 }
119 117
120 write_sequnlock(&xtime_lock);
121
122 ret = local_timer_interrupt(); 118 ret = local_timer_interrupt();
123#ifdef CONFIG_SMP 119#ifdef CONFIG_SMP
124 send_IPI_allbutself(LOCAL_TIMER_IPI); 120 send_IPI_allbutself(LOCAL_TIMER_IPI);
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 30394081d9b..6ab9580b0b0 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -185,26 +185,21 @@ struct hpux_statfs {
185 int16_t f_pad; 185 int16_t f_pad;
186}; 186};
187 187
188static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) 188static int do_statfs_hpux(struct kstatfs *st, struct hpux_statfs __user *p)
189{ 189{
190 struct kstatfs st; 190 struct hpux_statfs buf;
191 int retval; 191 memset(&buf, 0, sizeof(buf));
192 192 buf.f_type = st->f_type;
193 retval = vfs_statfs(path, &st); 193 buf.f_bsize = st->f_bsize;
194 if (retval) 194 buf.f_blocks = st->f_blocks;
195 return retval; 195 buf.f_bfree = st->f_bfree;
196 196 buf.f_bavail = st->f_bavail;
197 memset(buf, 0, sizeof(*buf)); 197 buf.f_files = st->f_files;
198 buf->f_type = st.f_type; 198 buf.f_ffree = st->f_ffree;
199 buf->f_bsize = st.f_bsize; 199 buf.f_fsid[0] = st->f_fsid.val[0];
200 buf->f_blocks = st.f_blocks; 200 buf.f_fsid[1] = st->f_fsid.val[1];
201 buf->f_bfree = st.f_bfree; 201 if (copy_to_user(p, &buf, sizeof(buf)))
202 buf->f_bavail = st.f_bavail; 202 return -EFAULT;
203 buf->f_files = st.f_files;
204 buf->f_ffree = st.f_ffree;
205 buf->f_fsid[0] = st.f_fsid.val[0];
206 buf->f_fsid[1] = st.f_fsid.val[1];
207
208 return 0; 203 return 0;
209} 204}
210 205
@@ -212,35 +207,19 @@ static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf)
212asmlinkage long hpux_statfs(const char __user *pathname, 207asmlinkage long hpux_statfs(const char __user *pathname,
213 struct hpux_statfs __user *buf) 208 struct hpux_statfs __user *buf)
214{ 209{
215 struct path path; 210 struct kstatfs st;
216 int error; 211 int error = user_statfs(pathname, &st);
217 212 if (!error)
218 error = user_path(pathname, &path); 213 error = do_statfs_hpux(&st, buf);
219 if (!error) {
220 struct hpux_statfs tmp;
221 error = do_statfs_hpux(&path, &tmp);
222 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
223 error = -EFAULT;
224 path_put(&path);
225 }
226 return error; 214 return error;
227} 215}
228 216
229asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf) 217asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf)
230{ 218{
231 struct file *file; 219 struct kstatfs st;
232 struct hpux_statfs tmp; 220 int error = fd_statfs(fd, &st);
233 int error; 221 if (!error)
234 222 error = do_statfs_hpux(&st, buf);
235 error = -EBADF;
236 file = fget(fd);
237 if (!file)
238 goto out;
239 error = do_statfs_hpux(&file->f_path, &tmp);
240 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
241 error = -EFAULT;
242 fput(file);
243 out:
244 return error; 223 return error;
245} 224}
246 225
diff --git a/arch/parisc/include/asm/fcntl.h b/arch/parisc/include/asm/fcntl.h
index f357fc693c8..0304b92ccfe 100644
--- a/arch/parisc/include/asm/fcntl.h
+++ b/arch/parisc/include/asm/fcntl.h
@@ -19,6 +19,8 @@
19#define O_NOFOLLOW 000000200 /* don't follow links */ 19#define O_NOFOLLOW 000000200 /* don't follow links */
20#define O_INVISIBLE 004000000 /* invisible I/O, for DMAPI/XDSM */ 20#define O_INVISIBLE 004000000 /* invisible I/O, for DMAPI/XDSM */
21 21
22#define O_PATH 020000000
23
22#define F_GETLK64 8 24#define F_GETLK64 8
23#define F_SETLK64 9 25#define F_SETLK64 9
24#define F_SETLKW64 10 26#define F_SETLKW64 10
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 0c705c3a55e..67a33cc27ef 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -8,7 +8,7 @@
8#include <asm/errno.h> 8#include <asm/errno.h>
9 9
10static inline int 10static inline int
11futex_atomic_op_inuser (int encoded_op, int __user *uaddr) 11futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
12{ 12{
13 int op = (encoded_op >> 28) & 7; 13 int op = (encoded_op >> 28) & 7;
14 int cmp = (encoded_op >> 24) & 15; 14 int cmp = (encoded_op >> 24) & 15;
@@ -18,7 +18,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
18 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 18 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
19 oparg = 1 << oparg; 19 oparg = 1 << oparg;
20 20
21 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) 21 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
22 return -EFAULT; 22 return -EFAULT;
23 23
24 pagefault_disable(); 24 pagefault_disable();
@@ -51,10 +51,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
51 51
52/* Non-atomic version */ 52/* Non-atomic version */
53static inline int 53static inline int
54futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 54futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
55 u32 oldval, u32 newval)
55{ 56{
56 int err = 0; 57 u32 val;
57 int uval;
58 58
59 /* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is 59 /* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is
60 * our gateway page, and causes no end of trouble... 60 * our gateway page, and causes no end of trouble...
@@ -62,15 +62,15 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
62 if (segment_eq(KERNEL_DS, get_fs()) && !uaddr) 62 if (segment_eq(KERNEL_DS, get_fs()) && !uaddr)
63 return -EFAULT; 63 return -EFAULT;
64 64
65 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 65 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
66 return -EFAULT; 66 return -EFAULT;
67 67
68 err = get_user(uval, uaddr); 68 if (get_user(val, uaddr))
69 if (err) return -EFAULT; 69 return -EFAULT;
70 if (uval == oldval) 70 if (val == oldval && put_user(newval, uaddr))
71 err = put_user(newval, uaddr); 71 return -EFAULT;
72 if (err) return -EFAULT; 72 *uval = val;
73 return uval; 73 return 0;
74} 74}
75 75
76#endif /*__KERNEL__*/ 76#endif /*__KERNEL__*/
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 05511ccb61d..45b7389d77a 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -162,11 +162,8 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id)
162 update_process_times(user_mode(get_irq_regs())); 162 update_process_times(user_mode(get_irq_regs()));
163 } 163 }
164 164
165 if (cpu == 0) { 165 if (cpu == 0)
166 write_seqlock(&xtime_lock); 166 xtime_update(ticks_elapsed);
167 do_timer(ticks_elapsed);
168 write_sequnlock(&xtime_lock);
169 }
170 167
171 return IRQ_HANDLED; 168 return IRQ_HANDLED;
172} 169}
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index 7c589ef81fb..c94e4a3fe2e 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -30,7 +30,7 @@
30 : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ 30 : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
31 : "cr0", "memory") 31 : "cr0", "memory")
32 32
33static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) 33static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
34{ 34{
35 int op = (encoded_op >> 28) & 7; 35 int op = (encoded_op >> 28) & 7;
36 int cmp = (encoded_op >> 24) & 15; 36 int cmp = (encoded_op >> 24) & 15;
@@ -40,7 +40,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
40 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 40 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
41 oparg = 1 << oparg; 41 oparg = 1 << oparg;
42 42
43 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) 43 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
44 return -EFAULT; 44 return -EFAULT;
45 45
46 pagefault_disable(); 46 pagefault_disable();
@@ -82,35 +82,38 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
82} 82}
83 83
84static inline int 84static inline int
85futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 85futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
86 u32 oldval, u32 newval)
86{ 87{
87 int prev; 88 int ret = 0;
89 u32 prev;
88 90
89 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 91 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
90 return -EFAULT; 92 return -EFAULT;
91 93
92 __asm__ __volatile__ ( 94 __asm__ __volatile__ (
93 PPC_RELEASE_BARRIER 95 PPC_RELEASE_BARRIER
94"1: lwarx %0,0,%2 # futex_atomic_cmpxchg_inatomic\n\ 96"1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\
95 cmpw 0,%0,%3\n\ 97 cmpw 0,%1,%4\n\
96 bne- 3f\n" 98 bne- 3f\n"
97 PPC405_ERR77(0,%2) 99 PPC405_ERR77(0,%3)
98"2: stwcx. %4,0,%2\n\ 100"2: stwcx. %5,0,%3\n\
99 bne- 1b\n" 101 bne- 1b\n"
100 PPC_ACQUIRE_BARRIER 102 PPC_ACQUIRE_BARRIER
101"3: .section .fixup,\"ax\"\n\ 103"3: .section .fixup,\"ax\"\n\
1024: li %0,%5\n\ 1044: li %0,%6\n\
103 b 3b\n\ 105 b 3b\n\
104 .previous\n\ 106 .previous\n\
105 .section __ex_table,\"a\"\n\ 107 .section __ex_table,\"a\"\n\
106 .align 3\n\ 108 .align 3\n\
107 " PPC_LONG "1b,4b,2b,4b\n\ 109 " PPC_LONG "1b,4b,2b,4b\n\
108 .previous" \ 110 .previous" \
109 : "=&r" (prev), "+m" (*uaddr) 111 : "+r" (ret), "=&r" (prev), "+m" (*uaddr)
110 : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT) 112 : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT)
111 : "cc", "memory"); 113 : "cc", "memory");
112 114
113 return prev; 115 *uval = prev;
116 return ret;
114} 117}
115 118
116#endif /* __KERNEL__ */ 119#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 51e9e6f90d1..edeb80fdd2c 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -171,6 +171,16 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
171 return bus->sysdata; 171 return bus->sysdata;
172} 172}
173 173
174static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
175{
176 struct pci_controller *host;
177
178 if (bus->self)
179 return pci_device_to_OF_node(bus->self);
180 host = pci_bus_to_host(bus);
181 return host ? host->dn : NULL;
182}
183
174static inline int isa_vaddr_is_ioport(void __iomem *address) 184static inline int isa_vaddr_is_ioport(void __iomem *address)
175{ 185{
176 /* No specific ISA handling on ppc32 at this stage, it 186 /* No specific ISA handling on ppc32 at this stage, it
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index d7275758559..c189aa5fe1f 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -70,21 +70,6 @@ static inline int of_node_to_nid(struct device_node *device) { return 0; }
70#endif 70#endif
71#define of_node_to_nid of_node_to_nid 71#define of_node_to_nid of_node_to_nid
72 72
73/**
74 * of_irq_map_pci - Resolve the interrupt for a PCI device
75 * @pdev: the device whose interrupt is to be resolved
76 * @out_irq: structure of_irq filled by this function
77 *
78 * This function resolves the PCI interrupt for a given PCI device. If a
79 * device-node exists for a given pci_dev, it will use normal OF tree
80 * walking. If not, it will implement standard swizzling and walk up the
81 * PCI tree until an device-node is found, at which point it will finish
82 * resolving using the OF tree walking.
83 */
84struct pci_dev;
85struct of_irq;
86extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
87
88extern void of_instantiate_rtc(void); 73extern void of_instantiate_rtc(void);
89 74
90/* These includes are put at the bottom because they may contain things 75/* These includes are put at the bottom because they may contain things
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index 8447d89fbe7..bb1e2cdeb9b 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -13,11 +13,6 @@
13 * by Paul Mackerras <paulus@samba.org>. 13 * by Paul Mackerras <paulus@samba.org>.
14 */ 14 */
15 15
16#include <linux/list.h>
17#include <linux/spinlock.h>
18#include <asm/atomic.h>
19#include <asm/system.h>
20
21/* 16/*
22 * the semaphore definition 17 * the semaphore definition
23 */ 18 */
@@ -33,47 +28,6 @@
33#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 28#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
34#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 29#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
35 30
36struct rw_semaphore {
37 long count;
38 spinlock_t wait_lock;
39 struct list_head wait_list;
40#ifdef CONFIG_DEBUG_LOCK_ALLOC
41 struct lockdep_map dep_map;
42#endif
43};
44
45#ifdef CONFIG_DEBUG_LOCK_ALLOC
46# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
47#else
48# define __RWSEM_DEP_MAP_INIT(lockname)
49#endif
50
51#define __RWSEM_INITIALIZER(name) \
52{ \
53 RWSEM_UNLOCKED_VALUE, \
54 __SPIN_LOCK_UNLOCKED((name).wait_lock), \
55 LIST_HEAD_INIT((name).wait_list) \
56 __RWSEM_DEP_MAP_INIT(name) \
57}
58
59#define DECLARE_RWSEM(name) \
60 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
61
62extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
63extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
64extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
65extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
66
67extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
68 struct lock_class_key *key);
69
70#define init_rwsem(sem) \
71 do { \
72 static struct lock_class_key __key; \
73 \
74 __init_rwsem((sem), #sem, &__key); \
75 } while (0)
76
77/* 31/*
78 * lock for reading 32 * lock for reading
79 */ 33 */
@@ -174,10 +128,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
174 return atomic_long_add_return(delta, (atomic_long_t *)&sem->count); 128 return atomic_long_add_return(delta, (atomic_long_t *)&sem->count);
175} 129}
176 130
177static inline int rwsem_is_locked(struct rw_semaphore *sem)
178{
179 return sem->count != 0;
180}
181
182#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
183#endif /* _ASM_POWERPC_RWSEM_H */ 132#endif /* _ASM_POWERPC_RWSEM_H */
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 10a44e68ef1..eb341be9a4d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/of_address.h> 24#include <linux/of_address.h>
25#include <linux/of_pci.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
26#include <linux/list.h> 27#include <linux/list.h>
27#include <linux/syscalls.h> 28#include <linux/syscalls.h>
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index c2b7a07cc3d..47187cc2cf0 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -2,95 +2,11 @@
2 2
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/pci_regs.h>
6#include <linux/module.h> 5#include <linux/module.h>
7#include <linux/ioport.h> 6#include <linux/ioport.h>
8#include <linux/etherdevice.h> 7#include <linux/etherdevice.h>
9#include <linux/of_address.h> 8#include <linux/of_address.h>
10#include <asm/prom.h> 9#include <asm/prom.h>
11#include <asm/pci-bridge.h>
12
13#ifdef CONFIG_PCI
14int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
15{
16 struct device_node *dn, *ppnode;
17 struct pci_dev *ppdev;
18 u32 lspec;
19 u32 laddr[3];
20 u8 pin;
21 int rc;
22
23 /* Check if we have a device node, if yes, fallback to standard OF
24 * parsing
25 */
26 dn = pci_device_to_OF_node(pdev);
27 if (dn) {
28 rc = of_irq_map_one(dn, 0, out_irq);
29 if (!rc)
30 return rc;
31 }
32
33 /* Ok, we don't, time to have fun. Let's start by building up an
34 * interrupt spec. we assume #interrupt-cells is 1, which is standard
35 * for PCI. If you do different, then don't use that routine.
36 */
37 rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
38 if (rc != 0)
39 return rc;
40 /* No pin, exit */
41 if (pin == 0)
42 return -ENODEV;
43
44 /* Now we walk up the PCI tree */
45 lspec = pin;
46 for (;;) {
47 /* Get the pci_dev of our parent */
48 ppdev = pdev->bus->self;
49
50 /* Ouch, it's a host bridge... */
51 if (ppdev == NULL) {
52#ifdef CONFIG_PPC64
53 ppnode = pci_bus_to_OF_node(pdev->bus);
54#else
55 struct pci_controller *host;
56 host = pci_bus_to_host(pdev->bus);
57 ppnode = host ? host->dn : NULL;
58#endif
59 /* No node for host bridge ? give up */
60 if (ppnode == NULL)
61 return -EINVAL;
62 } else
63 /* We found a P2P bridge, check if it has a node */
64 ppnode = pci_device_to_OF_node(ppdev);
65
66 /* Ok, we have found a parent with a device-node, hand over to
67 * the OF parsing code.
68 * We build a unit address from the linux device to be used for
69 * resolution. Note that we use the linux bus number which may
70 * not match your firmware bus numbering.
71 * Fortunately, in most cases, interrupt-map-mask doesn't include
72 * the bus number as part of the matching.
73 * You should still be careful about that though if you intend
74 * to rely on this function (you ship a firmware that doesn't
75 * create device nodes for all PCI devices).
76 */
77 if (ppnode)
78 break;
79
80 /* We can only get here if we hit a P2P bridge with no node,
81 * let's do standard swizzling and try again
82 */
83 lspec = pci_swizzle_interrupt_pin(pdev, lspec);
84 pdev = ppdev;
85 }
86
87 laddr[0] = (pdev->bus->number << 16)
88 | (pdev->devfn << 8);
89 laddr[1] = laddr[2] = 0;
90 return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq);
91}
92EXPORT_SYMBOL_GPL(of_irq_map_pci);
93#endif /* CONFIG_PCI */
94 10
95void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop, 11void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
96 unsigned long *busno, unsigned long *phys, unsigned long *size) 12 unsigned long *busno, unsigned long *phys, unsigned long *size)
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 187a7d32f86..a3d2ce54ea2 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags,
70 if (!IS_ERR(tmp)) { 70 if (!IS_ERR(tmp)) {
71 struct nameidata nd; 71 struct nameidata nd;
72 72
73 ret = path_lookup(tmp, LOOKUP_PARENT, &nd); 73 ret = kern_path_parent(tmp, &nd);
74 if (!ret) { 74 if (!ret) {
75 nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE; 75 nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE;
76 ret = spufs_create(&nd, flags, mode, neighbor); 76 ret = spufs_create(&nd, flags, mode, neighbor);
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index 5c5d02de49e..81cf36b691f 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -7,7 +7,7 @@
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <asm/errno.h> 8#include <asm/errno.h>
9 9
10static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) 10static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
11{ 11{
12 int op = (encoded_op >> 28) & 7; 12 int op = (encoded_op >> 28) & 7;
13 int cmp = (encoded_op >> 24) & 15; 13 int cmp = (encoded_op >> 24) & 15;
@@ -18,7 +18,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
18 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 18 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
19 oparg = 1 << oparg; 19 oparg = 1 << oparg;
20 20
21 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) 21 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
22 return -EFAULT; 22 return -EFAULT;
23 23
24 pagefault_disable(); 24 pagefault_disable();
@@ -39,13 +39,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
39 return ret; 39 return ret;
40} 40}
41 41
42static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, 42static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
43 int oldval, int newval) 43 u32 oldval, u32 newval)
44{ 44{
45 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) 45 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
46 return -EFAULT; 46 return -EFAULT;
47 47
48 return uaccess.futex_atomic_cmpxchg(uaddr, oldval, newval); 48 return uaccess.futex_atomic_cmpxchg(uval, uaddr, oldval, newval);
49} 49}
50 50
51#endif /* __KERNEL__ */ 51#endif /* __KERNEL__ */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 423fdda2322..d0eb4653ceb 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -43,29 +43,6 @@
43 43
44#ifdef __KERNEL__ 44#ifdef __KERNEL__
45 45
46#include <linux/list.h>
47#include <linux/spinlock.h>
48
49struct rwsem_waiter;
50
51extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *);
52extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *);
53extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
54extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *);
55extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *);
56
57/*
58 * the semaphore definition
59 */
60struct rw_semaphore {
61 signed long count;
62 spinlock_t wait_lock;
63 struct list_head wait_list;
64#ifdef CONFIG_DEBUG_LOCK_ALLOC
65 struct lockdep_map dep_map;
66#endif
67};
68
69#ifndef __s390x__ 46#ifndef __s390x__
70#define RWSEM_UNLOCKED_VALUE 0x00000000 47#define RWSEM_UNLOCKED_VALUE 0x00000000
71#define RWSEM_ACTIVE_BIAS 0x00000001 48#define RWSEM_ACTIVE_BIAS 0x00000001
@@ -81,41 +58,6 @@ struct rw_semaphore {
81#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 58#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
82 59
83/* 60/*
84 * initialisation
85 */
86
87#ifdef CONFIG_DEBUG_LOCK_ALLOC
88# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
89#else
90# define __RWSEM_DEP_MAP_INIT(lockname)
91#endif
92
93#define __RWSEM_INITIALIZER(name) \
94 { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \
95 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
96
97#define DECLARE_RWSEM(name) \
98 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
99
100static inline void init_rwsem(struct rw_semaphore *sem)
101{
102 sem->count = RWSEM_UNLOCKED_VALUE;
103 spin_lock_init(&sem->wait_lock);
104 INIT_LIST_HEAD(&sem->wait_list);
105}
106
107extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
108 struct lock_class_key *key);
109
110#define init_rwsem(sem) \
111do { \
112 static struct lock_class_key __key; \
113 \
114 __init_rwsem((sem), #sem, &__key); \
115} while (0)
116
117
118/*
119 * lock for reading 61 * lock for reading
120 */ 62 */
121static inline void __down_read(struct rw_semaphore *sem) 63static inline void __down_read(struct rw_semaphore *sem)
@@ -377,10 +319,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
377 return new; 319 return new;
378} 320}
379 321
380static inline int rwsem_is_locked(struct rw_semaphore *sem)
381{
382 return (sem->count != 0);
383}
384
385#endif /* __KERNEL__ */ 322#endif /* __KERNEL__ */
386#endif /* _S390_RWSEM_H */ 323#endif /* _S390_RWSEM_H */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index d6b1ed0ec52..2d9ea11f919 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -83,8 +83,8 @@ struct uaccess_ops {
83 size_t (*clear_user)(size_t, void __user *); 83 size_t (*clear_user)(size_t, void __user *);
84 size_t (*strnlen_user)(size_t, const char __user *); 84 size_t (*strnlen_user)(size_t, const char __user *);
85 size_t (*strncpy_from_user)(size_t, const char __user *, char *); 85 size_t (*strncpy_from_user)(size_t, const char __user *, char *);
86 int (*futex_atomic_op)(int op, int __user *, int oparg, int *old); 86 int (*futex_atomic_op)(int op, u32 __user *, int oparg, int *old);
87 int (*futex_atomic_cmpxchg)(int __user *, int old, int new); 87 int (*futex_atomic_cmpxchg)(u32 *, u32 __user *, u32 old, u32 new);
88}; 88};
89 89
90extern struct uaccess_ops uaccess; 90extern struct uaccess_ops uaccess;
diff --git a/arch/s390/lib/uaccess.h b/arch/s390/lib/uaccess.h
index 126011df14f..1d2536cb630 100644
--- a/arch/s390/lib/uaccess.h
+++ b/arch/s390/lib/uaccess.h
@@ -12,12 +12,12 @@ extern size_t copy_from_user_std(size_t, const void __user *, void *);
12extern size_t copy_to_user_std(size_t, void __user *, const void *); 12extern size_t copy_to_user_std(size_t, void __user *, const void *);
13extern size_t strnlen_user_std(size_t, const char __user *); 13extern size_t strnlen_user_std(size_t, const char __user *);
14extern size_t strncpy_from_user_std(size_t, const char __user *, char *); 14extern size_t strncpy_from_user_std(size_t, const char __user *, char *);
15extern int futex_atomic_cmpxchg_std(int __user *, int, int); 15extern int futex_atomic_cmpxchg_std(u32 *, u32 __user *, u32, u32);
16extern int futex_atomic_op_std(int, int __user *, int, int *); 16extern int futex_atomic_op_std(int, u32 __user *, int, int *);
17 17
18extern size_t copy_from_user_pt(size_t, const void __user *, void *); 18extern size_t copy_from_user_pt(size_t, const void __user *, void *);
19extern size_t copy_to_user_pt(size_t, void __user *, const void *); 19extern size_t copy_to_user_pt(size_t, void __user *, const void *);
20extern int futex_atomic_op_pt(int, int __user *, int, int *); 20extern int futex_atomic_op_pt(int, u32 __user *, int, int *);
21extern int futex_atomic_cmpxchg_pt(int __user *, int, int); 21extern int futex_atomic_cmpxchg_pt(u32 *, u32 __user *, u32, u32);
22 22
23#endif /* __ARCH_S390_LIB_UACCESS_H */ 23#endif /* __ARCH_S390_LIB_UACCESS_H */
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index 404f2de296d..74833831417 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -302,7 +302,7 @@ fault:
302 : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ 302 : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \
303 "m" (*uaddr) : "cc" ); 303 "m" (*uaddr) : "cc" );
304 304
305static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) 305static int __futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old)
306{ 306{
307 int oldval = 0, newval, ret; 307 int oldval = 0, newval, ret;
308 308
@@ -335,7 +335,7 @@ static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old)
335 return ret; 335 return ret;
336} 336}
337 337
338int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) 338int futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old)
339{ 339{
340 int ret; 340 int ret;
341 341
@@ -354,26 +354,29 @@ int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old)
354 return ret; 354 return ret;
355} 355}
356 356
357static int __futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) 357static int __futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr,
358 u32 oldval, u32 newval)
358{ 359{
359 int ret; 360 int ret;
360 361
361 asm volatile("0: cs %1,%4,0(%5)\n" 362 asm volatile("0: cs %1,%4,0(%5)\n"
362 "1: lr %0,%1\n" 363 "1: la %0,0\n"
363 "2:\n" 364 "2:\n"
364 EX_TABLE(0b,2b) EX_TABLE(1b,2b) 365 EX_TABLE(0b,2b) EX_TABLE(1b,2b)
365 : "=d" (ret), "+d" (oldval), "=m" (*uaddr) 366 : "=d" (ret), "+d" (oldval), "=m" (*uaddr)
366 : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) 367 : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
367 : "cc", "memory" ); 368 : "cc", "memory" );
369 *uval = oldval;
368 return ret; 370 return ret;
369} 371}
370 372
371int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) 373int futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr,
374 u32 oldval, u32 newval)
372{ 375{
373 int ret; 376 int ret;
374 377
375 if (segment_eq(get_fs(), KERNEL_DS)) 378 if (segment_eq(get_fs(), KERNEL_DS))
376 return __futex_atomic_cmpxchg_pt(uaddr, oldval, newval); 379 return __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval);
377 spin_lock(&current->mm->page_table_lock); 380 spin_lock(&current->mm->page_table_lock);
378 uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr); 381 uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr);
379 if (!uaddr) { 382 if (!uaddr) {
@@ -382,7 +385,7 @@ int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval)
382 } 385 }
383 get_page(virt_to_page(uaddr)); 386 get_page(virt_to_page(uaddr));
384 spin_unlock(&current->mm->page_table_lock); 387 spin_unlock(&current->mm->page_table_lock);
385 ret = __futex_atomic_cmpxchg_pt(uaddr, oldval, newval); 388 ret = __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval);
386 put_page(virt_to_page(uaddr)); 389 put_page(virt_to_page(uaddr));
387 return ret; 390 return ret;
388} 391}
diff --git a/arch/s390/lib/uaccess_std.c b/arch/s390/lib/uaccess_std.c
index a6c4f7ed24a..bb1a7eed42c 100644
--- a/arch/s390/lib/uaccess_std.c
+++ b/arch/s390/lib/uaccess_std.c
@@ -255,7 +255,7 @@ size_t strncpy_from_user_std(size_t size, const char __user *src, char *dst)
255 : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ 255 : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \
256 "m" (*uaddr) : "cc"); 256 "m" (*uaddr) : "cc");
257 257
258int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old) 258int futex_atomic_op_std(int op, u32 __user *uaddr, int oparg, int *old)
259{ 259{
260 int oldval = 0, newval, ret; 260 int oldval = 0, newval, ret;
261 261
@@ -287,19 +287,21 @@ int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old)
287 return ret; 287 return ret;
288} 288}
289 289
290int futex_atomic_cmpxchg_std(int __user *uaddr, int oldval, int newval) 290int futex_atomic_cmpxchg_std(u32 *uval, u32 __user *uaddr,
291 u32 oldval, u32 newval)
291{ 292{
292 int ret; 293 int ret;
293 294
294 asm volatile( 295 asm volatile(
295 " sacf 256\n" 296 " sacf 256\n"
296 "0: cs %1,%4,0(%5)\n" 297 "0: cs %1,%4,0(%5)\n"
297 "1: lr %0,%1\n" 298 "1: la %0,0\n"
298 "2: sacf 0\n" 299 "2: sacf 0\n"
299 EX_TABLE(0b,2b) EX_TABLE(1b,2b) 300 EX_TABLE(0b,2b) EX_TABLE(1b,2b)
300 : "=d" (ret), "+d" (oldval), "=m" (*uaddr) 301 : "=d" (ret), "+d" (oldval), "=m" (*uaddr)
301 : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) 302 : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
302 : "cc", "memory" ); 303 : "cc", "memory" );
304 *uval = oldval;
303 return ret; 305 return ret;
304} 306}
305 307
diff --git a/arch/sh/include/asm/futex-irq.h b/arch/sh/include/asm/futex-irq.h
index a9f16a7f9ae..6cb9f193a95 100644
--- a/arch/sh/include/asm/futex-irq.h
+++ b/arch/sh/include/asm/futex-irq.h
@@ -3,7 +3,7 @@
3 3
4#include <asm/system.h> 4#include <asm/system.h>
5 5
6static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, 6static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr,
7 int *oldval) 7 int *oldval)
8{ 8{
9 unsigned long flags; 9 unsigned long flags;
@@ -20,7 +20,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr,
20 return ret; 20 return ret;
21} 21}
22 22
23static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, 23static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr,
24 int *oldval) 24 int *oldval)
25{ 25{
26 unsigned long flags; 26 unsigned long flags;
@@ -37,7 +37,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr,
37 return ret; 37 return ret;
38} 38}
39 39
40static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, 40static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr,
41 int *oldval) 41 int *oldval)
42{ 42{
43 unsigned long flags; 43 unsigned long flags;
@@ -54,7 +54,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr,
54 return ret; 54 return ret;
55} 55}
56 56
57static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, 57static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr,
58 int *oldval) 58 int *oldval)
59{ 59{
60 unsigned long flags; 60 unsigned long flags;
@@ -71,7 +71,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr,
71 return ret; 71 return ret;
72} 72}
73 73
74static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, 74static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr,
75 int *oldval) 75 int *oldval)
76{ 76{
77 unsigned long flags; 77 unsigned long flags;
@@ -88,11 +88,13 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr,
88 return ret; 88 return ret;
89} 89}
90 90
91static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr, 91static inline int atomic_futex_op_cmpxchg_inatomic(u32 *uval,
92 int oldval, int newval) 92 u32 __user *uaddr,
93 u32 oldval, u32 newval)
93{ 94{
94 unsigned long flags; 95 unsigned long flags;
95 int ret, prev = 0; 96 int ret;
97 u32 prev = 0;
96 98
97 local_irq_save(flags); 99 local_irq_save(flags);
98 100
@@ -102,10 +104,8 @@ static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr,
102 104
103 local_irq_restore(flags); 105 local_irq_restore(flags);
104 106
105 if (ret) 107 *uval = prev;
106 return ret; 108 return ret;
107
108 return prev;
109} 109}
110 110
111#endif /* __ASM_SH_FUTEX_IRQ_H */ 111#endif /* __ASM_SH_FUTEX_IRQ_H */
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index 68256ec5fa3..7be39a646fb 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -10,7 +10,7 @@
10/* XXX: UP variants, fix for SH-4A and SMP.. */ 10/* XXX: UP variants, fix for SH-4A and SMP.. */
11#include <asm/futex-irq.h> 11#include <asm/futex-irq.h>
12 12
13static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 13static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
14{ 14{
15 int op = (encoded_op >> 28) & 7; 15 int op = (encoded_op >> 28) & 7;
16 int cmp = (encoded_op >> 24) & 15; 16 int cmp = (encoded_op >> 24) & 15;
@@ -21,7 +21,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
21 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 21 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
22 oparg = 1 << oparg; 22 oparg = 1 << oparg;
23 23
24 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 24 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
25 return -EFAULT; 25 return -EFAULT;
26 26
27 pagefault_disable(); 27 pagefault_disable();
@@ -65,12 +65,13 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
65} 65}
66 66
67static inline int 67static inline int
68futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 68futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
69 u32 oldval, u32 newval)
69{ 70{
70 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 71 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
71 return -EFAULT; 72 return -EFAULT;
72 73
73 return atomic_futex_op_cmpxchg_inatomic(uaddr, oldval, newval); 74 return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
74} 75}
75 76
76#endif /* __KERNEL__ */ 77#endif /* __KERNEL__ */
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index 06e2251a5e4..edab5726529 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -11,64 +11,13 @@
11#endif 11#endif
12 12
13#ifdef __KERNEL__ 13#ifdef __KERNEL__
14#include <linux/list.h>
15#include <linux/spinlock.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18 14
19/*
20 * the semaphore definition
21 */
22struct rw_semaphore {
23 long count;
24#define RWSEM_UNLOCKED_VALUE 0x00000000 15#define RWSEM_UNLOCKED_VALUE 0x00000000
25#define RWSEM_ACTIVE_BIAS 0x00000001 16#define RWSEM_ACTIVE_BIAS 0x00000001
26#define RWSEM_ACTIVE_MASK 0x0000ffff 17#define RWSEM_ACTIVE_MASK 0x0000ffff
27#define RWSEM_WAITING_BIAS (-0x00010000) 18#define RWSEM_WAITING_BIAS (-0x00010000)
28#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 19#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
29#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 20#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
30 spinlock_t wait_lock;
31 struct list_head wait_list;
32#ifdef CONFIG_DEBUG_LOCK_ALLOC
33 struct lockdep_map dep_map;
34#endif
35};
36
37#ifdef CONFIG_DEBUG_LOCK_ALLOC
38# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
39#else
40# define __RWSEM_DEP_MAP_INIT(lockname)
41#endif
42
43#define __RWSEM_INITIALIZER(name) \
44 { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
45 LIST_HEAD_INIT((name).wait_list) \
46 __RWSEM_DEP_MAP_INIT(name) }
47
48#define DECLARE_RWSEM(name) \
49 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
50
51extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
52extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
53extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
54extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
55
56extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
57 struct lock_class_key *key);
58
59#define init_rwsem(sem) \
60do { \
61 static struct lock_class_key __key; \
62 \
63 __init_rwsem((sem), #sem, &__key); \
64} while (0)
65
66static inline void init_rwsem(struct rw_semaphore *sem)
67{
68 sem->count = RWSEM_UNLOCKED_VALUE;
69 spin_lock_init(&sem->wait_lock);
70 INIT_LIST_HEAD(&sem->wait_list);
71}
72 21
73/* 22/*
74 * lock for reading 23 * lock for reading
@@ -179,10 +128,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
179 return atomic_add_return(delta, (atomic_t *)(&sem->count)); 128 return atomic_add_return(delta, (atomic_t *)(&sem->count));
180} 129}
181 130
182static inline int rwsem_is_locked(struct rw_semaphore *sem)
183{
184 return (sem->count != 0);
185}
186
187#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
188#endif /* _ASM_SH_RWSEM_H */ 132#endif /* _ASM_SH_RWSEM_H */
diff --git a/arch/sparc/include/asm/fcntl.h b/arch/sparc/include/asm/fcntl.h
index 38f37b333cc..d0b83f66f35 100644
--- a/arch/sparc/include/asm/fcntl.h
+++ b/arch/sparc/include/asm/fcntl.h
@@ -34,6 +34,8 @@
34#define __O_SYNC 0x800000 34#define __O_SYNC 0x800000
35#define O_SYNC (__O_SYNC|O_DSYNC) 35#define O_SYNC (__O_SYNC|O_DSYNC)
36 36
37#define O_PATH 0x1000000
38
37#define F_GETOWN 5 /* for sockets. */ 39#define F_GETOWN 5 /* for sockets. */
38#define F_SETOWN 6 /* for sockets. */ 40#define F_SETOWN 6 /* for sockets. */
39#define F_GETLK 7 41#define F_GETLK 7
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index 47f95839dc6..444e7bea23b 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -30,7 +30,7 @@
30 : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ 30 : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \
31 : "memory") 31 : "memory")
32 32
33static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 33static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
34{ 34{
35 int op = (encoded_op >> 28) & 7; 35 int op = (encoded_op >> 28) & 7;
36 int cmp = (encoded_op >> 24) & 15; 36 int cmp = (encoded_op >> 24) & 15;
@@ -38,7 +38,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
38 int cmparg = (encoded_op << 20) >> 20; 38 int cmparg = (encoded_op << 20) >> 20;
39 int oldval = 0, ret, tem; 39 int oldval = 0, ret, tem;
40 40
41 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))) 41 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
42 return -EFAULT; 42 return -EFAULT;
43 if (unlikely((((unsigned long) uaddr) & 0x3UL))) 43 if (unlikely((((unsigned long) uaddr) & 0x3UL)))
44 return -EINVAL; 44 return -EINVAL;
@@ -85,26 +85,30 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
85} 85}
86 86
87static inline int 87static inline int
88futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 88futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
89 u32 oldval, u32 newval)
89{ 90{
91 int ret = 0;
92
90 __asm__ __volatile__( 93 __asm__ __volatile__(
91 "\n1: casa [%3] %%asi, %2, %0\n" 94 "\n1: casa [%4] %%asi, %3, %1\n"
92 "2:\n" 95 "2:\n"
93 " .section .fixup,#alloc,#execinstr\n" 96 " .section .fixup,#alloc,#execinstr\n"
94 " .align 4\n" 97 " .align 4\n"
95 "3: sethi %%hi(2b), %0\n" 98 "3: sethi %%hi(2b), %0\n"
96 " jmpl %0 + %%lo(2b), %%g0\n" 99 " jmpl %0 + %%lo(2b), %%g0\n"
97 " mov %4, %0\n" 100 " mov %5, %0\n"
98 " .previous\n" 101 " .previous\n"
99 " .section __ex_table,\"a\"\n" 102 " .section __ex_table,\"a\"\n"
100 " .align 4\n" 103 " .align 4\n"
101 " .word 1b, 3b\n" 104 " .word 1b, 3b\n"
102 " .previous\n" 105 " .previous\n"
103 : "=r" (newval) 106 : "+r" (ret), "=r" (newval)
104 : "0" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT) 107 : "1" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT)
105 : "memory"); 108 : "memory");
106 109
107 return newval; 110 *uval = newval;
111 return ret;
108} 112}
109 113
110#endif /* !(_SPARC64_FUTEX_H) */ 114#endif /* !(_SPARC64_FUTEX_H) */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index a2b4302869b..069bf4d663a 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -13,53 +13,12 @@
13 13
14#ifdef __KERNEL__ 14#ifdef __KERNEL__
15 15
16#include <linux/list.h>
17#include <linux/spinlock.h>
18
19struct rwsem_waiter;
20
21struct rw_semaphore {
22 signed long count;
23#define RWSEM_UNLOCKED_VALUE 0x00000000L 16#define RWSEM_UNLOCKED_VALUE 0x00000000L
24#define RWSEM_ACTIVE_BIAS 0x00000001L 17#define RWSEM_ACTIVE_BIAS 0x00000001L
25#define RWSEM_ACTIVE_MASK 0xffffffffL 18#define RWSEM_ACTIVE_MASK 0xffffffffL
26#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) 19#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
27#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 20#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
28#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 21#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
29 spinlock_t wait_lock;
30 struct list_head wait_list;
31#ifdef CONFIG_DEBUG_LOCK_ALLOC
32 struct lockdep_map dep_map;
33#endif
34};
35
36#ifdef CONFIG_DEBUG_LOCK_ALLOC
37# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
38#else
39# define __RWSEM_DEP_MAP_INIT(lockname)
40#endif
41
42#define __RWSEM_INITIALIZER(name) \
43{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
44 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
45
46#define DECLARE_RWSEM(name) \
47 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
48
49extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
50extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
51extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
52extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
53
54extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
55 struct lock_class_key *key);
56
57#define init_rwsem(sem) \
58do { \
59 static struct lock_class_key __key; \
60 \
61 __init_rwsem((sem), #sem, &__key); \
62} while (0)
63 22
64/* 23/*
65 * lock for reading 24 * lock for reading
@@ -160,11 +119,6 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
160 return atomic64_add_return(delta, (atomic64_t *)(&sem->count)); 119 return atomic64_add_return(delta, (atomic64_t *)(&sem->count));
161} 120}
162 121
163static inline int rwsem_is_locked(struct rw_semaphore *sem)
164{
165 return (sem->count != 0);
166}
167
168#endif /* __KERNEL__ */ 122#endif /* __KERNEL__ */
169 123
170#endif /* _SPARC64_RWSEM_H */ 124#endif /* _SPARC64_RWSEM_H */
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index aeaa09a3c65..2cdc131b50a 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -700,10 +700,8 @@ static void pcic_clear_clock_irq(void)
700 700
701static irqreturn_t pcic_timer_handler (int irq, void *h) 701static irqreturn_t pcic_timer_handler (int irq, void *h)
702{ 702{
703 write_seqlock(&xtime_lock); /* Dummy, to show that we remember */
704 pcic_clear_clock_irq(); 703 pcic_clear_clock_irq();
705 do_timer(1); 704 xtime_update(1);
706 write_sequnlock(&xtime_lock);
707#ifndef CONFIG_SMP 705#ifndef CONFIG_SMP
708 update_process_times(user_mode(get_irq_regs())); 706 update_process_times(user_mode(get_irq_regs()));
709#endif 707#endif
diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index 9c743b1886f..4211bfc9bca 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -85,7 +85,7 @@ int update_persistent_clock(struct timespec now)
85 85
86/* 86/*
87 * timer_interrupt() needs to keep up the real-time clock, 87 * timer_interrupt() needs to keep up the real-time clock,
88 * as well as call the "do_timer()" routine every clocktick 88 * as well as call the "xtime_update()" routine every clocktick
89 */ 89 */
90 90
91#define TICK_SIZE (tick_nsec / 1000) 91#define TICK_SIZE (tick_nsec / 1000)
@@ -96,14 +96,9 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id)
96 profile_tick(CPU_PROFILING); 96 profile_tick(CPU_PROFILING);
97#endif 97#endif
98 98
99 /* Protect counter clear so that do_gettimeoffset works */
100 write_seqlock(&xtime_lock);
101
102 clear_clock_irq(); 99 clear_clock_irq();
103 100
104 do_timer(1); 101 xtime_update(1);
105
106 write_sequnlock(&xtime_lock);
107 102
108#ifndef CONFIG_SMP 103#ifndef CONFIG_SMP
109 update_process_times(user_mode(get_irq_regs())); 104 update_process_times(user_mode(get_irq_regs()));
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index cbddeb38ffd..d3c7a12ad87 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -16,7 +16,7 @@
16#define ATOMIC_HASH(a) (&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)]) 16#define ATOMIC_HASH(a) (&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)])
17 17
18spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = { 18spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = {
19 [0 ... (ATOMIC_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED 19 [0 ... (ATOMIC_HASH_SIZE-1)] = __SPIN_LOCK_UNLOCKED(__atomic_hash)
20}; 20};
21 21
22#else /* SMP */ 22#else /* SMP */
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index fe0d10dcae5..d03ec124a59 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -29,16 +29,16 @@
29#include <linux/uaccess.h> 29#include <linux/uaccess.h>
30#include <linux/errno.h> 30#include <linux/errno.h>
31 31
32extern struct __get_user futex_set(int __user *v, int i); 32extern struct __get_user futex_set(u32 __user *v, int i);
33extern struct __get_user futex_add(int __user *v, int n); 33extern struct __get_user futex_add(u32 __user *v, int n);
34extern struct __get_user futex_or(int __user *v, int n); 34extern struct __get_user futex_or(u32 __user *v, int n);
35extern struct __get_user futex_andn(int __user *v, int n); 35extern struct __get_user futex_andn(u32 __user *v, int n);
36extern struct __get_user futex_cmpxchg(int __user *v, int o, int n); 36extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n);
37 37
38#ifndef __tilegx__ 38#ifndef __tilegx__
39extern struct __get_user futex_xor(int __user *v, int n); 39extern struct __get_user futex_xor(u32 __user *v, int n);
40#else 40#else
41static inline struct __get_user futex_xor(int __user *uaddr, int n) 41static inline struct __get_user futex_xor(u32 __user *uaddr, int n)
42{ 42{
43 struct __get_user asm_ret = __get_user_4(uaddr); 43 struct __get_user asm_ret = __get_user_4(uaddr);
44 if (!asm_ret.err) { 44 if (!asm_ret.err) {
@@ -53,7 +53,7 @@ static inline struct __get_user futex_xor(int __user *uaddr, int n)
53} 53}
54#endif 54#endif
55 55
56static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 56static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
57{ 57{
58 int op = (encoded_op >> 28) & 7; 58 int op = (encoded_op >> 28) & 7;
59 int cmp = (encoded_op >> 24) & 15; 59 int cmp = (encoded_op >> 24) & 15;
@@ -65,7 +65,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
65 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 65 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
66 oparg = 1 << oparg; 66 oparg = 1 << oparg;
67 67
68 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 68 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
69 return -EFAULT; 69 return -EFAULT;
70 70
71 pagefault_disable(); 71 pagefault_disable();
@@ -119,16 +119,17 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
119 return ret; 119 return ret;
120} 120}
121 121
122static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, 122static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
123 int newval) 123 u32 oldval, u32 newval)
124{ 124{
125 struct __get_user asm_ret; 125 struct __get_user asm_ret;
126 126
127 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 127 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
128 return -EFAULT; 128 return -EFAULT;
129 129
130 asm_ret = futex_cmpxchg(uaddr, oldval, newval); 130 asm_ret = futex_cmpxchg(uaddr, oldval, newval);
131 return asm_ret.err ? asm_ret.err : asm_ret.val; 131 *uval = asm_ret.val;
132 return asm_ret.err;
132} 133}
133 134
134#ifndef __tilegx__ 135#ifndef __tilegx__
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index e351e14b433..1e78940218c 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -7,6 +7,7 @@ config UML
7 bool 7 bool
8 default y 8 default y
9 select HAVE_GENERIC_HARDIRQS 9 select HAVE_GENERIC_HARDIRQS
10 select GENERIC_HARDIRQS_NO_DEPRECATED
10 11
11config MMU 12config MMU
12 bool 13 bool
diff --git a/arch/um/Kconfig.x86 b/arch/um/Kconfig.x86
index 5ee328099c6..02fb017fed4 100644
--- a/arch/um/Kconfig.x86
+++ b/arch/um/Kconfig.x86
@@ -10,6 +10,8 @@ endmenu
10 10
11config UML_X86 11config UML_X86
12 def_bool y 12 def_bool y
13 select GENERIC_FIND_FIRST_BIT
14 select GENERIC_FIND_NEXT_BIT
13 15
14config 64BIT 16config 64BIT
15 bool 17 bool
@@ -19,6 +21,9 @@ config X86_32
19 def_bool !64BIT 21 def_bool !64BIT
20 select HAVE_AOUT 22 select HAVE_AOUT
21 23
24config X86_64
25 def_bool 64BIT
26
22config RWSEM_XCHGADD_ALGORITHM 27config RWSEM_XCHGADD_ALGORITHM
23 def_bool X86_XADD 28 def_bool X86_XADD
24 29
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 975613b23dc..c70e047eed7 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -124,35 +124,18 @@ void mconsole_log(struct mc_request *req)
124#if 0 124#if 0
125void mconsole_proc(struct mc_request *req) 125void mconsole_proc(struct mc_request *req)
126{ 126{
127 struct nameidata nd;
128 struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; 127 struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
129 struct file *file; 128 struct file *file;
130 int n, err; 129 int n;
131 char *ptr = req->request.data, *buf; 130 char *ptr = req->request.data, *buf;
132 mm_segment_t old_fs = get_fs(); 131 mm_segment_t old_fs = get_fs();
133 132
134 ptr += strlen("proc"); 133 ptr += strlen("proc");
135 ptr = skip_spaces(ptr); 134 ptr = skip_spaces(ptr);
136 135
137 err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd); 136 file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY);
138 if (err) {
139 mconsole_reply(req, "Failed to look up file", 1, 0);
140 goto out;
141 }
142
143 err = may_open(&nd.path, MAY_READ, O_RDONLY);
144 if (result) {
145 mconsole_reply(req, "Failed to open file", 1, 0);
146 path_put(&nd.path);
147 goto out;
148 }
149
150 file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
151 current_cred());
152 err = PTR_ERR(file);
153 if (IS_ERR(file)) { 137 if (IS_ERR(file)) {
154 mconsole_reply(req, "Failed to open file", 1, 0); 138 mconsole_reply(req, "Failed to open file", 1, 0);
155 path_put(&nd.path);
156 goto out; 139 goto out;
157 } 140 }
158 141
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index ba4a98ba39c..620f5b70957 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -185,7 +185,7 @@ struct ubd {
185 .no_cow = 0, \ 185 .no_cow = 0, \
186 .shared = 0, \ 186 .shared = 0, \
187 .cow = DEFAULT_COW, \ 187 .cow = DEFAULT_COW, \
188 .lock = SPIN_LOCK_UNLOCKED, \ 188 .lock = __SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
189 .request = NULL, \ 189 .request = NULL, \
190 .start_sg = 0, \ 190 .start_sg = 0, \
191 .end_sg = 0, \ 191 .end_sg = 0, \
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 3f0ac9e0c96..64cfea80cfe 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -35,8 +35,10 @@ int show_interrupts(struct seq_file *p, void *v)
35 } 35 }
36 36
37 if (i < NR_IRQS) { 37 if (i < NR_IRQS) {
38 raw_spin_lock_irqsave(&irq_desc[i].lock, flags); 38 struct irq_desc *desc = irq_to_desc(i);
39 action = irq_desc[i].action; 39
40 raw_spin_lock_irqsave(&desc->lock, flags);
41 action = desc->action;
40 if (!action) 42 if (!action)
41 goto skip; 43 goto skip;
42 seq_printf(p, "%3d: ",i); 44 seq_printf(p, "%3d: ",i);
@@ -46,7 +48,7 @@ int show_interrupts(struct seq_file *p, void *v)
46 for_each_online_cpu(j) 48 for_each_online_cpu(j)
47 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 49 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
48#endif 50#endif
49 seq_printf(p, " %14s", irq_desc[i].chip->name); 51 seq_printf(p, " %14s", get_irq_desc_chip(desc)->name);
50 seq_printf(p, " %s", action->name); 52 seq_printf(p, " %s", action->name);
51 53
52 for (action=action->next; action; action = action->next) 54 for (action=action->next; action; action = action->next)
@@ -54,7 +56,7 @@ int show_interrupts(struct seq_file *p, void *v)
54 56
55 seq_putc(p, '\n'); 57 seq_putc(p, '\n');
56skip: 58skip:
57 raw_spin_unlock_irqrestore(&irq_desc[i].lock, flags); 59 raw_spin_unlock_irqrestore(&desc->lock, flags);
58 } else if (i == NR_IRQS) 60 } else if (i == NR_IRQS)
59 seq_putc(p, '\n'); 61 seq_putc(p, '\n');
60 62
@@ -360,10 +362,10 @@ EXPORT_SYMBOL(um_request_irq);
360EXPORT_SYMBOL(reactivate_fd); 362EXPORT_SYMBOL(reactivate_fd);
361 363
362/* 364/*
363 * irq_chip must define (startup || enable) && 365 * irq_chip must define at least enable/disable and ack when
364 * (shutdown || disable) && end 366 * the edge handler is used.
365 */ 367 */
366static void dummy(unsigned int irq) 368static void dummy(struct irq_data *d)
367{ 369{
368} 370}
369 371
@@ -371,20 +373,17 @@ static void dummy(unsigned int irq)
371static struct irq_chip normal_irq_type = { 373static struct irq_chip normal_irq_type = {
372 .name = "SIGIO", 374 .name = "SIGIO",
373 .release = free_irq_by_irq_and_dev, 375 .release = free_irq_by_irq_and_dev,
374 .disable = dummy, 376 .irq_disable = dummy,
375 .enable = dummy, 377 .irq_enable = dummy,
376 .ack = dummy, 378 .irq_ack = dummy,
377 .end = dummy
378}; 379};
379 380
380static struct irq_chip SIGVTALRM_irq_type = { 381static struct irq_chip SIGVTALRM_irq_type = {
381 .name = "SIGVTALRM", 382 .name = "SIGVTALRM",
382 .release = free_irq_by_irq_and_dev, 383 .release = free_irq_by_irq_and_dev,
383 .shutdown = dummy, /* never called */ 384 .irq_disable = dummy,
384 .disable = dummy, 385 .irq_enable = dummy,
385 .enable = dummy, 386 .irq_ack = dummy,
386 .ack = dummy,
387 .end = dummy
388}; 387};
389 388
390void __init init_IRQ(void) 389void __init init_IRQ(void)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d30aa..f8958b01b97 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -64,8 +64,12 @@ config X86
64 select HAVE_TEXT_POKE_SMP 64 select HAVE_TEXT_POKE_SMP
65 select HAVE_GENERIC_HARDIRQS 65 select HAVE_GENERIC_HARDIRQS
66 select HAVE_SPARSE_IRQ 66 select HAVE_SPARSE_IRQ
67 select GENERIC_FIND_FIRST_BIT
68 select GENERIC_FIND_NEXT_BIT
67 select GENERIC_IRQ_PROBE 69 select GENERIC_IRQ_PROBE
68 select GENERIC_PENDING_IRQ if SMP 70 select GENERIC_PENDING_IRQ if SMP
71 select GENERIC_IRQ_SHOW
72 select IRQ_FORCED_THREADING
69 select USE_GENERIC_SMP_HELPERS if SMP 73 select USE_GENERIC_SMP_HELPERS if SMP
70 74
71config INSTRUCTION_DECODER 75config INSTRUCTION_DECODER
@@ -382,6 +386,8 @@ config X86_INTEL_CE
382 depends on X86_32 386 depends on X86_32
383 depends on X86_EXTENDED_PLATFORM 387 depends on X86_EXTENDED_PLATFORM
384 select X86_REBOOTFIXUPS 388 select X86_REBOOTFIXUPS
389 select OF
390 select OF_EARLY_FLATTREE
385 ---help--- 391 ---help---
386 Select for the Intel CE media processor (CE4100) SOC. 392 Select for the Intel CE media processor (CE4100) SOC.
387 This option compiles in support for the CE4100 SOC for settop 393 This option compiles in support for the CE4100 SOC for settop
@@ -811,7 +817,7 @@ config X86_LOCAL_APIC
811 817
812config X86_IO_APIC 818config X86_IO_APIC
813 def_bool y 819 def_bool y
814 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 820 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
815 821
816config X86_VISWS_APIC 822config X86_VISWS_APIC
817 def_bool y 823 def_bool y
@@ -1705,7 +1711,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
1705 depends on NUMA 1711 depends on NUMA
1706 1712
1707config USE_PERCPU_NUMA_NODE_ID 1713config USE_PERCPU_NUMA_NODE_ID
1708 def_bool X86_64 1714 def_bool y
1709 depends on NUMA 1715 depends on NUMA
1710 1716
1711menu "Power management and ACPI options" 1717menu "Power management and ACPI options"
@@ -2066,9 +2072,10 @@ config SCx200HR_TIMER
2066 2072
2067config OLPC 2073config OLPC
2068 bool "One Laptop Per Child support" 2074 bool "One Laptop Per Child support"
2075 depends on !X86_PAE
2069 select GPIOLIB 2076 select GPIOLIB
2070 select OLPC_OPENFIRMWARE 2077 select OF
2071 depends on !X86_64 && !X86_PAE 2078 select OF_PROMTREE if PROC_DEVICETREE
2072 ---help--- 2079 ---help---
2073 Add support for detecting the unique features of the OLPC 2080 Add support for detecting the unique features of the OLPC
2074 XO hardware. 2081 XO hardware.
@@ -2079,21 +2086,6 @@ config OLPC_XO1
2079 ---help--- 2086 ---help---
2080 Add support for non-essential features of the OLPC XO-1 laptop. 2087 Add support for non-essential features of the OLPC XO-1 laptop.
2081 2088
2082config OLPC_OPENFIRMWARE
2083 bool "Support for OLPC's Open Firmware"
2084 depends on !X86_64 && !X86_PAE
2085 default n
2086 select OF
2087 help
2088 This option adds support for the implementation of Open Firmware
2089 that is used on the OLPC XO-1 Children's Machine.
2090 If unsure, say N here.
2091
2092config OLPC_OPENFIRMWARE_DT
2093 bool
2094 default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
2095 select OF_PROMTREE
2096
2097endif # X86_32 2089endif # X86_32
2098 2090
2099config AMD_NB 2091config AMD_NB
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e6d3013f7ec..75d89ac58d2 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -294,11 +294,6 @@ config X86_GENERIC
294 294
295endif 295endif
296 296
297config X86_CPU
298 def_bool y
299 select GENERIC_FIND_FIRST_BIT
300 select GENERIC_FIND_NEXT_BIT
301
302# 297#
303# Define implied options from the CPU selection here 298# Define implied options from the CPU selection here
304config X86_INTERNODE_CACHE_SHIFT 299config X86_INTERNODE_CACHE_SHIFT
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99c339..430312ba6e3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
25#define sysretl_audit ia32_ret_from_sys_call 25#define sysretl_audit ia32_ret_from_sys_call
26#endif 26#endif
27 27
28 .section .entry.text, "ax"
29
28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 30#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
29 31
30 .macro IA32_ARG_FIXUP noebp=0 32 .macro IA32_ARG_FIXUP noebp=0
@@ -126,26 +128,20 @@ ENTRY(ia32_sysenter_target)
126 */ 128 */
127 ENABLE_INTERRUPTS(CLBR_NONE) 129 ENABLE_INTERRUPTS(CLBR_NONE)
128 movl %ebp,%ebp /* zero extension */ 130 movl %ebp,%ebp /* zero extension */
129 pushq $__USER32_DS 131 pushq_cfi $__USER32_DS
130 CFI_ADJUST_CFA_OFFSET 8
131 /*CFI_REL_OFFSET ss,0*/ 132 /*CFI_REL_OFFSET ss,0*/
132 pushq %rbp 133 pushq_cfi %rbp
133 CFI_ADJUST_CFA_OFFSET 8
134 CFI_REL_OFFSET rsp,0 134 CFI_REL_OFFSET rsp,0
135 pushfq 135 pushfq_cfi
136 CFI_ADJUST_CFA_OFFSET 8
137 /*CFI_REL_OFFSET rflags,0*/ 136 /*CFI_REL_OFFSET rflags,0*/
138 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d 137 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
139 CFI_REGISTER rip,r10 138 CFI_REGISTER rip,r10
140 pushq $__USER32_CS 139 pushq_cfi $__USER32_CS
141 CFI_ADJUST_CFA_OFFSET 8
142 /*CFI_REL_OFFSET cs,0*/ 140 /*CFI_REL_OFFSET cs,0*/
143 movl %eax, %eax 141 movl %eax, %eax
144 pushq %r10 142 pushq_cfi %r10
145 CFI_ADJUST_CFA_OFFSET 8
146 CFI_REL_OFFSET rip,0 143 CFI_REL_OFFSET rip,0
147 pushq %rax 144 pushq_cfi %rax
148 CFI_ADJUST_CFA_OFFSET 8
149 cld 145 cld
150 SAVE_ARGS 0,0,1 146 SAVE_ARGS 0,0,1
151 /* no need to do an access_ok check here because rbp has been 147 /* no need to do an access_ok check here because rbp has been
@@ -182,11 +178,9 @@ sysexit_from_sys_call:
182 xorq %r9,%r9 178 xorq %r9,%r9
183 xorq %r10,%r10 179 xorq %r10,%r10
184 xorq %r11,%r11 180 xorq %r11,%r11
185 popfq 181 popfq_cfi
186 CFI_ADJUST_CFA_OFFSET -8
187 /*CFI_RESTORE rflags*/ 182 /*CFI_RESTORE rflags*/
188 popq %rcx /* User %esp */ 183 popq_cfi %rcx /* User %esp */
189 CFI_ADJUST_CFA_OFFSET -8
190 CFI_REGISTER rsp,rcx 184 CFI_REGISTER rsp,rcx
191 TRACE_IRQS_ON 185 TRACE_IRQS_ON
192 ENABLE_INTERRUPTS_SYSEXIT32 186 ENABLE_INTERRUPTS_SYSEXIT32
@@ -421,8 +415,7 @@ ENTRY(ia32_syscall)
421 */ 415 */
422 ENABLE_INTERRUPTS(CLBR_NONE) 416 ENABLE_INTERRUPTS(CLBR_NONE)
423 movl %eax,%eax 417 movl %eax,%eax
424 pushq %rax 418 pushq_cfi %rax
425 CFI_ADJUST_CFA_OFFSET 8
426 cld 419 cld
427 /* note the registers are not zero extended to the sf. 420 /* note the registers are not zero extended to the sf.
428 this could be a problem. */ 421 this could be a problem. */
@@ -851,4 +844,7 @@ ia32_sys_call_table:
851 .quad sys_fanotify_init 844 .quad sys_fanotify_init
852 .quad sys32_fanotify_mark 845 .quad sys32_fanotify_mark
853 .quad sys_prlimit64 /* 340 */ 846 .quad sys_prlimit64 /* 340 */
847 .quad sys_name_to_handle_at
848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime
854ia32_syscall_end: 850ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4ea15ca89b2..b964ec45754 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -186,15 +186,7 @@ struct bootnode;
186 186
187#ifdef CONFIG_ACPI_NUMA 187#ifdef CONFIG_ACPI_NUMA
188extern int acpi_numa; 188extern int acpi_numa;
189extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 189extern int x86_acpi_numa_init(void);
190 unsigned long end);
191extern int acpi_scan_nodes(unsigned long start, unsigned long end);
192#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
193
194#ifdef CONFIG_NUMA_EMU
195extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
196 int num_nodes);
197#endif
198#endif /* CONFIG_ACPI_NUMA */ 190#endif /* CONFIG_ACPI_NUMA */
199 191
200#define acpi_unlazy_tlb(x) leave_mm(x) 192#define acpi_unlazy_tlb(x) leave_mm(x)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 527fb966ab5..331682231bb 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -16,16 +16,10 @@ struct bootnode;
16extern bool early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 17extern int amd_cache_northbridges(void);
18extern void amd_flush_garts(void); 18extern void amd_flush_garts(void);
19extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); 19extern int amd_numa_init(void);
20extern int amd_scan_nodes(void);
21extern int amd_get_subcaches(int); 20extern int amd_get_subcaches(int);
22extern int amd_set_subcaches(int, int); 21extern int amd_set_subcaches(int, int);
23 22
24#ifdef CONFIG_NUMA_EMU
25extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
26extern void amd_get_nodes(struct bootnode *nodes);
27#endif
28
29struct amd_northbridge { 23struct amd_northbridge {
30 struct pci_dev *misc; 24 struct pci_dev *misc;
31 struct pci_dev *link; 25 struct pci_dev *link;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3c896946f4c..a279d98ea95 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -220,7 +220,6 @@ extern void enable_IR_x2apic(void);
220 220
221extern int get_physical_broadcast(void); 221extern int get_physical_broadcast(void);
222 222
223extern void apic_disable(void);
224extern int lapic_get_maxlvt(void); 223extern int lapic_get_maxlvt(void);
225extern void clear_local_APIC(void); 224extern void clear_local_APIC(void);
226extern void connect_bsp_APIC(void); 225extern void connect_bsp_APIC(void);
@@ -228,7 +227,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup);
228extern void disable_local_APIC(void); 227extern void disable_local_APIC(void);
229extern void lapic_shutdown(void); 228extern void lapic_shutdown(void);
230extern int verify_local_APIC(void); 229extern int verify_local_APIC(void);
231extern void cache_APIC_registers(void);
232extern void sync_Arb_IDs(void); 230extern void sync_Arb_IDs(void);
233extern void init_bsp_APIC(void); 231extern void init_bsp_APIC(void);
234extern void setup_local_APIC(void); 232extern void setup_local_APIC(void);
@@ -239,8 +237,7 @@ void register_lapic_address(unsigned long address);
239extern void setup_boot_APIC_clock(void); 237extern void setup_boot_APIC_clock(void);
240extern void setup_secondary_APIC_clock(void); 238extern void setup_secondary_APIC_clock(void);
241extern int APIC_init_uniprocessor(void); 239extern int APIC_init_uniprocessor(void);
242extern void enable_NMI_through_LVT0(void); 240extern int apic_force_enable(unsigned long addr);
243extern int apic_force_enable(void);
244 241
245/* 242/*
246 * On 32bit this is mach-xxx local 243 * On 32bit this is mach-xxx local
@@ -261,7 +258,6 @@ static inline void lapic_shutdown(void) { }
261#define local_apic_timer_c2_ok 1 258#define local_apic_timer_c2_ok 1
262static inline void init_apic_mappings(void) { } 259static inline void init_apic_mappings(void) { }
263static inline void disable_local_APIC(void) { } 260static inline void disable_local_APIC(void) { }
264static inline void apic_disable(void) { }
265# define setup_boot_APIC_clock x86_init_noop 261# define setup_boot_APIC_clock x86_init_noop
266# define setup_secondary_APIC_clock x86_init_noop 262# define setup_secondary_APIC_clock x86_init_noop
267#endif /* !CONFIG_X86_LOCAL_APIC */ 263#endif /* !CONFIG_X86_LOCAL_APIC */
@@ -307,8 +303,6 @@ struct apic {
307 303
308 void (*setup_apic_routing)(void); 304 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 305 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 306 int (*cpu_present_to_apicid)(int mps_cpu);
313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); 307 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 308 void (*setup_portio_remap)(void);
@@ -356,6 +350,23 @@ struct apic {
356 void (*icr_write)(u32 low, u32 high); 350 void (*icr_write)(u32 low, u32 high);
357 void (*wait_icr_idle)(void); 351 void (*wait_icr_idle)(void);
358 u32 (*safe_wait_icr_idle)(void); 352 u32 (*safe_wait_icr_idle)(void);
353
354#ifdef CONFIG_X86_32
355 /*
356 * Called very early during boot from get_smp_config(). It should
357 * return the logical apicid. x86_[bios]_cpu_to_apicid is
358 * initialized before this function is called.
359 *
360 * If logical apicid can't be determined that early, the function
361 * may return BAD_APICID. Logical apicid will be configured after
362 * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
363 * won't be applied properly during early boot in this case.
364 */
365 int (*x86_32_early_logical_apicid)(int cpu);
366
367 /* determine CPU -> NUMA node mapping */
368 int (*x86_32_numa_cpu_node)(int cpu);
369#endif
359}; 370};
360 371
361/* 372/*
@@ -503,6 +514,11 @@ extern struct apic apic_noop;
503 514
504extern struct apic apic_default; 515extern struct apic apic_default;
505 516
517static inline int noop_x86_32_early_logical_apicid(int cpu)
518{
519 return BAD_APICID;
520}
521
506/* 522/*
507 * Set up the logical destination ID. 523 * Set up the logical destination ID.
508 * 524 *
@@ -522,7 +538,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
522 return cpuid_apic >> index_msb; 538 return cpuid_apic >> index_msb;
523} 539}
524 540
525extern int default_apicid_to_node(int logical_apicid); 541extern int default_x86_32_numa_cpu_node(int cpu);
526 542
527#endif 543#endif
528 544
@@ -558,12 +574,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma
558 *retmap = *phys_map; 574 *retmap = *phys_map;
559} 575}
560 576
561/* Mapping from cpu number to logical apicid */
562static inline int default_cpu_to_logical_apicid(int cpu)
563{
564 return 1 << cpu;
565}
566
567static inline int __default_cpu_present_to_apicid(int mps_cpu) 577static inline int __default_cpu_present_to_apicid(int mps_cpu)
568{ 578{
569 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) 579 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
@@ -596,8 +606,4 @@ extern int default_check_phys_apicid_present(int phys_apicid);
596 606
597#endif /* CONFIG_X86_LOCAL_APIC */ 607#endif /* CONFIG_X86_LOCAL_APIC */
598 608
599#ifdef CONFIG_X86_32
600extern u8 cpu_2_logical_apicid[NR_CPUS];
601#endif
602
603#endif /* _ASM_X86_APIC_H */ 609#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 47a30ff8e51..d87988bacf3 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -426,4 +426,16 @@ struct local_apic {
426#else 426#else
427 #define BAD_APICID 0xFFFFu 427 #define BAD_APICID 0xFFFFu
428#endif 428#endif
429
430enum ioapic_irq_destination_types {
431 dest_Fixed = 0,
432 dest_LowestPrio = 1,
433 dest_SMI = 2,
434 dest__reserved_1 = 3,
435 dest_NMI = 4,
436 dest_INIT = 5,
437 dest__reserved_2 = 6,
438 dest_ExtINT = 7
439};
440
429#endif /* _ASM_X86_APICDEF_H */ 441#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index c8bfe63a06d..e020d88ec02 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -12,6 +12,7 @@
12/* setup data types */ 12/* setup data types */
13#define SETUP_NONE 0 13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1 14#define SETUP_E820_EXT 1
15#define SETUP_DTB 2
15 16
16/* extensible setup data list node */ 17/* extensible setup data list node */
17struct setup_data { 18struct setup_data {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 220e2ea08e8..91f3e087cf2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -160,6 +160,7 @@
160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ 160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ 161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ 162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
163#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
163 164
164/* 165/*
165 * Auxiliary flags: Linux defined - For features scattered in various 166 * Auxiliary flags: Linux defined - For features scattered in various
@@ -279,6 +280,7 @@ extern const char * const x86_power_flags[32];
279#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 280#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
280#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 281#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
281#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) 282#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
283#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
282 284
283#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 285#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
284# define cpu_has_invlpg 1 286# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index e99d55d74df..908b96957d8 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -96,7 +96,7 @@ extern void e820_setup_gap(void);
96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, 96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
97 unsigned long start_addr, unsigned long long end_addr); 97 unsigned long start_addr, unsigned long long end_addr);
98struct setup_data; 98struct setup_data;
99extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); 99extern void parse_e820_ext(struct setup_data *data);
100 100
101#if defined(CONFIG_X86_64) || \ 101#if defined(CONFIG_X86_64) || \
102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) 102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 57650ab4a5f..1cd6d26a0a8 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18 18
19.irpc idx, "01234567" 19.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
20 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
21.if NUM_INVALIDATE_TLB_VECTORS > \idx
20BUILD_INTERRUPT3(invalidate_interrupt\idx, 22BUILD_INTERRUPT3(invalidate_interrupt\idx,
21 (INVALIDATE_TLB_VECTOR_START)+\idx, 23 (INVALIDATE_TLB_VECTOR_START)+\idx,
22 smp_invalidate_interrupt) 24 smp_invalidate_interrupt)
25.endif
23.endr 26.endr
24#endif 27#endif
25 28
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 06850a7194e..2c6fc9e6281 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -7,14 +7,12 @@
7 frame pointer later */ 7 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER 8#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME 9 .macro FRAME
10 pushl %ebp 10 pushl_cfi %ebp
11 CFI_ADJUST_CFA_OFFSET 4
12 CFI_REL_OFFSET ebp,0 11 CFI_REL_OFFSET ebp,0
13 movl %esp,%ebp 12 movl %esp,%ebp
14 .endm 13 .endm
15 .macro ENDFRAME 14 .macro ENDFRAME
16 popl %ebp 15 popl_cfi %ebp
17 CFI_ADJUST_CFA_OFFSET -4
18 CFI_RESTORE ebp 16 CFI_RESTORE ebp
19 .endm 17 .endm
20#else 18#else
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e95..d09bb03653f 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
37 "+m" (*uaddr), "=&r" (tem) \ 37 "+m" (*uaddr), "=&r" (tem) \
38 : "r" (oparg), "i" (-EFAULT), "1" (0)) 38 : "r" (oparg), "i" (-EFAULT), "1" (0))
39 39
40static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 40static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
41{ 41{
42 int op = (encoded_op >> 28) & 7; 42 int op = (encoded_op >> 28) & 7;
43 int cmp = (encoded_op >> 24) & 15; 43 int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
49 oparg = 1 << oparg; 49 oparg = 1 << oparg;
50 50
51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
52 return -EFAULT; 52 return -EFAULT;
53 53
54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
109 return ret; 109 return ret;
110} 110}
111 111
112static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, 112static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
113 int newval) 113 u32 oldval, u32 newval)
114{ 114{
115 int ret = 0;
115 116
116#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 117#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
117 /* Real i386 machines have no cmpxchg instruction */ 118 /* Real i386 machines have no cmpxchg instruction */
@@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
119 return -ENOSYS; 120 return -ENOSYS;
120#endif 121#endif
121 122
122 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 123 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
123 return -EFAULT; 124 return -EFAULT;
124 125
125 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" 126 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
126 "2:\t.section .fixup, \"ax\"\n" 127 "2:\t.section .fixup, \"ax\"\n"
127 "3:\tmov %2, %0\n" 128 "3:\tmov %3, %0\n"
128 "\tjmp 2b\n" 129 "\tjmp 2b\n"
129 "\t.previous\n" 130 "\t.previous\n"
130 _ASM_EXTABLE(1b, 3b) 131 _ASM_EXTABLE(1b, 3b)
131 : "=a" (oldval), "+m" (*uaddr) 132 : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
132 : "i" (-EFAULT), "r" (newval), "0" (oldval) 133 : "i" (-EFAULT), "r" (newval), "1" (oldval)
133 : "memory" 134 : "memory"
134 ); 135 );
135 136
136 return oldval; 137 *uval = oldval;
138 return ret;
137} 139}
138 140
139#endif 141#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 0274ec5a7e6..bb9efe8706e 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void);
45extern void invalidate_interrupt5(void); 45extern void invalidate_interrupt5(void);
46extern void invalidate_interrupt6(void); 46extern void invalidate_interrupt6(void);
47extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
48extern void invalidate_interrupt8(void);
49extern void invalidate_interrupt9(void);
50extern void invalidate_interrupt10(void);
51extern void invalidate_interrupt11(void);
52extern void invalidate_interrupt12(void);
53extern void invalidate_interrupt13(void);
54extern void invalidate_interrupt14(void);
55extern void invalidate_interrupt15(void);
56extern void invalidate_interrupt16(void);
57extern void invalidate_interrupt17(void);
58extern void invalidate_interrupt18(void);
59extern void invalidate_interrupt19(void);
60extern void invalidate_interrupt20(void);
61extern void invalidate_interrupt21(void);
62extern void invalidate_interrupt22(void);
63extern void invalidate_interrupt23(void);
64extern void invalidate_interrupt24(void);
65extern void invalidate_interrupt25(void);
66extern void invalidate_interrupt26(void);
67extern void invalidate_interrupt27(void);
68extern void invalidate_interrupt28(void);
69extern void invalidate_interrupt29(void);
70extern void invalidate_interrupt30(void);
71extern void invalidate_interrupt31(void);
48 72
49extern void irq_move_cleanup_interrupt(void); 73extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void); 74extern void reboot_interrupt(void);
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 36fb1a6a510..8dbe353e41e 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start,
11 unsigned long page_size_mask); 11 unsigned long page_size_mask);
12 12
13 13
14extern unsigned long __initdata e820_table_start; 14extern unsigned long __initdata pgt_buf_start;
15extern unsigned long __meminitdata e820_table_end; 15extern unsigned long __meminitdata pgt_buf_end;
16extern unsigned long __meminitdata e820_table_top; 16extern unsigned long __meminitdata pgt_buf_top;
17 17
18#endif /* _ASM_X86_INIT_32_H */ 18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index f327d386d6c..c4bd267dfc5 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -63,17 +63,6 @@ union IO_APIC_reg_03 {
63 } __attribute__ ((packed)) bits; 63 } __attribute__ ((packed)) bits;
64}; 64};
65 65
66enum ioapic_irq_destination_types {
67 dest_Fixed = 0,
68 dest_LowestPrio = 1,
69 dest_SMI = 2,
70 dest__reserved_1 = 3,
71 dest_NMI = 4,
72 dest_INIT = 5,
73 dest__reserved_2 = 6,
74 dest_ExtINT = 7
75};
76
77struct IO_APIC_route_entry { 66struct IO_APIC_route_entry {
78 __u32 vector : 8, 67 __u32 vector : 8,
79 delivery_mode : 3, /* 000: FIXED 68 delivery_mode : 3, /* 000: FIXED
@@ -106,6 +95,10 @@ struct IR_IO_APIC_route_entry {
106 index : 15; 95 index : 15;
107} __attribute__ ((packed)); 96} __attribute__ ((packed));
108 97
98#define IOAPIC_AUTO -1
99#define IOAPIC_EDGE 0
100#define IOAPIC_LEVEL 1
101
109#ifdef CONFIG_X86_IO_APIC 102#ifdef CONFIG_X86_IO_APIC
110 103
111/* 104/*
@@ -150,11 +143,6 @@ extern int timer_through_8259;
150#define io_apic_assign_pci_irqs \ 143#define io_apic_assign_pci_irqs \
151 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) 144 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
152 145
153extern u8 io_apic_unique_id(u8 id);
154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic);
157
158struct io_apic_irq_attr; 146struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 147extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 148 struct io_apic_irq_attr *irq_attr);
@@ -162,6 +150,8 @@ void setup_IO_APIC_irq_extra(u32 gsi);
162extern void ioapic_and_gsi_init(void); 150extern void ioapic_and_gsi_init(void);
163extern void ioapic_insert_resources(void); 151extern void ioapic_insert_resources(void);
164 152
153int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr);
154
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 155extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 156extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 157extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
@@ -186,6 +176,8 @@ extern void __init pre_init_apic_IRQ0(void);
186 176
187extern void mp_save_irq(struct mpc_intsrc *m); 177extern void mp_save_irq(struct mpc_intsrc *m);
188 178
179extern void disable_ioapic_support(void);
180
189#else /* !CONFIG_X86_IO_APIC */ 181#else /* !CONFIG_X86_IO_APIC */
190 182
191#define io_apic_assign_pci_irqs 0 183#define io_apic_assign_pci_irqs 0
@@ -199,6 +191,26 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; }
199struct io_apic_irq_attr; 191struct io_apic_irq_attr;
200static inline int io_apic_set_pci_routing(struct device *dev, int irq, 192static inline int io_apic_set_pci_routing(struct device *dev, int irq,
201 struct io_apic_irq_attr *irq_attr) { return 0; } 193 struct io_apic_irq_attr *irq_attr) { return 0; }
194
195static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void)
196{
197 return NULL;
198}
199
200static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
201static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
202{
203 return -ENOMEM;
204}
205
206static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { }
207static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent)
208{
209 return -ENOMEM;
210}
211
212static inline void mp_save_irq(struct mpc_intsrc *m) { };
213static inline void disable_ioapic_support(void) { }
202#endif 214#endif
203 215
204#endif /* _ASM_X86_IO_APIC_H */ 216#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 0b7228268a6..615fa9061b5 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
123 int vector); 123 int vector);
124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, 124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
125 int vector); 125 int vector);
126extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
127 int vector);
128extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
129 int vector);
130 126
131/* Avoid include hell */ 127/* Avoid include hell */
132#define NMI_VECTOR 0x02 128#define NMI_VECTOR 0x02
@@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector)
150} 146}
151 147
152#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
149extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
150 int vector);
151extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
152 int vector);
153extern void default_send_IPI_mask_logical(const struct cpumask *mask, 153extern void default_send_IPI_mask_logical(const struct cpumask *mask,
154 int vector); 154 int vector);
155extern void default_send_IPI_allbutself(int vector); 155extern void default_send_IPI_allbutself(int vector);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index c704b38c57a..ba870bb6dd8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,9 +10,6 @@
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h> 11#include <asm/irq_vectors.h>
12 12
13/* Even though we don't support this, supply it to appease OF */
14static inline void irq_dispose_mapping(unsigned int virq) { }
15
16static inline int irq_canonicalize(int irq) 13static inline int irq_canonicalize(int irq)
17{ 14{
18 return ((irq == 2) ? 9 : irq); 15 return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h
new file mode 100644
index 00000000000..423bbbddf36
--- /dev/null
+++ b/arch/x86/include/asm/irq_controller.h
@@ -0,0 +1,12 @@
1#ifndef __IRQ_CONTROLLER__
2#define __IRQ_CONTROLLER__
3
4struct irq_domain {
5 int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize,
6 u32 *out_hwirq, u32 *out_type);
7 void *priv;
8 struct device_node *controller;
9 struct list_head l;
10};
11
12#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6af0894dafb..6e976ee3b3e 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H 1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H 2#define _ASM_X86_IRQ_VECTORS_H
3 3
4#include <linux/threads.h>
4/* 5/*
5 * Linux IRQ vector layout. 6 * Linux IRQ vector layout.
6 * 7 *
@@ -16,8 +17,8 @@
16 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
17 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
18 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
19 * Vectors 129 ... 237 : device interrupts 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
20 * Vectors 238 ... 255 : special interrupts 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
21 * 22 *
22 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
23 * 24 *
@@ -96,37 +97,43 @@
96#define THRESHOLD_APIC_VECTOR 0xf9 97#define THRESHOLD_APIC_VECTOR 0xf9
97#define REBOOT_VECTOR 0xf8 98#define REBOOT_VECTOR 0xf8
98 99
99/* f0-f7 used for spreading out TLB flushes: */
100#define INVALIDATE_TLB_VECTOR_END 0xf7
101#define INVALIDATE_TLB_VECTOR_START 0xf0
102#define NUM_INVALIDATE_TLB_VECTORS 8
103
104/*
105 * Local APIC timer IRQ vector is on a different priority level,
106 * to work around the 'lost local interrupt if more than 2 IRQ
107 * sources per level' errata.
108 */
109#define LOCAL_TIMER_VECTOR 0xef
110
111/* 100/*
112 * Generic system vector for platform specific use 101 * Generic system vector for platform specific use
113 */ 102 */
114#define X86_PLATFORM_IPI_VECTOR 0xed 103#define X86_PLATFORM_IPI_VECTOR 0xf7
115 104
116/* 105/*
117 * IRQ work vector: 106 * IRQ work vector:
118 */ 107 */
119#define IRQ_WORK_VECTOR 0xec 108#define IRQ_WORK_VECTOR 0xf6
120 109
121#define UV_BAU_MESSAGE 0xea 110#define UV_BAU_MESSAGE 0xf5
122 111
123/* 112/*
124 * Self IPI vector for machine checks 113 * Self IPI vector for machine checks
125 */ 114 */
126#define MCE_SELF_VECTOR 0xeb 115#define MCE_SELF_VECTOR 0xf4
127 116
128/* Xen vector callback to receive events in a HVM domain */ 117/* Xen vector callback to receive events in a HVM domain */
129#define XEN_HVM_EVTCHN_CALLBACK 0xe9 118#define XEN_HVM_EVTCHN_CALLBACK 0xf3
119
120/*
121 * Local APIC timer IRQ vector is on a different priority level,
122 * to work around the 'lost local interrupt if more than 2 IRQ
123 * sources per level' errata.
124 */
125#define LOCAL_TIMER_VECTOR 0xef
126
127/* up to 32 vectors used for spreading out TLB flushes: */
128#if NR_CPUS <= 32
129# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
130#else
131# define NUM_INVALIDATE_TLB_VECTORS (32)
132#endif
133
134#define INVALIDATE_TLB_VECTOR_END (0xee)
135#define INVALIDATE_TLB_VECTOR_START \
136 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
130 137
131#define NR_VECTORS 256 138#define NR_VECTORS 256
132 139
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index ca242d35e87..518bbbb9ee5 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -13,7 +13,6 @@ enum die_val {
13 DIE_PANIC, 13 DIE_PANIC,
14 DIE_NMI, 14 DIE_NMI,
15 DIE_DIE, 15 DIE_DIE,
16 DIE_NMIWATCHDOG,
17 DIE_KERNELDEBUG, 16 DIE_KERNELDEBUG,
18 DIE_TRAP, 17 DIE_TRAP,
19 DIE_GPF, 18 DIE_GPF,
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 0c90dd9f050..9c7d95f6174 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -25,7 +25,6 @@ extern int pic_mode;
25#define MAX_IRQ_SOURCES 256 25#define MAX_IRQ_SOURCES 256
26 26
27extern unsigned int def_to_bigsmp; 27extern unsigned int def_to_bigsmp;
28extern u8 apicid_2_node[];
29 28
30#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
31extern int mp_bus_id_to_node[MAX_MP_BUSSES]; 30extern int mp_bus_id_to_node[MAX_MP_BUSSES];
@@ -33,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
33extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; 32extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
34#endif 33#endif
35 34
36#define MAX_APICID 256
37
38#else /* CONFIG_X86_64: */ 35#else /* CONFIG_X86_64: */
39 36
40#define MAX_MP_BUSSES 256 37#define MAX_MP_BUSSES 256
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 43a18c77676..823d4822340 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -52,6 +52,9 @@
52#define MSR_IA32_MCG_STATUS 0x0000017a 52#define MSR_IA32_MCG_STATUS 0x0000017a
53#define MSR_IA32_MCG_CTL 0x0000017b 53#define MSR_IA32_MCG_CTL 0x0000017b
54 54
55#define MSR_OFFCORE_RSP_0 0x000001a6
56#define MSR_OFFCORE_RSP_1 0x000001a7
57
55#define MSR_IA32_PEBS_ENABLE 0x000003f1 58#define MSR_IA32_PEBS_ENABLE 0x000003f1
56#define MSR_IA32_DS_AREA 0x00000600 59#define MSR_IA32_DS_AREA 0x00000600
57#define MSR_IA32_PERF_CAPABILITIES 0x00000345 60#define MSR_IA32_PERF_CAPABILITIES 0x00000345
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c76f5b92b84..07f46016d3f 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -7,7 +7,6 @@
7 7
8#ifdef CONFIG_X86_LOCAL_APIC 8#ifdef CONFIG_X86_LOCAL_APIC
9 9
10extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
11extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 10extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
12extern int reserve_perfctr_nmi(unsigned int); 11extern int reserve_perfctr_nmi(unsigned int);
13extern void release_perfctr_nmi(unsigned int); 12extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 27da400d313..3d4dab43c99 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,5 +1,57 @@
1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H
3
4#include <asm/topology.h>
5#include <asm/apicdef.h>
6
7#ifdef CONFIG_NUMA
8
9#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
10
11/*
12 * __apicid_to_node[] stores the raw mapping between physical apicid and
13 * node and is used to initialize cpu_to_node mapping.
14 *
15 * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
16 * should be accessed by the accessors - set_apicid_to_node() and
17 * numa_cpu_node().
18 */
19extern s16 __apicid_to_node[MAX_LOCAL_APIC];
20
21static inline void set_apicid_to_node(int apicid, s16 node)
22{
23 __apicid_to_node[apicid] = node;
24}
25#else /* CONFIG_NUMA */
26static inline void set_apicid_to_node(int apicid, s16 node)
27{
28}
29#endif /* CONFIG_NUMA */
30
1#ifdef CONFIG_X86_32 31#ifdef CONFIG_X86_32
2# include "numa_32.h" 32# include "numa_32.h"
3#else 33#else
4# include "numa_64.h" 34# include "numa_64.h"
5#endif 35#endif
36
37#ifdef CONFIG_NUMA
38extern void __cpuinit numa_set_node(int cpu, int node);
39extern void __cpuinit numa_clear_node(int cpu);
40extern void __init numa_init_array(void);
41extern void __init init_cpu_to_node(void);
42extern void __cpuinit numa_add_cpu(int cpu);
43extern void __cpuinit numa_remove_cpu(int cpu);
44#else /* CONFIG_NUMA */
45static inline void numa_set_node(int cpu, int node) { }
46static inline void numa_clear_node(int cpu) { }
47static inline void numa_init_array(void) { }
48static inline void init_cpu_to_node(void) { }
49static inline void numa_add_cpu(int cpu) { }
50static inline void numa_remove_cpu(int cpu) { }
51#endif /* CONFIG_NUMA */
52
53#ifdef CONFIG_DEBUG_PER_CPU_MAPS
54struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable);
55#endif
56
57#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index b0ef2b449a9..c6beed1ef10 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -4,7 +4,12 @@
4extern int numa_off; 4extern int numa_off;
5 5
6extern int pxm_to_nid(int pxm); 6extern int pxm_to_nid(int pxm);
7extern void numa_remove_cpu(int cpu); 7
8#ifdef CONFIG_NUMA
9extern int __cpuinit numa_cpu_node(int cpu);
10#else /* CONFIG_NUMA */
11static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
12#endif /* CONFIG_NUMA */
8 13
9#ifdef CONFIG_HIGHMEM 14#ifdef CONFIG_HIGHMEM
10extern void set_highmem_pages_init(void); 15extern void set_highmem_pages_init(void);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 0493be39607..344eb1790b4 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -2,23 +2,16 @@
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h> 4#include <linux/nodemask.h>
5#include <asm/apicdef.h>
6 5
7struct bootnode { 6struct bootnode {
8 u64 start; 7 u64 start;
9 u64 end; 8 u64 end;
10}; 9};
11 10
12extern int compute_hash_shift(struct bootnode *nodes, int numblks,
13 int *nodeids);
14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) 11#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16 12
17extern void numa_init_array(void);
18extern int numa_off; 13extern int numa_off;
19 14
20extern s16 apicid_to_node[MAX_LOCAL_APIC];
21
22extern unsigned long numa_free_all_bootmem(void); 15extern unsigned long numa_free_all_bootmem(void);
23extern void setup_node_bootmem(int nodeid, unsigned long start, 16extern void setup_node_bootmem(int nodeid, unsigned long start,
24 unsigned long end); 17 unsigned long end);
@@ -31,11 +24,11 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
31 */ 24 */
32#define NODE_MIN_SIZE (4*1024*1024) 25#define NODE_MIN_SIZE (4*1024*1024)
33 26
34extern void __init init_cpu_to_node(void); 27extern nodemask_t numa_nodes_parsed __initdata;
35extern void __cpuinit numa_set_node(int cpu, int node); 28
36extern void __cpuinit numa_clear_node(int cpu); 29extern int __cpuinit numa_cpu_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 30extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
38extern void __cpuinit numa_remove_cpu(int cpu); 31extern void __init numa_set_distance(int from, int to, int distance);
39 32
40#ifdef CONFIG_NUMA_EMU 33#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 34#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
@@ -43,11 +36,7 @@ extern void __cpuinit numa_remove_cpu(int cpu);
43void numa_emu_cmdline(char *); 36void numa_emu_cmdline(char *);
44#endif /* CONFIG_NUMA_EMU */ 37#endif /* CONFIG_NUMA_EMU */
45#else 38#else
46static inline void init_cpu_to_node(void) { } 39static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
47static inline void numa_set_node(int cpu, int node) { }
48static inline void numa_clear_node(int cpu) { }
49static inline void numa_add_cpu(int cpu, int node) { }
50static inline void numa_remove_cpu(int cpu) { }
51#endif 40#endif
52 41
53#endif /* _ASM_X86_NUMA_64_H */ 42#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 641988efe06..c5d3a5abbb9 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -6,7 +6,7 @@
6 6
7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ 7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC
10 10
11extern bool olpc_ofw_is_installed(void); 11extern bool olpc_ofw_is_installed(void);
12 12
@@ -26,19 +26,15 @@ extern void setup_olpc_ofw_pgd(void);
26/* check if OFW was detected during boot */ 26/* check if OFW was detected during boot */
27extern bool olpc_ofw_present(void); 27extern bool olpc_ofw_present(void);
28 28
29#else /* !CONFIG_OLPC_OPENFIRMWARE */ 29#else /* !CONFIG_OLPC */
30
31static inline bool olpc_ofw_is_installed(void) { return false; }
32static inline void olpc_ofw_detect(void) { } 30static inline void olpc_ofw_detect(void) { }
33static inline void setup_olpc_ofw_pgd(void) { } 31static inline void setup_olpc_ofw_pgd(void) { }
34static inline bool olpc_ofw_present(void) { return false; } 32#endif /* !CONFIG_OLPC */
35
36#endif /* !CONFIG_OLPC_OPENFIRMWARE */
37 33
38#ifdef CONFIG_OLPC_OPENFIRMWARE_DT 34#ifdef CONFIG_OF_PROMTREE
39extern void olpc_dt_build_devicetree(void); 35extern void olpc_dt_build_devicetree(void);
40#else 36#else
41static inline void olpc_dt_build_devicetree(void) { } 37static inline void olpc_dt_build_devicetree(void) { }
42#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ 38#endif
43 39
44#endif /* _ASM_X86_OLPC_OFW_H */ 40#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 1df66211fd1..bce688d54c1 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAGE_DEFS_H 2#define _ASM_X86_PAGE_DEFS_H
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <linux/types.h>
5 6
6/* PAGE_SHIFT determines the page size */ 7/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12 8#define PAGE_SHIFT 12
@@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr);
45extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
46extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
47 48
49static inline phys_addr_t get_max_mapped(void)
50{
51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
52}
53
48extern unsigned long init_memory_mapping(unsigned long start, 54extern unsigned long init_memory_mapping(unsigned long start,
49 unsigned long end); 55 unsigned long end);
50 56
51extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, 57extern void initmem_init(void);
52 int acpi, int k8);
53extern void free_initmem(void); 58extern void free_initmem(void);
54 59
55#endif /* !__ASSEMBLY__ */ 60#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 45636cefa18..4c25ab48257 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -94,10 +94,6 @@ struct cpuinfo_x86 {
94 int x86_cache_alignment; /* In bytes */ 94 int x86_cache_alignment; /* In bytes */
95 int x86_power; 95 int x86_power;
96 unsigned long loops_per_jiffy; 96 unsigned long loops_per_jiffy;
97#ifdef CONFIG_SMP
98 /* cpus sharing the last level cache: */
99 cpumask_var_t llc_shared_map;
100#endif
101 /* cpuid returned max cores value: */ 97 /* cpuid returned max cores value: */
102 u16 x86_max_cores; 98 u16 x86_max_cores;
103 u16 apicid; 99 u16 apicid;
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index b4ec95f0751..971e0b46446 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -1 +1,69 @@
1/* dummy prom.h; here to make linux/of.h's #includes happy */ 1/*
2 * Definitions for Device tree / OpenFirmware handling on X86
3 *
4 * based on arch/powerpc/include/asm/prom.h which is
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#ifndef _ASM_X86_PROM_H
14#define _ASM_X86_PROM_H
15#ifndef __ASSEMBLY__
16
17#include <linux/of.h>
18#include <linux/types.h>
19#include <linux/pci.h>
20
21#include <asm/irq.h>
22#include <asm/atomic.h>
23#include <asm/setup.h>
24#include <asm/irq_controller.h>
25
26#ifdef CONFIG_OF
27extern int of_ioapic;
28extern u64 initial_dtb;
29extern void add_dtb(u64 data);
30extern void x86_add_irq_domains(void);
31void __cpuinit x86_of_pci_init(void);
32void x86_dtb_init(void);
33
34static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
35{
36 return pdev ? pdev->dev.of_node : NULL;
37}
38
39static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
40{
41 return pci_device_to_OF_node(bus->self);
42}
43
44#else
45static inline void add_dtb(u64 data) { }
46static inline void x86_add_irq_domains(void) { }
47static inline void x86_of_pci_init(void) { }
48static inline void x86_dtb_init(void) { }
49#define of_ioapic 0
50#endif
51
52extern char cmd_line[COMMAND_LINE_SIZE];
53
54#define pci_address_to_pio pci_address_to_pio
55unsigned long pci_address_to_pio(phys_addr_t addr);
56
57/**
58 * irq_dispose_mapping - Unmap an interrupt
59 * @virq: linux virq number of the interrupt to unmap
60 *
61 * FIXME: We really should implement proper virq handling like power,
62 * but that's going to be major surgery.
63 */
64static inline void irq_dispose_mapping(unsigned int virq) { }
65
66#define HAVE_ARCH_DEVTREE_FIXUPS
67
68#endif /* __ASSEMBLY__ */
69#endif
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d1e41b0f9b6..df4cd32b4cc 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -37,26 +37,9 @@
37#endif 37#endif
38 38
39#ifdef __KERNEL__ 39#ifdef __KERNEL__
40
41#include <linux/list.h>
42#include <linux/spinlock.h>
43#include <linux/lockdep.h>
44#include <asm/asm.h> 40#include <asm/asm.h>
45 41
46struct rwsem_waiter;
47
48extern asmregparm struct rw_semaphore *
49 rwsem_down_read_failed(struct rw_semaphore *sem);
50extern asmregparm struct rw_semaphore *
51 rwsem_down_write_failed(struct rw_semaphore *sem);
52extern asmregparm struct rw_semaphore *
53 rwsem_wake(struct rw_semaphore *);
54extern asmregparm struct rw_semaphore *
55 rwsem_downgrade_wake(struct rw_semaphore *sem);
56
57/* 42/*
58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of 43 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647 44 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits. 45 * for 64 bits.
@@ -74,43 +57,6 @@ extern asmregparm struct rw_semaphore *
74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 57#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 58#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
76 59
77typedef signed long rwsem_count_t;
78
79struct rw_semaphore {
80 rwsem_count_t count;
81 spinlock_t wait_lock;
82 struct list_head wait_list;
83#ifdef CONFIG_DEBUG_LOCK_ALLOC
84 struct lockdep_map dep_map;
85#endif
86};
87
88#ifdef CONFIG_DEBUG_LOCK_ALLOC
89# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
90#else
91# define __RWSEM_DEP_MAP_INIT(lockname)
92#endif
93
94
95#define __RWSEM_INITIALIZER(name) \
96{ \
97 RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
98 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
99}
100
101#define DECLARE_RWSEM(name) \
102 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
103
104extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
105 struct lock_class_key *key);
106
107#define init_rwsem(sem) \
108do { \
109 static struct lock_class_key __key; \
110 \
111 __init_rwsem((sem), #sem, &__key); \
112} while (0)
113
114/* 60/*
115 * lock for reading 61 * lock for reading
116 */ 62 */
@@ -133,7 +79,7 @@ static inline void __down_read(struct rw_semaphore *sem)
133 */ 79 */
134static inline int __down_read_trylock(struct rw_semaphore *sem) 80static inline int __down_read_trylock(struct rw_semaphore *sem)
135{ 81{
136 rwsem_count_t result, tmp; 82 long result, tmp;
137 asm volatile("# beginning __down_read_trylock\n\t" 83 asm volatile("# beginning __down_read_trylock\n\t"
138 " mov %0,%1\n\t" 84 " mov %0,%1\n\t"
139 "1:\n\t" 85 "1:\n\t"
@@ -155,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
155 */ 101 */
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 102static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 103{
158 rwsem_count_t tmp; 104 long tmp;
159 asm volatile("# beginning down_write\n\t" 105 asm volatile("# beginning down_write\n\t"
160 LOCK_PREFIX " xadd %1,(%2)\n\t" 106 LOCK_PREFIX " xadd %1,(%2)\n\t"
161 /* adds 0xffff0001, returns the old value */ 107 /* adds 0xffff0001, returns the old value */
@@ -180,9 +126,8 @@ static inline void __down_write(struct rw_semaphore *sem)
180 */ 126 */
181static inline int __down_write_trylock(struct rw_semaphore *sem) 127static inline int __down_write_trylock(struct rw_semaphore *sem)
182{ 128{
183 rwsem_count_t ret = cmpxchg(&sem->count, 129 long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
184 RWSEM_UNLOCKED_VALUE, 130 RWSEM_ACTIVE_WRITE_BIAS);
185 RWSEM_ACTIVE_WRITE_BIAS);
186 if (ret == RWSEM_UNLOCKED_VALUE) 131 if (ret == RWSEM_UNLOCKED_VALUE)
187 return 1; 132 return 1;
188 return 0; 133 return 0;
@@ -193,7 +138,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
193 */ 138 */
194static inline void __up_read(struct rw_semaphore *sem) 139static inline void __up_read(struct rw_semaphore *sem)
195{ 140{
196 rwsem_count_t tmp; 141 long tmp;
197 asm volatile("# beginning __up_read\n\t" 142 asm volatile("# beginning __up_read\n\t"
198 LOCK_PREFIX " xadd %1,(%2)\n\t" 143 LOCK_PREFIX " xadd %1,(%2)\n\t"
199 /* subtracts 1, returns the old value */ 144 /* subtracts 1, returns the old value */
@@ -211,7 +156,7 @@ static inline void __up_read(struct rw_semaphore *sem)
211 */ 156 */
212static inline void __up_write(struct rw_semaphore *sem) 157static inline void __up_write(struct rw_semaphore *sem)
213{ 158{
214 rwsem_count_t tmp; 159 long tmp;
215 asm volatile("# beginning __up_write\n\t" 160 asm volatile("# beginning __up_write\n\t"
216 LOCK_PREFIX " xadd %1,(%2)\n\t" 161 LOCK_PREFIX " xadd %1,(%2)\n\t"
217 /* subtracts 0xffff0001, returns the old value */ 162 /* subtracts 0xffff0001, returns the old value */
@@ -247,8 +192,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
247/* 192/*
248 * implement atomic add functionality 193 * implement atomic add functionality
249 */ 194 */
250static inline void rwsem_atomic_add(rwsem_count_t delta, 195static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
251 struct rw_semaphore *sem)
252{ 196{
253 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" 197 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
254 : "+m" (sem->count) 198 : "+m" (sem->count)
@@ -258,10 +202,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta,
258/* 202/*
259 * implement exchange and add functionality 203 * implement exchange and add functionality
260 */ 204 */
261static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, 205static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
262 struct rw_semaphore *sem)
263{ 206{
264 rwsem_count_t tmp = delta; 207 long tmp = delta;
265 208
266 asm volatile(LOCK_PREFIX "xadd %0,%1" 209 asm volatile(LOCK_PREFIX "xadd %0,%1"
267 : "+r" (tmp), "+m" (sem->count) 210 : "+r" (tmp), "+m" (sem->count)
@@ -270,10 +213,5 @@ static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
270 return tmp + delta; 213 return tmp + delta;
271} 214}
272 215
273static inline int rwsem_is_locked(struct rw_semaphore *sem)
274{
275 return (sem->count != 0);
276}
277
278#endif /* __KERNEL__ */ 216#endif /* __KERNEL__ */
279#endif /* _ASM_X86_RWSEM_H */ 217#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1f469513677..73b11bc0ae6 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,12 +17,24 @@
17#endif 17#endif
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/cpumask.h> 19#include <asm/cpumask.h>
20#include <asm/cpufeature.h>
20 21
21extern int smp_num_siblings; 22extern int smp_num_siblings;
22extern unsigned int num_processors; 23extern unsigned int num_processors;
23 24
25static inline bool cpu_has_ht_siblings(void)
26{
27 bool has_siblings = false;
28#ifdef CONFIG_SMP
29 has_siblings = cpu_has_ht && smp_num_siblings > 1;
30#endif
31 return has_siblings;
32}
33
24DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
25DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
26DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU(u16, cpu_llc_id);
27DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU(int, cpu_number);
28 40
@@ -36,8 +48,16 @@ static inline struct cpumask *cpu_core_mask(int cpu)
36 return per_cpu(cpu_core_map, cpu); 48 return per_cpu(cpu_core_map, cpu);
37} 49}
38 50
51static inline struct cpumask *cpu_llc_shared_mask(int cpu)
52{
53 return per_cpu(cpu_llc_shared_map, cpu);
54}
55
39DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
40DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
60#endif
41 61
42/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
43extern unsigned long stack_start; /* Initial stack pointer address */ 63extern unsigned long stack_start; /* Initial stack pointer address */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 33ecc3ea878..12569e691ce 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -98,8 +98,6 @@ do { \
98 */ 98 */
99#define HAVE_DISABLE_HLT 99#define HAVE_DISABLE_HLT
100#else 100#else
101#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
102#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
103 101
104/* frame pointer must be last for get_wchan */ 102/* frame pointer must be last for get_wchan */
105#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" 103#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21899cc31e5..910a7084f7f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -47,21 +47,6 @@
47 47
48#include <asm/mpspec.h> 48#include <asm/mpspec.h>
49 49
50#ifdef CONFIG_X86_32
51
52/* Mappings between logical cpu number and node number */
53extern int cpu_to_node_map[];
54
55/* Returns the number of the node containing CPU 'cpu' */
56static inline int __cpu_to_node(int cpu)
57{
58 return cpu_to_node_map[cpu];
59}
60#define early_cpu_to_node __cpu_to_node
61#define cpu_to_node __cpu_to_node
62
63#else /* CONFIG_X86_64 */
64
65/* Mappings between logical cpu number and node number */ 50/* Mappings between logical cpu number and node number */
66DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 51DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
67 52
@@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu)
84 69
85#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 70#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
86 71
87#endif /* CONFIG_X86_64 */
88
89/* Mappings between node number and cpus on that node. */ 72/* Mappings between node number and cpus on that node. */
90extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 73extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
91 74
@@ -155,7 +138,7 @@ extern unsigned long node_remap_size[];
155 .balance_interval = 1, \ 138 .balance_interval = 1, \
156} 139}
157 140
158#ifdef CONFIG_X86_64_ACPI_NUMA 141#ifdef CONFIG_X86_64
159extern int __node_distance(int, int); 142extern int __node_distance(int, int);
160#define node_distance(a, b) __node_distance(a, b) 143#define node_distance(a, b) __node_distance(a, b)
161#endif 144#endif
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e8ba0..ffaf183c619 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,10 +346,13 @@
346#define __NR_fanotify_init 338 346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339 347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340 348#define __NR_prlimit64 340
349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343
349 352
350#ifdef __KERNEL__ 353#ifdef __KERNEL__
351 354
352#define NR_syscalls 341 355#define NR_syscalls 344
353 356
354#define __ARCH_WANT_IPC_PARSE_VERSION 357#define __ARCH_WANT_IPC_PARSE_VERSION
355#define __ARCH_WANT_OLD_READDIR 358#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8a715..5466bea670e 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,12 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) 669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
670#define __NR_prlimit64 302 670#define __NR_prlimit64 302
671__SYSCALL(__NR_prlimit64, sys_prlimit64) 671__SYSCALL(__NR_prlimit64, sys_prlimit64)
672#define __NR_name_to_handle_at 303
673__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
674#define __NR_open_by_handle_at 304
675__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
676#define __NR_clock_adjtime 305
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
672 678
673#ifndef __NO_STUBS 679#ifndef __NO_STUBS
674#define __ARCH_WANT_OLD_READDIR 680#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 64642ad019f..643ebf2e2ad 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -83,11 +83,13 @@ struct x86_init_paging {
83 * boot cpu 83 * boot cpu
84 * @tsc_pre_init: platform function called before TSC init 84 * @tsc_pre_init: platform function called before TSC init
85 * @timer_init: initialize the platform timer (default PIT/HPET) 85 * @timer_init: initialize the platform timer (default PIT/HPET)
86 * @wallclock_init: init the wallclock device
86 */ 87 */
87struct x86_init_timers { 88struct x86_init_timers {
88 void (*setup_percpu_clockev)(void); 89 void (*setup_percpu_clockev)(void);
89 void (*tsc_pre_init)(void); 90 void (*tsc_pre_init)(void);
90 void (*timer_init)(void); 91 void (*timer_init)(void);
92 void (*wallclock_init)(void);
91}; 93};
92 94
93/** 95/**
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a3c28ae4025..8508bfe5229 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
287static inline int 287static inline int
288HYPERVISOR_sched_op(int cmd, void *arg) 288HYPERVISOR_sched_op(int cmd, void *arg)
289{ 289{
290 return _hypercall2(int, sched_op_new, cmd, arg); 290 return _hypercall2(int, sched_op, cmd, arg);
291} 291}
292 292
293static inline long 293static inline long
@@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
422#endif 422#endif
423 423
424static inline int 424static inline int
425HYPERVISOR_suspend(unsigned long srec) 425HYPERVISOR_suspend(unsigned long start_info_mfn)
426{ 426{
427 return _hypercall3(int, sched_op, SCHEDOP_shutdown, 427 struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
428 SHUTDOWN_suspend, srec); 428
429 /*
430 * For a PV guest the tools require that the start_info mfn be
431 * present in rdx/edx when the hypercall is made. Per the
432 * hypercall calling convention this is the third hypercall
433 * argument, which is start_info_mfn here.
434 */
435 return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
429} 436}
430 437
431static inline int 438static inline int
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f25bdf238a3..c61934fbf22 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -29,8 +29,10 @@ typedef struct xpaddr {
29 29
30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ 30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
31#define INVALID_P2M_ENTRY (~0UL) 31#define INVALID_P2M_ENTRY (~0UL)
32#define FOREIGN_FRAME_BIT (1UL<<31) 32#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
33#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
33#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) 34#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
35#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
34 36
35/* Maximum amount of memory we can handle in a domain in pages */ 37/* Maximum amount of memory we can handle in a domain in pages */
36#define MAX_DOMAIN_PAGES \ 38#define MAX_DOMAIN_PAGES \
@@ -41,12 +43,18 @@ extern unsigned int machine_to_phys_order;
41 43
42extern unsigned long get_phys_to_machine(unsigned long pfn); 44extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 45extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
46extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern unsigned long set_phys_range_identity(unsigned long pfn_s,
48 unsigned long pfn_e);
44 49
45extern int m2p_add_override(unsigned long mfn, struct page *page); 50extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page); 51extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn); 52extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 53extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49 54
55#ifdef CONFIG_XEN_DEBUG_FS
56extern int p2m_dump_show(struct seq_file *m, void *v);
57#endif
50static inline unsigned long pfn_to_mfn(unsigned long pfn) 58static inline unsigned long pfn_to_mfn(unsigned long pfn)
51{ 59{
52 unsigned long mfn; 60 unsigned long mfn;
@@ -57,7 +65,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
57 mfn = get_phys_to_machine(pfn); 65 mfn = get_phys_to_machine(pfn);
58 66
59 if (mfn != INVALID_P2M_ENTRY) 67 if (mfn != INVALID_P2M_ENTRY)
60 mfn &= ~FOREIGN_FRAME_BIT; 68 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
61 69
62 return mfn; 70 return mfn;
63} 71}
@@ -73,25 +81,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
73static inline unsigned long mfn_to_pfn(unsigned long mfn) 81static inline unsigned long mfn_to_pfn(unsigned long mfn)
74{ 82{
75 unsigned long pfn; 83 unsigned long pfn;
84 int ret = 0;
76 85
77 if (xen_feature(XENFEAT_auto_translated_physmap)) 86 if (xen_feature(XENFEAT_auto_translated_physmap))
78 return mfn; 87 return mfn;
79 88
89 if (unlikely((mfn >> machine_to_phys_order) != 0)) {
90 pfn = ~0;
91 goto try_override;
92 }
80 pfn = 0; 93 pfn = 0;
81 /* 94 /*
82 * The array access can fail (e.g., device space beyond end of RAM). 95 * The array access can fail (e.g., device space beyond end of RAM).
83 * In such cases it doesn't matter what we return (we return garbage), 96 * In such cases it doesn't matter what we return (we return garbage),
84 * but we must handle the fault without crashing! 97 * but we must handle the fault without crashing!
85 */ 98 */
86 __get_user(pfn, &machine_to_phys_mapping[mfn]); 99 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
87 100try_override:
88 /* 101 /* ret might be < 0 if there are no entries in the m2p for mfn */
89 * If this appears to be a foreign mfn (because the pfn 102 if (ret < 0)
90 * doesn't map back to the mfn), then check the local override 103 pfn = ~0;
91 * table to see if there's a better pfn to use. 104 else if (get_phys_to_machine(pfn) != mfn)
105 /*
106 * If this appears to be a foreign mfn (because the pfn
107 * doesn't map back to the mfn), then check the local override
108 * table to see if there's a better pfn to use.
109 *
110 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
111 */
112 pfn = m2p_find_override_pfn(mfn, ~0);
113
114 /*
115 * pfn is ~0 if there are no entries in the m2p for mfn or if the
116 * entry doesn't map back to the mfn and m2p_override doesn't have a
117 * valid entry for it.
92 */ 118 */
93 if (get_phys_to_machine(pfn) != mfn) 119 if (pfn == ~0 &&
94 pfn = m2p_find_override_pfn(mfn, pfn); 120 get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
121 pfn = mfn;
95 122
96 return pfn; 123 return pfn;
97} 124}
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 2329b3eaf8d..aa862098916 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void)
27 * its own functions. 27 * its own functions.
28 */ 28 */
29struct xen_pci_frontend_ops { 29struct xen_pci_frontend_ops {
30 int (*enable_msi)(struct pci_dev *dev, int **vectors); 30 int (*enable_msi)(struct pci_dev *dev, int vectors[]);
31 void (*disable_msi)(struct pci_dev *dev); 31 void (*disable_msi)(struct pci_dev *dev);
32 int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); 32 int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
33 void (*disable_msix)(struct pci_dev *dev); 33 void (*disable_msix)(struct pci_dev *dev);
34}; 34};
35 35
36extern struct xen_pci_frontend_ops *xen_pci_frontend; 36extern struct xen_pci_frontend_ops *xen_pci_frontend;
37 37
38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, 38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
39 int **vectors) 39 int vectors[])
40{ 40{
41 if (xen_pci_frontend && xen_pci_frontend->enable_msi) 41 if (xen_pci_frontend && xen_pci_frontend->enable_msi)
42 return xen_pci_frontend->enable_msi(dev, vectors); 42 return xen_pci_frontend->enable_msi(dev, vectors);
@@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
48 xen_pci_frontend->disable_msi(dev); 48 xen_pci_frontend->disable_msi(dev);
49} 49}
50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, 50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
51 int **vectors, int nvec) 51 int vectors[], int nvec)
52{ 52{
53 if (xen_pci_frontend && xen_pci_frontend->enable_msix) 53 if (xen_pci_frontend && xen_pci_frontend->enable_msix)
54 return xen_pci_frontend->enable_msix(dev, vectors, nvec); 54 return xen_pci_frontend->enable_msix(dev, vectors, nvec);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2cd88..62445ba2f8a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -66,9 +66,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
66apm-y := apm_32.o 66apm-y := apm_32.o
67obj-$(CONFIG_APM) += apm.o 67obj-$(CONFIG_APM) += apm.o
68obj-$(CONFIG_SMP) += smp.o 68obj-$(CONFIG_SMP) += smp.o
69obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o 69obj-$(CONFIG_SMP) += smpboot.o
70obj-$(CONFIG_SMP) += tsc_sync.o
70obj-$(CONFIG_SMP) += setup_percpu.o 71obj-$(CONFIG_SMP) += setup_percpu.o
71obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
72obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 72obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
73obj-$(CONFIG_X86_MPPARSE) += mpparse.o 73obj-$(CONFIG_X86_MPPARSE) += mpparse.o
74obj-y += apic/ 74obj-y += apic/
@@ -109,6 +109,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o
109obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 109obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
110 110
111obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 111obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
112obj-$(CONFIG_OF) += devicetree.o
112 113
113### 114###
114# 64 bit specific files 115# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3e6e2d68f76..9a966c579af 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -595,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
595 nid = acpi_get_node(handle); 595 nid = acpi_get_node(handle);
596 if (nid == -1 || !node_online(nid)) 596 if (nid == -1 || !node_online(nid))
597 return; 597 return;
598#ifdef CONFIG_X86_64 598 set_apicid_to_node(physid, nid);
599 apicid_to_node[physid] = nid;
600 numa_set_node(cpu, nid); 599 numa_set_node(cpu, nid);
601#else /* CONFIG_X86_32 */
602 apicid_2_node[physid] = nid;
603 cpu_to_node_map[cpu] = nid;
604#endif
605
606#endif 600#endif
607} 601}
608 602
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 51d4e166306..1293c709ee8 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -508,64 +508,12 @@ static int apbt_next_event(unsigned long delta,
508 return 0; 508 return 0;
509} 509}
510 510
511/*
512 * APB timer clock is not in sync with pclk on Langwell, which translates to
513 * unreliable read value caused by sampling error. the error does not add up
514 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
515 * would go backwards. the following code is trying to prevent time traveling
516 * backwards. little bit paranoid.
517 */
518static cycle_t apbt_read_clocksource(struct clocksource *cs) 511static cycle_t apbt_read_clocksource(struct clocksource *cs)
519{ 512{
520 unsigned long t0, t1, t2; 513 unsigned long current_count;
521 static unsigned long last_read; 514
522 515 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
523bad_count: 516 return (cycle_t)~current_count;
524 t1 = apbt_readl(phy_cs_timer_id,
525 APBTMR_N_CURRENT_VALUE);
526 t2 = apbt_readl(phy_cs_timer_id,
527 APBTMR_N_CURRENT_VALUE);
528 if (unlikely(t1 < t2)) {
529 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
530 t1, t2, t2 - t1);
531 goto bad_count;
532 }
533 /*
534 * check against cached last read, makes sure time does not go back.
535 * it could be a normal rollover but we will do tripple check anyway
536 */
537 if (unlikely(t2 > last_read)) {
538 /* check if we have a normal rollover */
539 unsigned long raw_intr_status =
540 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
541 /*
542 * cs timer interrupt is masked but raw intr bit is set if
543 * rollover occurs. then we read EOI reg to clear it.
544 */
545 if (raw_intr_status & (1 << phy_cs_timer_id)) {
546 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
547 goto out;
548 }
549 pr_debug("APB CS going back %lx:%lx:%lx ",
550 t2, last_read, t2 - last_read);
551bad_count_x3:
552 pr_debug("triple check enforced\n");
553 t0 = apbt_readl(phy_cs_timer_id,
554 APBTMR_N_CURRENT_VALUE);
555 udelay(1);
556 t1 = apbt_readl(phy_cs_timer_id,
557 APBTMR_N_CURRENT_VALUE);
558 udelay(1);
559 t2 = apbt_readl(phy_cs_timer_id,
560 APBTMR_N_CURRENT_VALUE);
561 if ((t2 > t1) || (t1 > t0)) {
562 printk(KERN_ERR "Error: APB CS tripple check failed\n");
563 goto bad_count_x3;
564 }
565 }
566out:
567 last_read = t2;
568 return (cycle_t)~t2;
569} 517}
570 518
571static int apbt_clocksource_register(void) 519static int apbt_clocksource_register(void)
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5955a7800a9..7b1e8e10b89 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/bootmem.h> 16#include <linux/memblock.h>
17#include <linux/mmzone.h> 17#include <linux/mmzone.h>
18#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
19#include <linux/pci.h> 19#include <linux/pci.h>
@@ -57,7 +57,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
57static u32 __init allocate_aperture(void) 57static u32 __init allocate_aperture(void)
58{ 58{
59 u32 aper_size; 59 u32 aper_size;
60 void *p; 60 unsigned long addr;
61 61
62 /* aper_size should <= 1G */ 62 /* aper_size should <= 1G */
63 if (fallback_aper_order > 5) 63 if (fallback_aper_order > 5)
@@ -83,27 +83,26 @@ static u32 __init allocate_aperture(void)
83 * so don't use 512M below as gart iommu, leave the space for kernel 83 * so don't use 512M below as gart iommu, leave the space for kernel
84 * code for safe 84 * code for safe
85 */ 85 */
86 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); 86 addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
87 if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
88 printk(KERN_ERR
89 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 addr, aper_size>>10);
91 return 0;
92 }
93 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
87 /* 94 /*
88 * Kmemleak should not scan this block as it may not be mapped via the 95 * Kmemleak should not scan this block as it may not be mapped via the
89 * kernel direct mapping. 96 * kernel direct mapping.
90 */ 97 */
91 kmemleak_ignore(p); 98 kmemleak_ignore(phys_to_virt(addr));
92 if (!p || __pa(p)+aper_size > 0xffffffff) {
93 printk(KERN_ERR
94 "Cannot allocate aperture memory hole (%p,%uK)\n",
95 p, aper_size>>10);
96 if (p)
97 free_bootmem(__pa(p), aper_size);
98 return 0;
99 }
100 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 99 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
101 aper_size >> 10, __pa(p)); 100 aper_size >> 10, addr);
102 insert_aperture_resource((u32)__pa(p), aper_size); 101 insert_aperture_resource((u32)addr, aper_size);
103 register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, 102 register_nosave_region(addr >> PAGE_SHIFT,
104 (u32)__pa(p+aper_size) >> PAGE_SHIFT); 103 (addr+aper_size) >> PAGE_SHIFT);
105 104
106 return (u32)__pa(p); 105 return (u32)addr;
107} 106}
108 107
109 108
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 76b96d74978..966673f4414 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -43,6 +43,7 @@
43#include <asm/i8259.h> 43#include <asm/i8259.h>
44#include <asm/proto.h> 44#include <asm/proto.h>
45#include <asm/apic.h> 45#include <asm/apic.h>
46#include <asm/io_apic.h>
46#include <asm/desc.h> 47#include <asm/desc.h>
47#include <asm/hpet.h> 48#include <asm/hpet.h>
48#include <asm/idle.h> 49#include <asm/idle.h>
@@ -78,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
78EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
79 80
80#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
82
83/*
84 * On x86_32, the mapping between cpu and logical apicid may vary
85 * depending on apic in use. The following early percpu variable is
86 * used for the mapping. This is where the behaviors of x86_64 and 32
87 * actually diverge. Let's keep it ugly for now.
88 */
89DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
90
81/* 91/*
82 * Knob to control our willingness to enable the local APIC. 92 * Knob to control our willingness to enable the local APIC.
83 * 93 *
84 * +1=force-enable 94 * +1=force-enable
85 */ 95 */
86static int force_enable_local_apic; 96static int force_enable_local_apic __initdata;
87/* 97/*
88 * APIC command line parameters 98 * APIC command line parameters
89 */ 99 */
@@ -153,7 +163,7 @@ early_param("nox2apic", setup_nox2apic);
153unsigned long mp_lapic_addr; 163unsigned long mp_lapic_addr;
154int disable_apic; 164int disable_apic;
155/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 165/* Disable local APIC timer from the kernel commandline or via dmi quirk */
156static int disable_apic_timer __cpuinitdata; 166static int disable_apic_timer __initdata;
157/* Local APIC timer works in C2 */ 167/* Local APIC timer works in C2 */
158int local_apic_timer_c2_ok; 168int local_apic_timer_c2_ok;
159EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 169EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -177,29 +187,8 @@ static struct resource lapic_resource = {
177 187
178static unsigned int calibration_result; 188static unsigned int calibration_result;
179 189
180static int lapic_next_event(unsigned long delta,
181 struct clock_event_device *evt);
182static void lapic_timer_setup(enum clock_event_mode mode,
183 struct clock_event_device *evt);
184static void lapic_timer_broadcast(const struct cpumask *mask);
185static void apic_pm_activate(void); 190static void apic_pm_activate(void);
186 191
187/*
188 * The local apic timer can be used for any function which is CPU local.
189 */
190static struct clock_event_device lapic_clockevent = {
191 .name = "lapic",
192 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
193 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
194 .shift = 32,
195 .set_mode = lapic_timer_setup,
196 .set_next_event = lapic_next_event,
197 .broadcast = lapic_timer_broadcast,
198 .rating = 100,
199 .irq = -1,
200};
201static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
202
203static unsigned long apic_phys; 192static unsigned long apic_phys;
204 193
205/* 194/*
@@ -238,7 +227,7 @@ static int modern_apic(void)
238 * right after this call apic become NOOP driven 227 * right after this call apic become NOOP driven
239 * so apic->write/read doesn't do anything 228 * so apic->write/read doesn't do anything
240 */ 229 */
241void apic_disable(void) 230static void __init apic_disable(void)
242{ 231{
243 pr_info("APIC: switched to apic NOOP\n"); 232 pr_info("APIC: switched to apic NOOP\n");
244 apic = &apic_noop; 233 apic = &apic_noop;
@@ -282,23 +271,6 @@ u64 native_apic_icr_read(void)
282 return icr1 | ((u64)icr2 << 32); 271 return icr1 | ((u64)icr2 << 32);
283} 272}
284 273
285/**
286 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
287 */
288void __cpuinit enable_NMI_through_LVT0(void)
289{
290 unsigned int v;
291
292 /* unmask and set to NMI */
293 v = APIC_DM_NMI;
294
295 /* Level triggered for 82489DX (32bit mode) */
296 if (!lapic_is_integrated())
297 v |= APIC_LVT_LEVEL_TRIGGER;
298
299 apic_write(APIC_LVT0, v);
300}
301
302#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
303/** 275/**
304 * get_physical_broadcast - Get number of physical broadcast IDs 276 * get_physical_broadcast - Get number of physical broadcast IDs
@@ -508,6 +480,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
508#endif 480#endif
509} 481}
510 482
483
484/*
485 * The local apic timer can be used for any function which is CPU local.
486 */
487static struct clock_event_device lapic_clockevent = {
488 .name = "lapic",
489 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
490 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
491 .shift = 32,
492 .set_mode = lapic_timer_setup,
493 .set_next_event = lapic_next_event,
494 .broadcast = lapic_timer_broadcast,
495 .rating = 100,
496 .irq = -1,
497};
498static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
499
511/* 500/*
512 * Setup the local APIC timer for this CPU. Copy the initialized values 501 * Setup the local APIC timer for this CPU. Copy the initialized values
513 * of the boot CPU and register the clock event in the framework. 502 * of the boot CPU and register the clock event in the framework.
@@ -1209,7 +1198,7 @@ void __cpuinit setup_local_APIC(void)
1209 rdtscll(tsc); 1198 rdtscll(tsc);
1210 1199
1211 if (disable_apic) { 1200 if (disable_apic) {
1212 arch_disable_smp_support(); 1201 disable_ioapic_support();
1213 return; 1202 return;
1214 } 1203 }
1215 1204
@@ -1237,6 +1226,19 @@ void __cpuinit setup_local_APIC(void)
1237 */ 1226 */
1238 apic->init_apic_ldr(); 1227 apic->init_apic_ldr();
1239 1228
1229#ifdef CONFIG_X86_32
1230 /*
1231 * APIC LDR is initialized. If logical_apicid mapping was
1232 * initialized during get_smp_config(), make sure it matches the
1233 * actual value.
1234 */
1235 i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
1236 WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
1237 /* always use the value from LDR */
1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1239 logical_smp_processor_id();
1240#endif
1241
1240 /* 1242 /*
1241 * Set Task Priority to 'accept all'. We never change this 1243 * Set Task Priority to 'accept all'. We never change this
1242 * later on. 1244 * later on.
@@ -1448,7 +1450,7 @@ int __init enable_IR(void)
1448void __init enable_IR_x2apic(void) 1450void __init enable_IR_x2apic(void)
1449{ 1451{
1450 unsigned long flags; 1452 unsigned long flags;
1451 struct IO_APIC_route_entry **ioapic_entries = NULL; 1453 struct IO_APIC_route_entry **ioapic_entries;
1452 int ret, x2apic_enabled = 0; 1454 int ret, x2apic_enabled = 0;
1453 int dmar_table_init_ret; 1455 int dmar_table_init_ret;
1454 1456
@@ -1537,7 +1539,7 @@ static int __init detect_init_APIC(void)
1537} 1539}
1538#else 1540#else
1539 1541
1540static int apic_verify(void) 1542static int __init apic_verify(void)
1541{ 1543{
1542 u32 features, h, l; 1544 u32 features, h, l;
1543 1545
@@ -1562,7 +1564,7 @@ static int apic_verify(void)
1562 return 0; 1564 return 0;
1563} 1565}
1564 1566
1565int apic_force_enable(void) 1567int __init apic_force_enable(unsigned long addr)
1566{ 1568{
1567 u32 h, l; 1569 u32 h, l;
1568 1570
@@ -1578,7 +1580,7 @@ int apic_force_enable(void)
1578 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1580 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1579 pr_info("Local APIC disabled by BIOS -- reenabling.\n"); 1581 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1580 l &= ~MSR_IA32_APICBASE_BASE; 1582 l &= ~MSR_IA32_APICBASE_BASE;
1581 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1583 l |= MSR_IA32_APICBASE_ENABLE | addr;
1582 wrmsr(MSR_IA32_APICBASE, l, h); 1584 wrmsr(MSR_IA32_APICBASE, l, h);
1583 enabled_via_apicbase = 1; 1585 enabled_via_apicbase = 1;
1584 } 1586 }
@@ -1619,7 +1621,7 @@ static int __init detect_init_APIC(void)
1619 "you can enable it with \"lapic\"\n"); 1621 "you can enable it with \"lapic\"\n");
1620 return -1; 1622 return -1;
1621 } 1623 }
1622 if (apic_force_enable()) 1624 if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
1623 return -1; 1625 return -1;
1624 } else { 1626 } else {
1625 if (apic_verify()) 1627 if (apic_verify())
@@ -1930,17 +1932,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1930{ 1932{
1931 int cpu; 1933 int cpu;
1932 1934
1933 /*
1934 * Validate version
1935 */
1936 if (version == 0x0) {
1937 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1938 "fixing up to 0x10. (tell your hw vendor)\n",
1939 version);
1940 version = 0x10;
1941 }
1942 apic_version[apicid] = version;
1943
1944 if (num_processors >= nr_cpu_ids) { 1935 if (num_processors >= nr_cpu_ids) {
1945 int max = nr_cpu_ids; 1936 int max = nr_cpu_ids;
1946 int thiscpu = max + disabled_cpus; 1937 int thiscpu = max + disabled_cpus;
@@ -1954,22 +1945,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
1954 } 1945 }
1955 1946
1956 num_processors++; 1947 num_processors++;
1957 cpu = cpumask_next_zero(-1, cpu_present_mask);
1958
1959 if (version != apic_version[boot_cpu_physical_apicid])
1960 WARN_ONCE(1,
1961 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1962 apic_version[boot_cpu_physical_apicid], cpu, version);
1963
1964 physid_set(apicid, phys_cpu_present_map);
1965 if (apicid == boot_cpu_physical_apicid) { 1948 if (apicid == boot_cpu_physical_apicid) {
1966 /* 1949 /*
1967 * x86_bios_cpu_apicid is required to have processors listed 1950 * x86_bios_cpu_apicid is required to have processors listed
1968 * in same order as logical cpu numbers. Hence the first 1951 * in same order as logical cpu numbers. Hence the first
1969 * entry is BSP, and so on. 1952 * entry is BSP, and so on.
1953 * boot_cpu_init() already hold bit 0 in cpu_present_mask
1954 * for BSP.
1970 */ 1955 */
1971 cpu = 0; 1956 cpu = 0;
1957 } else
1958 cpu = cpumask_next_zero(-1, cpu_present_mask);
1959
1960 /*
1961 * Validate version
1962 */
1963 if (version == 0x0) {
1964 pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
1965 cpu, apicid);
1966 version = 0x10;
1972 } 1967 }
1968 apic_version[apicid] = version;
1969
1970 if (version != apic_version[boot_cpu_physical_apicid]) {
1971 pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
1972 apic_version[boot_cpu_physical_apicid], cpu, version);
1973 }
1974
1975 physid_set(apicid, phys_cpu_present_map);
1973 if (apicid > max_physical_apicid) 1976 if (apicid > max_physical_apicid)
1974 max_physical_apicid = apicid; 1977 max_physical_apicid = apicid;
1975 1978
@@ -1977,7 +1980,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
1977 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1980 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1978 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1981 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1979#endif 1982#endif
1980 1983#ifdef CONFIG_X86_32
1984 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1985 apic->x86_32_early_logical_apicid(cpu);
1986#endif
1981 set_cpu_possible(cpu, true); 1987 set_cpu_possible(cpu, true);
1982 set_cpu_present(cpu, true); 1988 set_cpu_present(cpu, true);
1983} 1989}
@@ -1998,10 +2004,14 @@ void default_init_apic_ldr(void)
1998} 2004}
1999 2005
2000#ifdef CONFIG_X86_32 2006#ifdef CONFIG_X86_32
2001int default_apicid_to_node(int logical_apicid) 2007int default_x86_32_numa_cpu_node(int cpu)
2002{ 2008{
2003#ifdef CONFIG_SMP 2009#ifdef CONFIG_NUMA
2004 return apicid_2_node[hard_smp_processor_id()]; 2010 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
2011
2012 if (apicid != BAD_APICID)
2013 return __apicid_to_node[apicid];
2014 return NUMA_NO_NODE;
2005#else 2015#else
2006 return 0; 2016 return 0;
2007#endif 2017#endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c..5652d31fe10 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,8 +185,6 @@ struct apic apic_flat = {
185 .ioapic_phys_id_map = NULL, 185 .ioapic_phys_id_map = NULL,
186 .setup_apic_routing = NULL, 186 .setup_apic_routing = NULL,
187 .multi_timer_check = NULL, 187 .multi_timer_check = NULL,
188 .apicid_to_node = NULL,
189 .cpu_to_logical_apicid = NULL,
190 .cpu_present_to_apicid = default_cpu_present_to_apicid, 188 .cpu_present_to_apicid = default_cpu_present_to_apicid,
191 .apicid_to_cpu_present = NULL, 189 .apicid_to_cpu_present = NULL,
192 .setup_portio_remap = NULL, 190 .setup_portio_remap = NULL,
@@ -337,8 +335,6 @@ struct apic apic_physflat = {
337 .ioapic_phys_id_map = NULL, 335 .ioapic_phys_id_map = NULL,
338 .setup_apic_routing = NULL, 336 .setup_apic_routing = NULL,
339 .multi_timer_check = NULL, 337 .multi_timer_check = NULL,
340 .apicid_to_node = NULL,
341 .cpu_to_logical_apicid = NULL,
342 .cpu_present_to_apicid = default_cpu_present_to_apicid, 338 .cpu_present_to_apicid = default_cpu_present_to_apicid,
343 .apicid_to_cpu_present = NULL, 339 .apicid_to_cpu_present = NULL,
344 .setup_portio_remap = NULL, 340 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f..f1baa2dc087 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
54 return 0; 54 return 0;
55} 55}
56 56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb) 57static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{ 58{
64 return 0; 59 return 0;
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
113 cpumask_set_cpu(cpu, retmask); 108 cpumask_set_cpu(cpu, retmask);
114} 109}
115 110
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
123{ 112{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic)); 113 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -130,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v)
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic); 119 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131} 120}
132 121
122#ifdef CONFIG_X86_32
123static int noop_x86_32_numa_cpu_node(int cpu)
124{
125 /* we're always on node 0 */
126 return 0;
127}
128#endif
129
133struct apic apic_noop = { 130struct apic apic_noop = {
134 .name = "noop", 131 .name = "noop",
135 .probe = noop_probe, 132 .probe = noop_probe,
@@ -153,9 +150,7 @@ struct apic apic_noop = {
153 .ioapic_phys_id_map = default_ioapic_phys_id_map, 150 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL, 151 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL, 152 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157 153
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid, 154 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, 155 .apicid_to_cpu_present = physid_set_mask_of_physid,
161 156
@@ -197,4 +192,9 @@ struct apic apic_noop = {
197 .icr_write = noop_apic_icr_write, 192 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle, 193 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, 194 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
195
196#ifdef CONFIG_X86_32
197 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
198 .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
199#endif
200}; 200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5091b..541a2e43165 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
45 return 1; 45 return 1;
46} 46}
47 47
48static int bigsmp_early_logical_apicid(int cpu)
49{
50 /* on bigsmp, logical apicid is the same as physical */
51 return early_per_cpu(x86_cpu_to_apicid, cpu);
52}
53
48static inline unsigned long calculate_ldr(int cpu) 54static inline unsigned long calculate_ldr(int cpu)
49{ 55{
50 unsigned long val, id; 56 unsigned long val, id;
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
80 nr_ioapics); 86 nr_ioapics);
81} 87}
82 88
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu) 89static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{ 90{
90 if (mps_cpu < nr_cpu_ids) 91 if (mps_cpu < nr_cpu_ids)
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 94 return BAD_APICID;
94} 95}
95 96
96/* Mapping from cpu number to logical apicid */
97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= nr_cpu_ids)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 97static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
105{ 98{
106 /* For clustered we don't have a good way to do this yet - hack */ 99 /* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
115/* As we are using single CPU as destination, pick only one CPU here */ 108/* As we are using single CPU as destination, pick only one CPU here */
116static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) 109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
117{ 110{
118 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); 111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
119} 116}
120 117
121static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
129 */ 126 */
130 for_each_cpu_and(cpu, cpumask, andmask) { 127 for_each_cpu_and(cpu, cpumask, andmask) {
131 if (cpumask_test_cpu(cpu, cpu_online_mask)) 128 if (cpumask_test_cpu(cpu, cpu_online_mask))
132 break; 129 return cpu_physical_id(cpu);
133 } 130 }
134 return bigsmp_cpu_to_logical_apicid(cpu); 131 return BAD_APICID;
135} 132}
136 133
137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = {
219 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 216 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
220 .setup_apic_routing = bigsmp_setup_apic_routing, 217 .setup_apic_routing = bigsmp_setup_apic_routing,
221 .multi_timer_check = NULL, 218 .multi_timer_check = NULL,
222 .apicid_to_node = bigsmp_apicid_to_node,
223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 219 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
225 .apicid_to_cpu_present = physid_set_mask_of_physid, 220 .apicid_to_cpu_present = physid_set_mask_of_physid,
226 .setup_portio_remap = NULL, 221 .setup_portio_remap = NULL,
@@ -256,4 +251,7 @@ struct apic apic_bigsmp = {
256 .icr_write = native_apic_icr_write, 251 .icr_write = native_apic_icr_write,
257 .wait_icr_idle = native_apic_wait_icr_idle, 252 .wait_icr_idle = native_apic_wait_icr_idle,
258 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
259}; 257};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582d802..3e9de4854c5 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
460 return physid_isset(bit, phys_cpu_present_map); 460 return physid_isset(bit, phys_cpu_present_map);
461} 461}
462 462
463static int es7000_early_logical_apicid(int cpu)
464{
465 /* on es7000, logical apicid is the same as physical */
466 return early_per_cpu(x86_bios_cpu_apicid, cpu);
467}
468
463static unsigned long calculate_ldr(int cpu) 469static unsigned long calculate_ldr(int cpu)
464{ 470{
465 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); 471 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -504,12 +510,11 @@ static void es7000_setup_apic_routing(void)
504 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
505} 511}
506 512
507static int es7000_apicid_to_node(int logical_apicid) 513static int es7000_numa_cpu_node(int cpu)
508{ 514{
509 return 0; 515 return 0;
510} 516}
511 517
512
513static int es7000_cpu_present_to_apicid(int mps_cpu) 518static int es7000_cpu_present_to_apicid(int mps_cpu)
514{ 519{
515 if (!mps_cpu) 520 if (!mps_cpu)
@@ -528,18 +533,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
528 ++cpu_id; 533 ++cpu_id;
529} 534}
530 535
531/* Mapping from cpu number to logical apicid */
532static int es7000_cpu_to_logical_apicid(int cpu)
533{
534#ifdef CONFIG_SMP
535 if (cpu >= nr_cpu_ids)
536 return BAD_APICID;
537 return cpu_2_logical_apicid[cpu];
538#else
539 return logical_smp_processor_id();
540#endif
541}
542
543static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 536static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
544{ 537{
545 /* For clustered we don't have a good way to do this yet - hack */ 538 /* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +554,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
561 * The cpus in the mask must all be on the apic cluster. 554 * The cpus in the mask must all be on the apic cluster.
562 */ 555 */
563 for_each_cpu(cpu, cpumask) { 556 for_each_cpu(cpu, cpumask) {
564 int new_apicid = es7000_cpu_to_logical_apicid(cpu); 557 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
565 558
566 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 559 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
567 WARN(1, "Not a valid mask!"); 560 WARN(1, "Not a valid mask!");
@@ -578,7 +571,7 @@ static unsigned int
578es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 571es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
579 const struct cpumask *andmask) 572 const struct cpumask *andmask)
580{ 573{
581 int apicid = es7000_cpu_to_logical_apicid(0); 574 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
582 cpumask_var_t cpumask; 575 cpumask_var_t cpumask;
583 576
584 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 577 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -655,8 +648,6 @@ struct apic __refdata apic_es7000_cluster = {
655 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 648 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
656 .setup_apic_routing = es7000_setup_apic_routing, 649 .setup_apic_routing = es7000_setup_apic_routing,
657 .multi_timer_check = NULL, 650 .multi_timer_check = NULL,
658 .apicid_to_node = es7000_apicid_to_node,
659 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
660 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 651 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
661 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 652 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
662 .setup_portio_remap = NULL, 653 .setup_portio_remap = NULL,
@@ -695,6 +686,9 @@ struct apic __refdata apic_es7000_cluster = {
695 .icr_write = native_apic_icr_write, 686 .icr_write = native_apic_icr_write,
696 .wait_icr_idle = native_apic_wait_icr_idle, 687 .wait_icr_idle = native_apic_wait_icr_idle,
697 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 688 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
689
690 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
691 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
698}; 692};
699 693
700struct apic __refdata apic_es7000 = { 694struct apic __refdata apic_es7000 = {
@@ -720,8 +714,6 @@ struct apic __refdata apic_es7000 = {
720 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 714 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
721 .setup_apic_routing = es7000_setup_apic_routing, 715 .setup_apic_routing = es7000_setup_apic_routing,
722 .multi_timer_check = NULL, 716 .multi_timer_check = NULL,
723 .apicid_to_node = es7000_apicid_to_node,
724 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
725 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 717 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
726 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 718 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
727 .setup_portio_remap = NULL, 719 .setup_portio_remap = NULL,
@@ -758,4 +750,7 @@ struct apic __refdata apic_es7000 = {
758 .icr_write = native_apic_icr_write, 750 .icr_write = native_apic_icr_write,
759 .wait_icr_idle = native_apic_wait_icr_idle, 751 .wait_icr_idle = native_apic_wait_icr_idle,
760 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 752 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
753
754 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
755 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
761}; 756};
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 79fd43ca6f9..c4e557a1ebb 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -83,7 +83,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
83 arch_spin_lock(&lock); 83 arch_spin_lock(&lock);
84 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 84 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
85 show_regs(regs); 85 show_regs(regs);
86 dump_stack();
87 arch_spin_unlock(&lock); 86 arch_spin_unlock(&lock);
88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 87 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
89 return NOTIFY_STOP; 88 return NOTIFY_STOP;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ca9e2a3545a..4b5ebd26f56 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -108,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
108 108
109int skip_ioapic_setup; 109int skip_ioapic_setup;
110 110
111void arch_disable_smp_support(void) 111/**
112 * disable_ioapic_support() - disables ioapic support at runtime
113 */
114void disable_ioapic_support(void)
112{ 115{
113#ifdef CONFIG_PCI 116#ifdef CONFIG_PCI
114 noioapicquirk = 1; 117 noioapicquirk = 1;
@@ -120,11 +123,14 @@ void arch_disable_smp_support(void)
120static int __init parse_noapic(char *str) 123static int __init parse_noapic(char *str)
121{ 124{
122 /* disable IO-APIC */ 125 /* disable IO-APIC */
123 arch_disable_smp_support(); 126 disable_ioapic_support();
124 return 0; 127 return 0;
125} 128}
126early_param("noapic", parse_noapic); 129early_param("noapic", parse_noapic);
127 130
131static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
132 struct io_apic_irq_attr *attr);
133
128/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ 134/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
129void mp_save_irq(struct mpc_intsrc *m) 135void mp_save_irq(struct mpc_intsrc *m)
130{ 136{
@@ -181,7 +187,7 @@ int __init arch_early_irq_init(void)
181 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); 187 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
182 188
183 for (i = 0; i < count; i++) { 189 for (i = 0; i < count; i++) {
184 set_irq_chip_data(i, &cfg[i]); 190 irq_set_chip_data(i, &cfg[i]);
185 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); 191 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
186 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); 192 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
187 /* 193 /*
@@ -200,7 +206,7 @@ int __init arch_early_irq_init(void)
200#ifdef CONFIG_SPARSE_IRQ 206#ifdef CONFIG_SPARSE_IRQ
201static struct irq_cfg *irq_cfg(unsigned int irq) 207static struct irq_cfg *irq_cfg(unsigned int irq)
202{ 208{
203 return get_irq_chip_data(irq); 209 return irq_get_chip_data(irq);
204} 210}
205 211
206static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) 212static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
@@ -226,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
226{ 232{
227 if (!cfg) 233 if (!cfg)
228 return; 234 return;
229 set_irq_chip_data(at, NULL); 235 irq_set_chip_data(at, NULL);
230 free_cpumask_var(cfg->domain); 236 free_cpumask_var(cfg->domain);
231 free_cpumask_var(cfg->old_domain); 237 free_cpumask_var(cfg->old_domain);
232 kfree(cfg); 238 kfree(cfg);
@@ -256,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
256 if (res < 0) { 262 if (res < 0) {
257 if (res != -EEXIST) 263 if (res != -EEXIST)
258 return NULL; 264 return NULL;
259 cfg = get_irq_chip_data(at); 265 cfg = irq_get_chip_data(at);
260 if (cfg) 266 if (cfg)
261 return cfg; 267 return cfg;
262 } 268 }
263 269
264 cfg = alloc_irq_cfg(at, node); 270 cfg = alloc_irq_cfg(at, node);
265 if (cfg) 271 if (cfg)
266 set_irq_chip_data(at, cfg); 272 irq_set_chip_data(at, cfg);
267 else 273 else
268 irq_free_desc(at); 274 irq_free_desc(at);
269 return cfg; 275 return cfg;
@@ -818,7 +824,7 @@ static int EISA_ELCR(unsigned int irq)
818#define default_MCA_trigger(idx) (1) 824#define default_MCA_trigger(idx) (1)
819#define default_MCA_polarity(idx) default_ISA_polarity(idx) 825#define default_MCA_polarity(idx) default_ISA_polarity(idx)
820 826
821static int MPBIOS_polarity(int idx) 827static int irq_polarity(int idx)
822{ 828{
823 int bus = mp_irqs[idx].srcbus; 829 int bus = mp_irqs[idx].srcbus;
824 int polarity; 830 int polarity;
@@ -860,7 +866,7 @@ static int MPBIOS_polarity(int idx)
860 return polarity; 866 return polarity;
861} 867}
862 868
863static int MPBIOS_trigger(int idx) 869static int irq_trigger(int idx)
864{ 870{
865 int bus = mp_irqs[idx].srcbus; 871 int bus = mp_irqs[idx].srcbus;
866 int trigger; 872 int trigger;
@@ -932,16 +938,6 @@ static int MPBIOS_trigger(int idx)
932 return trigger; 938 return trigger;
933} 939}
934 940
935static inline int irq_polarity(int idx)
936{
937 return MPBIOS_polarity(idx);
938}
939
940static inline int irq_trigger(int idx)
941{
942 return MPBIOS_trigger(idx);
943}
944
945static int pin_2_irq(int idx, int apic, int pin) 941static int pin_2_irq(int idx, int apic, int pin)
946{ 942{
947 int irq; 943 int irq;
@@ -1189,7 +1185,7 @@ void __setup_vector_irq(int cpu)
1189 raw_spin_lock(&vector_lock); 1185 raw_spin_lock(&vector_lock);
1190 /* Mark the inuse vectors */ 1186 /* Mark the inuse vectors */
1191 for_each_active_irq(irq) { 1187 for_each_active_irq(irq) {
1192 cfg = get_irq_chip_data(irq); 1188 cfg = irq_get_chip_data(irq);
1193 if (!cfg) 1189 if (!cfg)
1194 continue; 1190 continue;
1195 /* 1191 /*
@@ -1220,10 +1216,6 @@ void __setup_vector_irq(int cpu)
1220static struct irq_chip ioapic_chip; 1216static struct irq_chip ioapic_chip;
1221static struct irq_chip ir_ioapic_chip; 1217static struct irq_chip ir_ioapic_chip;
1222 1218
1223#define IOAPIC_AUTO -1
1224#define IOAPIC_EDGE 0
1225#define IOAPIC_LEVEL 1
1226
1227#ifdef CONFIG_X86_32 1219#ifdef CONFIG_X86_32
1228static inline int IO_APIC_irq_trigger(int irq) 1220static inline int IO_APIC_irq_trigger(int irq)
1229{ 1221{
@@ -1248,35 +1240,31 @@ static inline int IO_APIC_irq_trigger(int irq)
1248} 1240}
1249#endif 1241#endif
1250 1242
1251static void ioapic_register_intr(unsigned int irq, unsigned long trigger) 1243static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1244 unsigned long trigger)
1252{ 1245{
1246 struct irq_chip *chip = &ioapic_chip;
1247 irq_flow_handler_t hdl;
1248 bool fasteoi;
1253 1249
1254 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1250 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1255 trigger == IOAPIC_LEVEL) 1251 trigger == IOAPIC_LEVEL) {
1256 irq_set_status_flags(irq, IRQ_LEVEL); 1252 irq_set_status_flags(irq, IRQ_LEVEL);
1257 else 1253 fasteoi = true;
1254 } else {
1258 irq_clear_status_flags(irq, IRQ_LEVEL); 1255 irq_clear_status_flags(irq, IRQ_LEVEL);
1256 fasteoi = false;
1257 }
1259 1258
1260 if (irq_remapped(get_irq_chip_data(irq))) { 1259 if (irq_remapped(cfg)) {
1261 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 1260 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1262 if (trigger) 1261 chip = &ir_ioapic_chip;
1263 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, 1262 fasteoi = trigger != 0;
1264 handle_fasteoi_irq,
1265 "fasteoi");
1266 else
1267 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1268 handle_edge_irq, "edge");
1269 return;
1270 } 1263 }
1271 1264
1272 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1265 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
1273 trigger == IOAPIC_LEVEL) 1266 irq_set_chip_and_handler_name(irq, chip, hdl,
1274 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1267 fasteoi ? "fasteoi" : "edge");
1275 handle_fasteoi_irq,
1276 "fasteoi");
1277 else
1278 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1279 handle_edge_irq, "edge");
1280} 1268}
1281 1269
1282static int setup_ioapic_entry(int apic_id, int irq, 1270static int setup_ioapic_entry(int apic_id, int irq,
@@ -1374,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1374 return; 1362 return;
1375 } 1363 }
1376 1364
1377 ioapic_register_intr(irq, trigger); 1365 ioapic_register_intr(irq, cfg, trigger);
1378 if (irq < legacy_pic->nr_legacy_irqs) 1366 if (irq < legacy_pic->nr_legacy_irqs)
1379 legacy_pic->mask(irq); 1367 legacy_pic->mask(irq);
1380 1368
@@ -1385,33 +1373,26 @@ static struct {
1385 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 1373 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1386} mp_ioapic_routing[MAX_IO_APICS]; 1374} mp_ioapic_routing[MAX_IO_APICS];
1387 1375
1388static void __init setup_IO_APIC_irqs(void) 1376static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1389{ 1377{
1390 int apic_id, pin, idx, irq, notcon = 0; 1378 if (idx != -1)
1391 int node = cpu_to_node(0); 1379 return false;
1392 struct irq_cfg *cfg;
1393 1380
1394 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1381 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1382 mp_ioapics[apic_id].apicid, pin);
1383 return true;
1384}
1385
1386static void __init __io_apic_setup_irqs(unsigned int apic_id)
1387{
1388 int idx, node = cpu_to_node(0);
1389 struct io_apic_irq_attr attr;
1390 unsigned int pin, irq;
1395 1391
1396 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1397 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1392 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1398 idx = find_irq_entry(apic_id, pin, mp_INT); 1393 idx = find_irq_entry(apic_id, pin, mp_INT);
1399 if (idx == -1) { 1394 if (io_apic_pin_not_connected(idx, apic_id, pin))
1400 if (!notcon) {
1401 notcon = 1;
1402 apic_printk(APIC_VERBOSE,
1403 KERN_DEBUG " %d-%d",
1404 mp_ioapics[apic_id].apicid, pin);
1405 } else
1406 apic_printk(APIC_VERBOSE, " %d-%d",
1407 mp_ioapics[apic_id].apicid, pin);
1408 continue; 1395 continue;
1409 }
1410 if (notcon) {
1411 apic_printk(APIC_VERBOSE,
1412 " (apicid-pin) not connected\n");
1413 notcon = 0;
1414 }
1415 1396
1416 irq = pin_2_irq(idx, apic_id, pin); 1397 irq = pin_2_irq(idx, apic_id, pin);
1417 1398
@@ -1423,25 +1404,24 @@ static void __init setup_IO_APIC_irqs(void)
1423 * installed and if it returns 1: 1404 * installed and if it returns 1:
1424 */ 1405 */
1425 if (apic->multi_timer_check && 1406 if (apic->multi_timer_check &&
1426 apic->multi_timer_check(apic_id, irq)) 1407 apic->multi_timer_check(apic_id, irq))
1427 continue; 1408 continue;
1428 1409
1429 cfg = alloc_irq_and_cfg_at(irq, node); 1410 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1430 if (!cfg) 1411 irq_polarity(idx));
1431 continue;
1432 1412
1433 add_pin_to_irq_node(cfg, node, apic_id, pin); 1413 io_apic_setup_irq_pin(irq, node, &attr);
1434 /*
1435 * don't mark it in pin_programmed, so later acpi could
1436 * set it correctly when irq < 16
1437 */
1438 setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
1439 irq_polarity(idx));
1440 } 1414 }
1415}
1441 1416
1442 if (notcon) 1417static void __init setup_IO_APIC_irqs(void)
1443 apic_printk(APIC_VERBOSE, 1418{
1444 " (apicid-pin) not connected\n"); 1419 unsigned int apic_id;
1420
1421 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1422
1423 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1424 __io_apic_setup_irqs(apic_id);
1445} 1425}
1446 1426
1447/* 1427/*
@@ -1452,7 +1432,7 @@ static void __init setup_IO_APIC_irqs(void)
1452void setup_IO_APIC_irq_extra(u32 gsi) 1432void setup_IO_APIC_irq_extra(u32 gsi)
1453{ 1433{
1454 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); 1434 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1455 struct irq_cfg *cfg; 1435 struct io_apic_irq_attr attr;
1456 1436
1457 /* 1437 /*
1458 * Convert 'gsi' to 'ioapic.pin'. 1438 * Convert 'gsi' to 'ioapic.pin'.
@@ -1472,21 +1452,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1472 if (apic_id == 0 || irq < NR_IRQS_LEGACY) 1452 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1473 return; 1453 return;
1474 1454
1475 cfg = alloc_irq_and_cfg_at(irq, node); 1455 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1476 if (!cfg) 1456 irq_polarity(idx));
1477 return;
1478
1479 add_pin_to_irq_node(cfg, node, apic_id, pin);
1480
1481 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1482 pr_debug("Pin %d-%d already programmed\n",
1483 mp_ioapics[apic_id].apicid, pin);
1484 return;
1485 }
1486 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1487 1457
1488 setup_ioapic_irq(apic_id, pin, irq, cfg, 1458 io_apic_setup_irq_pin_once(irq, node, &attr);
1489 irq_trigger(idx), irq_polarity(idx));
1490} 1459}
1491 1460
1492/* 1461/*
@@ -1518,7 +1487,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1518 * The timer IRQ doesn't have to know that behind the 1487 * The timer IRQ doesn't have to know that behind the
1519 * scene we may have a 8259A-master in AEOI mode ... 1488 * scene we may have a 8259A-master in AEOI mode ...
1520 */ 1489 */
1521 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 1490 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
1491 "edge");
1522 1492
1523 /* 1493 /*
1524 * Add it to the IO-APIC irq-routing table: 1494 * Add it to the IO-APIC irq-routing table:
@@ -1625,7 +1595,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1625 for_each_active_irq(irq) { 1595 for_each_active_irq(irq) {
1626 struct irq_pin_list *entry; 1596 struct irq_pin_list *entry;
1627 1597
1628 cfg = get_irq_chip_data(irq); 1598 cfg = irq_get_chip_data(irq);
1629 if (!cfg) 1599 if (!cfg)
1630 continue; 1600 continue;
1631 entry = cfg->irq_2_pin; 1601 entry = cfg->irq_2_pin;
@@ -2391,7 +2361,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
2391 2361
2392void irq_force_complete_move(int irq) 2362void irq_force_complete_move(int irq)
2393{ 2363{
2394 struct irq_cfg *cfg = get_irq_chip_data(irq); 2364 struct irq_cfg *cfg = irq_get_chip_data(irq);
2395 2365
2396 if (!cfg) 2366 if (!cfg)
2397 return; 2367 return;
@@ -2405,7 +2375,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { }
2405static void ack_apic_edge(struct irq_data *data) 2375static void ack_apic_edge(struct irq_data *data)
2406{ 2376{
2407 irq_complete_move(data->chip_data); 2377 irq_complete_move(data->chip_data);
2408 move_native_irq(data->irq); 2378 irq_move_irq(data);
2409 ack_APIC_irq(); 2379 ack_APIC_irq();
2410} 2380}
2411 2381
@@ -2462,7 +2432,7 @@ static void ack_apic_level(struct irq_data *data)
2462 irq_complete_move(cfg); 2432 irq_complete_move(cfg);
2463#ifdef CONFIG_GENERIC_PENDING_IRQ 2433#ifdef CONFIG_GENERIC_PENDING_IRQ
2464 /* If we are moving the irq we need to mask it */ 2434 /* If we are moving the irq we need to mask it */
2465 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2435 if (unlikely(irqd_is_setaffinity_pending(data))) {
2466 do_unmask_irq = 1; 2436 do_unmask_irq = 1;
2467 mask_ioapic(cfg); 2437 mask_ioapic(cfg);
2468 } 2438 }
@@ -2551,7 +2521,7 @@ static void ack_apic_level(struct irq_data *data)
2551 * and you can go talk to the chipset vendor about it. 2521 * and you can go talk to the chipset vendor about it.
2552 */ 2522 */
2553 if (!io_apic_level_ack_pending(cfg)) 2523 if (!io_apic_level_ack_pending(cfg))
2554 move_masked_irq(irq); 2524 irq_move_masked_irq(data);
2555 unmask_ioapic(cfg); 2525 unmask_ioapic(cfg);
2556 } 2526 }
2557} 2527}
@@ -2614,7 +2584,7 @@ static inline void init_IO_APIC_traps(void)
2614 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2584 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2615 */ 2585 */
2616 for_each_active_irq(irq) { 2586 for_each_active_irq(irq) {
2617 cfg = get_irq_chip_data(irq); 2587 cfg = irq_get_chip_data(irq);
2618 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { 2588 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2619 /* 2589 /*
2620 * Hmm.. We don't have an entry for this, 2590 * Hmm.. We don't have an entry for this,
@@ -2625,7 +2595,7 @@ static inline void init_IO_APIC_traps(void)
2625 legacy_pic->make_irq(irq); 2595 legacy_pic->make_irq(irq);
2626 else 2596 else
2627 /* Strange. Oh, well.. */ 2597 /* Strange. Oh, well.. */
2628 set_irq_chip(irq, &no_irq_chip); 2598 irq_set_chip(irq, &no_irq_chip);
2629 } 2599 }
2630 } 2600 }
2631} 2601}
@@ -2665,7 +2635,7 @@ static struct irq_chip lapic_chip __read_mostly = {
2665static void lapic_register_intr(int irq) 2635static void lapic_register_intr(int irq)
2666{ 2636{
2667 irq_clear_status_flags(irq, IRQ_LEVEL); 2637 irq_clear_status_flags(irq, IRQ_LEVEL);
2668 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2638 irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2669 "edge"); 2639 "edge");
2670} 2640}
2671 2641
@@ -2749,7 +2719,7 @@ int timer_through_8259 __initdata;
2749 */ 2719 */
2750static inline void __init check_timer(void) 2720static inline void __init check_timer(void)
2751{ 2721{
2752 struct irq_cfg *cfg = get_irq_chip_data(0); 2722 struct irq_cfg *cfg = irq_get_chip_data(0);
2753 int node = cpu_to_node(0); 2723 int node = cpu_to_node(0);
2754 int apic1, pin1, apic2, pin2; 2724 int apic1, pin1, apic2, pin2;
2755 unsigned long flags; 2725 unsigned long flags;
@@ -3060,7 +3030,7 @@ unsigned int create_irq_nr(unsigned int from, int node)
3060 raw_spin_unlock_irqrestore(&vector_lock, flags); 3030 raw_spin_unlock_irqrestore(&vector_lock, flags);
3061 3031
3062 if (ret) { 3032 if (ret) {
3063 set_irq_chip_data(irq, cfg); 3033 irq_set_chip_data(irq, cfg);
3064 irq_clear_status_flags(irq, IRQ_NOREQUEST); 3034 irq_clear_status_flags(irq, IRQ_NOREQUEST);
3065 } else { 3035 } else {
3066 free_irq_at(irq, cfg); 3036 free_irq_at(irq, cfg);
@@ -3085,7 +3055,7 @@ int create_irq(void)
3085 3055
3086void destroy_irq(unsigned int irq) 3056void destroy_irq(unsigned int irq)
3087{ 3057{
3088 struct irq_cfg *cfg = get_irq_chip_data(irq); 3058 struct irq_cfg *cfg = irq_get_chip_data(irq);
3089 unsigned long flags; 3059 unsigned long flags;
3090 3060
3091 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 3061 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
@@ -3119,7 +3089,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3119 3089
3120 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3090 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3121 3091
3122 if (irq_remapped(get_irq_chip_data(irq))) { 3092 if (irq_remapped(cfg)) {
3123 struct irte irte; 3093 struct irte irte;
3124 int ir_index; 3094 int ir_index;
3125 u16 sub_handle; 3095 u16 sub_handle;
@@ -3291,6 +3261,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3291 3261
3292static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3262static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3293{ 3263{
3264 struct irq_chip *chip = &msi_chip;
3294 struct msi_msg msg; 3265 struct msi_msg msg;
3295 int ret; 3266 int ret;
3296 3267
@@ -3298,14 +3269,15 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3298 if (ret < 0) 3269 if (ret < 0)
3299 return ret; 3270 return ret;
3300 3271
3301 set_irq_msi(irq, msidesc); 3272 irq_set_msi_desc(irq, msidesc);
3302 write_msi_msg(irq, &msg); 3273 write_msi_msg(irq, &msg);
3303 3274
3304 if (irq_remapped(get_irq_chip_data(irq))) { 3275 if (irq_remapped(irq_get_chip_data(irq))) {
3305 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3276 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3306 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3277 chip = &msi_ir_chip;
3307 } else 3278 }
3308 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3279
3280 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3309 3281
3310 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3282 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3311 3283
@@ -3423,8 +3395,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3423 if (ret < 0) 3395 if (ret < 0)
3424 return ret; 3396 return ret;
3425 dmar_msi_write(irq, &msg); 3397 dmar_msi_write(irq, &msg);
3426 set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, 3398 irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
3427 "edge"); 3399 "edge");
3428 return 0; 3400 return 0;
3429} 3401}
3430#endif 3402#endif
@@ -3482,6 +3454,7 @@ static struct irq_chip hpet_msi_type = {
3482 3454
3483int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3455int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3484{ 3456{
3457 struct irq_chip *chip = &hpet_msi_type;
3485 struct msi_msg msg; 3458 struct msi_msg msg;
3486 int ret; 3459 int ret;
3487 3460
@@ -3501,15 +3474,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3501 if (ret < 0) 3474 if (ret < 0)
3502 return ret; 3475 return ret;
3503 3476
3504 hpet_msi_write(get_irq_data(irq), &msg); 3477 hpet_msi_write(irq_get_handler_data(irq), &msg);
3505 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3478 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3506 if (irq_remapped(get_irq_chip_data(irq))) 3479 if (irq_remapped(irq_get_chip_data(irq)))
3507 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, 3480 chip = &ir_hpet_msi_type;
3508 handle_edge_irq, "edge");
3509 else
3510 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3511 handle_edge_irq, "edge");
3512 3481
3482 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3513 return 0; 3483 return 0;
3514} 3484}
3515#endif 3485#endif
@@ -3596,7 +3566,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3596 3566
3597 write_ht_irq_msg(irq, &msg); 3567 write_ht_irq_msg(irq, &msg);
3598 3568
3599 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3569 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3600 handle_edge_irq, "edge"); 3570 handle_edge_irq, "edge");
3601 3571
3602 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3572 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -3605,7 +3575,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3605} 3575}
3606#endif /* CONFIG_HT_IRQ */ 3576#endif /* CONFIG_HT_IRQ */
3607 3577
3608int __init io_apic_get_redir_entries (int ioapic) 3578int
3579io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3580{
3581 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
3582 int ret;
3583
3584 if (!cfg)
3585 return -EINVAL;
3586 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
3587 if (!ret)
3588 setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
3589 attr->trigger, attr->polarity);
3590 return ret;
3591}
3592
3593static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3594 struct io_apic_irq_attr *attr)
3595{
3596 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3597 int ret;
3598
3599 /* Avoid redundant programming */
3600 if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
3601 pr_debug("Pin %d-%d already programmed\n",
3602 mp_ioapics[id].apicid, pin);
3603 return 0;
3604 }
3605 ret = io_apic_setup_irq_pin(irq, node, attr);
3606 if (!ret)
3607 set_bit(pin, mp_ioapic_routing[id].pin_programmed);
3608 return ret;
3609}
3610
3611static int __init io_apic_get_redir_entries(int ioapic)
3609{ 3612{
3610 union IO_APIC_reg_01 reg_01; 3613 union IO_APIC_reg_01 reg_01;
3611 unsigned long flags; 3614 unsigned long flags;
@@ -3659,96 +3662,24 @@ int __init arch_probe_nr_irqs(void)
3659} 3662}
3660#endif 3663#endif
3661 3664
3662static int __io_apic_set_pci_routing(struct device *dev, int irq, 3665int io_apic_set_pci_routing(struct device *dev, int irq,
3663 struct io_apic_irq_attr *irq_attr) 3666 struct io_apic_irq_attr *irq_attr)
3664{ 3667{
3665 struct irq_cfg *cfg;
3666 int node; 3668 int node;
3667 int ioapic, pin;
3668 int trigger, polarity;
3669 3669
3670 ioapic = irq_attr->ioapic;
3671 if (!IO_APIC_IRQ(irq)) { 3670 if (!IO_APIC_IRQ(irq)) {
3672 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3671 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3673 ioapic); 3672 irq_attr->ioapic);
3674 return -EINVAL; 3673 return -EINVAL;
3675 } 3674 }
3676 3675
3677 if (dev) 3676 node = dev ? dev_to_node(dev) : cpu_to_node(0);
3678 node = dev_to_node(dev);
3679 else
3680 node = cpu_to_node(0);
3681
3682 cfg = alloc_irq_and_cfg_at(irq, node);
3683 if (!cfg)
3684 return 0;
3685
3686 pin = irq_attr->ioapic_pin;
3687 trigger = irq_attr->trigger;
3688 polarity = irq_attr->polarity;
3689 3677
3690 /* 3678 return io_apic_setup_irq_pin_once(irq, node, irq_attr);
3691 * IRQs < 16 are already in the irq_2_pin[] map
3692 */
3693 if (irq >= legacy_pic->nr_legacy_irqs) {
3694 if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
3695 printk(KERN_INFO "can not add pin %d for irq %d\n",
3696 pin, irq);
3697 return 0;
3698 }
3699 }
3700
3701 setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
3702
3703 return 0;
3704} 3679}
3705 3680
3706int io_apic_set_pci_routing(struct device *dev, int irq,
3707 struct io_apic_irq_attr *irq_attr)
3708{
3709 int ioapic, pin;
3710 /*
3711 * Avoid pin reprogramming. PRTs typically include entries
3712 * with redundant pin->gsi mappings (but unique PCI devices);
3713 * we only program the IOAPIC on the first.
3714 */
3715 ioapic = irq_attr->ioapic;
3716 pin = irq_attr->ioapic_pin;
3717 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3718 pr_debug("Pin %d-%d already programmed\n",
3719 mp_ioapics[ioapic].apicid, pin);
3720 return 0;
3721 }
3722 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3723
3724 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3725}
3726
3727u8 __init io_apic_unique_id(u8 id)
3728{
3729#ifdef CONFIG_X86_32 3681#ifdef CONFIG_X86_32
3730 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 3682static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3731 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3732 return io_apic_get_unique_id(nr_ioapics, id);
3733 else
3734 return id;
3735#else
3736 int i;
3737 DECLARE_BITMAP(used, 256);
3738
3739 bitmap_zero(used, 256);
3740 for (i = 0; i < nr_ioapics; i++) {
3741 struct mpc_ioapic *ia = &mp_ioapics[i];
3742 __set_bit(ia->apicid, used);
3743 }
3744 if (!test_bit(id, used))
3745 return id;
3746 return find_first_zero_bit(used, 256);
3747#endif
3748}
3749
3750#ifdef CONFIG_X86_32
3751int __init io_apic_get_unique_id(int ioapic, int apic_id)
3752{ 3683{
3753 union IO_APIC_reg_00 reg_00; 3684 union IO_APIC_reg_00 reg_00;
3754 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 3685 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -3821,9 +3752,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3821 3752
3822 return apic_id; 3753 return apic_id;
3823} 3754}
3755
3756static u8 __init io_apic_unique_id(u8 id)
3757{
3758 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3759 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3760 return io_apic_get_unique_id(nr_ioapics, id);
3761 else
3762 return id;
3763}
3764#else
3765static u8 __init io_apic_unique_id(u8 id)
3766{
3767 int i;
3768 DECLARE_BITMAP(used, 256);
3769
3770 bitmap_zero(used, 256);
3771 for (i = 0; i < nr_ioapics; i++) {
3772 struct mpc_ioapic *ia = &mp_ioapics[i];
3773 __set_bit(ia->apicid, used);
3774 }
3775 if (!test_bit(id, used))
3776 return id;
3777 return find_first_zero_bit(used, 256);
3778}
3824#endif 3779#endif
3825 3780
3826int __init io_apic_get_version(int ioapic) 3781static int __init io_apic_get_version(int ioapic)
3827{ 3782{
3828 union IO_APIC_reg_01 reg_01; 3783 union IO_APIC_reg_01 reg_01;
3829 unsigned long flags; 3784 unsigned long flags;
@@ -3868,8 +3823,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
3868void __init setup_ioapic_dest(void) 3823void __init setup_ioapic_dest(void)
3869{ 3824{
3870 int pin, ioapic, irq, irq_entry; 3825 int pin, ioapic, irq, irq_entry;
3871 struct irq_desc *desc;
3872 const struct cpumask *mask; 3826 const struct cpumask *mask;
3827 struct irq_data *idata;
3873 3828
3874 if (skip_ioapic_setup == 1) 3829 if (skip_ioapic_setup == 1)
3875 return; 3830 return;
@@ -3884,21 +3839,20 @@ void __init setup_ioapic_dest(void)
3884 if ((ioapic > 0) && (irq > 16)) 3839 if ((ioapic > 0) && (irq > 16))
3885 continue; 3840 continue;
3886 3841
3887 desc = irq_to_desc(irq); 3842 idata = irq_get_irq_data(irq);
3888 3843
3889 /* 3844 /*
3890 * Honour affinities which have been set in early boot 3845 * Honour affinities which have been set in early boot
3891 */ 3846 */
3892 if (desc->status & 3847 if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
3893 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3848 mask = idata->affinity;
3894 mask = desc->irq_data.affinity;
3895 else 3849 else
3896 mask = apic->target_cpus(); 3850 mask = apic->target_cpus();
3897 3851
3898 if (intr_remapping_enabled) 3852 if (intr_remapping_enabled)
3899 ir_ioapic_set_affinity(&desc->irq_data, mask, false); 3853 ir_ioapic_set_affinity(idata, mask, false);
3900 else 3854 else
3901 ioapic_set_affinity(&desc->irq_data, mask, false); 3855 ioapic_set_affinity(idata, mask, false);
3902 } 3856 }
3903 3857
3904} 3858}
@@ -4026,7 +3980,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
4026 return gsi - mp_gsi_routing[ioapic].gsi_base; 3980 return gsi - mp_gsi_routing[ioapic].gsi_base;
4027} 3981}
4028 3982
4029static int bad_ioapic(unsigned long address) 3983static __init int bad_ioapic(unsigned long address)
4030{ 3984{
4031 if (nr_ioapics >= MAX_IO_APICS) { 3985 if (nr_ioapics >= MAX_IO_APICS) {
4032 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " 3986 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
@@ -4086,20 +4040,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4086/* Enable IOAPIC early just for system timer */ 4040/* Enable IOAPIC early just for system timer */
4087void __init pre_init_apic_IRQ0(void) 4041void __init pre_init_apic_IRQ0(void)
4088{ 4042{
4089 struct irq_cfg *cfg; 4043 struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
4090 4044
4091 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4045 printk(KERN_INFO "Early APIC setup for system timer0\n");
4092#ifndef CONFIG_SMP 4046#ifndef CONFIG_SMP
4093 physid_set_mask_of_physid(boot_cpu_physical_apicid, 4047 physid_set_mask_of_physid(boot_cpu_physical_apicid,
4094 &phys_cpu_present_map); 4048 &phys_cpu_present_map);
4095#endif 4049#endif
4096 /* Make sure the irq descriptor is set up */
4097 cfg = alloc_irq_and_cfg_at(0, 0);
4098
4099 setup_local_APIC(); 4050 setup_local_APIC();
4100 4051
4101 add_pin_to_irq_node(cfg, 0, 0, 0); 4052 io_apic_setup_irq_pin(0, 0, &attr);
4102 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 4053 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
4103 4054 "edge");
4104 setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
4105} 4055}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6..cce91bf2667 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
59#ifdef CONFIG_X86_32
60
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, 61void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector) 62 int vector)
61{ 63{
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
71 local_irq_save(flags); 73 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask) 74 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field( 75 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector, 76 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
75 apic->dest_logical); 77 vector, apic->dest_logical);
76 local_irq_restore(flags); 78 local_irq_restore(flags);
77} 79}
78 80
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
90 if (query_cpu == this_cpu) 92 if (query_cpu == this_cpu)
91 continue; 93 continue;
92 __default_send_IPI_dest_field( 94 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector, 95 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
94 apic->dest_logical); 96 vector, apic->dest_logical);
95 } 97 }
96 local_irq_restore(flags); 98 local_irq_restore(flags);
97} 99}
98 100
99#ifdef CONFIG_X86_32
100
101/* 101/*
102 * This is only used on smaller machines. 102 * This is only used on smaller machines.
103 */ 103 */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 960f26ab5c9..6273eee5134 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
373 return physids_promote(0xFUL, retmap); 373 return physids_promote(0xFUL, retmap);
374} 374}
375 375
376static inline int numaq_cpu_to_logical_apicid(int cpu)
377{
378 if (cpu >= nr_cpu_ids)
379 return BAD_APICID;
380 return cpu_2_logical_apicid[cpu];
381}
382
383/* 376/*
384 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent 377 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
385 * cpu to APIC ID relation to properly interact with the intelligent 378 * cpu to APIC ID relation to properly interact with the intelligent
@@ -398,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
398 return logical_apicid >> 4; 391 return logical_apicid >> 4;
399} 392}
400 393
394static int numaq_numa_cpu_node(int cpu)
395{
396 int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
397
398 if (logical_apicid != BAD_APICID)
399 return numaq_apicid_to_node(logical_apicid);
400 return NUMA_NO_NODE;
401}
402
401static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) 403static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
402{ 404{
403 int node = numaq_apicid_to_node(logical_apicid); 405 int node = numaq_apicid_to_node(logical_apicid);
@@ -508,8 +510,6 @@ struct apic __refdata apic_numaq = {
508 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 510 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
509 .setup_apic_routing = numaq_setup_apic_routing, 511 .setup_apic_routing = numaq_setup_apic_routing,
510 .multi_timer_check = numaq_multi_timer_check, 512 .multi_timer_check = numaq_multi_timer_check,
511 .apicid_to_node = numaq_apicid_to_node,
512 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid, 513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present, 514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
515 .setup_portio_remap = numaq_setup_portio_remap, 515 .setup_portio_remap = numaq_setup_portio_remap,
@@ -547,4 +547,7 @@ struct apic __refdata apic_numaq = {
547 .icr_write = native_apic_icr_write, 547 .icr_write = native_apic_icr_write,
548 .wait_icr_idle = native_apic_wait_icr_idle, 548 .wait_icr_idle = native_apic_wait_icr_idle,
549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
550
551 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
552 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
550}; 553};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe01608..fc84c7b6110 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void)
77 apic->setup_apic_routing(); 77 apic->setup_apic_routing();
78} 78}
79 79
80static int default_x86_32_early_logical_apicid(int cpu)
81{
82 return 1 << cpu;
83}
84
80static void setup_apic_flat_routing(void) 85static void setup_apic_flat_routing(void)
81{ 86{
82#ifdef CONFIG_X86_IO_APIC 87#ifdef CONFIG_X86_IO_APIC
@@ -130,8 +135,6 @@ struct apic apic_default = {
130 .ioapic_phys_id_map = default_ioapic_phys_id_map, 135 .ioapic_phys_id_map = default_ioapic_phys_id_map,
131 .setup_apic_routing = setup_apic_flat_routing, 136 .setup_apic_routing = setup_apic_flat_routing,
132 .multi_timer_check = NULL, 137 .multi_timer_check = NULL,
133 .apicid_to_node = default_apicid_to_node,
134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
135 .cpu_present_to_apicid = default_cpu_present_to_apicid, 138 .cpu_present_to_apicid = default_cpu_present_to_apicid,
136 .apicid_to_cpu_present = physid_set_mask_of_physid, 139 .apicid_to_cpu_present = physid_set_mask_of_physid,
137 .setup_portio_remap = NULL, 140 .setup_portio_remap = NULL,
@@ -167,6 +170,9 @@ struct apic apic_default = {
167 .icr_write = native_apic_icr_write, 170 .icr_write = native_apic_icr_write,
168 .wait_icr_idle = native_apic_wait_icr_idle, 171 .wait_icr_idle = native_apic_wait_icr_idle,
169 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
173
174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
175 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
170}; 176};
171 177
172extern struct apic apic_numaq; 178extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b419263d90..e4b8059b414 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
194 return 1; 194 return 1;
195} 195}
196 196
197static void summit_init_apic_ldr(void) 197static int summit_early_logical_apicid(int cpu)
198{ 198{
199 unsigned long val, id;
200 int count = 0; 199 int count = 0;
201 u8 my_id = (u8)hard_smp_processor_id(); 200 u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
202 u8 my_cluster = APIC_CLUSTER(my_id); 201 u8 my_cluster = APIC_CLUSTER(my_id);
203#ifdef CONFIG_SMP 202#ifdef CONFIG_SMP
204 u8 lid; 203 u8 lid;
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void)
206 205
207 /* Create logical APIC IDs by counting CPUs already in cluster. */ 206 /* Create logical APIC IDs by counting CPUs already in cluster. */
208 for (count = 0, i = nr_cpu_ids; --i >= 0; ) { 207 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
209 lid = cpu_2_logical_apicid[i]; 208 lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
210 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) 209 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
211 ++count; 210 ++count;
212 } 211 }
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
214 /* We only have a 4 wide bitmap in cluster mode. If a deranged 213 /* We only have a 4 wide bitmap in cluster mode. If a deranged
215 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ 214 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
216 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); 215 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
217 id = my_cluster | (1UL << count); 216 return my_cluster | (1UL << count);
217}
218
219static void summit_init_apic_ldr(void)
220{
221 int cpu = smp_processor_id();
222 unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
223 unsigned long val;
224
218 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); 225 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
219 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; 226 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
220 val |= SET_APIC_LOGICAL_ID(id); 227 val |= SET_APIC_LOGICAL_ID(id);
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void)
232 nr_ioapics); 239 nr_ioapics);
233} 240}
234 241
235static int summit_apicid_to_node(int logical_apicid)
236{
237#ifdef CONFIG_SMP
238 return apicid_2_node[hard_smp_processor_id()];
239#else
240 return 0;
241#endif
242}
243
244/* Mapping from cpu number to logical apicid */
245static inline int summit_cpu_to_logical_apicid(int cpu)
246{
247#ifdef CONFIG_SMP
248 if (cpu >= nr_cpu_ids)
249 return BAD_APICID;
250 return cpu_2_logical_apicid[cpu];
251#else
252 return logical_smp_processor_id();
253#endif
254}
255
256static int summit_cpu_present_to_apicid(int mps_cpu) 242static int summit_cpu_present_to_apicid(int mps_cpu)
257{ 243{
258 if (mps_cpu < nr_cpu_ids) 244 if (mps_cpu < nr_cpu_ids)
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
286 * The cpus in the mask must all be on the apic cluster. 272 * The cpus in the mask must all be on the apic cluster.
287 */ 273 */
288 for_each_cpu(cpu, cpumask) { 274 for_each_cpu(cpu, cpumask) {
289 int new_apicid = summit_cpu_to_logical_apicid(cpu); 275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
290 276
291 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
292 printk("%s: Not a valid mask!\n", __func__); 278 printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
301static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
302 const struct cpumask *andmask) 288 const struct cpumask *andmask)
303{ 289{
304 int apicid = summit_cpu_to_logical_apicid(0); 290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
305 cpumask_var_t cpumask; 291 cpumask_var_t cpumask;
306 292
307 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -528,8 +514,6 @@ struct apic apic_summit = {
528 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 514 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
529 .setup_apic_routing = summit_setup_apic_routing, 515 .setup_apic_routing = summit_setup_apic_routing,
530 .multi_timer_check = NULL, 516 .multi_timer_check = NULL,
531 .apicid_to_node = summit_apicid_to_node,
532 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
533 .cpu_present_to_apicid = summit_cpu_present_to_apicid, 517 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
534 .apicid_to_cpu_present = summit_apicid_to_cpu_present, 518 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
535 .setup_portio_remap = NULL, 519 .setup_portio_remap = NULL,
@@ -565,4 +549,7 @@ struct apic apic_summit = {
565 .icr_write = native_apic_icr_write, 549 .icr_write = native_apic_icr_write,
566 .wait_icr_idle = native_apic_wait_icr_idle, 550 .wait_icr_idle = native_apic_wait_icr_idle,
567 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
554 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
568}; 555};
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f491..90949bbd566 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -206,8 +206,6 @@ struct apic apic_x2apic_cluster = {
206 .ioapic_phys_id_map = NULL, 206 .ioapic_phys_id_map = NULL,
207 .setup_apic_routing = NULL, 207 .setup_apic_routing = NULL,
208 .multi_timer_check = NULL, 208 .multi_timer_check = NULL,
209 .apicid_to_node = NULL,
210 .cpu_to_logical_apicid = NULL,
211 .cpu_present_to_apicid = default_cpu_present_to_apicid, 209 .cpu_present_to_apicid = default_cpu_present_to_apicid,
212 .apicid_to_cpu_present = NULL, 210 .apicid_to_cpu_present = NULL,
213 .setup_portio_remap = NULL, 211 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ce..c7e6d6645bf 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -195,8 +195,6 @@ struct apic apic_x2apic_phys = {
195 .ioapic_phys_id_map = NULL, 195 .ioapic_phys_id_map = NULL,
196 .setup_apic_routing = NULL, 196 .setup_apic_routing = NULL,
197 .multi_timer_check = NULL, 197 .multi_timer_check = NULL,
198 .apicid_to_node = NULL,
199 .cpu_to_logical_apicid = NULL,
200 .cpu_present_to_apicid = default_cpu_present_to_apicid, 198 .cpu_present_to_apicid = default_cpu_present_to_apicid,
201 .apicid_to_cpu_present = NULL, 199 .apicid_to_cpu_present = NULL,
202 .setup_portio_remap = NULL, 200 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index bd16b58b885..3c289281394 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -338,8 +338,6 @@ struct apic __refdata apic_x2apic_uv_x = {
338 .ioapic_phys_id_map = NULL, 338 .ioapic_phys_id_map = NULL,
339 .setup_apic_routing = NULL, 339 .setup_apic_routing = NULL,
340 .multi_timer_check = NULL, 340 .multi_timer_check = NULL,
341 .apicid_to_node = NULL,
342 .cpu_to_logical_apicid = NULL,
343 .cpu_present_to_apicid = default_cpu_present_to_apicid, 341 .cpu_present_to_apicid = default_cpu_present_to_apicid,
344 .apicid_to_cpu_present = NULL, 342 .apicid_to_cpu_present = NULL,
345 .setup_portio_remap = NULL, 343 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f4..4f13fafc526 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,70 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <linux/kbuild.h>
14#include <asm/processor.h>
15#include <asm/thread_info.h>
16#include <asm/sigframe.h>
17#include <asm/bootparam.h>
18#include <asm/suspend.h>
19
20#ifdef CONFIG_XEN
21#include <xen/interface/xen.h>
22#endif
23
1#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c" 25# include "asm-offsets_32.c"
3#else 26#else
4# include "asm-offsets_64.c" 27# include "asm-offsets_64.c"
5#endif 28#endif
29
30void common(void) {
31 BLANK();
32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36
37 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
39
40 BLANK();
41 OFFSET(pbe_address, pbe, address);
42 OFFSET(pbe_orig_address, pbe, orig_address);
43 OFFSET(pbe_next, pbe, next);
44
45#ifdef CONFIG_PARAVIRT
46 BLANK();
47 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
48 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
49 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
50 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
51 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
52 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
53 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
54 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
55 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
56#endif
57
58#ifdef CONFIG_XEN
59 BLANK();
60 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
61 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
62#endif
63
64 BLANK();
65 OFFSET(BP_scratch, boot_params, scratch);
66 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a4088dda37..c29d631af6f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 1#include <asm/ucontext.h>
14#include <asm/sigframe.h>
15#include <asm/pgtable.h>
16#include <asm/fixmap.h>
17#include <asm/processor.h>
18#include <asm/thread_info.h>
19#include <asm/bootparam.h>
20#include <asm/elf.h>
21#include <asm/suspend.h>
22
23#include <xen/interface/xen.h>
24 2
25#include <linux/lguest.h> 3#include <linux/lguest.h>
26#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
51 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); 29 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
52 BLANK(); 30 BLANK();
53 31
54 OFFSET(TI_task, thread_info, task);
55 OFFSET(TI_exec_domain, thread_info, exec_domain);
56 OFFSET(TI_flags, thread_info, flags);
57 OFFSET(TI_status, thread_info, status);
58 OFFSET(TI_preempt_count, thread_info, preempt_count);
59 OFFSET(TI_addr_limit, thread_info, addr_limit);
60 OFFSET(TI_restart_block, thread_info, restart_block);
61 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 32 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
62 OFFSET(TI_cpu, thread_info, cpu); 33 OFFSET(TI_cpu, thread_info, cpu);
63 BLANK(); 34 BLANK();
64 35
65 OFFSET(GDS_size, desc_ptr, size);
66 OFFSET(GDS_address, desc_ptr, address);
67 BLANK();
68
69 OFFSET(PT_EBX, pt_regs, bx); 36 OFFSET(PT_EBX, pt_regs, bx);
70 OFFSET(PT_ECX, pt_regs, cx); 37 OFFSET(PT_ECX, pt_regs, cx);
71 OFFSET(PT_EDX, pt_regs, dx); 38 OFFSET(PT_EDX, pt_regs, dx);
@@ -85,42 +52,13 @@ void foo(void)
85 OFFSET(PT_OLDSS, pt_regs, ss); 52 OFFSET(PT_OLDSS, pt_regs, ss);
86 BLANK(); 53 BLANK();
87 54
88 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
89 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 55 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
90 BLANK(); 56 BLANK();
91 57
92 OFFSET(pbe_address, pbe, address);
93 OFFSET(pbe_orig_address, pbe, orig_address);
94 OFFSET(pbe_next, pbe, next);
95
96 /* Offset from the sysenter stack to tss.sp0 */ 58 /* Offset from the sysenter stack to tss.sp0 */
97 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 59 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
98 sizeof(struct tss_struct)); 60 sizeof(struct tss_struct));
99 61
100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
102 DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
103
104 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
105
106#ifdef CONFIG_PARAVIRT
107 BLANK();
108 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
109 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
110 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
114 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
116#endif
117
118#ifdef CONFIG_XEN
119 BLANK();
120 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
121 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
122#endif
123
124#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 62#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
125 BLANK(); 63 BLANK();
126 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -139,11 +77,4 @@ void foo(void)
139 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
140 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
141#endif 79#endif
142
143 BLANK();
144 OFFSET(BP_scratch, boot_params, scratch);
145 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
146 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
147 OFFSET(BP_version, boot_params, hdr.version);
148 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 80}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd96..e72a1194af2 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/errno.h>
12#include <linux/hardirq.h>
13#include <linux/suspend.h>
14#include <linux/kbuild.h>
15#include <asm/processor.h>
16#include <asm/segment.h>
17#include <asm/thread_info.h>
18#include <asm/ia32.h> 1#include <asm/ia32.h>
19#include <asm/bootparam.h>
20#include <asm/suspend.h>
21
22#include <xen/interface/xen.h>
23
24#include <asm/sigframe.h>
25 2
26#define __NO_STUBS 1 3#define __NO_STUBS 1
27#undef __SYSCALL 4#undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
33 10
34int main(void) 11int main(void)
35{ 12{
36#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
37 ENTRY(state);
38 ENTRY(flags);
39 ENTRY(pid);
40 BLANK();
41#undef ENTRY
42#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
43 ENTRY(flags);
44 ENTRY(addr_limit);
45 ENTRY(preempt_count);
46 ENTRY(status);
47#ifdef CONFIG_IA32_EMULATION
48 ENTRY(sysenter_return);
49#endif
50 BLANK();
51#undef ENTRY
52#ifdef CONFIG_PARAVIRT 13#ifdef CONFIG_PARAVIRT
53 BLANK();
54 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
55 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
56 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
57 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
58 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
59 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); 14 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
60 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
61 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); 15 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
62 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); 16 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
63 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
64 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 17 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
65 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 18 BLANK();
66#endif 19#endif
67 20
68
69#ifdef CONFIG_IA32_EMULATION 21#ifdef CONFIG_IA32_EMULATION
70#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 22 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
23 BLANK();
24
25#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
71 ENTRY(ax); 26 ENTRY(ax);
72 ENTRY(bx); 27 ENTRY(bx);
73 ENTRY(cx); 28 ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
79 ENTRY(ip); 34 ENTRY(ip);
80 BLANK(); 35 BLANK();
81#undef ENTRY 36#undef ENTRY
82 DEFINE(IA32_RT_SIGFRAME_sigcontext, 37
83 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); 38 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
84 BLANK(); 39 BLANK();
85#endif 40#endif
86 DEFINE(pbe_address, offsetof(struct pbe, address)); 41
87 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); 42#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
88 DEFINE(pbe_next, offsetof(struct pbe, next));
89 BLANK();
90#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
91 ENTRY(bx); 43 ENTRY(bx);
92 ENTRY(bx); 44 ENTRY(bx);
93 ENTRY(cx); 45 ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
107 ENTRY(flags); 59 ENTRY(flags);
108 BLANK(); 60 BLANK();
109#undef ENTRY 61#undef ENTRY
110#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 62
63#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
111 ENTRY(cr0); 64 ENTRY(cr0);
112 ENTRY(cr2); 65 ENTRY(cr2);
113 ENTRY(cr3); 66 ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
115 ENTRY(cr8); 68 ENTRY(cr8);
116 BLANK(); 69 BLANK();
117#undef ENTRY 70#undef ENTRY
118 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
119 BLANK();
120 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
121 BLANK();
122 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
123 71
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
124 BLANK(); 73 BLANK();
125 OFFSET(BP_scratch, boot_params, scratch);
126 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
127 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
128 OFFSET(BP_version, boot_params, hdr.version);
129 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
130 74
131 BLANK(); 75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
132 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 76
133#ifdef CONFIG_XEN
134 BLANK();
135 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
136 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
137#undef ENTRY
138#endif
139 return 0; 77 return 0;
140} 78}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 58f1b012e1c..3ecece0217e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
233} 233}
234#endif 234#endif
235 235
236#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 236#ifdef CONFIG_NUMA
237/*
238 * To workaround broken NUMA config. Read the comment in
239 * srat_detect_node().
240 */
237static int __cpuinit nearby_node(int apicid) 241static int __cpuinit nearby_node(int apicid)
238{ 242{
239 int i, node; 243 int i, node;
240 244
241 for (i = apicid - 1; i >= 0; i--) { 245 for (i = apicid - 1; i >= 0; i--) {
242 node = apicid_to_node[i]; 246 node = __apicid_to_node[i];
243 if (node != NUMA_NO_NODE && node_online(node)) 247 if (node != NUMA_NO_NODE && node_online(node))
244 return node; 248 return node;
245 } 249 }
246 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 250 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
247 node = apicid_to_node[i]; 251 node = __apicid_to_node[i];
248 if (node != NUMA_NO_NODE && node_online(node)) 252 if (node != NUMA_NO_NODE && node_online(node))
249 return node; 253 return node;
250 } 254 }
@@ -338,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
338 342
339static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 343static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
340{ 344{
341#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 345#ifdef CONFIG_NUMA
342 int cpu = smp_processor_id(); 346 int cpu = smp_processor_id();
343 int node; 347 int node;
344 unsigned apicid = c->apicid; 348 unsigned apicid = c->apicid;
345 349
346 node = per_cpu(cpu_llc_id, cpu); 350 node = numa_cpu_node(cpu);
351 if (node == NUMA_NO_NODE)
352 node = per_cpu(cpu_llc_id, cpu);
347 353
348 if (apicid_to_node[apicid] != NUMA_NO_NODE)
349 node = apicid_to_node[apicid];
350 if (!node_online(node)) { 354 if (!node_online(node)) {
351 /* Two possibilities here: 355 /*
352 - The CPU is missing memory and no node was created. 356 * Two possibilities here:
353 In that case try picking one from a nearby CPU 357 *
354 - The APIC IDs differ from the HyperTransport node IDs 358 * - The CPU is missing memory and no node was created. In
355 which the K8 northbridge parsing fills in. 359 * that case try picking one from a nearby CPU.
356 Assume they are all increased by a constant offset, 360 *
357 but in the same order as the HT nodeids. 361 * - The APIC IDs differ from the HyperTransport node IDs
358 If that doesn't result in a usable node fall back to the 362 * which the K8 northbridge parsing fills in. Assume
359 path for the previous case. */ 363 * they are all increased by a constant offset, but in
360 364 * the same order as the HT nodeids. If that doesn't
365 * result in a usable node fall back to the path for the
366 * previous case.
367 *
368 * This workaround operates directly on the mapping between
369 * APIC ID and NUMA node, assuming certain relationship
370 * between APIC ID, HT node ID and NUMA topology. As going
371 * through CPU mapping may alter the outcome, directly
372 * access __apicid_to_node[].
373 */
361 int ht_nodeid = c->initial_apicid; 374 int ht_nodeid = c->initial_apicid;
362 375
363 if (ht_nodeid >= 0 && 376 if (ht_nodeid >= 0 &&
364 apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 377 __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
365 node = apicid_to_node[ht_nodeid]; 378 node = __apicid_to_node[ht_nodeid];
366 /* Pick a nearby node */ 379 /* Pick a nearby node */
367 if (!node_online(node)) 380 if (!node_online(node))
368 node = nearby_node(apicid); 381 node = nearby_node(apicid);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834396b..e2ced0074a4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -675,7 +675,7 @@ void __init early_cpu_init(void)
675 const struct cpu_dev *const *cdev; 675 const struct cpu_dev *const *cdev;
676 int count = 0; 676 int count = 0;
677 677
678#ifdef PROCESSOR_SELECT 678#ifdef CONFIG_PROCESSOR_SELECT
679 printk(KERN_INFO "KERNEL supported cpus:\n"); 679 printk(KERN_INFO "KERNEL supported cpus:\n");
680#endif 680#endif
681 681
@@ -687,7 +687,7 @@ void __init early_cpu_init(void)
687 cpu_devs[count] = cpudev; 687 cpu_devs[count] = cpudev;
688 count++; 688 count++;
689 689
690#ifdef PROCESSOR_SELECT 690#ifdef CONFIG_PROCESSOR_SELECT
691 { 691 {
692 unsigned int j; 692 unsigned int j;
693 693
@@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
869 869
870 select_idle_routine(c); 870 select_idle_routine(c);
871 871
872#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 872#ifdef CONFIG_NUMA
873 numa_add_cpu(smp_processor_id()); 873 numa_add_cpu(smp_processor_id());
874#endif 874#endif
875} 875}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index d16c2c53d6b..df86bc8c859 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -276,14 +276,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
276 276
277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
278{ 278{
279#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 279#ifdef CONFIG_NUMA
280 unsigned node; 280 unsigned node;
281 int cpu = smp_processor_id(); 281 int cpu = smp_processor_id();
282 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
283 282
284 /* Don't do the funky fallback heuristics the AMD version employs 283 /* Don't do the funky fallback heuristics the AMD version employs
285 for now. */ 284 for now. */
286 node = apicid_to_node[apicid]; 285 node = numa_cpu_node(cpu);
287 if (node == NUMA_NO_NODE || !node_online(node)) { 286 if (node == NUMA_NO_NODE || !node_online(node)) {
288 /* reuse the value from init_cpu_to_node() */ 287 /* reuse the value from init_cpu_to_node() */
289 node = cpu_to_node(cpu); 288 node = cpu_to_node(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 90cc675ac74..1ce1af2899d 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -768,11 +768,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
768 struct cpuinfo_x86 *c = &cpu_data(cpu); 768 struct cpuinfo_x86 *c = &cpu_data(cpu);
769 769
770 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 770 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
771 for_each_cpu(i, c->llc_shared_map) { 771 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
772 if (!per_cpu(ici_cpuid4_info, i)) 772 if (!per_cpu(ici_cpuid4_info, i))
773 continue; 773 continue;
774 this_leaf = CPUID4_INFO_IDX(i, index); 774 this_leaf = CPUID4_INFO_IDX(i, index);
775 for_each_cpu(sibling, c->llc_shared_map) { 775 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
776 if (!cpu_online(sibling)) 776 if (!cpu_online(sibling))
777 continue; 777 continue;
778 set_bit(sibling, this_leaf->shared_cpu_map); 778 set_bit(sibling, this_leaf->shared_cpu_map);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5bf2fac52ac..167f97b5596 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 int i, err = 0; 527 int i, err = 0;
528 struct threshold_bank *b = NULL; 528 struct threshold_bank *b = NULL;
529 char name[32]; 529 char name[32];
530#ifdef CONFIG_SMP
531 struct cpuinfo_x86 *c = &cpu_data(cpu);
532#endif
533 530
534 sprintf(name, "threshold_bank%i", bank); 531 sprintf(name, "threshold_bank%i", bank);
535 532
536#ifdef CONFIG_SMP 533#ifdef CONFIG_SMP
537 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 534 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
538 i = cpumask_first(c->llc_shared_map); 535 i = cpumask_first(cpu_llc_shared_mask(cpu));
539 536
540 /* first core not up yet */ 537 /* first core not up yet */
541 if (cpu_data(i).cpu_core_id) 538 if (cpu_data(i).cpu_core_id)
@@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
555 if (err) 552 if (err)
556 goto out; 553 goto out;
557 554
558 cpumask_copy(b->cpus, c->llc_shared_map); 555 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
559 per_cpu(threshold_banks, cpu)[bank] = b; 556 per_cpu(threshold_banks, cpu)[bank] = b;
560 557
561 goto out; 558 goto out;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d977a2ea69..26604188aa4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,7 @@
30#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h>
33 34
34#if 0 35#if 0
35#undef wrmsrl 36#undef wrmsrl
@@ -93,6 +94,8 @@ struct amd_nb {
93 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
94}; 95};
95 96
97struct intel_percore;
98
96#define MAX_LBR_ENTRIES 16 99#define MAX_LBR_ENTRIES 16
97 100
98struct cpu_hw_events { 101struct cpu_hw_events {
@@ -128,6 +131,13 @@ struct cpu_hw_events {
128 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 131 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
129 132
130 /* 133 /*
134 * Intel percore register state.
135 * Coordinate shared resources between HT threads.
136 */
137 int percore_used; /* Used by this CPU? */
138 struct intel_percore *per_core;
139
140 /*
131 * AMD specific bits 141 * AMD specific bits
132 */ 142 */
133 struct amd_nb *amd_nb; 143 struct amd_nb *amd_nb;
@@ -166,8 +176,10 @@ struct cpu_hw_events {
166/* 176/*
167 * Constraint on the Event code + UMask 177 * Constraint on the Event code + UMask
168 */ 178 */
169#define PEBS_EVENT_CONSTRAINT(c, n) \ 179#define INTEL_UEVENT_CONSTRAINT(c, n) \
170 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) 180 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
181#define PEBS_EVENT_CONSTRAINT(c, n) \
182 INTEL_UEVENT_CONSTRAINT(c, n)
171 183
172#define EVENT_CONSTRAINT_END \ 184#define EVENT_CONSTRAINT_END \
173 EVENT_CONSTRAINT(0, 0, 0) 185 EVENT_CONSTRAINT(0, 0, 0)
@@ -175,6 +187,28 @@ struct cpu_hw_events {
175#define for_each_event_constraint(e, c) \ 187#define for_each_event_constraint(e, c) \
176 for ((e) = (c); (e)->weight; (e)++) 188 for ((e) = (c); (e)->weight; (e)++)
177 189
190/*
191 * Extra registers for specific events.
192 * Some events need large masks and require external MSRs.
193 * Define a mapping to these extra registers.
194 */
195struct extra_reg {
196 unsigned int event;
197 unsigned int msr;
198 u64 config_mask;
199 u64 valid_mask;
200};
201
202#define EVENT_EXTRA_REG(e, ms, m, vm) { \
203 .event = (e), \
204 .msr = (ms), \
205 .config_mask = (m), \
206 .valid_mask = (vm), \
207 }
208#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
209 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
210#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
211
178union perf_capabilities { 212union perf_capabilities {
179 struct { 213 struct {
180 u64 lbr_format : 6; 214 u64 lbr_format : 6;
@@ -219,6 +253,7 @@ struct x86_pmu {
219 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 253 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
220 struct perf_event *event); 254 struct perf_event *event);
221 struct event_constraint *event_constraints; 255 struct event_constraint *event_constraints;
256 struct event_constraint *percore_constraints;
222 void (*quirks)(void); 257 void (*quirks)(void);
223 int perfctr_second_write; 258 int perfctr_second_write;
224 259
@@ -247,6 +282,11 @@ struct x86_pmu {
247 */ 282 */
248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 283 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
249 int lbr_nr; /* hardware stack size */ 284 int lbr_nr; /* hardware stack size */
285
286 /*
287 * Extra registers for events
288 */
289 struct extra_reg *extra_regs;
250}; 290};
251 291
252static struct x86_pmu x86_pmu __read_mostly; 292static struct x86_pmu x86_pmu __read_mostly;
@@ -271,6 +311,10 @@ static u64 __read_mostly hw_cache_event_ids
271 [PERF_COUNT_HW_CACHE_MAX] 311 [PERF_COUNT_HW_CACHE_MAX]
272 [PERF_COUNT_HW_CACHE_OP_MAX] 312 [PERF_COUNT_HW_CACHE_OP_MAX]
273 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 313 [PERF_COUNT_HW_CACHE_RESULT_MAX];
314static u64 __read_mostly hw_cache_extra_regs
315 [PERF_COUNT_HW_CACHE_MAX]
316 [PERF_COUNT_HW_CACHE_OP_MAX]
317 [PERF_COUNT_HW_CACHE_RESULT_MAX];
274 318
275/* 319/*
276 * Propagate event elapsed time into the generic event. 320 * Propagate event elapsed time into the generic event.
@@ -298,7 +342,7 @@ x86_perf_event_update(struct perf_event *event)
298 */ 342 */
299again: 343again:
300 prev_raw_count = local64_read(&hwc->prev_count); 344 prev_raw_count = local64_read(&hwc->prev_count);
301 rdmsrl(hwc->event_base + idx, new_raw_count); 345 rdmsrl(hwc->event_base, new_raw_count);
302 346
303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 347 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
304 new_raw_count) != prev_raw_count) 348 new_raw_count) != prev_raw_count)
@@ -321,6 +365,49 @@ again:
321 return new_raw_count; 365 return new_raw_count;
322} 366}
323 367
368/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
369static inline int x86_pmu_addr_offset(int index)
370{
371 if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
372 return index << 1;
373 return index;
374}
375
376static inline unsigned int x86_pmu_config_addr(int index)
377{
378 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
379}
380
381static inline unsigned int x86_pmu_event_addr(int index)
382{
383 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
384}
385
386/*
387 * Find and validate any extra registers to set up.
388 */
389static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
390{
391 struct extra_reg *er;
392
393 event->hw.extra_reg = 0;
394 event->hw.extra_config = 0;
395
396 if (!x86_pmu.extra_regs)
397 return 0;
398
399 for (er = x86_pmu.extra_regs; er->msr; er++) {
400 if (er->event != (config & er->config_mask))
401 continue;
402 if (event->attr.config1 & ~er->valid_mask)
403 return -EINVAL;
404 event->hw.extra_reg = er->msr;
405 event->hw.extra_config = event->attr.config1;
406 break;
407 }
408 return 0;
409}
410
324static atomic_t active_events; 411static atomic_t active_events;
325static DEFINE_MUTEX(pmc_reserve_mutex); 412static DEFINE_MUTEX(pmc_reserve_mutex);
326 413
@@ -331,12 +418,12 @@ static bool reserve_pmc_hardware(void)
331 int i; 418 int i;
332 419
333 for (i = 0; i < x86_pmu.num_counters; i++) { 420 for (i = 0; i < x86_pmu.num_counters; i++) {
334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 421 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
335 goto perfctr_fail; 422 goto perfctr_fail;
336 } 423 }
337 424
338 for (i = 0; i < x86_pmu.num_counters; i++) { 425 for (i = 0; i < x86_pmu.num_counters; i++) {
339 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 426 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
340 goto eventsel_fail; 427 goto eventsel_fail;
341 } 428 }
342 429
@@ -344,13 +431,13 @@ static bool reserve_pmc_hardware(void)
344 431
345eventsel_fail: 432eventsel_fail:
346 for (i--; i >= 0; i--) 433 for (i--; i >= 0; i--)
347 release_evntsel_nmi(x86_pmu.eventsel + i); 434 release_evntsel_nmi(x86_pmu_config_addr(i));
348 435
349 i = x86_pmu.num_counters; 436 i = x86_pmu.num_counters;
350 437
351perfctr_fail: 438perfctr_fail:
352 for (i--; i >= 0; i--) 439 for (i--; i >= 0; i--)
353 release_perfctr_nmi(x86_pmu.perfctr + i); 440 release_perfctr_nmi(x86_pmu_event_addr(i));
354 441
355 return false; 442 return false;
356} 443}
@@ -360,8 +447,8 @@ static void release_pmc_hardware(void)
360 int i; 447 int i;
361 448
362 for (i = 0; i < x86_pmu.num_counters; i++) { 449 for (i = 0; i < x86_pmu.num_counters; i++) {
363 release_perfctr_nmi(x86_pmu.perfctr + i); 450 release_perfctr_nmi(x86_pmu_event_addr(i));
364 release_evntsel_nmi(x86_pmu.eventsel + i); 451 release_evntsel_nmi(x86_pmu_config_addr(i));
365 } 452 }
366} 453}
367 454
@@ -382,7 +469,7 @@ static bool check_hw_exists(void)
382 * complain and bail. 469 * complain and bail.
383 */ 470 */
384 for (i = 0; i < x86_pmu.num_counters; i++) { 471 for (i = 0; i < x86_pmu.num_counters; i++) {
385 reg = x86_pmu.eventsel + i; 472 reg = x86_pmu_config_addr(i);
386 ret = rdmsrl_safe(reg, &val); 473 ret = rdmsrl_safe(reg, &val);
387 if (ret) 474 if (ret)
388 goto msr_fail; 475 goto msr_fail;
@@ -407,8 +494,8 @@ static bool check_hw_exists(void)
407 * that don't trap on the MSR access and always return 0s. 494 * that don't trap on the MSR access and always return 0s.
408 */ 495 */
409 val = 0xabcdUL; 496 val = 0xabcdUL;
410 ret = checking_wrmsrl(x86_pmu.perfctr, val); 497 ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
411 ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); 498 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
412 if (ret || val != val_new) 499 if (ret || val != val_new)
413 goto msr_fail; 500 goto msr_fail;
414 501
@@ -442,8 +529,9 @@ static inline int x86_pmu_initialized(void)
442} 529}
443 530
444static inline int 531static inline int
445set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) 532set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
446{ 533{
534 struct perf_event_attr *attr = &event->attr;
447 unsigned int cache_type, cache_op, cache_result; 535 unsigned int cache_type, cache_op, cache_result;
448 u64 config, val; 536 u64 config, val;
449 537
@@ -470,8 +558,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
470 return -EINVAL; 558 return -EINVAL;
471 559
472 hwc->config |= val; 560 hwc->config |= val;
473 561 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
474 return 0; 562 return x86_pmu_extra_regs(val, event);
475} 563}
476 564
477static int x86_setup_perfctr(struct perf_event *event) 565static int x86_setup_perfctr(struct perf_event *event)
@@ -496,10 +584,10 @@ static int x86_setup_perfctr(struct perf_event *event)
496 } 584 }
497 585
498 if (attr->type == PERF_TYPE_RAW) 586 if (attr->type == PERF_TYPE_RAW)
499 return 0; 587 return x86_pmu_extra_regs(event->attr.config, event);
500 588
501 if (attr->type == PERF_TYPE_HW_CACHE) 589 if (attr->type == PERF_TYPE_HW_CACHE)
502 return set_ext_hw_attr(hwc, attr); 590 return set_ext_hw_attr(hwc, event);
503 591
504 if (attr->config >= x86_pmu.max_events) 592 if (attr->config >= x86_pmu.max_events)
505 return -EINVAL; 593 return -EINVAL;
@@ -617,11 +705,11 @@ static void x86_pmu_disable_all(void)
617 705
618 if (!test_bit(idx, cpuc->active_mask)) 706 if (!test_bit(idx, cpuc->active_mask))
619 continue; 707 continue;
620 rdmsrl(x86_pmu.eventsel + idx, val); 708 rdmsrl(x86_pmu_config_addr(idx), val);
621 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 709 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
622 continue; 710 continue;
623 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 711 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
624 wrmsrl(x86_pmu.eventsel + idx, val); 712 wrmsrl(x86_pmu_config_addr(idx), val);
625 } 713 }
626} 714}
627 715
@@ -642,21 +730,26 @@ static void x86_pmu_disable(struct pmu *pmu)
642 x86_pmu.disable_all(); 730 x86_pmu.disable_all();
643} 731}
644 732
733static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
734 u64 enable_mask)
735{
736 if (hwc->extra_reg)
737 wrmsrl(hwc->extra_reg, hwc->extra_config);
738 wrmsrl(hwc->config_base, hwc->config | enable_mask);
739}
740
645static void x86_pmu_enable_all(int added) 741static void x86_pmu_enable_all(int added)
646{ 742{
647 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 743 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
648 int idx; 744 int idx;
649 745
650 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 746 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
651 struct perf_event *event = cpuc->events[idx]; 747 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
652 u64 val;
653 748
654 if (!test_bit(idx, cpuc->active_mask)) 749 if (!test_bit(idx, cpuc->active_mask))
655 continue; 750 continue;
656 751
657 val = event->hw.config; 752 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
658 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
659 wrmsrl(x86_pmu.eventsel + idx, val);
660 } 753 }
661} 754}
662 755
@@ -821,15 +914,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
821 hwc->event_base = 0; 914 hwc->event_base = 0;
822 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 915 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
823 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 916 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
824 /* 917 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0;
825 * We set it so that event_base + idx in wrmsr/rdmsr maps to
826 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
827 */
828 hwc->event_base =
829 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
830 } else { 918 } else {
831 hwc->config_base = x86_pmu.eventsel; 919 hwc->config_base = x86_pmu_config_addr(hwc->idx);
832 hwc->event_base = x86_pmu.perfctr; 920 hwc->event_base = x86_pmu_event_addr(hwc->idx);
833 } 921 }
834} 922}
835 923
@@ -915,17 +1003,11 @@ static void x86_pmu_enable(struct pmu *pmu)
915 x86_pmu.enable_all(added); 1003 x86_pmu.enable_all(added);
916} 1004}
917 1005
918static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
919 u64 enable_mask)
920{
921 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
922}
923
924static inline void x86_pmu_disable_event(struct perf_event *event) 1006static inline void x86_pmu_disable_event(struct perf_event *event)
925{ 1007{
926 struct hw_perf_event *hwc = &event->hw; 1008 struct hw_perf_event *hwc = &event->hw;
927 1009
928 wrmsrl(hwc->config_base + hwc->idx, hwc->config); 1010 wrmsrl(hwc->config_base, hwc->config);
929} 1011}
930 1012
931static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1013static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -978,7 +1060,7 @@ x86_perf_event_set_period(struct perf_event *event)
978 */ 1060 */
979 local64_set(&hwc->prev_count, (u64)-left); 1061 local64_set(&hwc->prev_count, (u64)-left);
980 1062
981 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); 1063 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
982 1064
983 /* 1065 /*
984 * Due to erratum on certan cpu we need 1066 * Due to erratum on certan cpu we need
@@ -986,7 +1068,7 @@ x86_perf_event_set_period(struct perf_event *event)
986 * is updated properly 1068 * is updated properly
987 */ 1069 */
988 if (x86_pmu.perfctr_second_write) { 1070 if (x86_pmu.perfctr_second_write) {
989 wrmsrl(hwc->event_base + idx, 1071 wrmsrl(hwc->event_base,
990 (u64)(-left) & x86_pmu.cntval_mask); 1072 (u64)(-left) & x86_pmu.cntval_mask);
991 } 1073 }
992 1074
@@ -1113,8 +1195,8 @@ void perf_event_print_debug(void)
1113 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1195 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1114 1196
1115 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1197 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1116 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1198 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1117 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1199 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1118 1200
1119 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1201 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1120 1202
@@ -1389,7 +1471,7 @@ static void __init pmu_check_apic(void)
1389 pr_info("no hardware sampling interrupt available.\n"); 1471 pr_info("no hardware sampling interrupt available.\n");
1390} 1472}
1391 1473
1392int __init init_hw_perf_events(void) 1474static int __init init_hw_perf_events(void)
1393{ 1475{
1394 struct event_constraint *c; 1476 struct event_constraint *c;
1395 int err; 1477 int err;
@@ -1608,7 +1690,7 @@ out:
1608 return ret; 1690 return ret;
1609} 1691}
1610 1692
1611int x86_pmu_event_init(struct perf_event *event) 1693static int x86_pmu_event_init(struct perf_event *event)
1612{ 1694{
1613 struct pmu *tmp; 1695 struct pmu *tmp;
1614 int err; 1696 int err;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67e2202a603..461f62bbd77 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -127,6 +127,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
127/* 127/*
128 * AMD64 events are detected based on their event codes. 128 * AMD64 events are detected based on their event codes.
129 */ 129 */
130static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
131{
132 return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
133}
134
130static inline int amd_is_nb_event(struct hw_perf_event *hwc) 135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
131{ 136{
132 return (hwc->config & 0xe0) == 0xe0; 137 return (hwc->config & 0xe0) == 0xe0;
@@ -385,13 +390,181 @@ static __initconst const struct x86_pmu amd_pmu = {
385 .cpu_dead = amd_pmu_cpu_dead, 390 .cpu_dead = amd_pmu_cpu_dead,
386}; 391};
387 392
393/* AMD Family 15h */
394
395#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
396
397#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
398#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
399#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
400#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
401#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
402#define AMD_EVENT_EX_LS 0x000000C0ULL
403#define AMD_EVENT_DE 0x000000D0ULL
404#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
405
406/*
407 * AMD family 15h event code/PMC mappings:
408 *
409 * type = event_code & 0x0F0:
410 *
411 * 0x000 FP PERF_CTL[5:3]
412 * 0x010 FP PERF_CTL[5:3]
413 * 0x020 LS PERF_CTL[5:0]
414 * 0x030 LS PERF_CTL[5:0]
415 * 0x040 DC PERF_CTL[5:0]
416 * 0x050 DC PERF_CTL[5:0]
417 * 0x060 CU PERF_CTL[2:0]
418 * 0x070 CU PERF_CTL[2:0]
419 * 0x080 IC/DE PERF_CTL[2:0]
420 * 0x090 IC/DE PERF_CTL[2:0]
421 * 0x0A0 ---
422 * 0x0B0 ---
423 * 0x0C0 EX/LS PERF_CTL[5:0]
424 * 0x0D0 DE PERF_CTL[2:0]
425 * 0x0E0 NB NB_PERF_CTL[3:0]
426 * 0x0F0 NB NB_PERF_CTL[3:0]
427 *
428 * Exceptions:
429 *
430 * 0x003 FP PERF_CTL[3]
431 * 0x00B FP PERF_CTL[3]
432 * 0x00D FP PERF_CTL[3]
433 * 0x023 DE PERF_CTL[2:0]
434 * 0x02D LS PERF_CTL[3]
435 * 0x02E LS PERF_CTL[3,0]
436 * 0x043 CU PERF_CTL[2:0]
437 * 0x045 CU PERF_CTL[2:0]
438 * 0x046 CU PERF_CTL[2:0]
439 * 0x054 CU PERF_CTL[2:0]
440 * 0x055 CU PERF_CTL[2:0]
441 * 0x08F IC PERF_CTL[0]
442 * 0x187 DE PERF_CTL[0]
443 * 0x188 DE PERF_CTL[0]
444 * 0x0DB EX PERF_CTL[5:0]
445 * 0x0DC LS PERF_CTL[5:0]
446 * 0x0DD LS PERF_CTL[5:0]
447 * 0x0DE LS PERF_CTL[5:0]
448 * 0x0DF LS PERF_CTL[5:0]
449 * 0x1D6 EX PERF_CTL[5:0]
450 * 0x1D8 EX PERF_CTL[5:0]
451 */
452
453static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
454static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
455static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
456static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
457static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
458static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
459
460static struct event_constraint *
461amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
462{
463 unsigned int event_code = amd_get_event_code(&event->hw);
464
465 switch (event_code & AMD_EVENT_TYPE_MASK) {
466 case AMD_EVENT_FP:
467 switch (event_code) {
468 case 0x003:
469 case 0x00B:
470 case 0x00D:
471 return &amd_f15_PMC3;
472 default:
473 return &amd_f15_PMC53;
474 }
475 case AMD_EVENT_LS:
476 case AMD_EVENT_DC:
477 case AMD_EVENT_EX_LS:
478 switch (event_code) {
479 case 0x023:
480 case 0x043:
481 case 0x045:
482 case 0x046:
483 case 0x054:
484 case 0x055:
485 return &amd_f15_PMC20;
486 case 0x02D:
487 return &amd_f15_PMC3;
488 case 0x02E:
489 return &amd_f15_PMC30;
490 default:
491 return &amd_f15_PMC50;
492 }
493 case AMD_EVENT_CU:
494 case AMD_EVENT_IC_DE:
495 case AMD_EVENT_DE:
496 switch (event_code) {
497 case 0x08F:
498 case 0x187:
499 case 0x188:
500 return &amd_f15_PMC0;
501 case 0x0DB ... 0x0DF:
502 case 0x1D6:
503 case 0x1D8:
504 return &amd_f15_PMC50;
505 default:
506 return &amd_f15_PMC20;
507 }
508 case AMD_EVENT_NB:
509 /* not yet implemented */
510 return &emptyconstraint;
511 default:
512 return &emptyconstraint;
513 }
514}
515
516static __initconst const struct x86_pmu amd_pmu_f15h = {
517 .name = "AMD Family 15h",
518 .handle_irq = x86_pmu_handle_irq,
519 .disable_all = x86_pmu_disable_all,
520 .enable_all = x86_pmu_enable_all,
521 .enable = x86_pmu_enable_event,
522 .disable = x86_pmu_disable_event,
523 .hw_config = amd_pmu_hw_config,
524 .schedule_events = x86_schedule_events,
525 .eventsel = MSR_F15H_PERF_CTL,
526 .perfctr = MSR_F15H_PERF_CTR,
527 .event_map = amd_pmu_event_map,
528 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
529 .num_counters = 6,
530 .cntval_bits = 48,
531 .cntval_mask = (1ULL << 48) - 1,
532 .apic = 1,
533 /* use highest bit to detect overflow */
534 .max_period = (1ULL << 47) - 1,
535 .get_event_constraints = amd_get_event_constraints_f15h,
536 /* nortbridge counters not yet implemented: */
537#if 0
538 .put_event_constraints = amd_put_event_constraints,
539
540 .cpu_prepare = amd_pmu_cpu_prepare,
541 .cpu_starting = amd_pmu_cpu_starting,
542 .cpu_dead = amd_pmu_cpu_dead,
543#endif
544};
545
388static __init int amd_pmu_init(void) 546static __init int amd_pmu_init(void)
389{ 547{
390 /* Performance-monitoring supported from K7 and later: */ 548 /* Performance-monitoring supported from K7 and later: */
391 if (boot_cpu_data.x86 < 6) 549 if (boot_cpu_data.x86 < 6)
392 return -ENODEV; 550 return -ENODEV;
393 551
394 x86_pmu = amd_pmu; 552 /*
553 * If core performance counter extensions exists, it must be
554 * family 15h, otherwise fail. See x86_pmu_addr_offset().
555 */
556 switch (boot_cpu_data.x86) {
557 case 0x15:
558 if (!cpu_has_perfctr_core)
559 return -ENODEV;
560 x86_pmu = amd_pmu_f15h;
561 break;
562 default:
563 if (cpu_has_perfctr_core)
564 return -ENODEV;
565 x86_pmu = amd_pmu;
566 break;
567 }
395 568
396 /* Events are common for all AMDs */ 569 /* Events are common for all AMDs */
397 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 570 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 008835c1d79..8fc2b2cee1d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,5 +1,27 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/*
15 * Per core state
16 * This used to coordinate shared registers for HT threads.
17 */
18struct intel_percore {
19 raw_spinlock_t lock; /* protect structure */
20 struct er_account regs[MAX_EXTRA_REGS];
21 int refcnt; /* number of threads */
22 unsigned core_id;
23};
24
3/* 25/*
4 * Intel PerfMon, used on Core and later. 26 * Intel PerfMon, used on Core and later.
5 */ 27 */
@@ -64,6 +86,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
64 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
65}; 87};
66 88
89static struct extra_reg intel_nehalem_extra_regs[] =
90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END
93};
94
95static struct event_constraint intel_nehalem_percore_constraints[] =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
67static struct event_constraint intel_westmere_event_constraints[] = 101static struct event_constraint intel_westmere_event_constraints[] =
68{ 102{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -76,6 +110,33 @@ static struct event_constraint intel_westmere_event_constraints[] =
76 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
77}; 111};
78 112
113static struct event_constraint intel_snb_event_constraints[] =
114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END
124};
125
126static struct extra_reg intel_westmere_extra_regs[] =
127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END
131};
132
133static struct event_constraint intel_westmere_percore_constraints[] =
134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END
138};
139
79static struct event_constraint intel_gen_event_constraints[] = 140static struct event_constraint intel_gen_event_constraints[] =
80{ 141{
81 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -89,6 +150,106 @@ static u64 intel_pmu_event_map(int hw_event)
89 return intel_perfmon_event_map[hw_event]; 150 return intel_perfmon_event_map[hw_event];
90} 151}
91 152
153static __initconst const u64 snb_hw_cache_event_ids
154 [PERF_COUNT_HW_CACHE_MAX]
155 [PERF_COUNT_HW_CACHE_OP_MAX]
156 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
157{
158 [ C(L1D) ] = {
159 [ C(OP_READ) ] = {
160 [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
161 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
162 },
163 [ C(OP_WRITE) ] = {
164 [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
165 [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
166 },
167 [ C(OP_PREFETCH) ] = {
168 [ C(RESULT_ACCESS) ] = 0x0,
169 [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
170 },
171 },
172 [ C(L1I ) ] = {
173 [ C(OP_READ) ] = {
174 [ C(RESULT_ACCESS) ] = 0x0,
175 [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
176 },
177 [ C(OP_WRITE) ] = {
178 [ C(RESULT_ACCESS) ] = -1,
179 [ C(RESULT_MISS) ] = -1,
180 },
181 [ C(OP_PREFETCH) ] = {
182 [ C(RESULT_ACCESS) ] = 0x0,
183 [ C(RESULT_MISS) ] = 0x0,
184 },
185 },
186 [ C(LL ) ] = {
187 /*
188 * TBD: Need Off-core Response Performance Monitoring support
189 */
190 [ C(OP_READ) ] = {
191 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
192 [ C(RESULT_ACCESS) ] = 0x01b7,
193 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
194 [ C(RESULT_MISS) ] = 0x01bb,
195 },
196 [ C(OP_WRITE) ] = {
197 /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */
198 [ C(RESULT_ACCESS) ] = 0x01b7,
199 /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */
200 [ C(RESULT_MISS) ] = 0x01bb,
201 },
202 [ C(OP_PREFETCH) ] = {
203 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
204 [ C(RESULT_ACCESS) ] = 0x01b7,
205 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
206 [ C(RESULT_MISS) ] = 0x01bb,
207 },
208 },
209 [ C(DTLB) ] = {
210 [ C(OP_READ) ] = {
211 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
212 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
213 },
214 [ C(OP_WRITE) ] = {
215 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
216 [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
217 },
218 [ C(OP_PREFETCH) ] = {
219 [ C(RESULT_ACCESS) ] = 0x0,
220 [ C(RESULT_MISS) ] = 0x0,
221 },
222 },
223 [ C(ITLB) ] = {
224 [ C(OP_READ) ] = {
225 [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
226 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
227 },
228 [ C(OP_WRITE) ] = {
229 [ C(RESULT_ACCESS) ] = -1,
230 [ C(RESULT_MISS) ] = -1,
231 },
232 [ C(OP_PREFETCH) ] = {
233 [ C(RESULT_ACCESS) ] = -1,
234 [ C(RESULT_MISS) ] = -1,
235 },
236 },
237 [ C(BPU ) ] = {
238 [ C(OP_READ) ] = {
239 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
240 [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
241 },
242 [ C(OP_WRITE) ] = {
243 [ C(RESULT_ACCESS) ] = -1,
244 [ C(RESULT_MISS) ] = -1,
245 },
246 [ C(OP_PREFETCH) ] = {
247 [ C(RESULT_ACCESS) ] = -1,
248 [ C(RESULT_MISS) ] = -1,
249 },
250 },
251};
252
92static __initconst const u64 westmere_hw_cache_event_ids 253static __initconst const u64 westmere_hw_cache_event_ids
93 [PERF_COUNT_HW_CACHE_MAX] 254 [PERF_COUNT_HW_CACHE_MAX]
94 [PERF_COUNT_HW_CACHE_OP_MAX] 255 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -124,16 +285,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
124 }, 285 },
125 [ C(LL ) ] = { 286 [ C(LL ) ] = {
126 [ C(OP_READ) ] = { 287 [ C(OP_READ) ] = {
127 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 288 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
128 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 289 [ C(RESULT_ACCESS) ] = 0x01b7,
290 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
291 [ C(RESULT_MISS) ] = 0x01bb,
129 }, 292 },
293 /*
294 * Use RFO, not WRITEBACK, because a write miss would typically occur
295 * on RFO.
296 */
130 [ C(OP_WRITE) ] = { 297 [ C(OP_WRITE) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 298 /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
132 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 299 [ C(RESULT_ACCESS) ] = 0x01bb,
300 /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
301 [ C(RESULT_MISS) ] = 0x01b7,
133 }, 302 },
134 [ C(OP_PREFETCH) ] = { 303 [ C(OP_PREFETCH) ] = {
135 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 304 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
136 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 305 [ C(RESULT_ACCESS) ] = 0x01b7,
306 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
307 [ C(RESULT_MISS) ] = 0x01bb,
137 }, 308 },
138 }, 309 },
139 [ C(DTLB) ] = { 310 [ C(DTLB) ] = {
@@ -180,6 +351,39 @@ static __initconst const u64 westmere_hw_cache_event_ids
180 }, 351 },
181}; 352};
182 353
354/*
355 * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
356 */
357
358#define DMND_DATA_RD (1 << 0)
359#define DMND_RFO (1 << 1)
360#define DMND_WB (1 << 3)
361#define PF_DATA_RD (1 << 4)
362#define PF_DATA_RFO (1 << 5)
363#define RESP_UNCORE_HIT (1 << 8)
364#define RESP_MISS (0xf600) /* non uncore hit */
365
366static __initconst const u64 nehalem_hw_cache_extra_regs
367 [PERF_COUNT_HW_CACHE_MAX]
368 [PERF_COUNT_HW_CACHE_OP_MAX]
369 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
370{
371 [ C(LL ) ] = {
372 [ C(OP_READ) ] = {
373 [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
374 [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS,
375 },
376 [ C(OP_WRITE) ] = {
377 [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
378 [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS,
379 },
380 [ C(OP_PREFETCH) ] = {
381 [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
382 [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
383 },
384 }
385};
386
183static __initconst const u64 nehalem_hw_cache_event_ids 387static __initconst const u64 nehalem_hw_cache_event_ids
184 [PERF_COUNT_HW_CACHE_MAX] 388 [PERF_COUNT_HW_CACHE_MAX]
185 [PERF_COUNT_HW_CACHE_OP_MAX] 389 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -215,16 +419,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
215 }, 419 },
216 [ C(LL ) ] = { 420 [ C(LL ) ] = {
217 [ C(OP_READ) ] = { 421 [ C(OP_READ) ] = {
218 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 422 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
219 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 423 [ C(RESULT_ACCESS) ] = 0x01b7,
424 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
425 [ C(RESULT_MISS) ] = 0x01b7,
220 }, 426 },
427 /*
428 * Use RFO, not WRITEBACK, because a write miss would typically occur
429 * on RFO.
430 */
221 [ C(OP_WRITE) ] = { 431 [ C(OP_WRITE) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 432 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
223 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 433 [ C(RESULT_ACCESS) ] = 0x01b7,
434 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
435 [ C(RESULT_MISS) ] = 0x01b7,
224 }, 436 },
225 [ C(OP_PREFETCH) ] = { 437 [ C(OP_PREFETCH) ] = {
226 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 438 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
227 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 439 [ C(RESULT_ACCESS) ] = 0x01b7,
440 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
441 [ C(RESULT_MISS) ] = 0x01b7,
228 }, 442 },
229 }, 443 },
230 [ C(DTLB) ] = { 444 [ C(DTLB) ] = {
@@ -691,8 +905,8 @@ static void intel_pmu_reset(void)
691 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 905 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
692 906
693 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 907 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 908 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 909 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
696 } 910 }
697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 911 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 912 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
@@ -794,6 +1008,67 @@ intel_bts_constraints(struct perf_event *event)
794} 1008}
795 1009
796static struct event_constraint * 1010static struct event_constraint *
1011intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1012{
1013 struct hw_perf_event *hwc = &event->hw;
1014 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
1015 struct event_constraint *c;
1016 struct intel_percore *pc;
1017 struct er_account *era;
1018 int i;
1019 int free_slot;
1020 int found;
1021
1022 if (!x86_pmu.percore_constraints || hwc->extra_alloc)
1023 return NULL;
1024
1025 for (c = x86_pmu.percore_constraints; c->cmask; c++) {
1026 if (e != c->code)
1027 continue;
1028
1029 /*
1030 * Allocate resource per core.
1031 */
1032 pc = cpuc->per_core;
1033 if (!pc)
1034 break;
1035 c = &emptyconstraint;
1036 raw_spin_lock(&pc->lock);
1037 free_slot = -1;
1038 found = 0;
1039 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1040 era = &pc->regs[i];
1041 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1042 /* Allow sharing same config */
1043 if (hwc->extra_config == era->extra_config) {
1044 era->ref++;
1045 cpuc->percore_used = 1;
1046 hwc->extra_alloc = 1;
1047 c = NULL;
1048 }
1049 /* else conflict */
1050 found = 1;
1051 break;
1052 } else if (era->ref == 0 && free_slot == -1)
1053 free_slot = i;
1054 }
1055 if (!found && free_slot != -1) {
1056 era = &pc->regs[free_slot];
1057 era->ref = 1;
1058 era->extra_reg = hwc->extra_reg;
1059 era->extra_config = hwc->extra_config;
1060 cpuc->percore_used = 1;
1061 hwc->extra_alloc = 1;
1062 c = NULL;
1063 }
1064 raw_spin_unlock(&pc->lock);
1065 return c;
1066 }
1067
1068 return NULL;
1069}
1070
1071static struct event_constraint *
797intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1072intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
798{ 1073{
799 struct event_constraint *c; 1074 struct event_constraint *c;
@@ -806,9 +1081,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
806 if (c) 1081 if (c)
807 return c; 1082 return c;
808 1083
1084 c = intel_percore_constraints(cpuc, event);
1085 if (c)
1086 return c;
1087
809 return x86_get_event_constraints(cpuc, event); 1088 return x86_get_event_constraints(cpuc, event);
810} 1089}
811 1090
1091static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1092 struct perf_event *event)
1093{
1094 struct extra_reg *er;
1095 struct intel_percore *pc;
1096 struct er_account *era;
1097 struct hw_perf_event *hwc = &event->hw;
1098 int i, allref;
1099
1100 if (!cpuc->percore_used)
1101 return;
1102
1103 for (er = x86_pmu.extra_regs; er->msr; er++) {
1104 if (er->event != (hwc->config & er->config_mask))
1105 continue;
1106
1107 pc = cpuc->per_core;
1108 raw_spin_lock(&pc->lock);
1109 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1110 era = &pc->regs[i];
1111 if (era->ref > 0 &&
1112 era->extra_config == hwc->extra_config &&
1113 era->extra_reg == er->msr) {
1114 era->ref--;
1115 hwc->extra_alloc = 0;
1116 break;
1117 }
1118 }
1119 allref = 0;
1120 for (i = 0; i < MAX_EXTRA_REGS; i++)
1121 allref += pc->regs[i].ref;
1122 if (allref == 0)
1123 cpuc->percore_used = 0;
1124 raw_spin_unlock(&pc->lock);
1125 break;
1126 }
1127}
1128
812static int intel_pmu_hw_config(struct perf_event *event) 1129static int intel_pmu_hw_config(struct perf_event *event)
813{ 1130{
814 int ret = x86_pmu_hw_config(event); 1131 int ret = x86_pmu_hw_config(event);
@@ -880,20 +1197,67 @@ static __initconst const struct x86_pmu core_pmu = {
880 */ 1197 */
881 .max_period = (1ULL << 31) - 1, 1198 .max_period = (1ULL << 31) - 1,
882 .get_event_constraints = intel_get_event_constraints, 1199 .get_event_constraints = intel_get_event_constraints,
1200 .put_event_constraints = intel_put_event_constraints,
883 .event_constraints = intel_core_event_constraints, 1201 .event_constraints = intel_core_event_constraints,
884}; 1202};
885 1203
1204static int intel_pmu_cpu_prepare(int cpu)
1205{
1206 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1207
1208 if (!cpu_has_ht_siblings())
1209 return NOTIFY_OK;
1210
1211 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
1212 GFP_KERNEL, cpu_to_node(cpu));
1213 if (!cpuc->per_core)
1214 return NOTIFY_BAD;
1215
1216 raw_spin_lock_init(&cpuc->per_core->lock);
1217 cpuc->per_core->core_id = -1;
1218 return NOTIFY_OK;
1219}
1220
886static void intel_pmu_cpu_starting(int cpu) 1221static void intel_pmu_cpu_starting(int cpu)
887{ 1222{
1223 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1224 int core_id = topology_core_id(cpu);
1225 int i;
1226
888 init_debug_store_on_cpu(cpu); 1227 init_debug_store_on_cpu(cpu);
889 /* 1228 /*
890 * Deal with CPUs that don't clear their LBRs on power-up. 1229 * Deal with CPUs that don't clear their LBRs on power-up.
891 */ 1230 */
892 intel_pmu_lbr_reset(); 1231 intel_pmu_lbr_reset();
1232
1233 if (!cpu_has_ht_siblings())
1234 return;
1235
1236 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1237 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
1238
1239 if (pc && pc->core_id == core_id) {
1240 kfree(cpuc->per_core);
1241 cpuc->per_core = pc;
1242 break;
1243 }
1244 }
1245
1246 cpuc->per_core->core_id = core_id;
1247 cpuc->per_core->refcnt++;
893} 1248}
894 1249
895static void intel_pmu_cpu_dying(int cpu) 1250static void intel_pmu_cpu_dying(int cpu)
896{ 1251{
1252 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1253 struct intel_percore *pc = cpuc->per_core;
1254
1255 if (pc) {
1256 if (pc->core_id == -1 || --pc->refcnt == 0)
1257 kfree(pc);
1258 cpuc->per_core = NULL;
1259 }
1260
897 fini_debug_store_on_cpu(cpu); 1261 fini_debug_store_on_cpu(cpu);
898} 1262}
899 1263
@@ -918,7 +1282,9 @@ static __initconst const struct x86_pmu intel_pmu = {
918 */ 1282 */
919 .max_period = (1ULL << 31) - 1, 1283 .max_period = (1ULL << 31) - 1,
920 .get_event_constraints = intel_get_event_constraints, 1284 .get_event_constraints = intel_get_event_constraints,
1285 .put_event_constraints = intel_put_event_constraints,
921 1286
1287 .cpu_prepare = intel_pmu_cpu_prepare,
922 .cpu_starting = intel_pmu_cpu_starting, 1288 .cpu_starting = intel_pmu_cpu_starting,
923 .cpu_dying = intel_pmu_cpu_dying, 1289 .cpu_dying = intel_pmu_cpu_dying,
924}; 1290};
@@ -1024,6 +1390,7 @@ static __init int intel_pmu_init(void)
1024 intel_pmu_lbr_init_core(); 1390 intel_pmu_lbr_init_core();
1025 1391
1026 x86_pmu.event_constraints = intel_core2_event_constraints; 1392 x86_pmu.event_constraints = intel_core2_event_constraints;
1393 x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
1027 pr_cont("Core2 events, "); 1394 pr_cont("Core2 events, ");
1028 break; 1395 break;
1029 1396
@@ -1032,11 +1399,16 @@ static __init int intel_pmu_init(void)
1032 case 46: /* 45 nm nehalem-ex, "Beckton" */ 1399 case 46: /* 45 nm nehalem-ex, "Beckton" */
1033 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1400 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1034 sizeof(hw_cache_event_ids)); 1401 sizeof(hw_cache_event_ids));
1402 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1403 sizeof(hw_cache_extra_regs));
1035 1404
1036 intel_pmu_lbr_init_nhm(); 1405 intel_pmu_lbr_init_nhm();
1037 1406
1038 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1407 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1408 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1409 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1039 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1410 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1411 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1040 pr_cont("Nehalem events, "); 1412 pr_cont("Nehalem events, ");
1041 break; 1413 break;
1042 1414
@@ -1047,6 +1419,7 @@ static __init int intel_pmu_init(void)
1047 intel_pmu_lbr_init_atom(); 1419 intel_pmu_lbr_init_atom();
1048 1420
1049 x86_pmu.event_constraints = intel_gen_event_constraints; 1421 x86_pmu.event_constraints = intel_gen_event_constraints;
1422 x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
1050 pr_cont("Atom events, "); 1423 pr_cont("Atom events, ");
1051 break; 1424 break;
1052 1425
@@ -1054,14 +1427,30 @@ static __init int intel_pmu_init(void)
1054 case 44: /* 32 nm nehalem, "Gulftown" */ 1427 case 44: /* 32 nm nehalem, "Gulftown" */
1055 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1428 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1056 sizeof(hw_cache_event_ids)); 1429 sizeof(hw_cache_event_ids));
1430 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1431 sizeof(hw_cache_extra_regs));
1057 1432
1058 intel_pmu_lbr_init_nhm(); 1433 intel_pmu_lbr_init_nhm();
1059 1434
1060 x86_pmu.event_constraints = intel_westmere_event_constraints; 1435 x86_pmu.event_constraints = intel_westmere_event_constraints;
1436 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1061 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1437 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1438 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1439 x86_pmu.extra_regs = intel_westmere_extra_regs;
1062 pr_cont("Westmere events, "); 1440 pr_cont("Westmere events, ");
1063 break; 1441 break;
1064 1442
1443 case 42: /* SandyBridge */
1444 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1445 sizeof(hw_cache_event_ids));
1446
1447 intel_pmu_lbr_init_nhm();
1448
1449 x86_pmu.event_constraints = intel_snb_event_constraints;
1450 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1451 pr_cont("SandyBridge events, ");
1452 break;
1453
1065 default: 1454 default:
1066 /* 1455 /*
1067 * default constraints for v2 and up 1456 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b7dcd9f2b8a..b95c66ae4a2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -361,30 +361,88 @@ static int intel_pmu_drain_bts_buffer(void)
361/* 361/*
362 * PEBS 362 * PEBS
363 */ 363 */
364 364static struct event_constraint intel_core2_pebs_event_constraints[] = {
365static struct event_constraint intel_core_pebs_events[] = { 365 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
366 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
367 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ 366 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
368 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 367 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
369 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ 368 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
370 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ 369 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
371 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 370 EVENT_CONSTRAINT_END
372 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ 371};
373 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 372
374 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ 373static struct event_constraint intel_atom_pebs_event_constraints[] = {
374 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
375 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
376 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
375 EVENT_CONSTRAINT_END 377 EVENT_CONSTRAINT_END
376}; 378};
377 379
378static struct event_constraint intel_nehalem_pebs_events[] = { 380static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
379 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ 381 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
380 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ 382 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
381 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ 383 PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
382 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ 384 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
383 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ 385 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
384 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 386 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
385 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ 387 PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
386 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 388 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
387 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ 389 PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
390 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
391 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
392 EVENT_CONSTRAINT_END
393};
394
395static struct event_constraint intel_westmere_pebs_event_constraints[] = {
396 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
397 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
398 PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
399 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
400 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
401
402 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
403 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
404 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
405 PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
406 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
407 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
408 EVENT_CONSTRAINT_END
409};
410
411static struct event_constraint intel_snb_pebs_events[] = {
412 PEBS_EVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
413 PEBS_EVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
414 PEBS_EVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
415 PEBS_EVENT_CONSTRAINT(0x01c4, 0xf), /* BR_INST_RETIRED.CONDITIONAL */
416 PEBS_EVENT_CONSTRAINT(0x02c4, 0xf), /* BR_INST_RETIRED.NEAR_CALL */
417 PEBS_EVENT_CONSTRAINT(0x04c4, 0xf), /* BR_INST_RETIRED.ALL_BRANCHES */
418 PEBS_EVENT_CONSTRAINT(0x08c4, 0xf), /* BR_INST_RETIRED.NEAR_RETURN */
419 PEBS_EVENT_CONSTRAINT(0x10c4, 0xf), /* BR_INST_RETIRED.NOT_TAKEN */
420 PEBS_EVENT_CONSTRAINT(0x20c4, 0xf), /* BR_INST_RETIRED.NEAR_TAKEN */
421 PEBS_EVENT_CONSTRAINT(0x40c4, 0xf), /* BR_INST_RETIRED.FAR_BRANCH */
422 PEBS_EVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
423 PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
424 PEBS_EVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
425 PEBS_EVENT_CONSTRAINT(0x10c5, 0xf), /* BR_MISP_RETIRED.NOT_TAKEN */
426 PEBS_EVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.TAKEN */
427 PEBS_EVENT_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
428 PEBS_EVENT_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORE */
429 PEBS_EVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
430 PEBS_EVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
431 PEBS_EVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
432 PEBS_EVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
433 PEBS_EVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
434 PEBS_EVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
435 PEBS_EVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
436 PEBS_EVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
437 PEBS_EVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
438 PEBS_EVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
439 PEBS_EVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.LLC_HIT */
440 PEBS_EVENT_CONSTRAINT(0x40d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */
441 PEBS_EVENT_CONSTRAINT(0x01d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
442 PEBS_EVENT_CONSTRAINT(0x02d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
443 PEBS_EVENT_CONSTRAINT(0x04d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM */
444 PEBS_EVENT_CONSTRAINT(0x08d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE */
445 PEBS_EVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
388 EVENT_CONSTRAINT_END 446 EVENT_CONSTRAINT_END
389}; 447};
390 448
@@ -695,20 +753,17 @@ static void intel_ds_init(void)
695 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); 753 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
696 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); 754 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
697 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; 755 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
698 x86_pmu.pebs_constraints = intel_core_pebs_events;
699 break; 756 break;
700 757
701 case 1: 758 case 1:
702 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); 759 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
703 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); 760 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
704 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; 761 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
705 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
706 break; 762 break;
707 763
708 default: 764 default:
709 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); 765 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
710 x86_pmu.pebs = 0; 766 x86_pmu.pebs = 0;
711 break;
712 } 767 }
713 } 768 }
714} 769}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ff751a9f182..3769ac822f9 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -764,9 +764,9 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
764 u64 v; 764 u64 v;
765 765
766 /* an official way for overflow indication */ 766 /* an official way for overflow indication */
767 rdmsrl(hwc->config_base + hwc->idx, v); 767 rdmsrl(hwc->config_base, v);
768 if (v & P4_CCCR_OVF) { 768 if (v & P4_CCCR_OVF) {
769 wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF); 769 wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
770 return 1; 770 return 1;
771 } 771 }
772 772
@@ -815,7 +815,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
815 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 815 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
816 * asserted again and again 816 * asserted again and again
817 */ 817 */
818 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 818 (void)checking_wrmsrl(hwc->config_base,
819 (u64)(p4_config_unpack_cccr(hwc->config)) & 819 (u64)(p4_config_unpack_cccr(hwc->config)) &
820 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 820 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
821} 821}
@@ -885,7 +885,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
885 p4_pmu_enable_pebs(hwc->config); 885 p4_pmu_enable_pebs(hwc->config);
886 886
887 (void)checking_wrmsrl(escr_addr, escr_conf); 887 (void)checking_wrmsrl(escr_addr, escr_conf);
888 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 888 (void)checking_wrmsrl(hwc->config_base,
889 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 889 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
890} 890}
891 891
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07be2cd..20c097e3386 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
68 if (cpuc->enabled) 68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70 70
71 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 71 (void)checking_wrmsrl(hwc->config_base, val);
72} 72}
73 73
74static void p6_pmu_enable_event(struct perf_event *event) 74static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
81 if (cpuc->enabled) 81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
83 83
84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base, val);
85} 85}
86 86
87static __initconst const struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d5a23661550..966512b2cac 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -46,6 +46,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
46 /* returns the bit offset of the performance counter register */ 46 /* returns the bit offset of the performance counter register */
47 switch (boot_cpu_data.x86_vendor) { 47 switch (boot_cpu_data.x86_vendor) {
48 case X86_VENDOR_AMD: 48 case X86_VENDOR_AMD:
49 if (msr >= MSR_F15H_PERF_CTR)
50 return (msr - MSR_F15H_PERF_CTR) >> 1;
49 return msr - MSR_K7_PERFCTR0; 51 return msr - MSR_K7_PERFCTR0;
50 case X86_VENDOR_INTEL: 52 case X86_VENDOR_INTEL:
51 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 53 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -70,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
70 /* returns the bit offset of the event selection register */ 72 /* returns the bit offset of the event selection register */
71 switch (boot_cpu_data.x86_vendor) { 73 switch (boot_cpu_data.x86_vendor) {
72 case X86_VENDOR_AMD: 74 case X86_VENDOR_AMD:
75 if (msr >= MSR_F15H_PERF_CTL)
76 return (msr - MSR_F15H_PERF_CTL) >> 1;
73 return msr - MSR_K7_EVNTSEL0; 77 return msr - MSR_K7_EVNTSEL0;
74 case X86_VENDOR_INTEL: 78 case X86_VENDOR_INTEL:
75 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 79 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 00000000000..7a8cebc9ff2
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,441 @@
1/*
2 * Architecture specific OF callbacks.
3 */
4#include <linux/bootmem.h>
5#include <linux/io.h>
6#include <linux/interrupt.h>
7#include <linux/list.h>
8#include <linux/of.h>
9#include <linux/of_fdt.h>
10#include <linux/of_address.h>
11#include <linux/of_platform.h>
12#include <linux/of_irq.h>
13#include <linux/slab.h>
14#include <linux/pci.h>
15#include <linux/of_pci.h>
16
17#include <asm/hpet.h>
18#include <asm/irq_controller.h>
19#include <asm/apic.h>
20#include <asm/pci_x86.h>
21
22__initdata u64 initial_dtb;
23char __initdata cmd_line[COMMAND_LINE_SIZE];
24static LIST_HEAD(irq_domains);
25static DEFINE_RAW_SPINLOCK(big_irq_lock);
26
27int __initdata of_ioapic;
28
29#ifdef CONFIG_X86_IO_APIC
30static void add_interrupt_host(struct irq_domain *ih)
31{
32 unsigned long flags;
33
34 raw_spin_lock_irqsave(&big_irq_lock, flags);
35 list_add(&ih->l, &irq_domains);
36 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
37}
38#endif
39
40static struct irq_domain *get_ih_from_node(struct device_node *controller)
41{
42 struct irq_domain *ih, *found = NULL;
43 unsigned long flags;
44
45 raw_spin_lock_irqsave(&big_irq_lock, flags);
46 list_for_each_entry(ih, &irq_domains, l) {
47 if (ih->controller == controller) {
48 found = ih;
49 break;
50 }
51 }
52 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
53 return found;
54}
55
56unsigned int irq_create_of_mapping(struct device_node *controller,
57 const u32 *intspec, unsigned int intsize)
58{
59 struct irq_domain *ih;
60 u32 virq, type;
61 int ret;
62
63 ih = get_ih_from_node(controller);
64 if (!ih)
65 return 0;
66 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
67 if (ret)
68 return ret;
69 if (type == IRQ_TYPE_NONE)
70 return virq;
71 /* set the mask if it is different from current */
72 if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
73 set_irq_type(virq, type);
74 return virq;
75}
76EXPORT_SYMBOL_GPL(irq_create_of_mapping);
77
78unsigned long pci_address_to_pio(phys_addr_t address)
79{
80 /*
81 * The ioport address can be directly used by inX / outX
82 */
83 BUG_ON(address >= (1 << 16));
84 return (unsigned long)address;
85}
86EXPORT_SYMBOL_GPL(pci_address_to_pio);
87
88void __init early_init_dt_scan_chosen_arch(unsigned long node)
89{
90 BUG();
91}
92
93void __init early_init_dt_add_memory_arch(u64 base, u64 size)
94{
95 BUG();
96}
97
98void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
99{
100 return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
101}
102
103void __init add_dtb(u64 data)
104{
105 initial_dtb = data + offsetof(struct setup_data, data);
106}
107
108/*
109 * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
110 */
111static struct of_device_id __initdata ce4100_ids[] = {
112 { .compatible = "intel,ce4100-cp", },
113 { .compatible = "isa", },
114 { .compatible = "pci", },
115 {},
116};
117
118static int __init add_bus_probe(void)
119{
120 if (!of_have_populated_dt())
121 return 0;
122
123 return of_platform_bus_probe(NULL, ce4100_ids, NULL);
124}
125module_init(add_bus_probe);
126
127#ifdef CONFIG_PCI
128static int x86_of_pci_irq_enable(struct pci_dev *dev)
129{
130 struct of_irq oirq;
131 u32 virq;
132 int ret;
133 u8 pin;
134
135 ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
136 if (ret)
137 return ret;
138 if (!pin)
139 return 0;
140
141 ret = of_irq_map_pci(dev, &oirq);
142 if (ret)
143 return ret;
144
145 virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
146 oirq.size);
147 if (virq == 0)
148 return -EINVAL;
149 dev->irq = virq;
150 return 0;
151}
152
153static void x86_of_pci_irq_disable(struct pci_dev *dev)
154{
155}
156
157void __cpuinit x86_of_pci_init(void)
158{
159 struct device_node *np;
160
161 pcibios_enable_irq = x86_of_pci_irq_enable;
162 pcibios_disable_irq = x86_of_pci_irq_disable;
163
164 for_each_node_by_type(np, "pci") {
165 const void *prop;
166 struct pci_bus *bus;
167 unsigned int bus_min;
168 struct device_node *child;
169
170 prop = of_get_property(np, "bus-range", NULL);
171 if (!prop)
172 continue;
173 bus_min = be32_to_cpup(prop);
174
175 bus = pci_find_bus(0, bus_min);
176 if (!bus) {
177 printk(KERN_ERR "Can't find a node for bus %s.\n",
178 np->full_name);
179 continue;
180 }
181
182 if (bus->self)
183 bus->self->dev.of_node = np;
184 else
185 bus->dev.of_node = np;
186
187 for_each_child_of_node(np, child) {
188 struct pci_dev *dev;
189 u32 devfn;
190
191 prop = of_get_property(child, "reg", NULL);
192 if (!prop)
193 continue;
194
195 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
196 dev = pci_get_slot(bus, devfn);
197 if (!dev)
198 continue;
199 dev->dev.of_node = child;
200 pci_dev_put(dev);
201 }
202 }
203}
204#endif
205
206static void __init dtb_setup_hpet(void)
207{
208#ifdef CONFIG_HPET_TIMER
209 struct device_node *dn;
210 struct resource r;
211 int ret;
212
213 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
214 if (!dn)
215 return;
216 ret = of_address_to_resource(dn, 0, &r);
217 if (ret) {
218 WARN_ON(1);
219 return;
220 }
221 hpet_address = r.start;
222#endif
223}
224
225static void __init dtb_lapic_setup(void)
226{
227#ifdef CONFIG_X86_LOCAL_APIC
228 struct device_node *dn;
229 struct resource r;
230 int ret;
231
232 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
233 if (!dn)
234 return;
235
236 ret = of_address_to_resource(dn, 0, &r);
237 if (WARN_ON(ret))
238 return;
239
240 /* Did the boot loader setup the local APIC ? */
241 if (!cpu_has_apic) {
242 if (apic_force_enable(r.start))
243 return;
244 }
245 smp_found_config = 1;
246 pic_mode = 1;
247 register_lapic_address(r.start);
248 generic_processor_info(boot_cpu_physical_apicid,
249 GET_APIC_VERSION(apic_read(APIC_LVR)));
250#endif
251}
252
253#ifdef CONFIG_X86_IO_APIC
254static unsigned int ioapic_id;
255
256static void __init dtb_add_ioapic(struct device_node *dn)
257{
258 struct resource r;
259 int ret;
260
261 ret = of_address_to_resource(dn, 0, &r);
262 if (ret) {
263 printk(KERN_ERR "Can't obtain address from node %s.\n",
264 dn->full_name);
265 return;
266 }
267 mp_register_ioapic(++ioapic_id, r.start, gsi_top);
268}
269
270static void __init dtb_ioapic_setup(void)
271{
272 struct device_node *dn;
273
274 for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
275 dtb_add_ioapic(dn);
276
277 if (nr_ioapics) {
278 of_ioapic = 1;
279 return;
280 }
281 printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
282}
283#else
284static void __init dtb_ioapic_setup(void) {}
285#endif
286
287static void __init dtb_apic_setup(void)
288{
289 dtb_lapic_setup();
290 dtb_ioapic_setup();
291}
292
293#ifdef CONFIG_OF_FLATTREE
294static void __init x86_flattree_get_config(void)
295{
296 u32 size, map_len;
297 void *new_dtb;
298
299 if (!initial_dtb)
300 return;
301
302 map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
303 (u64)sizeof(struct boot_param_header));
304
305 initial_boot_params = early_memremap(initial_dtb, map_len);
306 size = be32_to_cpu(initial_boot_params->totalsize);
307 if (map_len < size) {
308 early_iounmap(initial_boot_params, map_len);
309 initial_boot_params = early_memremap(initial_dtb, size);
310 map_len = size;
311 }
312
313 new_dtb = alloc_bootmem(size);
314 memcpy(new_dtb, initial_boot_params, size);
315 early_iounmap(initial_boot_params, map_len);
316
317 initial_boot_params = new_dtb;
318
319 /* root level address cells */
320 of_scan_flat_dt(early_init_dt_scan_root, NULL);
321
322 unflatten_device_tree();
323}
324#else
325static inline void x86_flattree_get_config(void) { }
326#endif
327
328void __init x86_dtb_init(void)
329{
330 x86_flattree_get_config();
331
332 if (!of_have_populated_dt())
333 return;
334
335 dtb_setup_hpet();
336 dtb_apic_setup();
337}
338
339#ifdef CONFIG_X86_IO_APIC
340
341struct of_ioapic_type {
342 u32 out_type;
343 u32 trigger;
344 u32 polarity;
345};
346
347static struct of_ioapic_type of_ioapic_type[] =
348{
349 {
350 .out_type = IRQ_TYPE_EDGE_RISING,
351 .trigger = IOAPIC_EDGE,
352 .polarity = 1,
353 },
354 {
355 .out_type = IRQ_TYPE_LEVEL_LOW,
356 .trigger = IOAPIC_LEVEL,
357 .polarity = 0,
358 },
359 {
360 .out_type = IRQ_TYPE_LEVEL_HIGH,
361 .trigger = IOAPIC_LEVEL,
362 .polarity = 1,
363 },
364 {
365 .out_type = IRQ_TYPE_EDGE_FALLING,
366 .trigger = IOAPIC_EDGE,
367 .polarity = 0,
368 },
369};
370
371static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
372 u32 *out_hwirq, u32 *out_type)
373{
374 struct io_apic_irq_attr attr;
375 struct of_ioapic_type *it;
376 u32 line, idx, type;
377
378 if (intsize < 2)
379 return -EINVAL;
380
381 line = *intspec;
382 idx = (u32) id->priv;
383 *out_hwirq = line + mp_gsi_routing[idx].gsi_base;
384
385 intspec++;
386 type = *intspec;
387
388 if (type >= ARRAY_SIZE(of_ioapic_type))
389 return -EINVAL;
390
391 it = of_ioapic_type + type;
392 *out_type = it->out_type;
393
394 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
395
396 return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr);
397}
398
399static void __init ioapic_add_ofnode(struct device_node *np)
400{
401 struct resource r;
402 int i, ret;
403
404 ret = of_address_to_resource(np, 0, &r);
405 if (ret) {
406 printk(KERN_ERR "Failed to obtain address for %s\n",
407 np->full_name);
408 return;
409 }
410
411 for (i = 0; i < nr_ioapics; i++) {
412 if (r.start == mp_ioapics[i].apicaddr) {
413 struct irq_domain *id;
414
415 id = kzalloc(sizeof(*id), GFP_KERNEL);
416 BUG_ON(!id);
417 id->controller = np;
418 id->xlate = ioapic_xlate;
419 id->priv = (void *)i;
420 add_interrupt_host(id);
421 return;
422 }
423 }
424 printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
425}
426
427void __init x86_add_irq_domains(void)
428{
429 struct device_node *dp;
430
431 if (!of_have_populated_dt())
432 return;
433
434 for_each_node_with_property(dp, "interrupt-controller") {
435 if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
436 ioapic_add_ofnode(dp);
437 }
438}
439#else
440void __init x86_add_irq_domains(void) { }
441#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index df20723a6a1..220a1c11cfd 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -320,31 +320,6 @@ void die(const char *str, struct pt_regs *regs, long err)
320 oops_end(flags, regs, sig); 320 oops_end(flags, regs, sig);
321} 321}
322 322
323void notrace __kprobes
324die_nmi(char *str, struct pt_regs *regs, int do_panic)
325{
326 unsigned long flags;
327
328 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
329 return;
330
331 /*
332 * We are in trouble anyway, lets at least try
333 * to get a message out.
334 */
335 flags = oops_begin();
336 printk(KERN_EMERG "%s", str);
337 printk(" on CPU%d, ip %08lx, registers:\n",
338 smp_processor_id(), regs->ip);
339 show_registers(regs);
340 oops_end(flags, regs, 0);
341 if (do_panic || panic_on_oops)
342 panic("Non maskable interrupt");
343 nmi_exit();
344 local_irq_enable();
345 do_exit(SIGBUS);
346}
347
348static int __init oops_setup(char *s) 323static int __init oops_setup(char *s)
349{ 324{
350 if (!s) 325 if (!s)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 294f26da0c0..cdf5bfd9d4d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -667,21 +667,15 @@ __init void e820_setup_gap(void)
667 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of 667 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
668 * linked list of struct setup_data, which is parsed here. 668 * linked list of struct setup_data, which is parsed here.
669 */ 669 */
670void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) 670void __init parse_e820_ext(struct setup_data *sdata)
671{ 671{
672 u32 map_len;
673 int entries; 672 int entries;
674 struct e820entry *extmap; 673 struct e820entry *extmap;
675 674
676 entries = sdata->len / sizeof(struct e820entry); 675 entries = sdata->len / sizeof(struct e820entry);
677 map_len = sdata->len + sizeof(struct setup_data);
678 if (map_len > PAGE_SIZE)
679 sdata = early_ioremap(pa_data, map_len);
680 extmap = (struct e820entry *)(sdata->data); 676 extmap = (struct e820entry *)(sdata->data);
681 __append_e820_map(extmap, entries); 677 __append_e820_map(extmap, entries);
682 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 678 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
683 if (map_len > PAGE_SIZE)
684 early_iounmap(sdata, map_len);
685 printk(KERN_INFO "extended physical RAM map:\n"); 679 printk(KERN_INFO "extended physical RAM map:\n");
686 e820_print_map("extended"); 680 e820_print_map("extended");
687} 681}
@@ -847,15 +841,21 @@ static int __init parse_memopt(char *p)
847 if (!p) 841 if (!p)
848 return -EINVAL; 842 return -EINVAL;
849 843
850#ifdef CONFIG_X86_32
851 if (!strcmp(p, "nopentium")) { 844 if (!strcmp(p, "nopentium")) {
845#ifdef CONFIG_X86_32
852 setup_clear_cpu_cap(X86_FEATURE_PSE); 846 setup_clear_cpu_cap(X86_FEATURE_PSE);
853 return 0; 847 return 0;
854 } 848#else
849 printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
850 return -EINVAL;
855#endif 851#endif
852 }
856 853
857 userdef = 1; 854 userdef = 1;
858 mem_size = memparse(p, &p); 855 mem_size = memparse(p, &p);
856 /* don't remove all of memory when handling "mem={invalid}" param */
857 if (mem_size == 0)
858 return -EINVAL;
859 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); 859 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
860 860
861 return 0; 861 return 0;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efad7eb..fa41f7298c8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
65#define sysexit_audit syscall_exit_work 65#define sysexit_audit syscall_exit_work
66#endif 66#endif
67 67
68 .section .entry.text, "ax"
69
68/* 70/*
69 * We use macros for low-level operations which need to be overridden 71 * We use macros for low-level operations which need to be overridden
70 * for paravirtualization. The following will never clobber any registers: 72 * for paravirtualization. The following will never clobber any registers:
@@ -395,7 +397,7 @@ sysenter_past_esp:
395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 397 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
396 * pushed above; +8 corresponds to copy_thread's esp0 setting. 398 * pushed above; +8 corresponds to copy_thread's esp0 setting.
397 */ 399 */
398 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) 400 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
399 CFI_REL_OFFSET eip, 0 401 CFI_REL_OFFSET eip, 0
400 402
401 pushl_cfi %eax 403 pushl_cfi %eax
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
788 */ 790 */
789.section .init.rodata,"a" 791.section .init.rodata,"a"
790ENTRY(interrupt) 792ENTRY(interrupt)
791.text 793.section .entry.text, "ax"
792 .p2align 5 794 .p2align 5
793 .p2align CONFIG_X86_L1_CACHE_SHIFT 795 .p2align CONFIG_X86_L1_CACHE_SHIFT
794ENTRY(irq_entries_start) 796ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
807 .endif 809 .endif
808 .previous 810 .previous
809 .long 1b 811 .long 1b
810 .text 812 .section .entry.text, "ax"
811vector=vector+1 813vector=vector+1
812 .endif 814 .endif
813 .endr 815 .endr
@@ -1409,8 +1411,7 @@ END(general_protection)
1409#ifdef CONFIG_KVM_GUEST 1411#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault) 1412ENTRY(async_page_fault)
1411 RING0_EC_FRAME 1413 RING0_EC_FRAME
1412 pushl $do_async_page_fault 1414 pushl_cfi $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code 1415 jmp error_code
1415 CFI_ENDPROC 1416 CFI_ENDPROC
1416END(apf_page_fault) 1417END(apf_page_fault)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffbeb0c..b72b4a6466a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
61#define __AUDIT_ARCH_LE 0x40000000 61#define __AUDIT_ARCH_LE 0x40000000
62 62
63 .code64 63 .code64
64 .section .entry.text, "ax"
65
64#ifdef CONFIG_FUNCTION_TRACER 66#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 68ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
744 */ 746 */
745 .section .init.rodata,"a" 747 .section .init.rodata,"a"
746ENTRY(interrupt) 748ENTRY(interrupt)
747 .text 749 .section .entry.text
748 .p2align 5 750 .p2align 5
749 .p2align CONFIG_X86_L1_CACHE_SHIFT 751 .p2align CONFIG_X86_L1_CACHE_SHIFT
750ENTRY(irq_entries_start) 752ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
763 .endif 765 .endif
764 .previous 766 .previous
765 .quad 1b 767 .quad 1b
766 .text 768 .section .entry.text
767vector=vector+1 769vector=vector+1
768 .endif 770 .endif
769 .endr 771 .endr
@@ -975,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
975 x86_platform_ipi smp_x86_platform_ipi 977 x86_platform_ipi smp_x86_platform_ipi
976 978
977#ifdef CONFIG_SMP 979#ifdef CONFIG_SMP
978.irpc idx, "01234567" 980.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
981 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
982.if NUM_INVALIDATE_TLB_VECTORS > \idx
979apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ 983apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
980 invalidate_interrupt\idx smp_invalidate_interrupt 984 invalidate_interrupt\idx smp_invalidate_interrupt
985.endif
981.endr 986.endr
982#endif 987#endif
983 988
@@ -1248,7 +1253,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1248 decl PER_CPU_VAR(irq_count) 1253 decl PER_CPU_VAR(irq_count)
1249 jmp error_exit 1254 jmp error_exit
1250 CFI_ENDPROC 1255 CFI_ENDPROC
1251END(do_hypervisor_callback) 1256END(xen_do_hypervisor_callback)
1252 1257
1253/* 1258/*
1254 * Hypervisor uses this for application faults while it executes. 1259 * Hypervisor uses this for application faults while it executes.
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 382eb2936d4..a93742a5746 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -437,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
437 return; 437 return;
438 } 438 }
439 439
440 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
441 frame_pointer) == -EBUSY) {
442 *parent = old;
443 return;
444 }
445
446 trace.func = self_addr; 440 trace.func = self_addr;
441 trace.depth = current->curr_ret_stack + 1;
447 442
448 /* Only trace if the calling function expects to */ 443 /* Only trace if the calling function expects to */
449 if (!ftrace_graph_entry(&trace)) { 444 if (!ftrace_graph_entry(&trace)) {
450 current->curr_ret_stack--;
451 *parent = old; 445 *parent = old;
446 return;
447 }
448
449 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
450 frame_pointer) == -EBUSY) {
451 *parent = old;
452 return;
452 } 453 }
453} 454}
454#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 455#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 767d6c43de3..ce0be7cd085 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
73 */ 73 */
74KERNEL_PAGES = LOWMEM_PAGES 74KERNEL_PAGES = LOWMEM_PAGES
75 75
76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
77RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
78 78
79/* 79/*
@@ -137,7 +137,7 @@ ENTRY(startup_32)
137 movsl 137 movsl
1381: 1381:
139 139
140#ifdef CONFIG_OLPC_OPENFIRMWARE 140#ifdef CONFIG_OLPC
141 /* save OFW's pgdir table for later use when calling into OFW */ 141 /* save OFW's pgdir table for later use when calling into OFW */
142 movl %cr3, %eax 142 movl %cr3, %eax
143 movl %eax, pa(olpc_ofw_pgd) 143 movl %eax, pa(olpc_ofw_pgd)
@@ -623,7 +623,7 @@ ENTRY(initial_code)
623 * BSS section 623 * BSS section
624 */ 624 */
625__PAGE_ALIGNED_BSS 625__PAGE_ALIGNED_BSS
626 .align PAGE_SIZE_asm 626 .align PAGE_SIZE
627#ifdef CONFIG_X86_PAE 627#ifdef CONFIG_X86_PAE
628initial_pg_pmd: 628initial_pg_pmd:
629 .fill 1024*KPMDS,4,0 629 .fill 1024*KPMDS,4,0
@@ -644,7 +644,7 @@ ENTRY(swapper_pg_dir)
644#ifdef CONFIG_X86_PAE 644#ifdef CONFIG_X86_PAE
645__PAGE_ALIGNED_DATA 645__PAGE_ALIGNED_DATA
646 /* Page-aligned for the benefit of paravirt? */ 646 /* Page-aligned for the benefit of paravirt? */
647 .align PAGE_SIZE_asm 647 .align PAGE_SIZE
648ENTRY(initial_page_table) 648ENTRY(initial_page_table)
649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
650# if KPMDS == 3 650# if KPMDS == 3
@@ -662,7 +662,7 @@ ENTRY(initial_page_table)
662# else 662# else
663# error "Kernel PMDs should be 1, 2 or 3" 663# error "Kernel PMDs should be 1, 2 or 3"
664# endif 664# endif
665 .align PAGE_SIZE_asm /* needs to be page-sized too */ 665 .align PAGE_SIZE /* needs to be page-sized too */
666#endif 666#endif
667 667
668.data 668.data
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4ff5968f12d..bfe8f729e08 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -503,7 +503,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
503 if (!irq) 503 if (!irq)
504 return -EINVAL; 504 return -EINVAL;
505 505
506 set_irq_data(irq, dev); 506 irq_set_handler_data(irq, dev);
507 507
508 if (hpet_setup_msi_irq(irq)) 508 if (hpet_setup_msi_irq(irq))
509 return -EINVAL; 509 return -EINVAL;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 20757cb2efa..d9ca749c123 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq)
112{ 112{
113 disable_irq_nosync(irq); 113 disable_irq_nosync(irq);
114 io_apic_irqs &= ~(1<<irq); 114 io_apic_irqs &= ~(1<<irq);
115 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, 115 irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
116 i8259A_chip.name); 116 i8259A_chip.name);
117 enable_irq(irq); 117 enable_irq(irq);
118} 118}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af..8c968974253 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/bitmap.h>
17#include <asm/syscalls.h> 18#include <asm/syscalls.h>
18 19
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base,
21 unsigned int extent, int new_value)
22{
23 unsigned int i;
24
25 for (i = base; i < base + extent; i++) {
26 if (new_value)
27 __set_bit(i, bitmap);
28 else
29 __clear_bit(i, bitmap);
30 }
31}
32
33/* 20/*
34 * this changes the io permissions bitmap in the current task. 21 * this changes the io permissions bitmap in the current task.
35 */ 22 */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
69 */ 56 */
70 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(init_tss, get_cpu());
71 58
72 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num);
61 else
62 bitmap_set(t->io_bitmap_ptr, from, num);
73 63
74 /* 64 /*
75 * Search for a (possibly new) maximum. This is simple and stupid, 65 * Search for a (possibly new) maximum. This is simple and stupid,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 387b6a0c9e8..948a31eae75 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -44,9 +44,9 @@ void ack_bad_irq(unsigned int irq)
44 44
45#define irq_stats(x) (&per_cpu(irq_stat, x)) 45#define irq_stats(x) (&per_cpu(irq_stat, x))
46/* 46/*
47 * /proc/interrupts printing: 47 * /proc/interrupts printing for arch specific interrupts
48 */ 48 */
49static int show_other_interrupts(struct seq_file *p, int prec) 49int arch_show_interrupts(struct seq_file *p, int prec)
50{ 50{
51 int j; 51 int j;
52 52
@@ -122,59 +122,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
122 return 0; 122 return 0;
123} 123}
124 124
125int show_interrupts(struct seq_file *p, void *v)
126{
127 unsigned long flags, any_count = 0;
128 int i = *(loff_t *) v, j, prec;
129 struct irqaction *action;
130 struct irq_desc *desc;
131
132 if (i > nr_irqs)
133 return 0;
134
135 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
136 j *= 10;
137
138 if (i == nr_irqs)
139 return show_other_interrupts(p, prec);
140
141 /* print header */
142 if (i == 0) {
143 seq_printf(p, "%*s", prec + 8, "");
144 for_each_online_cpu(j)
145 seq_printf(p, "CPU%-8d", j);
146 seq_putc(p, '\n');
147 }
148
149 desc = irq_to_desc(i);
150 if (!desc)
151 return 0;
152
153 raw_spin_lock_irqsave(&desc->lock, flags);
154 for_each_online_cpu(j)
155 any_count |= kstat_irqs_cpu(i, j);
156 action = desc->action;
157 if (!action && !any_count)
158 goto out;
159
160 seq_printf(p, "%*d: ", prec, i);
161 for_each_online_cpu(j)
162 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
163 seq_printf(p, " %8s", desc->irq_data.chip->name);
164 seq_printf(p, "-%-8s", desc->name);
165
166 if (action) {
167 seq_printf(p, " %s", action->name);
168 while ((action = action->next) != NULL)
169 seq_printf(p, ", %s", action->name);
170 }
171
172 seq_putc(p, '\n');
173out:
174 raw_spin_unlock_irqrestore(&desc->lock, flags);
175 return 0;
176}
177
178/* 125/*
179 * /proc/stat helpers 126 * /proc/stat helpers
180 */ 127 */
@@ -276,15 +223,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
276 223
277EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 224EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
278 225
279#ifdef CONFIG_OF
280unsigned int irq_create_of_mapping(struct device_node *controller,
281 const u32 *intspec, unsigned int intsize)
282{
283 return intspec[0];
284}
285EXPORT_SYMBOL_GPL(irq_create_of_mapping);
286#endif
287
288#ifdef CONFIG_HOTPLUG_CPU 226#ifdef CONFIG_HOTPLUG_CPU
289/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 227/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
290void fixup_irqs(void) 228void fixup_irqs(void)
@@ -293,6 +231,7 @@ void fixup_irqs(void)
293 static int warned; 231 static int warned;
294 struct irq_desc *desc; 232 struct irq_desc *desc;
295 struct irq_data *data; 233 struct irq_data *data;
234 struct irq_chip *chip;
296 235
297 for_each_irq_desc(irq, desc) { 236 for_each_irq_desc(irq, desc) {
298 int break_affinity = 0; 237 int break_affinity = 0;
@@ -307,10 +246,10 @@ void fixup_irqs(void)
307 /* interrupt's are disabled at this point */ 246 /* interrupt's are disabled at this point */
308 raw_spin_lock(&desc->lock); 247 raw_spin_lock(&desc->lock);
309 248
310 data = &desc->irq_data; 249 data = irq_desc_get_irq_data(desc);
311 affinity = data->affinity; 250 affinity = data->affinity;
312 if (!irq_has_action(irq) || 251 if (!irq_has_action(irq) ||
313 cpumask_equal(affinity, cpu_online_mask)) { 252 cpumask_subset(affinity, cpu_online_mask)) {
314 raw_spin_unlock(&desc->lock); 253 raw_spin_unlock(&desc->lock);
315 continue; 254 continue;
316 } 255 }
@@ -327,16 +266,17 @@ void fixup_irqs(void)
327 affinity = cpu_all_mask; 266 affinity = cpu_all_mask;
328 } 267 }
329 268
330 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) 269 chip = irq_data_get_irq_chip(data);
331 data->chip->irq_mask(data); 270 if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
271 chip->irq_mask(data);
332 272
333 if (data->chip->irq_set_affinity) 273 if (chip->irq_set_affinity)
334 data->chip->irq_set_affinity(data, affinity, true); 274 chip->irq_set_affinity(data, affinity, true);
335 else if (!(warned++)) 275 else if (!(warned++))
336 set_affinity = 0; 276 set_affinity = 0;
337 277
338 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) 278 if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
339 data->chip->irq_unmask(data); 279 chip->irq_unmask(data);
340 280
341 raw_spin_unlock(&desc->lock); 281 raw_spin_unlock(&desc->lock);
342 282
@@ -368,10 +308,11 @@ void fixup_irqs(void)
368 irq = __this_cpu_read(vector_irq[vector]); 308 irq = __this_cpu_read(vector_irq[vector]);
369 309
370 desc = irq_to_desc(irq); 310 desc = irq_to_desc(irq);
371 data = &desc->irq_data; 311 data = irq_desc_get_irq_data(desc);
312 chip = irq_data_get_irq_chip(data);
372 raw_spin_lock(&desc->lock); 313 raw_spin_lock(&desc->lock);
373 if (data->chip->irq_retrigger) 314 if (chip->irq_retrigger)
374 data->chip->irq_retrigger(data); 315 chip->irq_retrigger(data);
375 raw_spin_unlock(&desc->lock); 316 raw_spin_unlock(&desc->lock);
376 } 317 }
377 } 318 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c752e973958..f470e4ef993 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/i8259.h> 26#include <asm/i8259.h>
27#include <asm/traps.h> 27#include <asm/traps.h>
28#include <asm/prom.h>
28 29
29/* 30/*
30 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
71static struct irqaction fpu_irq = { 72static struct irqaction fpu_irq = {
72 .handler = math_error_irq, 73 .handler = math_error_irq,
73 .name = "fpu", 74 .name = "fpu",
75 .flags = IRQF_NO_THREAD,
74}; 76};
75#endif 77#endif
76 78
@@ -80,6 +82,7 @@ static struct irqaction fpu_irq = {
80static struct irqaction irq2 = { 82static struct irqaction irq2 = {
81 .handler = no_action, 83 .handler = no_action,
82 .name = "cascade", 84 .name = "cascade",
85 .flags = IRQF_NO_THREAD,
83}; 86};
84 87
85DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 88DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -110,7 +113,7 @@ void __init init_ISA_irqs(void)
110 legacy_pic->init(0); 113 legacy_pic->init(0);
111 114
112 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) 115 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
113 set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); 116 irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
114} 117}
115 118
116void __init init_IRQ(void) 119void __init init_IRQ(void)
@@ -118,6 +121,12 @@ void __init init_IRQ(void)
118 int i; 121 int i;
119 122
120 /* 123 /*
124 * We probably need a better place for this, but it works for
125 * now ...
126 */
127 x86_add_irq_domains();
128
129 /*
121 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. 130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
122 * If these IRQ's are handled by legacy interrupt-controllers like PIC, 131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
123 * then this configuration will likely be static after the boot. If 132 * then this configuration will likely be static after the boot. If
@@ -164,14 +173,77 @@ static void __init smp_intr_init(void)
164 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
165 174
166 /* IPIs for invalidation */ 175 /* IPIs for invalidation */
167 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); 176#define ALLOC_INVTLB_VEC(NR) \
168 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); 177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
169 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); 178 invalidate_interrupt##NR)
170 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); 179
171 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); 180 switch (NUM_INVALIDATE_TLB_VECTORS) {
172 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); 181 default:
173 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); 182 ALLOC_INVTLB_VEC(31);
174 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); 183 case 31:
184 ALLOC_INVTLB_VEC(30);
185 case 30:
186 ALLOC_INVTLB_VEC(29);
187 case 29:
188 ALLOC_INVTLB_VEC(28);
189 case 28:
190 ALLOC_INVTLB_VEC(27);
191 case 27:
192 ALLOC_INVTLB_VEC(26);
193 case 26:
194 ALLOC_INVTLB_VEC(25);
195 case 25:
196 ALLOC_INVTLB_VEC(24);
197 case 24:
198 ALLOC_INVTLB_VEC(23);
199 case 23:
200 ALLOC_INVTLB_VEC(22);
201 case 22:
202 ALLOC_INVTLB_VEC(21);
203 case 21:
204 ALLOC_INVTLB_VEC(20);
205 case 20:
206 ALLOC_INVTLB_VEC(19);
207 case 19:
208 ALLOC_INVTLB_VEC(18);
209 case 18:
210 ALLOC_INVTLB_VEC(17);
211 case 17:
212 ALLOC_INVTLB_VEC(16);
213 case 16:
214 ALLOC_INVTLB_VEC(15);
215 case 15:
216 ALLOC_INVTLB_VEC(14);
217 case 14:
218 ALLOC_INVTLB_VEC(13);
219 case 13:
220 ALLOC_INVTLB_VEC(12);
221 case 12:
222 ALLOC_INVTLB_VEC(11);
223 case 11:
224 ALLOC_INVTLB_VEC(10);
225 case 10:
226 ALLOC_INVTLB_VEC(9);
227 case 9:
228 ALLOC_INVTLB_VEC(8);
229 case 8:
230 ALLOC_INVTLB_VEC(7);
231 case 7:
232 ALLOC_INVTLB_VEC(6);
233 case 6:
234 ALLOC_INVTLB_VEC(5);
235 case 5:
236 ALLOC_INVTLB_VEC(4);
237 case 4:
238 ALLOC_INVTLB_VEC(3);
239 case 3:
240 ALLOC_INVTLB_VEC(2);
241 case 2:
242 ALLOC_INVTLB_VEC(1);
243 case 1:
244 ALLOC_INVTLB_VEC(0);
245 break;
246 }
175 247
176 /* IPI for generic function call */ 248 /* IPI for generic function call */
177 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 249 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -243,7 +315,7 @@ void __init native_init_IRQ(void)
243 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 315 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
244 } 316 }
245 317
246 if (!acpi_ioapic) 318 if (!acpi_ioapic && !of_ioapic)
247 setup_irq(2, &irq2); 319 setup_irq(2, &irq2);
248 320
249#ifdef CONFIG_X86_32 321#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a4130005028..7c64c420a9f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -533,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
533 } 533 }
534 return NOTIFY_DONE; 534 return NOTIFY_DONE;
535 535
536 case DIE_NMIWATCHDOG:
537 if (atomic_read(&kgdb_active) != -1) {
538 /* KGDB CPU roundup: */
539 kgdb_nmicallback(raw_smp_processor_id(), regs);
540 return NOTIFY_STOP;
541 }
542 /* Enter debugger: */
543 break;
544
545 case DIE_DEBUG: 536 case DIE_DEBUG:
546 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { 537 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
547 if (user_mode(regs)) 538 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477b3f6..c969fd9d156 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) 1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1277 return 0; 1277 return 0;
1278 1278
1279 /*
1280 * Do not optimize in the entry code due to the unstable
1281 * stack handling.
1282 */
1283 if ((paddr >= (unsigned long )__entry_text_start) &&
1284 (paddr < (unsigned long )__entry_text_end))
1285 return 0;
1286
1279 /* Check there is enough space for a relative jump. */ 1287 /* Check there is enough space for a relative jump. */
1280 if (size - offset < RELATIVEJUMP_SIZE) 1288 if (size - offset < RELATIVEJUMP_SIZE)
1281 return 0; 1289 return 0;
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 0fe6d1a66c3..c5610384ab1 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_MAX_SIZE 2048
70#define UCODE_CONTAINER_SECTION_HDR 8 69#define UCODE_CONTAINER_SECTION_HDR 8
71#define UCODE_CONTAINER_HEADER_SIZE 12 70#define UCODE_CONTAINER_HEADER_SIZE 12
72 71
@@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
77 struct cpuinfo_x86 *c = &cpu_data(cpu); 76 struct cpuinfo_x86 *c = &cpu_data(cpu);
78 u32 dummy; 77 u32 dummy;
79 78
80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 79 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " 80 pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
83 "supported\n", cpu, c->x86);
84 return -1; 81 return -1;
85 } 82 }
83
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 84 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 85 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
86
88 return 0; 87 return 0;
89} 88}
90 89
91static int get_matching_microcode(int cpu, void *mc, int rev) 90static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
91 int rev)
92{ 92{
93 struct microcode_header_amd *mc_header = mc;
94 unsigned int current_cpu_id; 93 unsigned int current_cpu_id;
95 u16 equiv_cpu_id = 0; 94 u16 equiv_cpu_id = 0;
96 unsigned int i = 0; 95 unsigned int i = 0;
@@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
109 if (!equiv_cpu_id) 108 if (!equiv_cpu_id)
110 return 0; 109 return 0;
111 110
112 if (mc_header->processor_rev_id != equiv_cpu_id) 111 if (mc_hdr->processor_rev_id != equiv_cpu_id)
113 return 0; 112 return 0;
114 113
115 /* ucode might be chipset specific -- currently we don't support this */ 114 /* ucode might be chipset specific -- currently we don't support this */
116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 115 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
117 pr_err("CPU%d: loading of chipset specific code not yet supported\n", 116 pr_err("CPU%d: chipset specific code not yet supported\n",
118 cpu); 117 cpu);
119 return 0; 118 return 0;
120 } 119 }
121 120
122 if (mc_header->patch_id <= rev) 121 if (mc_hdr->patch_id <= rev)
123 return 0; 122 return 0;
124 123
125 return 1; 124 return 1;
@@ -144,71 +143,93 @@ static int apply_microcode_amd(int cpu)
144 143
145 /* check current patch id and patch's id for match */ 144 /* check current patch id and patch's id for match */
146 if (rev != mc_amd->hdr.patch_id) { 145 if (rev != mc_amd->hdr.patch_id) {
147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n", 146 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
148 cpu, mc_amd->hdr.patch_id); 147 cpu, mc_amd->hdr.patch_id);
149 return -1; 148 return -1;
150 } 149 }
151 150
152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); 151 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
153 uci->cpu_sig.rev = rev; 152 uci->cpu_sig.rev = rev;
154 153
155 return 0; 154 return 0;
156} 155}
157 156
158static void * 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
160{ 158{
161 unsigned int total_size; 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
162 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 160 unsigned int max_size, actual_size;
163 void *mc; 161
162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824
164#define F15H_MPB_MAX_SIZE 4096
165
166 switch (c->x86) {
167 case 0x14:
168 max_size = F14H_MPB_MAX_SIZE;
169 break;
170 case 0x15:
171 max_size = F15H_MPB_MAX_SIZE;
172 break;
173 default:
174 max_size = F1XH_MPB_MAX_SIZE;
175 break;
176 }
164 177
165 get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); 178 actual_size = buf[4] + (buf[5] << 8);
166 179
167 if (section_hdr[0] != UCODE_UCODE_TYPE) { 180 if (actual_size > size || actual_size > max_size) {
168 pr_err("error: invalid type field in container file section header\n"); 181 pr_err("section size mismatch\n");
169 return NULL; 182 return 0;
170 } 183 }
171 184
172 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 185 return actual_size;
186}
173 187
174 if (total_size > size || total_size > UCODE_MAX_SIZE) { 188static struct microcode_header_amd *
175 pr_err("error: size mismatch\n"); 189get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
176 return NULL; 190{
191 struct microcode_header_amd *mc = NULL;
192 unsigned int actual_size = 0;
193
194 if (buf[0] != UCODE_UCODE_TYPE) {
195 pr_err("invalid type field in container file section header\n");
196 goto out;
177 } 197 }
178 198
179 mc = vzalloc(UCODE_MAX_SIZE); 199 actual_size = verify_ucode_size(cpu, buf, size);
200 if (!actual_size)
201 goto out;
202
203 mc = vzalloc(actual_size);
180 if (!mc) 204 if (!mc)
181 return NULL; 205 goto out;
182 206
183 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); 207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
184 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
185 209
210out:
186 return mc; 211 return mc;
187} 212}
188 213
189static int install_equiv_cpu_table(const u8 *buf) 214static int install_equiv_cpu_table(const u8 *buf)
190{ 215{
191 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 216 unsigned int *ibuf = (unsigned int *)buf;
192 unsigned int *buf_pos = (unsigned int *)container_hdr; 217 unsigned int type = ibuf[1];
193 unsigned long size; 218 unsigned int size = ibuf[2];
194 219
195 get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); 220 if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
196 221 pr_err("empty section/"
197 size = buf_pos[2]; 222 "invalid type field in container file section header\n");
198 223 return -EINVAL;
199 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
200 pr_err("error: invalid type field in container file section header\n");
201 return 0;
202 } 224 }
203 225
204 equiv_cpu_table = vmalloc(size); 226 equiv_cpu_table = vmalloc(size);
205 if (!equiv_cpu_table) { 227 if (!equiv_cpu_table) {
206 pr_err("failed to allocate equivalent CPU table\n"); 228 pr_err("failed to allocate equivalent CPU table\n");
207 return 0; 229 return -ENOMEM;
208 } 230 }
209 231
210 buf += UCODE_CONTAINER_HEADER_SIZE; 232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
211 get_ucode_data(equiv_cpu_table, buf, size);
212 233
213 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
214} 235}
@@ -223,16 +244,16 @@ static enum ucode_state
223generic_load_microcode(int cpu, const u8 *data, size_t size) 244generic_load_microcode(int cpu, const u8 *data, size_t size)
224{ 245{
225 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 246 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
247 struct microcode_header_amd *mc_hdr = NULL;
248 unsigned int mc_size, leftover;
249 int offset;
226 const u8 *ucode_ptr = data; 250 const u8 *ucode_ptr = data;
227 void *new_mc = NULL; 251 void *new_mc = NULL;
228 void *mc; 252 unsigned int new_rev = uci->cpu_sig.rev;
229 int new_rev = uci->cpu_sig.rev;
230 unsigned int leftover;
231 unsigned long offset;
232 enum ucode_state state = UCODE_OK; 253 enum ucode_state state = UCODE_OK;
233 254
234 offset = install_equiv_cpu_table(ucode_ptr); 255 offset = install_equiv_cpu_table(ucode_ptr);
235 if (!offset) { 256 if (offset < 0) {
236 pr_err("failed to create equivalent cpu table\n"); 257 pr_err("failed to create equivalent cpu table\n");
237 return UCODE_ERROR; 258 return UCODE_ERROR;
238 } 259 }
@@ -241,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
241 leftover = size - offset; 262 leftover = size - offset;
242 263
243 while (leftover) { 264 while (leftover) {
244 unsigned int uninitialized_var(mc_size); 265 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
245 struct microcode_header_amd *mc_header; 266 if (!mc_hdr)
246
247 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
248 if (!mc)
249 break; 267 break;
250 268
251 mc_header = (struct microcode_header_amd *)mc; 269 if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
252 if (get_matching_microcode(cpu, mc, new_rev)) {
253 vfree(new_mc); 270 vfree(new_mc);
254 new_rev = mc_header->patch_id; 271 new_rev = mc_hdr->patch_id;
255 new_mc = mc; 272 new_mc = mc_hdr;
256 } else 273 } else
257 vfree(mc); 274 vfree(mc_hdr);
258 275
259 ucode_ptr += mc_size; 276 ucode_ptr += mc_size;
260 leftover -= mc_size; 277 leftover -= mc_size;
261 } 278 }
262 279
263 if (new_mc) { 280 if (!new_mc) {
264 if (!leftover) {
265 vfree(uci->mc);
266 uci->mc = new_mc;
267 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
268 cpu, new_rev, uci->cpu_sig.rev);
269 } else {
270 vfree(new_mc);
271 state = UCODE_ERROR;
272 }
273 } else
274 state = UCODE_NFOUND; 281 state = UCODE_NFOUND;
282 goto free_table;
283 }
275 284
285 if (!leftover) {
286 vfree(uci->mc);
287 uci->mc = new_mc;
288 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
289 cpu, uci->cpu_sig.rev, new_rev);
290 } else {
291 vfree(new_mc);
292 state = UCODE_ERROR;
293 }
294
295free_table:
276 free_equiv_cpu_table(); 296 free_equiv_cpu_table();
277 297
278 return state; 298 return state;
279} 299}
280 300
281static enum ucode_state request_microcode_fw(int cpu, struct device *device) 301static enum ucode_state request_microcode_amd(int cpu, struct device *device)
282{ 302{
283 const char *fw_name = "amd-ucode/microcode_amd.bin"; 303 const char *fw_name = "amd-ucode/microcode_amd.bin";
284 const struct firmware *firmware; 304 const struct firmware *fw;
285 enum ucode_state ret; 305 enum ucode_state ret = UCODE_NFOUND;
286 306
287 if (request_firmware(&firmware, fw_name, device)) { 307 if (request_firmware(&fw, fw_name, device)) {
288 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 308 pr_err("failed to load file %s\n", fw_name);
289 return UCODE_NFOUND; 309 goto out;
290 } 310 }
291 311
292 if (*(u32 *)firmware->data != UCODE_MAGIC) { 312 ret = UCODE_ERROR;
293 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 313 if (*(u32 *)fw->data != UCODE_MAGIC) {
294 *(u32 *)firmware->data); 314 pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
295 return UCODE_ERROR; 315 goto fw_release;
296 } 316 }
297 317
298 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 318 ret = generic_load_microcode(cpu, fw->data, fw->size);
299 319
300 release_firmware(firmware); 320fw_release:
321 release_firmware(fw);
301 322
323out:
302 return ret; 324 return ret;
303} 325}
304 326
@@ -319,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu)
319 341
320static struct microcode_ops microcode_amd_ops = { 342static struct microcode_ops microcode_amd_ops = {
321 .request_microcode_user = request_microcode_user, 343 .request_microcode_user = request_microcode_user,
322 .request_microcode_fw = request_microcode_fw, 344 .request_microcode_fw = request_microcode_amd,
323 .collect_cpu_info = collect_cpu_info_amd, 345 .collect_cpu_info = collect_cpu_info_amd,
324 .apply_microcode = apply_microcode_amd, 346 .apply_microcode = apply_microcode_amd,
325 .microcode_fini_cpu = microcode_fini_cpu_amd, 347 .microcode_fini_cpu = microcode_fini_cpu_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 1cca374a2ba..87af68e0e1e 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -417,8 +417,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
417 if (err) 417 if (err)
418 return err; 418 return err;
419 419
420 if (microcode_init_cpu(cpu) == UCODE_ERROR) 420 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
421 err = -EINVAL; 421 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
422 return -EINVAL;
423 }
422 424
423 return err; 425 return err;
424} 426}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff455419898..99fa3adf014 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -110,12 +110,9 @@ void show_regs_common(void)
110 init_utsname()->release, 110 init_utsname()->release,
111 (int)strcspn(init_utsname()->version, " "), 111 (int)strcspn(init_utsname()->version, " "),
112 init_utsname()->version); 112 init_utsname()->version);
113 printk(KERN_CONT " "); 113 printk(KERN_CONT " %s %s", vendor, product);
114 printk(KERN_CONT "%s %s", vendor, product); 114 if (board)
115 if (board) { 115 printk(KERN_CONT "/%s", board);
116 printk(KERN_CONT "/");
117 printk(KERN_CONT "%s", board);
118 }
119 printk(KERN_CONT "\n"); 116 printk(KERN_CONT "\n");
120} 117}
121 118
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 6f39cab052d..3f2ad2640d8 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
6#include <linux/acpi.h> 6#include <linux/acpi.h>
7#include <linux/bcd.h> 7#include <linux/bcd.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9#include <linux/of.h>
9 10
10#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
11#include <asm/x86_init.h> 12#include <asm/x86_init.h>
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
236 } 237 }
237 } 238 }
238#endif 239#endif
240 if (of_have_populated_dt())
241 return 0;
239 242
240 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
241 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3cfe26c025..b176f2b1f45 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
113#endif 113#endif
114#include <asm/mce.h> 114#include <asm/mce.h>
115#include <asm/alternative.h> 115#include <asm/alternative.h>
116#include <asm/prom.h>
116 117
117/* 118/*
118 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -293,10 +294,32 @@ static void __init init_gbpages(void)
293 else 294 else
294 direct_gbpages = 0; 295 direct_gbpages = 0;
295} 296}
297
298static void __init cleanup_highmap_brk_end(void)
299{
300 pud_t *pud;
301 pmd_t *pmd;
302
303 mmu_cr4_features = read_cr4();
304
305 /*
306 * _brk_end cannot change anymore, but it and _end may be
307 * located on different 2M pages. cleanup_highmap(), however,
308 * can only consider _end when it runs, so destroy any
309 * mappings beyond _brk_end here.
310 */
311 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
312 pmd = pmd_offset(pud, _brk_end - 1);
313 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
314 pmd_clear(pmd);
315}
296#else 316#else
297static inline void init_gbpages(void) 317static inline void init_gbpages(void)
298{ 318{
299} 319}
320static inline void cleanup_highmap_brk_end(void)
321{
322}
300#endif 323#endif
301 324
302static void __init reserve_brk(void) 325static void __init reserve_brk(void)
@@ -307,6 +330,8 @@ static void __init reserve_brk(void)
307 /* Mark brk area as locked down and no longer taking any 330 /* Mark brk area as locked down and no longer taking any
308 new allocations */ 331 new allocations */
309 _brk_start = 0; 332 _brk_start = 0;
333
334 cleanup_highmap_brk_end();
310} 335}
311 336
312#ifdef CONFIG_BLK_DEV_INITRD 337#ifdef CONFIG_BLK_DEV_INITRD
@@ -429,16 +454,30 @@ static void __init parse_setup_data(void)
429 return; 454 return;
430 pa_data = boot_params.hdr.setup_data; 455 pa_data = boot_params.hdr.setup_data;
431 while (pa_data) { 456 while (pa_data) {
432 data = early_memremap(pa_data, PAGE_SIZE); 457 u32 data_len, map_len;
458
459 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
460 (u64)sizeof(struct setup_data));
461 data = early_memremap(pa_data, map_len);
462 data_len = data->len + sizeof(struct setup_data);
463 if (data_len > map_len) {
464 early_iounmap(data, map_len);
465 data = early_memremap(pa_data, data_len);
466 map_len = data_len;
467 }
468
433 switch (data->type) { 469 switch (data->type) {
434 case SETUP_E820_EXT: 470 case SETUP_E820_EXT:
435 parse_e820_ext(data, pa_data); 471 parse_e820_ext(data);
472 break;
473 case SETUP_DTB:
474 add_dtb(pa_data);
436 break; 475 break;
437 default: 476 default:
438 break; 477 break;
439 } 478 }
440 pa_data = data->next; 479 pa_data = data->next;
441 early_iounmap(data, PAGE_SIZE); 480 early_iounmap(data, map_len);
442 } 481 }
443} 482}
444 483
@@ -680,15 +719,6 @@ static int __init parse_reservelow(char *p)
680 719
681early_param("reservelow", parse_reservelow); 720early_param("reservelow", parse_reservelow);
682 721
683static u64 __init get_max_mapped(void)
684{
685 u64 end = max_pfn_mapped;
686
687 end <<= PAGE_SHIFT;
688
689 return end;
690}
691
692/* 722/*
693 * Determine if we were loaded by an EFI loader. If so, then we have also been 723 * Determine if we were loaded by an EFI loader. If so, then we have also been
694 * passed the efi memmap, systab, etc., so we should use these data structures 724 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -704,8 +734,6 @@ static u64 __init get_max_mapped(void)
704 734
705void __init setup_arch(char **cmdline_p) 735void __init setup_arch(char **cmdline_p)
706{ 736{
707 int acpi = 0;
708 int amd = 0;
709 unsigned long flags; 737 unsigned long flags;
710 738
711#ifdef CONFIG_X86_32 739#ifdef CONFIG_X86_32
@@ -984,19 +1012,7 @@ void __init setup_arch(char **cmdline_p)
984 1012
985 early_acpi_boot_init(); 1013 early_acpi_boot_init();
986 1014
987#ifdef CONFIG_ACPI_NUMA 1015 initmem_init();
988 /*
989 * Parse SRAT to discover nodes.
990 */
991 acpi = acpi_numa_init();
992#endif
993
994#ifdef CONFIG_AMD_NUMA
995 if (!acpi)
996 amd = !amd_numa_init(0, max_pfn);
997#endif
998
999 initmem_init(0, max_pfn, acpi, amd);
1000 memblock_find_dma_reserve(); 1016 memblock_find_dma_reserve();
1001 dma32_reserve_bootmem(); 1017 dma32_reserve_bootmem();
1002 1018
@@ -1029,8 +1045,8 @@ void __init setup_arch(char **cmdline_p)
1029 * Read APIC and some other early information from ACPI tables. 1045 * Read APIC and some other early information from ACPI tables.
1030 */ 1046 */
1031 acpi_boot_init(); 1047 acpi_boot_init();
1032
1033 sfi_init(); 1048 sfi_init();
1049 x86_dtb_init();
1034 1050
1035 /* 1051 /*
1036 * get boot-time SMP configuration: 1052 * get boot-time SMP configuration:
@@ -1040,9 +1056,7 @@ void __init setup_arch(char **cmdline_p)
1040 1056
1041 prefill_possible_map(); 1057 prefill_possible_map();
1042 1058
1043#ifdef CONFIG_X86_64
1044 init_cpu_to_node(); 1059 init_cpu_to_node();
1045#endif
1046 1060
1047 init_apic_mappings(); 1061 init_apic_mappings();
1048 ioapic_and_gsi_init(); 1062 ioapic_and_gsi_init();
@@ -1066,6 +1080,8 @@ void __init setup_arch(char **cmdline_p)
1066#endif 1080#endif
1067 x86_init.oem.banner(); 1081 x86_init.oem.banner();
1068 1082
1083 x86_init.timers.wallclock_init();
1084
1069 mcheck_init(); 1085 mcheck_init();
1070 1086
1071 local_irq_save(flags); 1087 local_irq_save(flags);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 002b79685f7..71f4727da37 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void)
225 per_cpu(x86_bios_cpu_apicid, cpu) = 225 per_cpu(x86_bios_cpu_apicid, cpu) =
226 early_per_cpu_map(x86_bios_cpu_apicid, cpu); 226 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
227#endif 227#endif
228#ifdef CONFIG_X86_32
229 per_cpu(x86_cpu_to_logical_apicid, cpu) =
230 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
231#endif
228#ifdef CONFIG_X86_64 232#ifdef CONFIG_X86_64
229 per_cpu(irq_stack_ptr, cpu) = 233 per_cpu(irq_stack_ptr, cpu) =
230 per_cpu(irq_stack_union.irq_stack, cpu) + 234 per_cpu(irq_stack_union.irq_stack, cpu) +
231 IRQ_STACK_SIZE - 64; 235 IRQ_STACK_SIZE - 64;
236#endif
232#ifdef CONFIG_NUMA 237#ifdef CONFIG_NUMA
233 per_cpu(x86_cpu_to_node_map, cpu) = 238 per_cpu(x86_cpu_to_node_map, cpu) =
234 early_per_cpu_map(x86_cpu_to_node_map, cpu); 239 early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void)
242 */ 247 */
243 set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); 248 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
244#endif 249#endif
245#endif
246 /* 250 /*
247 * Up to this point, the boot CPU has been using .init.data 251 * Up to this point, the boot CPU has been using .init.data
248 * area. Reload any changed state for the boot CPU. 252 * area. Reload any changed state for the boot CPU.
@@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void)
256 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 260 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
257 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 261 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
258#endif 262#endif
259#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 263#ifdef CONFIG_X86_32
264 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
265#endif
266#ifdef CONFIG_NUMA
260 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 267 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
261#endif 268#endif
262 269
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 1bfb1c615a6..e9efdfd51c8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,6 +64,7 @@
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/mwait.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/io_apic.h>
67#include <asm/setup.h> 68#include <asm/setup.h>
68#include <asm/uv/uv.h> 69#include <asm/uv/uv.h>
69#include <linux/mc146818rtc.h> 70#include <linux/mc146818rtc.h>
@@ -71,10 +72,6 @@
71#include <asm/smpboot_hooks.h> 72#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h> 73#include <asm/i8259.h>
73 74
74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID];
76#endif
77
78/* State of each CPU */ 75/* State of each CPU */
79DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76DEFINE_PER_CPU(int, cpu_state) = { 0 };
80 77
@@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
130DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 127DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
131EXPORT_PER_CPU_SYMBOL(cpu_core_map); 128EXPORT_PER_CPU_SYMBOL(cpu_core_map);
132 129
130DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
131
133/* Per CPU bogomips and other parameters */ 132/* Per CPU bogomips and other parameters */
134DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 133DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
135EXPORT_PER_CPU_SYMBOL(cpu_info); 134EXPORT_PER_CPU_SYMBOL(cpu_info);
136 135
137atomic_t init_deasserted; 136atomic_t init_deasserted;
138 137
139#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
140/* which node each logical CPU is on */
141int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
142EXPORT_SYMBOL(cpu_to_node_map);
143
144/* set up a mapping between cpu and node. */
145static void map_cpu_to_node(int cpu, int node)
146{
147 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
148 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = node;
150}
151
152/* undo a mapping between cpu and node. */
153static void unmap_cpu_to_node(int cpu)
154{
155 int node;
156
157 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
158 for (node = 0; node < MAX_NUMNODES; node++)
159 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
160 cpu_to_node_map[cpu] = 0;
161}
162#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
163#define map_cpu_to_node(cpu, node) ({})
164#define unmap_cpu_to_node(cpu) ({})
165#endif
166
167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
171 { [0 ... NR_CPUS-1] = BAD_APICID };
172
173static void map_cpu_to_logical_apicid(void)
174{
175 int cpu = smp_processor_id();
176 int apicid = logical_smp_processor_id();
177 int node = apic->apicid_to_node(apicid);
178
179 if (!node_online(node))
180 node = first_online_node;
181
182 cpu_2_logical_apicid[cpu] = apicid;
183 map_cpu_to_node(cpu, node);
184}
185
186void numa_remove_cpu(int cpu)
187{
188 cpu_2_logical_apicid[cpu] = BAD_APICID;
189 unmap_cpu_to_node(cpu);
190}
191#else
192#define map_cpu_to_logical_apicid() do {} while (0)
193#endif
194
195/* 138/*
196 * Report back to the Boot Processor. 139 * Report back to the Boot Processor.
197 * Running on AP. 140 * Running on AP.
@@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void)
259 apic->smp_callin_clear_local_apic(); 202 apic->smp_callin_clear_local_apic();
260 setup_local_APIC(); 203 setup_local_APIC();
261 end_local_APIC_setup(); 204 end_local_APIC_setup();
262 map_cpu_to_logical_apicid();
263 205
264 /* 206 /*
265 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
@@ -355,23 +297,6 @@ notrace static void __cpuinit start_secondary(void *unused)
355 cpu_idle(); 297 cpu_idle();
356} 298}
357 299
358#ifdef CONFIG_CPUMASK_OFFSTACK
359/* In this case, llc_shared_map is a pointer to a cpumask. */
360static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
361 const struct cpuinfo_x86 *src)
362{
363 struct cpumask *llc = dst->llc_shared_map;
364 *dst = *src;
365 dst->llc_shared_map = llc;
366}
367#else
368static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
369 const struct cpuinfo_x86 *src)
370{
371 *dst = *src;
372}
373#endif /* CONFIG_CPUMASK_OFFSTACK */
374
375/* 300/*
376 * The bootstrap kernel entry code has set these up. Save them for 301 * The bootstrap kernel entry code has set these up. Save them for
377 * a given CPU 302 * a given CPU
@@ -381,7 +306,7 @@ void __cpuinit smp_store_cpu_info(int id)
381{ 306{
382 struct cpuinfo_x86 *c = &cpu_data(id); 307 struct cpuinfo_x86 *c = &cpu_data(id);
383 308
384 copy_cpuinfo_x86(c, &boot_cpu_data); 309 *c = boot_cpu_data;
385 c->cpu_index = id; 310 c->cpu_index = id;
386 if (id != 0) 311 if (id != 0)
387 identify_secondary_cpu(c); 312 identify_secondary_cpu(c);
@@ -389,15 +314,12 @@ void __cpuinit smp_store_cpu_info(int id)
389 314
390static void __cpuinit link_thread_siblings(int cpu1, int cpu2) 315static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
391{ 316{
392 struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
393 struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
394
395 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); 317 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
396 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); 318 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
397 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); 319 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
398 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); 320 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
399 cpumask_set_cpu(cpu1, c2->llc_shared_map); 321 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
400 cpumask_set_cpu(cpu2, c1->llc_shared_map); 322 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
401} 323}
402 324
403 325
@@ -426,7 +348,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
426 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 348 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
427 } 349 }
428 350
429 cpumask_set_cpu(cpu, c->llc_shared_map); 351 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
430 352
431 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { 353 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
432 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 354 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -437,8 +359,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
437 for_each_cpu(i, cpu_sibling_setup_mask) { 359 for_each_cpu(i, cpu_sibling_setup_mask) {
438 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 360 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
439 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 361 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
440 cpumask_set_cpu(i, c->llc_shared_map); 362 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
441 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); 363 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
442 } 364 }
443 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 365 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
444 cpumask_set_cpu(i, cpu_core_mask(cpu)); 366 cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -477,7 +399,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
477 !(cpu_has(c, X86_FEATURE_AMD_DCM))) 399 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
478 return cpu_core_mask(cpu); 400 return cpu_core_mask(cpu);
479 else 401 else
480 return c->llc_shared_map; 402 return cpu_llc_shared_mask(cpu);
481} 403}
482 404
483static void impress_friends(void) 405static void impress_friends(void)
@@ -946,6 +868,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
946 return 0; 868 return 0;
947} 869}
948 870
871/**
872 * arch_disable_smp_support() - disables SMP support for x86 at runtime
873 */
874void arch_disable_smp_support(void)
875{
876 disable_ioapic_support();
877}
878
949/* 879/*
950 * Fall back to non SMP mode after errors. 880 * Fall back to non SMP mode after errors.
951 * 881 *
@@ -961,7 +891,6 @@ static __init void disable_smp(void)
961 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 891 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
962 else 892 else
963 physid_set_mask_of_physid(0, &phys_cpu_present_map); 893 physid_set_mask_of_physid(0, &phys_cpu_present_map);
964 map_cpu_to_logical_apicid();
965 cpumask_set_cpu(0, cpu_sibling_mask(0)); 894 cpumask_set_cpu(0, cpu_sibling_mask(0));
966 cpumask_set_cpu(0, cpu_core_mask(0)); 895 cpumask_set_cpu(0, cpu_core_mask(0));
967} 896}
@@ -1046,7 +975,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1046 "(tell your hw vendor)\n"); 975 "(tell your hw vendor)\n");
1047 } 976 }
1048 smpboot_clear_io_apic(); 977 smpboot_clear_io_apic();
1049 arch_disable_smp_support(); 978 disable_ioapic_support();
1050 return -1; 979 return -1;
1051 } 980 }
1052 981
@@ -1090,21 +1019,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1090 1019
1091 preempt_disable(); 1020 preempt_disable();
1092 smp_cpu_index_default(); 1021 smp_cpu_index_default();
1093 memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); 1022
1094 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1095 mb();
1096 /* 1023 /*
1097 * Setup boot CPU information 1024 * Setup boot CPU information
1098 */ 1025 */
1099 smp_store_cpu_info(0); /* Final full version of the data */ 1026 smp_store_cpu_info(0); /* Final full version of the data */
1100#ifdef CONFIG_X86_32 1027 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1101 boot_cpu_logical_apicid = logical_smp_processor_id(); 1028 mb();
1102#endif 1029
1103 current_thread_info()->cpu = 0; /* needed? */ 1030 current_thread_info()->cpu = 0; /* needed? */
1104 for_each_possible_cpu(i) { 1031 for_each_possible_cpu(i) {
1105 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1032 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1106 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1033 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1107 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1034 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1108 } 1035 }
1109 set_cpu_sibling_map(0); 1036 set_cpu_sibling_map(0);
1110 1037
@@ -1140,8 +1067,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1140 1067
1141 bsp_end_local_APIC_setup(); 1068 bsp_end_local_APIC_setup();
1142 1069
1143 map_cpu_to_logical_apicid();
1144
1145 if (apic->setup_portio_remap) 1070 if (apic->setup_portio_remap)
1146 apic->setup_portio_remap(); 1071 apic->setup_portio_remap();
1147 1072
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786dc9b8..5f181742e8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,6 @@ ENTRY(sys_call_table)
340 .long sys_fanotify_init 340 .long sys_fanotify_init
341 .long sys_fanotify_mark 341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */ 342 .long sys_prlimit64 /* 340 */
343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf470075518..0381e1f3bae 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
105 SCHED_TEXT 105 SCHED_TEXT
106 LOCK_TEXT 106 LOCK_TEXT
107 KPROBES_TEXT 107 KPROBES_TEXT
108 ENTRY_TEXT
108 IRQENTRY_TEXT 109 IRQENTRY_TEXT
109 *(.fixup) 110 *(.fixup)
110 *(.gnu.warning) 111 *(.gnu.warning)
@@ -305,7 +306,7 @@ SECTIONS
305 } 306 }
306 307
307#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 308#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
308 PERCPU(THREAD_SIZE) 309 PERCPU(PAGE_SIZE)
309#endif 310#endif
310 311
311 . = ALIGN(PAGE_SIZE); 312 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e5..9796c2f3d07 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
52EXPORT_SYMBOL(memset); 52EXPORT_SYMBOL(memset);
53EXPORT_SYMBOL(memcpy); 53EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55EXPORT_SYMBOL(memmove);
55 56
56EXPORT_SYMBOL(empty_zero_page); 57EXPORT_SYMBOL(empty_zero_page);
57#ifndef CONFIG_PARAVIRT 58#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ceb2911aa43..c11514e9128 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
70 .setup_percpu_clockev = setup_boot_APIC_clock, 70 .setup_percpu_clockev = setup_boot_APIC_clock,
71 .tsc_pre_init = x86_init_noop, 71 .tsc_pre_init = x86_init_noop,
72 .timer_init = hpet_time_init, 72 .timer_init = hpet_time_init,
73 .wallclock_init = x86_init_noop,
73 }, 74 },
74 75
75 .iommu = { 76 .iommu = {
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 1357d7cf4ec..db932760ea8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall,
62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), 62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
63 63
64 TP_STRUCT__entry( 64 TP_STRUCT__entry(
65 __field( __u16, code )
66 __field( bool, fast )
67 __field( __u16, rep_cnt ) 65 __field( __u16, rep_cnt )
68 __field( __u16, rep_idx ) 66 __field( __u16, rep_idx )
69 __field( __u64, ingpa ) 67 __field( __u64, ingpa )
70 __field( __u64, outgpa ) 68 __field( __u64, outgpa )
69 __field( __u16, code )
70 __field( bool, fast )
71 ), 71 ),
72 72
73 TP_fast_assign( 73 TP_fast_assign(
74 __entry->code = code;
75 __entry->fast = fast;
76 __entry->rep_cnt = rep_cnt; 74 __entry->rep_cnt = rep_cnt;
77 __entry->rep_idx = rep_idx; 75 __entry->rep_idx = rep_idx;
78 __entry->ingpa = ingpa; 76 __entry->ingpa = ingpa;
79 __entry->outgpa = outgpa; 77 __entry->outgpa = outgpa;
78 __entry->code = code;
79 __entry->fast = fast;
80 ), 80 ),
81 81
82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", 82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index eba687f0cc0..b9ec1c74943 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void)
847void lguest_setup_irq(unsigned int irq) 847void lguest_setup_irq(unsigned int irq)
848{ 848{
849 irq_alloc_desc_at(irq, 0); 849 irq_alloc_desc_at(irq, 0);
850 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 850 irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
851 handle_level_irq, "level"); 851 handle_level_irq, "level");
852} 852}
853 853
@@ -995,7 +995,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
995static void lguest_time_init(void) 995static void lguest_time_init(void)
996{ 996{
997 /* Set up the timer interrupt (0) to go to our simple timer routine */ 997 /* Set up the timer interrupt (0) to go to our simple timer routine */
998 set_irq_handler(0, lguest_time_irq); 998 irq_set_handler(0, lguest_time_irq);
999 999
1000 clocksource_register(&lguest_clock); 1000 clocksource_register(&lguest_clock);
1001 1001
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 2cda60a06e6..e8e7e0d06f4 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -15,14 +15,12 @@
15 15
16/* if you want SMP support, implement these with real spinlocks */ 16/* if you want SMP support, implement these with real spinlocks */
17.macro LOCK reg 17.macro LOCK reg
18 pushfl 18 pushfl_cfi
19 CFI_ADJUST_CFA_OFFSET 4
20 cli 19 cli
21.endm 20.endm
22 21
23.macro UNLOCK reg 22.macro UNLOCK reg
24 popfl 23 popfl_cfi
25 CFI_ADJUST_CFA_OFFSET -4
26.endm 24.endm
27 25
28#define BEGIN(op) \ 26#define BEGIN(op) \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 71e080de335..391a083674b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -14,14 +14,12 @@
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg 16.macro SAVE reg
17 pushl %\reg 17 pushl_cfi %\reg
18 CFI_ADJUST_CFA_OFFSET 4
19 CFI_REL_OFFSET \reg, 0 18 CFI_REL_OFFSET \reg, 0
20.endm 19.endm
21 20
22.macro RESTORE reg 21.macro RESTORE reg
23 popl %\reg 22 popl_cfi %\reg
24 CFI_ADJUST_CFA_OFFSET -4
25 CFI_RESTORE \reg 23 CFI_RESTORE \reg
26.endm 24.endm
27 25
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb7..78d16a554db 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
50 */ 50 */
51ENTRY(csum_partial) 51ENTRY(csum_partial)
52 CFI_STARTPROC 52 CFI_STARTPROC
53 pushl %esi 53 pushl_cfi %esi
54 CFI_ADJUST_CFA_OFFSET 4
55 CFI_REL_OFFSET esi, 0 54 CFI_REL_OFFSET esi, 0
56 pushl %ebx 55 pushl_cfi %ebx
57 CFI_ADJUST_CFA_OFFSET 4
58 CFI_REL_OFFSET ebx, 0 56 CFI_REL_OFFSET ebx, 0
59 movl 20(%esp),%eax # Function arg: unsigned int sum 57 movl 20(%esp),%eax # Function arg: unsigned int sum
60 movl 16(%esp),%ecx # Function arg: int len 58 movl 16(%esp),%ecx # Function arg: int len
@@ -132,11 +130,9 @@ ENTRY(csum_partial)
132 jz 8f 130 jz 8f
133 roll $8, %eax 131 roll $8, %eax
1348: 1328:
135 popl %ebx 133 popl_cfi %ebx
136 CFI_ADJUST_CFA_OFFSET -4
137 CFI_RESTORE ebx 134 CFI_RESTORE ebx
138 popl %esi 135 popl_cfi %esi
139 CFI_ADJUST_CFA_OFFSET -4
140 CFI_RESTORE esi 136 CFI_RESTORE esi
141 ret 137 ret
142 CFI_ENDPROC 138 CFI_ENDPROC
@@ -148,11 +144,9 @@ ENDPROC(csum_partial)
148 144
149ENTRY(csum_partial) 145ENTRY(csum_partial)
150 CFI_STARTPROC 146 CFI_STARTPROC
151 pushl %esi 147 pushl_cfi %esi
152 CFI_ADJUST_CFA_OFFSET 4
153 CFI_REL_OFFSET esi, 0 148 CFI_REL_OFFSET esi, 0
154 pushl %ebx 149 pushl_cfi %ebx
155 CFI_ADJUST_CFA_OFFSET 4
156 CFI_REL_OFFSET ebx, 0 150 CFI_REL_OFFSET ebx, 0
157 movl 20(%esp),%eax # Function arg: unsigned int sum 151 movl 20(%esp),%eax # Function arg: unsigned int sum
158 movl 16(%esp),%ecx # Function arg: int len 152 movl 16(%esp),%ecx # Function arg: int len
@@ -260,11 +254,9 @@ ENTRY(csum_partial)
260 jz 90f 254 jz 90f
261 roll $8, %eax 255 roll $8, %eax
26290: 25690:
263 popl %ebx 257 popl_cfi %ebx
264 CFI_ADJUST_CFA_OFFSET -4
265 CFI_RESTORE ebx 258 CFI_RESTORE ebx
266 popl %esi 259 popl_cfi %esi
267 CFI_ADJUST_CFA_OFFSET -4
268 CFI_RESTORE esi 260 CFI_RESTORE esi
269 ret 261 ret
270 CFI_ENDPROC 262 CFI_ENDPROC
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic)
309 CFI_STARTPROC 301 CFI_STARTPROC
310 subl $4,%esp 302 subl $4,%esp
311 CFI_ADJUST_CFA_OFFSET 4 303 CFI_ADJUST_CFA_OFFSET 4
312 pushl %edi 304 pushl_cfi %edi
313 CFI_ADJUST_CFA_OFFSET 4
314 CFI_REL_OFFSET edi, 0 305 CFI_REL_OFFSET edi, 0
315 pushl %esi 306 pushl_cfi %esi
316 CFI_ADJUST_CFA_OFFSET 4
317 CFI_REL_OFFSET esi, 0 307 CFI_REL_OFFSET esi, 0
318 pushl %ebx 308 pushl_cfi %ebx
319 CFI_ADJUST_CFA_OFFSET 4
320 CFI_REL_OFFSET ebx, 0 309 CFI_REL_OFFSET ebx, 0
321 movl ARGBASE+16(%esp),%eax # sum 310 movl ARGBASE+16(%esp),%eax # sum
322 movl ARGBASE+12(%esp),%ecx # len 311 movl ARGBASE+12(%esp),%ecx # len
@@ -426,17 +415,13 @@ DST( movb %cl, (%edi) )
426 415
427.previous 416.previous
428 417
429 popl %ebx 418 popl_cfi %ebx
430 CFI_ADJUST_CFA_OFFSET -4
431 CFI_RESTORE ebx 419 CFI_RESTORE ebx
432 popl %esi 420 popl_cfi %esi
433 CFI_ADJUST_CFA_OFFSET -4
434 CFI_RESTORE esi 421 CFI_RESTORE esi
435 popl %edi 422 popl_cfi %edi
436 CFI_ADJUST_CFA_OFFSET -4
437 CFI_RESTORE edi 423 CFI_RESTORE edi
438 popl %ecx # equivalent to addl $4,%esp 424 popl_cfi %ecx # equivalent to addl $4,%esp
439 CFI_ADJUST_CFA_OFFSET -4
440 ret 425 ret
441 CFI_ENDPROC 426 CFI_ENDPROC
442ENDPROC(csum_partial_copy_generic) 427ENDPROC(csum_partial_copy_generic)
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic)
459 444
460ENTRY(csum_partial_copy_generic) 445ENTRY(csum_partial_copy_generic)
461 CFI_STARTPROC 446 CFI_STARTPROC
462 pushl %ebx 447 pushl_cfi %ebx
463 CFI_ADJUST_CFA_OFFSET 4
464 CFI_REL_OFFSET ebx, 0 448 CFI_REL_OFFSET ebx, 0
465 pushl %edi 449 pushl_cfi %edi
466 CFI_ADJUST_CFA_OFFSET 4
467 CFI_REL_OFFSET edi, 0 450 CFI_REL_OFFSET edi, 0
468 pushl %esi 451 pushl_cfi %esi
469 CFI_ADJUST_CFA_OFFSET 4
470 CFI_REL_OFFSET esi, 0 452 CFI_REL_OFFSET esi, 0
471 movl ARGBASE+4(%esp),%esi #src 453 movl ARGBASE+4(%esp),%esi #src
472 movl ARGBASE+8(%esp),%edi #dst 454 movl ARGBASE+8(%esp),%edi #dst
@@ -527,14 +509,11 @@ DST( movb %dl, (%edi) )
527 jmp 7b 509 jmp 7b
528.previous 510.previous
529 511
530 popl %esi 512 popl_cfi %esi
531 CFI_ADJUST_CFA_OFFSET -4
532 CFI_RESTORE esi 513 CFI_RESTORE esi
533 popl %edi 514 popl_cfi %edi
534 CFI_ADJUST_CFA_OFFSET -4
535 CFI_RESTORE edi 515 CFI_RESTORE edi
536 popl %ebx 516 popl_cfi %ebx
537 CFI_ADJUST_CFA_OFFSET -4
538 CFI_RESTORE ebx 517 CFI_RESTORE ebx
539 ret 518 ret
540 CFI_ENDPROC 519 CFI_ENDPROC
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 00000000000..0ecb8433e5a
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
1/*
2 * Normally compiler builtins are used, but sometimes the compiler calls out
3 * of line code. Based on asm-i386/string.h.
4 *
5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */
8#define _STRING_C
9#include <linux/linkage.h>
10#include <asm/dwarf2.h>
11
12#undef memmove
13
14/*
15 * Implement memmove(). This can handle overlap between src and dst.
16 *
17 * Input:
18 * rdi: dest
19 * rsi: src
20 * rdx: count
21 *
22 * Output:
23 * rax: dest
24 */
25ENTRY(memmove)
26 CFI_STARTPROC
27 /* Handle more 32bytes in loop */
28 mov %rdi, %rax
29 cmp $0x20, %rdx
30 jb 1f
31
32 /* Decide forward/backward copy mode */
33 cmp %rdi, %rsi
34 jb 2f
35
36 /*
37 * movsq instruction have many startup latency
38 * so we handle small size by general register.
39 */
40 cmp $680, %rdx
41 jb 3f
42 /*
43 * movsq instruction is only good for aligned case.
44 */
45
46 cmpb %dil, %sil
47 je 4f
483:
49 sub $0x20, %rdx
50 /*
51 * We gobble 32byts forward in each loop.
52 */
535:
54 sub $0x20, %rdx
55 movq 0*8(%rsi), %r11
56 movq 1*8(%rsi), %r10
57 movq 2*8(%rsi), %r9
58 movq 3*8(%rsi), %r8
59 leaq 4*8(%rsi), %rsi
60
61 movq %r11, 0*8(%rdi)
62 movq %r10, 1*8(%rdi)
63 movq %r9, 2*8(%rdi)
64 movq %r8, 3*8(%rdi)
65 leaq 4*8(%rdi), %rdi
66 jae 5b
67 addq $0x20, %rdx
68 jmp 1f
69 /*
70 * Handle data forward by movsq.
71 */
72 .p2align 4
734:
74 movq %rdx, %rcx
75 movq -8(%rsi, %rdx), %r11
76 lea -8(%rdi, %rdx), %r10
77 shrq $3, %rcx
78 rep movsq
79 movq %r11, (%r10)
80 jmp 13f
81 /*
82 * Handle data backward by movsq.
83 */
84 .p2align 4
857:
86 movq %rdx, %rcx
87 movq (%rsi), %r11
88 movq %rdi, %r10
89 leaq -8(%rsi, %rdx), %rsi
90 leaq -8(%rdi, %rdx), %rdi
91 shrq $3, %rcx
92 std
93 rep movsq
94 cld
95 movq %r11, (%r10)
96 jmp 13f
97
98 /*
99 * Start to prepare for backward copy.
100 */
101 .p2align 4
1022:
103 cmp $680, %rdx
104 jb 6f
105 cmp %dil, %sil
106 je 7b
1076:
108 /*
109 * Calculate copy position to tail.
110 */
111 addq %rdx, %rsi
112 addq %rdx, %rdi
113 subq $0x20, %rdx
114 /*
115 * We gobble 32byts backward in each loop.
116 */
1178:
118 subq $0x20, %rdx
119 movq -1*8(%rsi), %r11
120 movq -2*8(%rsi), %r10
121 movq -3*8(%rsi), %r9
122 movq -4*8(%rsi), %r8
123 leaq -4*8(%rsi), %rsi
124
125 movq %r11, -1*8(%rdi)
126 movq %r10, -2*8(%rdi)
127 movq %r9, -3*8(%rdi)
128 movq %r8, -4*8(%rdi)
129 leaq -4*8(%rdi), %rdi
130 jae 8b
131 /*
132 * Calculate copy position to head.
133 */
134 addq $0x20, %rdx
135 subq %rdx, %rsi
136 subq %rdx, %rdi
1371:
138 cmpq $16, %rdx
139 jb 9f
140 /*
141 * Move data from 16 bytes to 31 bytes.
142 */
143 movq 0*8(%rsi), %r11
144 movq 1*8(%rsi), %r10
145 movq -2*8(%rsi, %rdx), %r9
146 movq -1*8(%rsi, %rdx), %r8
147 movq %r11, 0*8(%rdi)
148 movq %r10, 1*8(%rdi)
149 movq %r9, -2*8(%rdi, %rdx)
150 movq %r8, -1*8(%rdi, %rdx)
151 jmp 13f
152 .p2align 4
1539:
154 cmpq $8, %rdx
155 jb 10f
156 /*
157 * Move data from 8 bytes to 15 bytes.
158 */
159 movq 0*8(%rsi), %r11
160 movq -1*8(%rsi, %rdx), %r10
161 movq %r11, 0*8(%rdi)
162 movq %r10, -1*8(%rdi, %rdx)
163 jmp 13f
16410:
165 cmpq $4, %rdx
166 jb 11f
167 /*
168 * Move data from 4 bytes to 7 bytes.
169 */
170 movl (%rsi), %r11d
171 movl -4(%rsi, %rdx), %r10d
172 movl %r11d, (%rdi)
173 movl %r10d, -4(%rdi, %rdx)
174 jmp 13f
17511:
176 cmp $2, %rdx
177 jb 12f
178 /*
179 * Move data from 2 bytes to 3 bytes.
180 */
181 movw (%rsi), %r11w
182 movw -2(%rsi, %rdx), %r10w
183 movw %r11w, (%rdi)
184 movw %r10w, -2(%rdi, %rdx)
185 jmp 13f
18612:
187 cmp $1, %rdx
188 jb 13f
189 /*
190 * Move data for 1 byte.
191 */
192 movb (%rsi), %r11b
193 movb %r11b, (%rdi)
19413:
195 retq
196 CFI_ENDPROC
197ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 6d0f0ec41b3..00000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,192 +0,0 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6#include <linux/module.h>
7
8#undef memmove
9void *memmove(void *dest, const void *src, size_t count)
10{
11 unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12 char *ret;
13
14 __asm__ __volatile__(
15 /* Handle more 32bytes in loop */
16 "mov %2, %3\n\t"
17 "cmp $0x20, %0\n\t"
18 "jb 1f\n\t"
19
20 /* Decide forward/backward copy mode */
21 "cmp %2, %1\n\t"
22 "jb 2f\n\t"
23
24 /*
25 * movsq instruction have many startup latency
26 * so we handle small size by general register.
27 */
28 "cmp $680, %0\n\t"
29 "jb 3f\n\t"
30 /*
31 * movsq instruction is only good for aligned case.
32 */
33 "cmpb %%dil, %%sil\n\t"
34 "je 4f\n\t"
35 "3:\n\t"
36 "sub $0x20, %0\n\t"
37 /*
38 * We gobble 32byts forward in each loop.
39 */
40 "5:\n\t"
41 "sub $0x20, %0\n\t"
42 "movq 0*8(%1), %4\n\t"
43 "movq 1*8(%1), %5\n\t"
44 "movq 2*8(%1), %6\n\t"
45 "movq 3*8(%1), %7\n\t"
46 "leaq 4*8(%1), %1\n\t"
47
48 "movq %4, 0*8(%2)\n\t"
49 "movq %5, 1*8(%2)\n\t"
50 "movq %6, 2*8(%2)\n\t"
51 "movq %7, 3*8(%2)\n\t"
52 "leaq 4*8(%2), %2\n\t"
53 "jae 5b\n\t"
54 "addq $0x20, %0\n\t"
55 "jmp 1f\n\t"
56 /*
57 * Handle data forward by movsq.
58 */
59 ".p2align 4\n\t"
60 "4:\n\t"
61 "movq %0, %8\n\t"
62 "movq -8(%1, %0), %4\n\t"
63 "lea -8(%2, %0), %5\n\t"
64 "shrq $3, %8\n\t"
65 "rep movsq\n\t"
66 "movq %4, (%5)\n\t"
67 "jmp 13f\n\t"
68 /*
69 * Handle data backward by movsq.
70 */
71 ".p2align 4\n\t"
72 "7:\n\t"
73 "movq %0, %8\n\t"
74 "movq (%1), %4\n\t"
75 "movq %2, %5\n\t"
76 "leaq -8(%1, %0), %1\n\t"
77 "leaq -8(%2, %0), %2\n\t"
78 "shrq $3, %8\n\t"
79 "std\n\t"
80 "rep movsq\n\t"
81 "cld\n\t"
82 "movq %4, (%5)\n\t"
83 "jmp 13f\n\t"
84
85 /*
86 * Start to prepare for backward copy.
87 */
88 ".p2align 4\n\t"
89 "2:\n\t"
90 "cmp $680, %0\n\t"
91 "jb 6f \n\t"
92 "cmp %%dil, %%sil\n\t"
93 "je 7b \n\t"
94 "6:\n\t"
95 /*
96 * Calculate copy position to tail.
97 */
98 "addq %0, %1\n\t"
99 "addq %0, %2\n\t"
100 "subq $0x20, %0\n\t"
101 /*
102 * We gobble 32byts backward in each loop.
103 */
104 "8:\n\t"
105 "subq $0x20, %0\n\t"
106 "movq -1*8(%1), %4\n\t"
107 "movq -2*8(%1), %5\n\t"
108 "movq -3*8(%1), %6\n\t"
109 "movq -4*8(%1), %7\n\t"
110 "leaq -4*8(%1), %1\n\t"
111
112 "movq %4, -1*8(%2)\n\t"
113 "movq %5, -2*8(%2)\n\t"
114 "movq %6, -3*8(%2)\n\t"
115 "movq %7, -4*8(%2)\n\t"
116 "leaq -4*8(%2), %2\n\t"
117 "jae 8b\n\t"
118 /*
119 * Calculate copy position to head.
120 */
121 "addq $0x20, %0\n\t"
122 "subq %0, %1\n\t"
123 "subq %0, %2\n\t"
124 "1:\n\t"
125 "cmpq $16, %0\n\t"
126 "jb 9f\n\t"
127 /*
128 * Move data from 16 bytes to 31 bytes.
129 */
130 "movq 0*8(%1), %4\n\t"
131 "movq 1*8(%1), %5\n\t"
132 "movq -2*8(%1, %0), %6\n\t"
133 "movq -1*8(%1, %0), %7\n\t"
134 "movq %4, 0*8(%2)\n\t"
135 "movq %5, 1*8(%2)\n\t"
136 "movq %6, -2*8(%2, %0)\n\t"
137 "movq %7, -1*8(%2, %0)\n\t"
138 "jmp 13f\n\t"
139 ".p2align 4\n\t"
140 "9:\n\t"
141 "cmpq $8, %0\n\t"
142 "jb 10f\n\t"
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 "movq 0*8(%1), %4\n\t"
147 "movq -1*8(%1, %0), %5\n\t"
148 "movq %4, 0*8(%2)\n\t"
149 "movq %5, -1*8(%2, %0)\n\t"
150 "jmp 13f\n\t"
151 "10:\n\t"
152 "cmpq $4, %0\n\t"
153 "jb 11f\n\t"
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 "movl (%1), %4d\n\t"
158 "movl -4(%1, %0), %5d\n\t"
159 "movl %4d, (%2)\n\t"
160 "movl %5d, -4(%2, %0)\n\t"
161 "jmp 13f\n\t"
162 "11:\n\t"
163 "cmp $2, %0\n\t"
164 "jb 12f\n\t"
165 /*
166 * Move data from 2 bytes to 3 bytes.
167 */
168 "movw (%1), %4w\n\t"
169 "movw -2(%1, %0), %5w\n\t"
170 "movw %4w, (%2)\n\t"
171 "movw %5w, -2(%2, %0)\n\t"
172 "jmp 13f\n\t"
173 "12:\n\t"
174 "cmp $1, %0\n\t"
175 "jb 13f\n\t"
176 /*
177 * Move data for 1 byte.
178 */
179 "movb (%1), %4b\n\t"
180 "movb %4b, (%2)\n\t"
181 "13:\n\t"
182 : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184 :"0" (count),
185 "1" (src),
186 "2" (dest)
187 :"memory");
188
189 return ret;
190
191}
192EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00e49d..67743977398 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -23,43 +23,50 @@
23#include <asm/dwarf2.h> 23#include <asm/dwarf2.h>
24 24
25#define save_common_regs \ 25#define save_common_regs \
26 pushq %rdi; \ 26 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
27 pushq %rsi; \ 27 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
28 pushq %rcx; \ 28 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
29 pushq %r8; \ 29 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
30 pushq %r9; \ 30 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
31 pushq %r10; \ 31 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
32 pushq %r11 32 pushq_cfi %r11; CFI_REL_OFFSET r11, 0
33 33
34#define restore_common_regs \ 34#define restore_common_regs \
35 popq %r11; \ 35 popq_cfi %r11; CFI_RESTORE r11; \
36 popq %r10; \ 36 popq_cfi %r10; CFI_RESTORE r10; \
37 popq %r9; \ 37 popq_cfi %r9; CFI_RESTORE r9; \
38 popq %r8; \ 38 popq_cfi %r8; CFI_RESTORE r8; \
39 popq %rcx; \ 39 popq_cfi %rcx; CFI_RESTORE rcx; \
40 popq %rsi; \ 40 popq_cfi %rsi; CFI_RESTORE rsi; \
41 popq %rdi 41 popq_cfi %rdi; CFI_RESTORE rdi
42 42
43/* Fix up special calling conventions */ 43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed) 44ENTRY(call_rwsem_down_read_failed)
45 CFI_STARTPROC
45 save_common_regs 46 save_common_regs
46 pushq %rdx 47 pushq_cfi %rdx
48 CFI_REL_OFFSET rdx, 0
47 movq %rax,%rdi 49 movq %rax,%rdi
48 call rwsem_down_read_failed 50 call rwsem_down_read_failed
49 popq %rdx 51 popq_cfi %rdx
52 CFI_RESTORE rdx
50 restore_common_regs 53 restore_common_regs
51 ret 54 ret
52 ENDPROC(call_rwsem_down_read_failed) 55 CFI_ENDPROC
56ENDPROC(call_rwsem_down_read_failed)
53 57
54ENTRY(call_rwsem_down_write_failed) 58ENTRY(call_rwsem_down_write_failed)
59 CFI_STARTPROC
55 save_common_regs 60 save_common_regs
56 movq %rax,%rdi 61 movq %rax,%rdi
57 call rwsem_down_write_failed 62 call rwsem_down_write_failed
58 restore_common_regs 63 restore_common_regs
59 ret 64 ret
60 ENDPROC(call_rwsem_down_write_failed) 65 CFI_ENDPROC
66ENDPROC(call_rwsem_down_write_failed)
61 67
62ENTRY(call_rwsem_wake) 68ENTRY(call_rwsem_wake)
69 CFI_STARTPROC
63 decl %edx /* do nothing if still outstanding active readers */ 70 decl %edx /* do nothing if still outstanding active readers */
64 jnz 1f 71 jnz 1f
65 save_common_regs 72 save_common_regs
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake)
67 call rwsem_wake 74 call rwsem_wake
68 restore_common_regs 75 restore_common_regs
691: ret 761: ret
70 ENDPROC(call_rwsem_wake) 77 CFI_ENDPROC
78ENDPROC(call_rwsem_wake)
71 79
72/* Fix up special calling conventions */ 80/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake) 81ENTRY(call_rwsem_downgrade_wake)
82 CFI_STARTPROC
74 save_common_regs 83 save_common_regs
75 pushq %rdx 84 pushq_cfi %rdx
85 CFI_REL_OFFSET rdx, 0
76 movq %rax,%rdi 86 movq %rax,%rdi
77 call rwsem_downgrade_wake 87 call rwsem_downgrade_wake
78 popq %rdx 88 popq_cfi %rdx
89 CFI_RESTORE rdx
79 restore_common_regs 90 restore_common_regs
80 ret 91 ret
81 ENDPROC(call_rwsem_downgrade_wake) 92 CFI_ENDPROC
93ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe474178..06691daa410 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -36,7 +36,7 @@
36 */ 36 */
37#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
38ENTRY(__write_lock_failed) 38ENTRY(__write_lock_failed)
39 CFI_STARTPROC simple 39 CFI_STARTPROC
40 FRAME 40 FRAME
412: LOCK_PREFIX 412: LOCK_PREFIX
42 addl $ RW_LOCK_BIAS,(%eax) 42 addl $ RW_LOCK_BIAS,(%eax)
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed)
74/* Fix up special calling conventions */ 74/* Fix up special calling conventions */
75ENTRY(call_rwsem_down_read_failed) 75ENTRY(call_rwsem_down_read_failed)
76 CFI_STARTPROC 76 CFI_STARTPROC
77 push %ecx 77 pushl_cfi %ecx
78 CFI_ADJUST_CFA_OFFSET 4
79 CFI_REL_OFFSET ecx,0 78 CFI_REL_OFFSET ecx,0
80 push %edx 79 pushl_cfi %edx
81 CFI_ADJUST_CFA_OFFSET 4
82 CFI_REL_OFFSET edx,0 80 CFI_REL_OFFSET edx,0
83 call rwsem_down_read_failed 81 call rwsem_down_read_failed
84 pop %edx 82 popl_cfi %edx
85 CFI_ADJUST_CFA_OFFSET -4 83 popl_cfi %ecx
86 pop %ecx
87 CFI_ADJUST_CFA_OFFSET -4
88 ret 84 ret
89 CFI_ENDPROC 85 CFI_ENDPROC
90 ENDPROC(call_rwsem_down_read_failed) 86 ENDPROC(call_rwsem_down_read_failed)
91 87
92ENTRY(call_rwsem_down_write_failed) 88ENTRY(call_rwsem_down_write_failed)
93 CFI_STARTPROC 89 CFI_STARTPROC
94 push %ecx 90 pushl_cfi %ecx
95 CFI_ADJUST_CFA_OFFSET 4
96 CFI_REL_OFFSET ecx,0 91 CFI_REL_OFFSET ecx,0
97 calll rwsem_down_write_failed 92 calll rwsem_down_write_failed
98 pop %ecx 93 popl_cfi %ecx
99 CFI_ADJUST_CFA_OFFSET -4
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102 ENDPROC(call_rwsem_down_write_failed) 96 ENDPROC(call_rwsem_down_write_failed)
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake)
105 CFI_STARTPROC 99 CFI_STARTPROC
106 decw %dx /* do nothing if still outstanding active readers */ 100 decw %dx /* do nothing if still outstanding active readers */
107 jnz 1f 101 jnz 1f
108 push %ecx 102 pushl_cfi %ecx
109 CFI_ADJUST_CFA_OFFSET 4
110 CFI_REL_OFFSET ecx,0 103 CFI_REL_OFFSET ecx,0
111 call rwsem_wake 104 call rwsem_wake
112 pop %ecx 105 popl_cfi %ecx
113 CFI_ADJUST_CFA_OFFSET -4
1141: ret 1061: ret
115 CFI_ENDPROC 107 CFI_ENDPROC
116 ENDPROC(call_rwsem_wake) 108 ENDPROC(call_rwsem_wake)
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake)
118/* Fix up special calling conventions */ 110/* Fix up special calling conventions */
119ENTRY(call_rwsem_downgrade_wake) 111ENTRY(call_rwsem_downgrade_wake)
120 CFI_STARTPROC 112 CFI_STARTPROC
121 push %ecx 113 pushl_cfi %ecx
122 CFI_ADJUST_CFA_OFFSET 4
123 CFI_REL_OFFSET ecx,0 114 CFI_REL_OFFSET ecx,0
124 push %edx 115 pushl_cfi %edx
125 CFI_ADJUST_CFA_OFFSET 4
126 CFI_REL_OFFSET edx,0 116 CFI_REL_OFFSET edx,0
127 call rwsem_downgrade_wake 117 call rwsem_downgrade_wake
128 pop %edx 118 popl_cfi %edx
129 CFI_ADJUST_CFA_OFFSET -4 119 popl_cfi %ecx
130 pop %ecx
131 CFI_ADJUST_CFA_OFFSET -4
132 ret 120 ret
133 CFI_ENDPROC 121 CFI_ENDPROC
134 ENDPROC(call_rwsem_downgrade_wake) 122 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e00ec..2930ae05d77 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -7,24 +7,6 @@
7 7
8 #include <linux/linkage.h> 8 #include <linux/linkage.h>
9 9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS 10#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */ 11 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func 12 .macro thunk_ra name,func
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5a542..782b082c9ff 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -22,26 +22,6 @@
22 CFI_ENDPROC 22 CFI_ENDPROC
23 .endm 23 .endm
24 24
25 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
26 .macro thunk_retrax name,func
27 .globl \name
28\name:
29 CFI_STARTPROC
30 SAVE_ARGS
31 call \func
32 jmp restore_norax
33 CFI_ENDPROC
34 .endm
35
36
37 .section .sched.text, "ax"
38#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
39 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
40 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
41 thunk rwsem_wake_thunk,rwsem_wake
42 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
43#endif
44
45#ifdef CONFIG_TRACE_IRQFLAGS 25#ifdef CONFIG_TRACE_IRQFLAGS
46 /* put return address in rdi (arg1) */ 26 /* put return address in rdi (arg1) */
47 .macro thunk_ra name,func 27 .macro thunk_ra name,func
@@ -72,10 +52,3 @@ restore:
72 RESTORE_ARGS 52 RESTORE_ARGS
73 ret 53 ret
74 CFI_ENDPROC 54 CFI_ENDPROC
75
76 CFI_STARTPROC
77 SAVE_ARGS
78restore_norax:
79 RESTORE_ARGS 1
80 ret
81 CFI_ENDPROC
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d6..3e608edf995 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
28 29
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
30 31
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c435e..0919c26820d 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,9 +26,7 @@
26#include <asm/apic.h> 26#include <asm/apic.h>
27#include <asm/amd_nb.h> 27#include <asm/amd_nb.h>
28 28
29static struct bootnode __initdata nodes[8];
30static unsigned char __initdata nodeids[8]; 29static unsigned char __initdata nodeids[8];
31static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
32 30
33static __init int find_northbridge(void) 31static __init int find_northbridge(void)
34{ 32{
@@ -51,7 +49,7 @@ static __init int find_northbridge(void)
51 return num; 49 return num;
52 } 50 }
53 51
54 return -1; 52 return -ENOENT;
55} 53}
56 54
57static __init void early_get_boot_cpu_id(void) 55static __init void early_get_boot_cpu_id(void)
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void)
69#endif 67#endif
70} 68}
71 69
72int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 70int __init amd_numa_init(void)
73{ 71{
74 unsigned long start = PFN_PHYS(start_pfn); 72 unsigned long start = PFN_PHYS(0);
75 unsigned long end = PFN_PHYS(end_pfn); 73 unsigned long end = PFN_PHYS(max_pfn);
76 unsigned numnodes; 74 unsigned numnodes;
77 unsigned long prevbase; 75 unsigned long prevbase;
78 int i, nb, found = 0; 76 int i, j, nb;
79 u32 nodeid, reg; 77 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base;
80 79
81 if (!early_pci_allowed()) 80 if (!early_pci_allowed())
82 return -1; 81 return -EINVAL;
83 82
84 nb = find_northbridge(); 83 nb = find_northbridge();
85 if (nb < 0) 84 if (nb < 0)
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
90 reg = read_pci_config(0, nb, 0, 0x60); 89 reg = read_pci_config(0, nb, 0, 0x60);
91 numnodes = ((reg >> 4) & 0xF) + 1; 90 numnodes = ((reg >> 4) & 0xF) + 1;
92 if (numnodes <= 1) 91 if (numnodes <= 1)
93 return -1; 92 return -ENOENT;
94 93
95 pr_info("Number of physical nodes %d\n", numnodes); 94 pr_info("Number of physical nodes %d\n", numnodes);
96 95
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
121 if ((base >> 8) & 3 || (limit >> 8) & 3) { 120 if ((base >> 8) & 3 || (limit >> 8) & 3) {
122 pr_err("Node %d using interleaving mode %lx/%lx\n", 121 pr_err("Node %d using interleaving mode %lx/%lx\n",
123 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 122 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
124 return -1; 123 return -EINVAL;
125 } 124 }
126 if (node_isset(nodeid, nodes_parsed)) { 125 if (node_isset(nodeid, numa_nodes_parsed)) {
127 pr_info("Node %d already present, skipping\n", 126 pr_info("Node %d already present, skipping\n",
128 nodeid); 127 nodeid);
129 continue; 128 continue;
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
160 if (prevbase > base) { 159 if (prevbase > base) {
161 pr_err("Node map not sorted %lx,%lx\n", 160 pr_err("Node map not sorted %lx,%lx\n",
162 prevbase, base); 161 prevbase, base);
163 return -1; 162 return -EINVAL;
164 } 163 }
165 164
166 pr_info("Node %d MemBase %016lx Limit %016lx\n", 165 pr_info("Node %d MemBase %016lx Limit %016lx\n",
167 nodeid, base, limit); 166 nodeid, base, limit);
168 167
169 found++;
170
171 nodes[nodeid].start = base;
172 nodes[nodeid].end = limit;
173
174 prevbase = base; 168 prevbase = base;
175 169 numa_add_memblk(nodeid, base, limit);
176 node_set(nodeid, nodes_parsed); 170 node_set(nodeid, numa_nodes_parsed);
177 }
178
179 if (!found)
180 return -1;
181 return 0;
182}
183
184#ifdef CONFIG_NUMA_EMU
185static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
186 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
187};
188
189void __init amd_get_nodes(struct bootnode *physnodes)
190{
191 int i;
192
193 for_each_node_mask(i, nodes_parsed) {
194 physnodes[i].start = nodes[i].start;
195 physnodes[i].end = nodes[i].end;
196 } 171 }
197}
198
199static int __init find_node_by_addr(unsigned long addr)
200{
201 int ret = NUMA_NO_NODE;
202 int i;
203
204 for (i = 0; i < 8; i++)
205 if (addr >= nodes[i].start && addr < nodes[i].end) {
206 ret = i;
207 break;
208 }
209 return ret;
210}
211 172
212/* 173 if (!nodes_weight(numa_nodes_parsed))
213 * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be 174 return -ENOENT;
214 * setup to represent the physical topology but reflect the emulated
215 * environment. For each emulated node, the real node which it appears on is
216 * found and a fake pxm to nid mapping is created which mirrors the actual
217 * locality. node_distance() then represents the correct distances between
218 * emulated nodes by using the fake acpi mappings to pxms.
219 */
220void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
221{
222 unsigned int bits;
223 unsigned int cores;
224 unsigned int apicid_base = 0;
225 int i;
226 175
176 /*
177 * We seem to have valid NUMA configuration. Map apicids to nodes
178 * using the coreid bits from early_identify_cpu.
179 */
227 bits = boot_cpu_data.x86_coreid_bits; 180 bits = boot_cpu_data.x86_coreid_bits;
228 cores = 1 << bits; 181 cores = 1 << bits;
229 early_get_boot_cpu_id();
230 if (boot_cpu_physical_apicid > 0)
231 apicid_base = boot_cpu_physical_apicid;
232
233 for (i = 0; i < nr_nodes; i++) {
234 int index;
235 int nid;
236 int j;
237
238 nid = find_node_by_addr(nodes[i].start);
239 if (nid == NUMA_NO_NODE)
240 continue;
241
242 index = nodeids[nid] << bits;
243 if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
244 for (j = apicid_base; j < cores + apicid_base; j++)
245 fake_apicid_to_node[index + j] = i;
246#ifdef CONFIG_ACPI_NUMA
247 __acpi_map_pxm_to_node(nid, i);
248#endif
249 }
250 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
251}
252#endif /* CONFIG_NUMA_EMU */
253
254int __init amd_scan_nodes(void)
255{
256 unsigned int bits;
257 unsigned int cores;
258 unsigned int apicid_base;
259 int i;
260
261 BUG_ON(nodes_empty(nodes_parsed));
262 node_possible_map = nodes_parsed;
263 memnode_shift = compute_hash_shift(nodes, 8, NULL);
264 if (memnode_shift < 0) {
265 pr_err("No NUMA node hash function found. Contact maintainer\n");
266 return -1;
267 }
268 pr_info("Using node hash shift of %d\n", memnode_shift);
269
270 /* use the coreid bits from early_identify_cpu */
271 bits = boot_cpu_data.x86_coreid_bits;
272 cores = (1<<bits);
273 apicid_base = 0; 182 apicid_base = 0;
183
274 /* get the APIC ID of the BSP early for systems with apicid lifting */ 184 /* get the APIC ID of the BSP early for systems with apicid lifting */
275 early_get_boot_cpu_id(); 185 early_get_boot_cpu_id();
276 if (boot_cpu_physical_apicid > 0) { 186 if (boot_cpu_physical_apicid > 0) {
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void)
278 apicid_base = boot_cpu_physical_apicid; 188 apicid_base = boot_cpu_physical_apicid;
279 } 189 }
280 190
281 for_each_node_mask(i, node_possible_map) { 191 for_each_node_mask(i, numa_nodes_parsed)
282 int j;
283
284 memblock_x86_register_active_regions(i,
285 nodes[i].start >> PAGE_SHIFT,
286 nodes[i].end >> PAGE_SHIFT);
287 for (j = apicid_base; j < cores + apicid_base; j++) 192 for (j = apicid_base; j < cores + apicid_base; j++)
288 apicid_to_node[(i << bits) + j] = i; 193 set_apicid_to_node((i << bits) + j, i);
289 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
290 }
291 194
292 numa_init_array();
293 return 0; 195 return 0;
294} 196}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42abe82..286d289b039 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
18 18
19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
20 20
21unsigned long __initdata e820_table_start; 21unsigned long __initdata pgt_buf_start;
22unsigned long __meminitdata e820_table_end; 22unsigned long __meminitdata pgt_buf_end;
23unsigned long __meminitdata e820_table_top; 23unsigned long __meminitdata pgt_buf_top;
24 24
25int after_bootmem; 25int after_bootmem;
26 26
@@ -33,7 +33,7 @@ int direct_gbpages
33static void __init find_early_table_space(unsigned long end, int use_pse, 33static void __init find_early_table_space(unsigned long end, int use_pse,
34 int use_gbpages) 34 int use_gbpages)
35{ 35{
36 unsigned long puds, pmds, ptes, tables, start; 36 unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
37 phys_addr_t base; 37 phys_addr_t base;
38 38
39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
65#ifdef CONFIG_X86_32 65#ifdef CONFIG_X86_32
66 /* for fixmap */ 66 /* for fixmap */
67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
68#endif
69 68
70 /* 69 good_end = max_pfn_mapped << PAGE_SHIFT;
71 * RED-PEN putting page tables only on node 0 could
72 * cause a hotspot and fill up ZONE_DMA. The page tables
73 * need roughly 0.5KB per GB.
74 */
75#ifdef CONFIG_X86_32
76 start = 0x7000;
77#else
78 start = 0x8000;
79#endif 70#endif
80 base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, 71
81 tables, PAGE_SIZE); 72 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
82 if (base == MEMBLOCK_ERROR) 73 if (base == MEMBLOCK_ERROR)
83 panic("Cannot find space for the kernel page tables"); 74 panic("Cannot find space for the kernel page tables");
84 75
85 e820_table_start = base >> PAGE_SHIFT; 76 pgt_buf_start = base >> PAGE_SHIFT;
86 e820_table_end = e820_table_start; 77 pgt_buf_end = pgt_buf_start;
87 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 78 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
88 79
89 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 80 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
90 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
91} 82}
92 83
93struct map_range { 84struct map_range {
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
279 load_cr3(swapper_pg_dir); 270 load_cr3(swapper_pg_dir);
280#endif 271#endif
281 272
282#ifdef CONFIG_X86_64
283 if (!after_bootmem && !start) {
284 pud_t *pud;
285 pmd_t *pmd;
286
287 mmu_cr4_features = read_cr4();
288
289 /*
290 * _brk_end cannot change anymore, but it and _end may be
291 * located on different 2M pages. cleanup_highmap(), however,
292 * can only consider _end when it runs, so destroy any
293 * mappings beyond _brk_end here.
294 */
295 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
296 pmd = pmd_offset(pud, _brk_end - 1);
297 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
298 pmd_clear(pmd);
299 }
300#endif
301 __flush_tlb_all(); 273 __flush_tlb_all();
302 274
303 if (!after_bootmem && e820_table_end > e820_table_start) 275 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, 276 memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
305 e820_table_end << PAGE_SHIFT, "PGTABLE"); 277 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
306 278
307 if (!after_bootmem) 279 if (!after_bootmem)
308 early_memtest(start, end); 280 early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074b7f0..73ad7ebd6e9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
62 62
63static __init void *alloc_low_page(void) 63static __init void *alloc_low_page(void)
64{ 64{
65 unsigned long pfn = e820_table_end++; 65 unsigned long pfn = pgt_buf_end++;
66 void *adr; 66 void *adr;
67 67
68 if (pfn >= e820_table_top) 68 if (pfn >= pgt_buf_top)
69 panic("alloc_low_page: ran out of memory"); 69 panic("alloc_low_page: ran out of memory");
70 70
71 adr = __va(pfn * PAGE_SIZE); 71 adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
166 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start 166 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
167 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { 167 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
168 pte_t *newpte; 168 pte_t *newpte;
169 int i; 169 int i;
170 170
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
644} 644}
645 645
646#ifndef CONFIG_NEED_MULTIPLE_NODES 646#ifndef CONFIG_NEED_MULTIPLE_NODES
647void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 647void __init initmem_init(void)
648 int acpi, int k8)
649{ 648{
650#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
651 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c14a5422e15..a08a62cb136 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -314,7 +314,7 @@ void __init cleanup_highmap(void)
314 314
315static __ref void *alloc_low_page(unsigned long *phys) 315static __ref void *alloc_low_page(unsigned long *phys)
316{ 316{
317 unsigned long pfn = e820_table_end++; 317 unsigned long pfn = pgt_buf_end++;
318 void *adr; 318 void *adr;
319 319
320 if (after_bootmem) { 320 if (after_bootmem) {
@@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
324 return adr; 324 return adr;
325 } 325 }
326 326
327 if (pfn >= e820_table_top) 327 if (pfn >= pgt_buf_top)
328 panic("alloc_low_page: ran out of memory"); 328 panic("alloc_low_page: ran out of memory");
329 329
330 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 330 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -333,12 +333,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
333 return adr; 333 return adr;
334} 334}
335 335
336static __ref void *map_low_page(void *virt)
337{
338 void *adr;
339 unsigned long phys, left;
340
341 if (after_bootmem)
342 return virt;
343
344 phys = __pa(virt);
345 left = phys & (PAGE_SIZE - 1);
346 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
347 adr = (void *)(((unsigned long)adr) | left);
348
349 return adr;
350}
351
336static __ref void unmap_low_page(void *adr) 352static __ref void unmap_low_page(void *adr)
337{ 353{
338 if (after_bootmem) 354 if (after_bootmem)
339 return; 355 return;
340 356
341 early_iounmap(adr, PAGE_SIZE); 357 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
342} 358}
343 359
344static unsigned long __meminit 360static unsigned long __meminit
@@ -386,15 +402,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
386} 402}
387 403
388static unsigned long __meminit 404static unsigned long __meminit
389phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
390 pgprot_t prot)
391{
392 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
393
394 return phys_pte_init(pte, address, end, prot);
395}
396
397static unsigned long __meminit
398phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 405phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
399 unsigned long page_size_mask, pgprot_t prot) 406 unsigned long page_size_mask, pgprot_t prot)
400{ 407{
@@ -420,8 +427,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
420 if (pmd_val(*pmd)) { 427 if (pmd_val(*pmd)) {
421 if (!pmd_large(*pmd)) { 428 if (!pmd_large(*pmd)) {
422 spin_lock(&init_mm.page_table_lock); 429 spin_lock(&init_mm.page_table_lock);
423 last_map_addr = phys_pte_update(pmd, address, 430 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
431 last_map_addr = phys_pte_init(pte, address,
424 end, prot); 432 end, prot);
433 unmap_low_page(pte);
425 spin_unlock(&init_mm.page_table_lock); 434 spin_unlock(&init_mm.page_table_lock);
426 continue; 435 continue;
427 } 436 }
@@ -468,18 +477,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
468} 477}
469 478
470static unsigned long __meminit 479static unsigned long __meminit
471phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
472 unsigned long page_size_mask, pgprot_t prot)
473{
474 pmd_t *pmd = pmd_offset(pud, 0);
475 unsigned long last_map_addr;
476
477 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
478 __flush_tlb_all();
479 return last_map_addr;
480}
481
482static unsigned long __meminit
483phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 480phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
484 unsigned long page_size_mask) 481 unsigned long page_size_mask)
485{ 482{
@@ -504,8 +501,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
504 501
505 if (pud_val(*pud)) { 502 if (pud_val(*pud)) {
506 if (!pud_large(*pud)) { 503 if (!pud_large(*pud)) {
507 last_map_addr = phys_pmd_update(pud, addr, end, 504 pmd = map_low_page(pmd_offset(pud, 0));
505 last_map_addr = phys_pmd_init(pmd, addr, end,
508 page_size_mask, prot); 506 page_size_mask, prot);
507 unmap_low_page(pmd);
508 __flush_tlb_all();
509 continue; 509 continue;
510 } 510 }
511 /* 511 /*
@@ -553,17 +553,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
553 return last_map_addr; 553 return last_map_addr;
554} 554}
555 555
556static unsigned long __meminit
557phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
558 unsigned long page_size_mask)
559{
560 pud_t *pud;
561
562 pud = (pud_t *)pgd_page_vaddr(*pgd);
563
564 return phys_pud_init(pud, addr, end, page_size_mask);
565}
566
567unsigned long __meminit 556unsigned long __meminit
568kernel_physical_mapping_init(unsigned long start, 557kernel_physical_mapping_init(unsigned long start,
569 unsigned long end, 558 unsigned long end,
@@ -587,8 +576,10 @@ kernel_physical_mapping_init(unsigned long start,
587 next = end; 576 next = end;
588 577
589 if (pgd_val(*pgd)) { 578 if (pgd_val(*pgd)) {
590 last_map_addr = phys_pud_update(pgd, __pa(start), 579 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
580 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 581 __pa(end), page_size_mask);
582 unmap_low_page(pud);
592 continue; 583 continue;
593 } 584 }
594 585
@@ -612,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start,
612} 603}
613 604
614#ifndef CONFIG_NUMA 605#ifndef CONFIG_NUMA
615void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 606void __init initmem_init(void)
616 int acpi, int k8)
617{ 607{
618 memblock_x86_register_active_regions(0, start_pfn, end_pfn); 608 memblock_x86_register_active_regions(0, 0, max_pfn);
619} 609}
620#endif 610#endif
621 611
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ebf6d7887a3..9559d360fde 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt)
26early_param("numa", numa_setup); 26early_param("numa", numa_setup);
27 27
28/* 28/*
29 * Which logical CPUs are on which nodes 29 * apicid, cpu, node mappings
30 */ 30 */
31s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33};
34
31cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
32EXPORT_SYMBOL(node_to_cpumask_map); 36EXPORT_SYMBOL(node_to_cpumask_map);
33 37
34/* 38/*
39 * Map cpu index to node index
40 */
41DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
42EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44void __cpuinit numa_set_node(int cpu, int node)
45{
46 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
47
48 /* early setting, no percpu area yet */
49 if (cpu_to_node_map) {
50 cpu_to_node_map[cpu] = node;
51 return;
52 }
53
54#ifdef CONFIG_DEBUG_PER_CPU_MAPS
55 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
56 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
57 dump_stack();
58 return;
59 }
60#endif
61 per_cpu(x86_cpu_to_node_map, cpu) = node;
62
63 if (node != NUMA_NO_NODE)
64 set_cpu_numa_node(cpu, node);
65}
66
67void __cpuinit numa_clear_node(int cpu)
68{
69 numa_set_node(cpu, NUMA_NO_NODE);
70}
71
72/*
35 * Allocate node_to_cpumask_map based on number of available nodes 73 * Allocate node_to_cpumask_map based on number of available nodes
36 * Requires node_possible_map to be valid. 74 * Requires node_possible_map to be valid.
37 * 75 *
@@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void)
57 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
58} 96}
59 97
60#ifdef CONFIG_DEBUG_PER_CPU_MAPS 98/*
99 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
101 * mapping. To avoid this fill in the mapping for all possible CPUs,
102 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes.
104 */
105void __init numa_init_array(void)
106{
107 int rr, i;
108
109 rr = first_node(node_online_map);
110 for (i = 0; i < nr_cpu_ids; i++) {
111 if (early_cpu_to_node(i) != NUMA_NO_NODE)
112 continue;
113 numa_set_node(i, rr);
114 rr = next_node(rr, node_online_map);
115 if (rr == MAX_NUMNODES)
116 rr = first_node(node_online_map);
117 }
118}
119
120static __init int find_near_online_node(int node)
121{
122 int n, val;
123 int min_val = INT_MAX;
124 int best_node = -1;
125
126 for_each_online_node(n) {
127 val = node_distance(node, n);
128
129 if (val < min_val) {
130 min_val = val;
131 best_node = n;
132 }
133 }
134
135 return best_node;
136}
137
138/*
139 * Setup early cpu_to_node.
140 *
141 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
142 * and apicid_to_node[] tables have valid entries for a CPU.
143 * This means we skip cpu_to_node[] initialisation for NUMA
144 * emulation and faking node case (when running a kernel compiled
145 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
146 * is already initialized in a round robin manner at numa_init_array,
147 * prior to this call, and this initialization is good enough
148 * for the fake NUMA cases.
149 *
150 * Called before the per_cpu areas are setup.
151 */
152void __init init_cpu_to_node(void)
153{
154 int cpu;
155 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
156
157 BUG_ON(cpu_to_apicid == NULL);
158
159 for_each_possible_cpu(cpu) {
160 int node = numa_cpu_node(cpu);
161
162 if (node == NUMA_NO_NODE)
163 continue;
164 if (!node_online(node))
165 node = find_near_online_node(node);
166 numa_set_node(cpu, node);
167 }
168}
169
170#ifndef CONFIG_DEBUG_PER_CPU_MAPS
171
172# ifndef CONFIG_NUMA_EMU
173void __cpuinit numa_add_cpu(int cpu)
174{
175 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
176}
177
178void __cpuinit numa_remove_cpu(int cpu)
179{
180 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
181}
182# endif /* !CONFIG_NUMA_EMU */
183
184#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
185
186int __cpu_to_node(int cpu)
187{
188 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
189 printk(KERN_WARNING
190 "cpu_to_node(%d): usage too early!\n", cpu);
191 dump_stack();
192 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
193 }
194 return per_cpu(x86_cpu_to_node_map, cpu);
195}
196EXPORT_SYMBOL(__cpu_to_node);
197
198/*
199 * Same function as cpu_to_node() but used if called before the
200 * per_cpu areas are setup.
201 */
202int early_cpu_to_node(int cpu)
203{
204 if (early_per_cpu_ptr(x86_cpu_to_node_map))
205 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
206
207 if (!cpu_possible(cpu)) {
208 printk(KERN_WARNING
209 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
210 dump_stack();
211 return NUMA_NO_NODE;
212 }
213 return per_cpu(x86_cpu_to_node_map, cpu);
214}
215
216struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
217{
218 int node = early_cpu_to_node(cpu);
219 struct cpumask *mask;
220 char buf[64];
221
222 if (node == NUMA_NO_NODE) {
223 /* early_cpu_to_node() already emits a warning and trace */
224 return NULL;
225 }
226 mask = node_to_cpumask_map[node];
227 if (!mask) {
228 pr_err("node_to_cpumask_map[%i] NULL\n", node);
229 dump_stack();
230 return NULL;
231 }
232
233 cpulist_scnprintf(buf, sizeof(buf), mask);
234 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
235 enable ? "numa_add_cpu" : "numa_remove_cpu",
236 cpu, node, buf);
237 return mask;
238}
239
240# ifndef CONFIG_NUMA_EMU
241static void __cpuinit numa_set_cpumask(int cpu, int enable)
242{
243 struct cpumask *mask;
244
245 mask = debug_cpumask_set_cpu(cpu, enable);
246 if (!mask)
247 return;
248
249 if (enable)
250 cpumask_set_cpu(cpu, mask);
251 else
252 cpumask_clear_cpu(cpu, mask);
253}
254
255void __cpuinit numa_add_cpu(int cpu)
256{
257 numa_set_cpumask(cpu, 1);
258}
259
260void __cpuinit numa_remove_cpu(int cpu)
261{
262 numa_set_cpumask(cpu, 0);
263}
264# endif /* !CONFIG_NUMA_EMU */
265
61/* 266/*
62 * Returns a pointer to the bitmask of CPUs on Node 'node'. 267 * Returns a pointer to the bitmask of CPUs on Node 'node'.
63 */ 268 */
@@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node)
80 return node_to_cpumask_map[node]; 285 return node_to_cpumask_map[node];
81} 286}
82EXPORT_SYMBOL(cpumask_of_node); 287EXPORT_SYMBOL(cpumask_of_node);
83#endif 288
289#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f27..bde3906420d 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 110
111static unsigned long kva_start_pfn; 111static unsigned long kva_start_pfn;
112static unsigned long kva_pages; 112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
113/* 119/*
114 * FLAT - support for basic PC memory model with discontig enabled, essentially 120 * FLAT - support for basic PC memory model with discontig enabled, essentially
115 * a single node with all available processors in it with a flat 121 * a single node with all available processors in it with a flat
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid)
346 (ulong) node_remap_end_vaddr[nid]); 352 (ulong) node_remap_end_vaddr[nid]);
347} 353}
348 354
349void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 355void __init initmem_init(void)
350 int acpi, int k8)
351{ 356{
352 int nid; 357 int nid;
353 long kva_target_pfn; 358 long kva_target_pfn;
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
361 */ 366 */
362 367
363 get_memcfg_numa(); 368 get_memcfg_numa();
369 numa_init_array();
364 370
365 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
366 372
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1337c51b07d..9ec0f209a6a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,31 +13,30 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/acpi.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/proto.h> 19#include <asm/proto.h>
19#include <asm/dma.h> 20#include <asm/dma.h>
20#include <asm/numa.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22#include <asm/amd_nb.h> 22#include <asm/amd_nb.h>
23 23
24#include "numa_internal.h"
25
24struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
25EXPORT_SYMBOL(node_data); 27EXPORT_SYMBOL(node_data);
26 28
27struct memnode memnode; 29nodemask_t numa_nodes_parsed __initdata;
28 30
29s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 31struct memnode memnode;
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31};
32 32
33static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
35 35
36/* 36static struct numa_meminfo numa_meminfo __initdata;
37 * Map cpu index to node index 37
38 */ 38static int numa_distance_cnt;
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 39static u8 *numa_distance;
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41 40
42/* 41/*
43 * Given a shift value, try to populate memnodemap[] 42 * Given a shift value, try to populate memnodemap[]
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
46 * 0 if memnodmap[] too small (of shift too small) 45 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big) 46 * -1 if node overlap or lost ram (shift too big)
48 */ 47 */
49static int __init populate_memnodemap(const struct bootnode *nodes, 48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
50 int numnodes, int shift, int *nodeids)
51{ 49{
52 unsigned long addr, end; 50 unsigned long addr, end;
53 int i, res = -1; 51 int i, res = -1;
54 52
55 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
56 for (i = 0; i < numnodes; i++) { 54 for (i = 0; i < mi->nr_blks; i++) {
57 addr = nodes[i].start; 55 addr = mi->blk[i].start;
58 end = nodes[i].end; 56 end = mi->blk[i].end;
59 if (addr >= end) 57 if (addr >= end)
60 continue; 58 continue;
61 if ((end >> shift) >= memnodemapsize) 59 if ((end >> shift) >= memnodemapsize)
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
63 do { 61 do {
64 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
65 return -1; 63 return -1;
66 64 memnodemap[addr >> shift] = mi->blk[i].nid;
67 if (!nodeids)
68 memnodemap[addr >> shift] = i;
69 else
70 memnodemap[addr >> shift] = nodeids[i];
71
72 addr += (1UL << shift); 65 addr += (1UL << shift);
73 } while (addr < end); 66 } while (addr < end);
74 res = 1; 67 res = 1;
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
86 79
87 addr = 0x8000; 80 addr = 0x8000;
88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, 82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
90 nodemap_size, L1_CACHE_BYTES); 83 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == MEMBLOCK_ERROR) { 84 if (nodemap_addr == MEMBLOCK_ERROR) {
92 printk(KERN_ERR 85 printk(KERN_ERR
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void)
106 * The LSB of all start and end addresses in the node map is the value of the 99 * The LSB of all start and end addresses in the node map is the value of the
107 * maximum possible shift. 100 * maximum possible shift.
108 */ 101 */
109static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
110 int numnodes)
111{ 103{
112 int i, nodes_used = 0; 104 int i, nodes_used = 0;
113 unsigned long start, end; 105 unsigned long start, end;
114 unsigned long bitfield = 0, memtop = 0; 106 unsigned long bitfield = 0, memtop = 0;
115 107
116 for (i = 0; i < numnodes; i++) { 108 for (i = 0; i < mi->nr_blks; i++) {
117 start = nodes[i].start; 109 start = mi->blk[i].start;
118 end = nodes[i].end; 110 end = mi->blk[i].end;
119 if (start >= end) 111 if (start >= end)
120 continue; 112 continue;
121 bitfield |= start; 113 bitfield |= start;
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
131 return i; 123 return i;
132} 124}
133 125
134int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 126static int __init compute_hash_shift(const struct numa_meminfo *mi)
135 int *nodeids)
136{ 127{
137 int shift; 128 int shift;
138 129
139 shift = extract_lsb_from_nodes(nodes, numnodes); 130 shift = extract_lsb_from_nodes(mi);
140 if (allocate_cachealigned_memnodemap()) 131 if (allocate_cachealigned_memnodemap())
141 return -1; 132 return -1;
142 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
143 shift); 134 shift);
144 135
145 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 136 if (populate_memnodemap(mi, shift) != 1) {
146 printk(KERN_INFO "Your memory is not aligned you need to " 137 printk(KERN_INFO "Your memory is not aligned you need to "
147 "rebuild your kernel with a bigger NODEMAPSIZE " 138 "rebuild your kernel with a bigger NODEMAPSIZE "
148 "shift=%d\n", shift); 139 "shift=%d\n", shift);
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
188 return NULL; 179 return NULL;
189} 180}
190 181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
191/* Initialize bootmem allocator for a node */ 239/* Initialize bootmem allocator for a node */
192void __init 240void __init
193setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -234,692 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
234 node_set_online(nodeid); 282 node_set_online(nodeid);
235} 283}
236 284
237/* 285/**
238 * There are unfortunately some poorly designed mainboards around that 286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
239 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 287 * @mi: numa_meminfo to clean up
240 * mapping. To avoid this fill in the mapping for all possible CPUs, 288 *
241 * as the number of CPUs is not known yet. We round robin the existing 289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
242 * nodes. 290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
243 */ 294 */
244void __init numa_init_array(void) 295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
245{ 296{
246 int rr, i; 297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
247 300
248 rr = first_node(node_online_map); 301 for (i = 0; i < mi->nr_blks; i++) {
249 for (i = 0; i < nr_cpu_ids; i++) { 302 struct numa_memblk *bi = &mi->blk[i];
250 if (early_cpu_to_node(i) != NUMA_NO_NODE)
251 continue;
252 numa_set_node(i, rr);
253 rr = next_node(rr, node_online_map);
254 if (rr == MAX_NUMNODES)
255 rr = first_node(node_online_map);
256 }
257}
258
259#ifdef CONFIG_NUMA_EMU
260/* Numa emulation */
261static struct bootnode nodes[MAX_NUMNODES] __initdata;
262static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
263static char *cmdline __initdata;
264 303
265void __init numa_emu_cmdline(char *str) 304 /* make sure all blocks are inside the limits */
266{ 305 bi->start = max(bi->start, low);
267 cmdline = str; 306 bi->end = min(bi->end, high);
268}
269 307
270static int __init setup_physnodes(unsigned long start, unsigned long end, 308 /* and there's no empty block */
271 int acpi, int amd) 309 if (bi->start == bi->end) {
272{ 310 numa_remove_memblk_from(i--, mi);
273 int ret = 0;
274 int i;
275
276 memset(physnodes, 0, sizeof(physnodes));
277#ifdef CONFIG_ACPI_NUMA
278 if (acpi)
279 acpi_get_nodes(physnodes, start, end);
280#endif
281#ifdef CONFIG_AMD_NUMA
282 if (amd)
283 amd_get_nodes(physnodes);
284#endif
285 /*
286 * Basic sanity checking on the physical node map: there may be errors
287 * if the SRAT or AMD code incorrectly reported the topology or the mem=
288 * kernel parameter is used.
289 */
290 for (i = 0; i < MAX_NUMNODES; i++) {
291 if (physnodes[i].start == physnodes[i].end)
292 continue;
293 if (physnodes[i].start > end) {
294 physnodes[i].end = physnodes[i].start;
295 continue;
296 }
297 if (physnodes[i].end < start) {
298 physnodes[i].start = physnodes[i].end;
299 continue; 311 continue;
300 } 312 }
301 if (physnodes[i].start < start)
302 physnodes[i].start = start;
303 if (physnodes[i].end > end)
304 physnodes[i].end = end;
305 ret++;
306 }
307 313
308 /* 314 for (j = i + 1; j < mi->nr_blks; j++) {
309 * If no physical topology was detected, a single node is faked to cover 315 struct numa_memblk *bj = &mi->blk[j];
310 * the entire address space. 316 unsigned long start, end;
311 */
312 if (!ret) {
313 physnodes[ret].start = start;
314 physnodes[ret].end = end;
315 ret = 1;
316 }
317 return ret;
318}
319
320static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
321{
322 int i;
323
324 BUG_ON(acpi && amd);
325#ifdef CONFIG_ACPI_NUMA
326 if (acpi)
327 acpi_fake_nodes(nodes, nr_nodes);
328#endif
329#ifdef CONFIG_AMD_NUMA
330 if (amd)
331 amd_fake_nodes(nodes, nr_nodes);
332#endif
333 if (!acpi && !amd)
334 for (i = 0; i < nr_cpu_ids; i++)
335 numa_set_node(i, 0);
336}
337
338/*
339 * Setups up nid to range from addr to addr + size. If the end
340 * boundary is greater than max_addr, then max_addr is used instead.
341 * The return value is 0 if there is additional memory left for
342 * allocation past addr and -1 otherwise. addr is adjusted to be at
343 * the end of the node.
344 */
345static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
346{
347 int ret = 0;
348 nodes[nid].start = *addr;
349 *addr += size;
350 if (*addr >= max_addr) {
351 *addr = max_addr;
352 ret = -1;
353 }
354 nodes[nid].end = *addr;
355 node_set(nid, node_possible_map);
356 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
357 nodes[nid].start, nodes[nid].end,
358 (nodes[nid].end - nodes[nid].start) >> 20);
359 return ret;
360}
361
362/*
363 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
364 * to max_addr. The return value is the number of nodes allocated.
365 */
366static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
367{
368 nodemask_t physnode_mask = NODE_MASK_NONE;
369 u64 size;
370 int big;
371 int ret = 0;
372 int i;
373
374 if (nr_nodes <= 0)
375 return -1;
376 if (nr_nodes > MAX_NUMNODES) {
377 pr_info("numa=fake=%d too large, reducing to %d\n",
378 nr_nodes, MAX_NUMNODES);
379 nr_nodes = MAX_NUMNODES;
380 }
381
382 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
383 /*
384 * Calculate the number of big nodes that can be allocated as a result
385 * of consolidating the remainder.
386 */
387 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
388 FAKE_NODE_MIN_SIZE;
389
390 size &= FAKE_NODE_MIN_HASH_MASK;
391 if (!size) {
392 pr_err("Not enough memory for each node. "
393 "NUMA emulation disabled.\n");
394 return -1;
395 }
396
397 for (i = 0; i < MAX_NUMNODES; i++)
398 if (physnodes[i].start != physnodes[i].end)
399 node_set(i, physnode_mask);
400
401 /*
402 * Continue to fill physical nodes with fake nodes until there is no
403 * memory left on any of them.
404 */
405 while (nodes_weight(physnode_mask)) {
406 for_each_node_mask(i, physnode_mask) {
407 u64 end = physnodes[i].start + size;
408 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
409
410 if (ret < big)
411 end += FAKE_NODE_MIN_SIZE;
412 317
413 /* 318 /*
414 * Continue to add memory to this fake node if its 319 * See whether there are overlapping blocks. Whine
415 * non-reserved memory is less than the per-node size. 320 * about but allow overlaps of the same nid. They
321 * will be merged below.
416 */ 322 */
417 while (end - physnodes[i].start - 323 if (bi->end > bj->start && bi->start < bj->end) {
418 memblock_x86_hole_size(physnodes[i].start, end) < size) { 324 if (bi->nid != bj->nid) {
419 end += FAKE_NODE_MIN_SIZE; 325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
420 if (end > physnodes[i].end) { 326 bi->nid, bi->start, bi->end,
421 end = physnodes[i].end; 327 bj->nid, bj->start, bj->end);
422 break; 328 return -EINVAL;
423 } 329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
424 } 333 }
425 334
426 /* 335 /*
427 * If there won't be at least FAKE_NODE_MIN_SIZE of 336 * Join together blocks on the same node, holes
428 * non-reserved memory in ZONE_DMA32 for the next node, 337 * between which don't overlap with memory on other
429 * this one must extend to the boundary. 338 * nodes.
430 */
431 if (end < dma32_end && dma32_end - end -
432 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
433 end = dma32_end;
434
435 /*
436 * If there won't be enough non-reserved memory for the
437 * next node, this one must extend to the end of the
438 * physical node.
439 */ 339 */
440 if (physnodes[i].end - end - 340 if (bi->nid != bj->nid)
441 memblock_x86_hole_size(end, physnodes[i].end) < size) 341 continue;
442 end = physnodes[i].end; 342 start = max(min(bi->start, bj->start), low);
443 343 end = min(max(bi->end, bj->end), high);
444 /* 344 for (k = 0; k < mi->nr_blks; k++) {
445 * Avoid allocating more nodes than requested, which can 345 struct numa_memblk *bk = &mi->blk[k];
446 * happen as a result of rounding down each node's size 346
447 * to FAKE_NODE_MIN_SIZE. 347 if (bi->nid == bk->nid)
448 */ 348 continue;
449 if (nodes_weight(physnode_mask) + ret >= nr_nodes) 349 if (start < bk->end && end > bk->start)
450 end = physnodes[i].end; 350 break;
451 351 }
452 if (setup_node_range(ret++, &physnodes[i].start, 352 if (k < mi->nr_blks)
453 end - physnodes[i].start, 353 continue;
454 physnodes[i].end) < 0) 354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
455 node_clear(i, physnode_mask); 355 bi->nid, bi->start, bi->end, bj->start, bj->end,
356 start, end);
357 bi->start = start;
358 bi->end = end;
359 numa_remove_memblk_from(j--, mi);
456 } 360 }
457 } 361 }
458 return ret;
459}
460
461/*
462 * Returns the end address of a node so that there is at least `size' amount of
463 * non-reserved memory or `max_addr' is reached.
464 */
465static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
466{
467 u64 end = start + size;
468 362
469 while (end - start - memblock_x86_hole_size(start, end) < size) { 363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
470 end += FAKE_NODE_MIN_SIZE; 364 mi->blk[i].start = mi->blk[i].end = 0;
471 if (end > max_addr) { 365 mi->blk[i].nid = NUMA_NO_NODE;
472 end = max_addr;
473 break;
474 }
475 } 366 }
476 return end; 367
368 return 0;
477} 369}
478 370
479/* 371/*
480 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 372 * Set nodes, which have memory in @mi, in *@nodemask.
481 * `addr' to `max_addr'. The return value is the number of nodes allocated.
482 */ 373 */
483static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) 374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
484{ 376{
485 nodemask_t physnode_mask = NODE_MASK_NONE;
486 u64 min_size;
487 int ret = 0;
488 int i; 377 int i;
489 378
490 if (!size) 379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
491 return -1; 380 if (mi->blk[i].start != mi->blk[i].end &&
492 /* 381 mi->blk[i].nid != NUMA_NO_NODE)
493 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 382 node_set(mi->blk[i].nid, *nodemask);
494 * increased accordingly if the requested size is too small. This 383}
495 * creates a uniform distribution of node sizes across the entire
496 * machine (but not necessarily over physical nodes).
497 */
498 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
499 MAX_NUMNODES;
500 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
501 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
502 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
503 FAKE_NODE_MIN_HASH_MASK;
504 if (size < min_size) {
505 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
506 size >> 20, min_size >> 20);
507 size = min_size;
508 }
509 size &= FAKE_NODE_MIN_HASH_MASK;
510
511 for (i = 0; i < MAX_NUMNODES; i++)
512 if (physnodes[i].start != physnodes[i].end)
513 node_set(i, physnode_mask);
514 /*
515 * Fill physical nodes with fake nodes of size until there is no memory
516 * left on any of them.
517 */
518 while (nodes_weight(physnode_mask)) {
519 for_each_node_mask(i, physnode_mask) {
520 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
521 u64 end;
522
523 end = find_end_of_node(physnodes[i].start,
524 physnodes[i].end, size);
525 /*
526 * If there won't be at least FAKE_NODE_MIN_SIZE of
527 * non-reserved memory in ZONE_DMA32 for the next node,
528 * this one must extend to the boundary.
529 */
530 if (end < dma32_end && dma32_end - end -
531 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
532 end = dma32_end;
533 384
534 /* 385/**
535 * If there won't be enough non-reserved memory for the 386 * numa_reset_distance - Reset NUMA distance table
536 * next node, this one must extend to the end of the 387 *
537 * physical node. 388 * The current table is freed. The next numa_set_distance() call will
538 */ 389 * create a new one.
539 if (physnodes[i].end - end - 390 */
540 memblock_x86_hole_size(end, physnodes[i].end) < size) 391void __init numa_reset_distance(void)
541 end = physnodes[i].end; 392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
542 394
543 /* 395 /* numa_distance could be 1LU marking allocation failure, test cnt */
544 * Setup the fake node that will be allocated as bootmem 396 if (numa_distance_cnt)
545 * later. If setup_node_range() returns non-zero, there 397 memblock_x86_free_range(__pa(numa_distance),
546 * is no more memory available on this physical node. 398 __pa(numa_distance) + size);
547 */ 399 numa_distance_cnt = 0;
548 if (setup_node_range(ret++, &physnodes[i].start, 400 numa_distance = NULL; /* enable table creation */
549 end - physnodes[i].start,
550 physnodes[i].end) < 0)
551 node_clear(i, physnode_mask);
552 }
553 }
554 return ret;
555} 401}
556 402
557/* 403static int __init numa_alloc_distance(void)
558 * Sets up the system RAM area from start_pfn to last_pfn according to the
559 * numa=fake command-line option.
560 */
561static int __init numa_emulation(unsigned long start_pfn,
562 unsigned long last_pfn, int acpi, int amd)
563{ 404{
564 u64 addr = start_pfn << PAGE_SHIFT; 405 nodemask_t nodes_parsed;
565 u64 max_addr = last_pfn << PAGE_SHIFT; 406 size_t size;
566 int num_nodes; 407 int i, j, cnt = 0;
567 int i; 408 u64 phys;
568 409
569 /* 410 /* size the new table and allocate it */
570 * If the numa=fake command-line contains a 'M' or 'G', it represents 411 nodes_parsed = numa_nodes_parsed;
571 * the fixed node size. Otherwise, if it is just a single number N, 412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
572 * split the system RAM into N fake nodes.
573 */
574 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
575 u64 size;
576 413
577 size = memparse(cmdline, &cmdline); 414 for_each_node_mask(i, nodes_parsed)
578 num_nodes = split_nodes_size_interleave(addr, max_addr, size); 415 cnt = i;
579 } else { 416 cnt++;
580 unsigned long n; 417 size = cnt * cnt * sizeof(numa_distance[0]);
581 418
582 n = simple_strtoul(cmdline, NULL, 0); 419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
583 num_nodes = split_nodes_interleave(addr, max_addr, n); 420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
584 } 426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
585 428
586 if (num_nodes < 0) 429 numa_distance = __va(phys);
587 return num_nodes; 430 numa_distance_cnt = cnt;
588 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 431
589 if (memnode_shift < 0) { 432 /* fill with the default distances */
590 memnode_shift = 0; 433 for (i = 0; i < cnt; i++)
591 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 434 for (j = 0; j < cnt; j++)
592 "disabled.\n"); 435 numa_distance[i * cnt + j] = i == j ?
593 return -1; 436 LOCAL_DISTANCE : REMOTE_DISTANCE;
594 } 437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
595 438
596 /*
597 * We need to vacate all active ranges that may have been registered for
598 * the e820 memory map.
599 */
600 remove_all_active_ranges();
601 for_each_node_mask(i, node_possible_map) {
602 memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
603 nodes[i].end >> PAGE_SHIFT);
604 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
605 }
606 setup_physnodes(addr, max_addr, acpi, amd);
607 fake_physnodes(acpi, amd, num_nodes);
608 numa_init_array();
609 return 0; 439 return 0;
610} 440}
611#endif /* CONFIG_NUMA_EMU */
612 441
613void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 442/**
614 int acpi, int amd) 443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accomodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
615{ 461{
616 int i; 462 if (!numa_distance && numa_alloc_distance() < 0)
617
618 nodes_clear(node_possible_map);
619 nodes_clear(node_online_map);
620
621#ifdef CONFIG_NUMA_EMU
622 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
623 acpi, amd);
624 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
625 return; 463 return;
626 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
627 acpi, amd);
628 nodes_clear(node_possible_map);
629 nodes_clear(node_online_map);
630#endif
631 464
632#ifdef CONFIG_ACPI_NUMA 465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
633 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
634 last_pfn << PAGE_SHIFT)) 467 from, to, distance);
635 return; 468 return;
636 nodes_clear(node_possible_map); 469 }
637 nodes_clear(node_online_map);
638#endif
639 470
640#ifdef CONFIG_AMD_NUMA 471 if ((u8)distance != distance ||
641 if (!numa_off && amd && !amd_scan_nodes()) 472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
642 return; 475 return;
643 nodes_clear(node_possible_map); 476 }
644 nodes_clear(node_online_map);
645#endif
646 printk(KERN_INFO "%s\n",
647 numa_off ? "NUMA turned off" : "No NUMA configuration found");
648 477
649 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 478 numa_distance[from * numa_distance_cnt + to] = distance;
650 start_pfn << PAGE_SHIFT,
651 last_pfn << PAGE_SHIFT);
652 /* setup dummy node covering all memory */
653 memnode_shift = 63;
654 memnodemap = memnode.embedded_map;
655 memnodemap[0] = 0;
656 node_set_online(0);
657 node_set(0, node_possible_map);
658 for (i = 0; i < nr_cpu_ids; i++)
659 numa_set_node(i, 0);
660 memblock_x86_register_active_regions(0, start_pfn, last_pfn);
661 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
662} 479}
663 480
664unsigned long __init numa_free_all_bootmem(void) 481int __node_distance(int from, int to)
665{ 482{
666 unsigned long pages = 0; 483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
667 int i; 484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
668 488
669 for_each_online_node(i) 489/*
670 pages += free_all_bootmem_node(NODE_DATA(i)); 490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
671 497
672 pages += free_all_memory_core_early(MAX_NUMNODES); 498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
673 507
674 return pages; 508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
675} 518}
676 519
677#ifdef CONFIG_NUMA 520static int __init numa_register_memblks(struct numa_meminfo *mi)
678
679static __init int find_near_online_node(int node)
680{ 521{
681 int n, val; 522 int i, nid;
682 int min_val = INT_MAX;
683 int best_node = -1;
684 523
685 for_each_online_node(n) { 524 /* Account for nodes with cpus and no memory */
686 val = node_distance(node, n); 525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
528 return -EINVAL;
687 529
688 if (val < min_val) { 530 memnode_shift = compute_hash_shift(mi);
689 min_val = val; 531 if (memnode_shift < 0) {
690 best_node = n; 532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
535
536 for (i = 0; i < mi->nr_blks; i++)
537 memblock_x86_register_active_regions(mi->blk[i].nid,
538 mi->blk[i].start >> PAGE_SHIFT,
539 mi->blk[i].end >> PAGE_SHIFT);
540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
545
546 /* Finally register nodes. */
547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
550
551 for (i = 0; i < mi->nr_blks; i++) {
552 if (nid != mi->blk[i].nid)
553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
691 } 556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
692 } 560 }
693 561
694 return best_node; 562 return 0;
695} 563}
696 564
697/* 565/**
698 * Setup early cpu_to_node. 566 * dummy_numma_init - Fallback dummy NUMA init
699 * 567 *
700 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 568 * Used if there's no underlying NUMA architecture, NUMA initialization
701 * and apicid_to_node[] tables have valid entries for a CPU. 569 * fails, or NUMA is disabled on the command line.
702 * This means we skip cpu_to_node[] initialisation for NUMA
703 * emulation and faking node case (when running a kernel compiled
704 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
705 * is already initialized in a round robin manner at numa_init_array,
706 * prior to this call, and this initialization is good enough
707 * for the fake NUMA cases.
708 * 570 *
709 * Called before the per_cpu areas are setup. 571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
710 */ 573 */
711void __init init_cpu_to_node(void) 574static int __init dummy_numa_init(void)
712{ 575{
713 int cpu; 576 printk(KERN_INFO "%s\n",
714 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
715 578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
716 BUG_ON(cpu_to_apicid == NULL); 579 0LU, max_pfn << PAGE_SHIFT);
717 580
718 for_each_possible_cpu(cpu) { 581 node_set(0, numa_nodes_parsed);
719 int node; 582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
720 u16 apicid = cpu_to_apicid[cpu];
721 583
722 if (apicid == BAD_APICID) 584 return 0;
723 continue;
724 node = apicid_to_node[apicid];
725 if (node == NUMA_NO_NODE)
726 continue;
727 if (!node_online(node))
728 node = find_near_online_node(node);
729 numa_set_node(cpu, node);
730 }
731} 585}
732#endif
733 586
734 587static int __init numa_init(int (*init_func)(void))
735void __cpuinit numa_set_node(int cpu, int node)
736{ 588{
737 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 589 int i;
738 590 int ret;
739 /* early setting, no percpu area yet */
740 if (cpu_to_node_map) {
741 cpu_to_node_map[cpu] = node;
742 return;
743 }
744
745#ifdef CONFIG_DEBUG_PER_CPU_MAPS
746 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
747 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
748 dump_stack();
749 return;
750 }
751#endif
752 per_cpu(x86_cpu_to_node_map, cpu) = node;
753 591
754 if (node != NUMA_NO_NODE) 592 for (i = 0; i < MAX_LOCAL_APIC; i++)
755 set_cpu_numa_node(cpu, node); 593 set_apicid_to_node(i, NUMA_NO_NODE);
756}
757 594
758void __cpuinit numa_clear_node(int cpu) 595 nodes_clear(numa_nodes_parsed);
759{ 596 nodes_clear(node_possible_map);
760 numa_set_node(cpu, NUMA_NO_NODE); 597 nodes_clear(node_online_map);
761} 598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
762 599 remove_all_active_ranges();
763#ifndef CONFIG_DEBUG_PER_CPU_MAPS 600 numa_reset_distance();
764 601
765#ifndef CONFIG_NUMA_EMU 602 ret = init_func();
766void __cpuinit numa_add_cpu(int cpu) 603 if (ret < 0)
767{ 604 return ret;
768 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 605 ret = numa_cleanup_meminfo(&numa_meminfo);
769} 606 if (ret < 0)
607 return ret;
770 608
771void __cpuinit numa_remove_cpu(int cpu) 609 numa_emulation(&numa_meminfo, numa_distance_cnt);
772{
773 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
774}
775#else
776void __cpuinit numa_add_cpu(int cpu)
777{
778 unsigned long addr;
779 u16 apicid;
780 int physnid;
781 int nid = NUMA_NO_NODE;
782 610
783 nid = early_cpu_to_node(cpu); 611 ret = numa_register_memblks(&numa_meminfo);
784 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 612 if (ret < 0)
613 return ret;
785 614
786 /* 615 for (i = 0; i < nr_cpu_ids; i++) {
787 * Use the starting address of the emulated node to find which physical 616 int nid = early_cpu_to_node(i);
788 * node it is allocated on.
789 */
790 addr = node_start_pfn(nid) << PAGE_SHIFT;
791 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
792 if (addr >= physnodes[physnid].start &&
793 addr < physnodes[physnid].end)
794 break;
795 617
796 /* 618 if (nid == NUMA_NO_NODE)
797 * Map the cpu to each emulated node that is allocated on the physical 619 continue;
798 * node of the cpu's apic id. 620 if (!node_online(nid))
799 */ 621 numa_clear_node(i);
800 for_each_online_node(nid) {
801 addr = node_start_pfn(nid) << PAGE_SHIFT;
802 if (addr >= physnodes[physnid].start &&
803 addr < physnodes[physnid].end)
804 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
805 } 622 }
623 numa_init_array();
624 return 0;
806} 625}
807 626
808void __cpuinit numa_remove_cpu(int cpu) 627void __init initmem_init(void)
809{ 628{
810 int i; 629 int ret;
811 630
812 for_each_online_node(i) 631 if (!numa_off) {
813 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 632#ifdef CONFIG_ACPI_NUMA
814} 633 ret = numa_init(x86_acpi_numa_init);
815#endif /* !CONFIG_NUMA_EMU */ 634 if (!ret)
816 635 return;
817#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 636#endif
818static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) 637#ifdef CONFIG_AMD_NUMA
819{ 638 ret = numa_init(amd_numa_init);
820 int node = early_cpu_to_node(cpu); 639 if (!ret)
821 struct cpumask *mask; 640 return;
822 char buf[64]; 641#endif
823
824 mask = node_to_cpumask_map[node];
825 if (!mask) {
826 pr_err("node_to_cpumask_map[%i] NULL\n", node);
827 dump_stack();
828 return NULL;
829 } 642 }
830 643
831 cpulist_scnprintf(buf, sizeof(buf), mask); 644 numa_init(dummy_numa_init);
832 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
833 enable ? "numa_add_cpu" : "numa_remove_cpu",
834 cpu, node, buf);
835 return mask;
836} 645}
837 646
838/* 647unsigned long __init numa_free_all_bootmem(void)
839 * --------- debug versions of the numa functions ---------
840 */
841#ifndef CONFIG_NUMA_EMU
842static void __cpuinit numa_set_cpumask(int cpu, int enable)
843{
844 struct cpumask *mask;
845
846 mask = debug_cpumask_set_cpu(cpu, enable);
847 if (!mask)
848 return;
849
850 if (enable)
851 cpumask_set_cpu(cpu, mask);
852 else
853 cpumask_clear_cpu(cpu, mask);
854}
855#else
856static void __cpuinit numa_set_cpumask(int cpu, int enable)
857{ 648{
858 int node = early_cpu_to_node(cpu); 649 unsigned long pages = 0;
859 struct cpumask *mask;
860 int i; 650 int i;
861 651
862 for_each_online_node(i) { 652 for_each_online_node(i)
863 unsigned long addr; 653 pages += free_all_bootmem_node(NODE_DATA(i));
864
865 addr = node_start_pfn(i) << PAGE_SHIFT;
866 if (addr < physnodes[node].start ||
867 addr >= physnodes[node].end)
868 continue;
869 mask = debug_cpumask_set_cpu(cpu, enable);
870 if (!mask)
871 return;
872
873 if (enable)
874 cpumask_set_cpu(cpu, mask);
875 else
876 cpumask_clear_cpu(cpu, mask);
877 }
878}
879#endif /* CONFIG_NUMA_EMU */
880 654
881void __cpuinit numa_add_cpu(int cpu) 655 pages += free_all_memory_core_early(MAX_NUMNODES);
882{
883 numa_set_cpumask(cpu, 1);
884}
885 656
886void __cpuinit numa_remove_cpu(int cpu) 657 return pages;
887{
888 numa_set_cpumask(cpu, 0);
889} 658}
890 659
891int __cpu_to_node(int cpu) 660int __cpuinit numa_cpu_node(int cpu)
892{ 661{
893 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
894 printk(KERN_WARNING
895 "cpu_to_node(%d): usage too early!\n", cpu);
896 dump_stack();
897 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
898 }
899 return per_cpu(x86_cpu_to_node_map, cpu);
900}
901EXPORT_SYMBOL(__cpu_to_node);
902 663
903/* 664 if (apicid != BAD_APICID)
904 * Same function as cpu_to_node() but used if called before the 665 return __apicid_to_node[apicid];
905 * per_cpu areas are setup. 666 return NUMA_NO_NODE;
906 */
907int early_cpu_to_node(int cpu)
908{
909 if (early_per_cpu_ptr(x86_cpu_to_node_map))
910 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
911
912 if (!cpu_possible(cpu)) {
913 printk(KERN_WARNING
914 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
915 dump_stack();
916 return NUMA_NO_NODE;
917 }
918 return per_cpu(x86_cpu_to_node_map, cpu);
919} 667}
920
921/*
922 * --------- end of debug versions of the numa functions ---------
923 */
924
925#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 00000000000..ad091e4cff1
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,494 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <asm/dma.h>
9
10#include "numa_internal.h"
11
12static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
13static char *emu_cmdline __initdata;
14
15void __init numa_emu_cmdline(char *str)
16{
17 emu_cmdline = str;
18}
19
20static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
21{
22 int i;
23
24 for (i = 0; i < mi->nr_blks; i++)
25 if (mi->blk[i].nid == nid)
26 return i;
27 return -ENOENT;
28}
29
30/*
31 * Sets up nid to range from @start to @end. The return value is -errno if
32 * something went wrong, 0 otherwise.
33 */
34static int __init emu_setup_memblk(struct numa_meminfo *ei,
35 struct numa_meminfo *pi,
36 int nid, int phys_blk, u64 size)
37{
38 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
39 struct numa_memblk *pb = &pi->blk[phys_blk];
40
41 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
42 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
43 return -EINVAL;
44 }
45
46 ei->nr_blks++;
47 eb->start = pb->start;
48 eb->end = pb->start + size;
49 eb->nid = nid;
50
51 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
52 emu_nid_to_phys[nid] = pb->nid;
53
54 pb->start += size;
55 if (pb->start >= pb->end) {
56 WARN_ON_ONCE(pb->start > pb->end);
57 numa_remove_memblk_from(phys_blk, pi);
58 }
59
60 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
61 eb->start, eb->end, (eb->end - eb->start) >> 20);
62 return 0;
63}
64
65/*
66 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
67 * to max_addr. The return value is the number of nodes allocated.
68 */
69static int __init split_nodes_interleave(struct numa_meminfo *ei,
70 struct numa_meminfo *pi,
71 u64 addr, u64 max_addr, int nr_nodes)
72{
73 nodemask_t physnode_mask = NODE_MASK_NONE;
74 u64 size;
75 int big;
76 int nid = 0;
77 int i, ret;
78
79 if (nr_nodes <= 0)
80 return -1;
81 if (nr_nodes > MAX_NUMNODES) {
82 pr_info("numa=fake=%d too large, reducing to %d\n",
83 nr_nodes, MAX_NUMNODES);
84 nr_nodes = MAX_NUMNODES;
85 }
86
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
88 /*
89 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder.
91 */
92 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
93 FAKE_NODE_MIN_SIZE;
94
95 size &= FAKE_NODE_MIN_HASH_MASK;
96 if (!size) {
97 pr_err("Not enough memory for each node. "
98 "NUMA emulation disabled.\n");
99 return -1;
100 }
101
102 for (i = 0; i < pi->nr_blks; i++)
103 node_set(pi->blk[i].nid, physnode_mask);
104
105 /*
106 * Continue to fill physical nodes with fake nodes until there is no
107 * memory left on any of them.
108 */
109 while (nodes_weight(physnode_mask)) {
110 for_each_node_mask(i, physnode_mask) {
111 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
112 u64 start, limit, end;
113 int phys_blk;
114
115 phys_blk = emu_find_memblk_by_nid(i, pi);
116 if (phys_blk < 0) {
117 node_clear(i, physnode_mask);
118 continue;
119 }
120 start = pi->blk[phys_blk].start;
121 limit = pi->blk[phys_blk].end;
122 end = start + size;
123
124 if (nid < big)
125 end += FAKE_NODE_MIN_SIZE;
126
127 /*
128 * Continue to add memory to this fake node if its
129 * non-reserved memory is less than the per-node size.
130 */
131 while (end - start -
132 memblock_x86_hole_size(start, end) < size) {
133 end += FAKE_NODE_MIN_SIZE;
134 if (end > limit) {
135 end = limit;
136 break;
137 }
138 }
139
140 /*
141 * If there won't be at least FAKE_NODE_MIN_SIZE of
142 * non-reserved memory in ZONE_DMA32 for the next node,
143 * this one must extend to the boundary.
144 */
145 if (end < dma32_end && dma32_end - end -
146 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
147 end = dma32_end;
148
149 /*
150 * If there won't be enough non-reserved memory for the
151 * next node, this one must extend to the end of the
152 * physical node.
153 */
154 if (limit - end -
155 memblock_x86_hole_size(end, limit) < size)
156 end = limit;
157
158 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
159 phys_blk,
160 min(end, limit) - start);
161 if (ret < 0)
162 return ret;
163 }
164 }
165 return 0;
166}
167
168/*
169 * Returns the end address of a node so that there is at least `size' amount of
170 * non-reserved memory or `max_addr' is reached.
171 */
172static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
173{
174 u64 end = start + size;
175
176 while (end - start - memblock_x86_hole_size(start, end) < size) {
177 end += FAKE_NODE_MIN_SIZE;
178 if (end > max_addr) {
179 end = max_addr;
180 break;
181 }
182 }
183 return end;
184}
185
186/*
187 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
188 * `addr' to `max_addr'. The return value is the number of nodes allocated.
189 */
190static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
191 struct numa_meminfo *pi,
192 u64 addr, u64 max_addr, u64 size)
193{
194 nodemask_t physnode_mask = NODE_MASK_NONE;
195 u64 min_size;
196 int nid = 0;
197 int i, ret;
198
199 if (!size)
200 return -1;
201 /*
202 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
203 * increased accordingly if the requested size is too small. This
204 * creates a uniform distribution of node sizes across the entire
205 * machine (but not necessarily over physical nodes).
206 */
207 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
208 MAX_NUMNODES;
209 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
210 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
211 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
212 FAKE_NODE_MIN_HASH_MASK;
213 if (size < min_size) {
214 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
215 size >> 20, min_size >> 20);
216 size = min_size;
217 }
218 size &= FAKE_NODE_MIN_HASH_MASK;
219
220 for (i = 0; i < pi->nr_blks; i++)
221 node_set(pi->blk[i].nid, physnode_mask);
222
223 /*
224 * Fill physical nodes with fake nodes of size until there is no memory
225 * left on any of them.
226 */
227 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
230 u64 start, limit, end;
231 int phys_blk;
232
233 phys_blk = emu_find_memblk_by_nid(i, pi);
234 if (phys_blk < 0) {
235 node_clear(i, physnode_mask);
236 continue;
237 }
238 start = pi->blk[phys_blk].start;
239 limit = pi->blk[phys_blk].end;
240
241 end = find_end_of_node(start, limit, size);
242 /*
243 * If there won't be at least FAKE_NODE_MIN_SIZE of
244 * non-reserved memory in ZONE_DMA32 for the next node,
245 * this one must extend to the boundary.
246 */
247 if (end < dma32_end && dma32_end - end -
248 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
249 end = dma32_end;
250
251 /*
252 * If there won't be enough non-reserved memory for the
253 * next node, this one must extend to the end of the
254 * physical node.
255 */
256 if (limit - end -
257 memblock_x86_hole_size(end, limit) < size)
258 end = limit;
259
260 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
261 phys_blk,
262 min(end, limit) - start);
263 if (ret < 0)
264 return ret;
265 }
266 }
267 return 0;
268}
269
270/**
271 * numa_emulation - Emulate NUMA nodes
272 * @numa_meminfo: NUMA configuration to massage
273 * @numa_dist_cnt: The size of the physical NUMA distance table
274 *
275 * Emulate NUMA nodes according to the numa=fake kernel parameter.
276 * @numa_meminfo contains the physical memory configuration and is modified
277 * to reflect the emulated configuration on success. @numa_dist_cnt is
278 * used to determine the size of the physical distance table.
279 *
280 * On success, the following modifications are made.
281 *
282 * - @numa_meminfo is updated to reflect the emulated nodes.
283 *
284 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
285 * emulated nodes.
286 *
287 * - NUMA distance table is rebuilt to represent distances between emulated
288 * nodes. The distances are determined considering how emulated nodes
289 * are mapped to physical nodes and match the actual distances.
290 *
291 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
292 * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
293 *
294 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
295 * identity mapping and no other modification is made.
296 */
297void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{
299 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT;
302 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid;
305 int i, j, ret;
306
307 if (!emu_cmdline)
308 goto no_emu;
309
310 memset(&ei, 0, sizeof(ei));
311 pi = *numa_meminfo;
312
313 for (i = 0; i < MAX_NUMNODES; i++)
314 emu_nid_to_phys[i] = NUMA_NO_NODE;
315
316 /*
317 * If the numa=fake command-line contains a 'M' or 'G', it represents
318 * the fixed node size. Otherwise, if it is just a single number N,
319 * split the system RAM into N fake nodes.
320 */
321 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
322 u64 size;
323
324 size = memparse(emu_cmdline, &emu_cmdline);
325 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
326 } else {
327 unsigned long n;
328
329 n = simple_strtoul(emu_cmdline, NULL, 0);
330 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
331 }
332
333 if (ret < 0)
334 goto no_emu;
335
336 if (numa_cleanup_meminfo(&ei) < 0) {
337 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
338 goto no_emu;
339 }
340
341 /* copy the physical distance table */
342 if (numa_dist_cnt) {
343 u64 phys;
344
345 phys = memblock_find_in_range(0,
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
350 goto no_emu;
351 }
352 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
353 phys_dist = __va(phys);
354
355 for (i = 0; i < numa_dist_cnt; i++)
356 for (j = 0; j < numa_dist_cnt; j++)
357 phys_dist[i * numa_dist_cnt + j] =
358 node_distance(i, j);
359 }
360
361 /*
362 * Determine the max emulated nid and the default phys nid to use
363 * for unmapped nodes.
364 */
365 max_emu_nid = 0;
366 dfl_phys_nid = NUMA_NO_NODE;
367 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
368 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
369 max_emu_nid = i;
370 if (dfl_phys_nid == NUMA_NO_NODE)
371 dfl_phys_nid = emu_nid_to_phys[i];
372 }
373 }
374 if (dfl_phys_nid == NUMA_NO_NODE) {
375 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
376 goto no_emu;
377 }
378
379 /* commit */
380 *numa_meminfo = ei;
381
382 /*
383 * Transform __apicid_to_node table to use emulated nids by
384 * reverse-mapping phys_nid. The maps should always exist but fall
385 * back to zero just in case.
386 */
387 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
388 if (__apicid_to_node[i] == NUMA_NO_NODE)
389 continue;
390 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
391 if (__apicid_to_node[i] == emu_nid_to_phys[j])
392 break;
393 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
394 }
395
396 /* make sure all emulated nodes are mapped to a physical node */
397 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
398 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
399 emu_nid_to_phys[i] = dfl_phys_nid;
400
401 /* transform distance table */
402 numa_reset_distance();
403 for (i = 0; i < max_emu_nid + 1; i++) {
404 for (j = 0; j < max_emu_nid + 1; j++) {
405 int physi = emu_nid_to_phys[i];
406 int physj = emu_nid_to_phys[j];
407 int dist;
408
409 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
410 dist = physi == physj ?
411 LOCAL_DISTANCE : REMOTE_DISTANCE;
412 else
413 dist = phys_dist[physi * numa_dist_cnt + physj];
414
415 numa_set_distance(i, j, dist);
416 }
417 }
418
419 /* free the copied physical distance table */
420 if (phys_dist)
421 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
422 return;
423
424no_emu:
425 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
426 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
427 emu_nid_to_phys[i] = i;
428}
429
430#ifndef CONFIG_DEBUG_PER_CPU_MAPS
431void __cpuinit numa_add_cpu(int cpu)
432{
433 int physnid, nid;
434
435 nid = early_cpu_to_node(cpu);
436 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
437
438 physnid = emu_nid_to_phys[nid];
439
440 /*
441 * Map the cpu to each emulated node that is allocated on the physical
442 * node of the cpu's apic id.
443 */
444 for_each_online_node(nid)
445 if (emu_nid_to_phys[nid] == physnid)
446 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
447}
448
449void __cpuinit numa_remove_cpu(int cpu)
450{
451 int i;
452
453 for_each_online_node(i)
454 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
455}
456#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
457static void __cpuinit numa_set_cpumask(int cpu, int enable)
458{
459 struct cpumask *mask;
460 int nid, physnid, i;
461
462 nid = early_cpu_to_node(cpu);
463 if (nid == NUMA_NO_NODE) {
464 /* early_cpu_to_node() already emits a warning and trace */
465 return;
466 }
467
468 physnid = emu_nid_to_phys[nid];
469
470 for_each_online_node(i) {
471 if (emu_nid_to_phys[nid] != physnid)
472 continue;
473
474 mask = debug_cpumask_set_cpu(cpu, enable);
475 if (!mask)
476 return;
477
478 if (enable)
479 cpumask_set_cpu(cpu, mask);
480 else
481 cpumask_clear_cpu(cpu, mask);
482 }
483}
484
485void __cpuinit numa_add_cpu(int cpu)
486{
487 numa_set_cpumask(cpu, 1);
488}
489
490void __cpuinit numa_remove_cpu(int cpu)
491{
492 numa_set_cpumask(cpu, 0);
493}
494#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 00000000000..ef2d97377d7
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt);
25#else
26static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
27 int numa_dist_cnt)
28{ }
29#endif
30
31#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae96e7b8051..48651c6f657 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,7 +57,7 @@ struct node_memory_chunk_s {
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; 57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58 58
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61 61
62int acpi_numa __initdata; 62int acpi_numa __initdata;
63 63
@@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void)
254 printk(KERN_DEBUG "Number of memory chunks in system = %d\n", 254 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
255 num_memory_chunks); 255 num_memory_chunks);
256 256
257 for (i = 0; i < MAX_APICID; i++) 257 for (i = 0; i < MAX_LOCAL_APIC; i++)
258 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 258 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
259 259
260 for (j = 0; j < num_memory_chunks; j++){ 260 for (j = 0; j < num_memory_chunks; j++){
261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; 261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285d1da..8e9d3394f6d 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,88 +26,34 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct acpi_table_slit *acpi_slit;
30
31static nodemask_t nodes_parsed __initdata;
32static nodemask_t cpu_nodes_parsed __initdata;
33static struct bootnode nodes[MAX_NUMNODES] __initdata;
34static struct bootnode nodes_add[MAX_NUMNODES]; 29static struct bootnode nodes_add[MAX_NUMNODES];
35 30
36static int num_node_memblks __initdata;
37static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
38static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
39
40static __init int setup_node(int pxm) 31static __init int setup_node(int pxm)
41{ 32{
42 return acpi_map_pxm_to_node(pxm); 33 return acpi_map_pxm_to_node(pxm);
43} 34}
44 35
45static __init int conflicting_memblks(unsigned long start, unsigned long end)
46{
47 int i;
48 for (i = 0; i < num_node_memblks; i++) {
49 struct bootnode *nd = &node_memblk_range[i];
50 if (nd->start == nd->end)
51 continue;
52 if (nd->end > start && nd->start < end)
53 return memblk_nodeid[i];
54 if (nd->end == end && nd->start == start)
55 return memblk_nodeid[i];
56 }
57 return -1;
58}
59
60static __init void cutoff_node(int i, unsigned long start, unsigned long end)
61{
62 struct bootnode *nd = &nodes[i];
63
64 if (nd->start < start) {
65 nd->start = start;
66 if (nd->end < nd->start)
67 nd->start = nd->end;
68 }
69 if (nd->end > end) {
70 nd->end = end;
71 if (nd->start > nd->end)
72 nd->start = nd->end;
73 }
74}
75
76static __init void bad_srat(void) 36static __init void bad_srat(void)
77{ 37{
78 int i;
79 printk(KERN_ERR "SRAT: SRAT not used.\n"); 38 printk(KERN_ERR "SRAT: SRAT not used.\n");
80 acpi_numa = -1; 39 acpi_numa = -1;
81 for (i = 0; i < MAX_LOCAL_APIC; i++) 40 memset(nodes_add, 0, sizeof(nodes_add));
82 apicid_to_node[i] = NUMA_NO_NODE;
83 for (i = 0; i < MAX_NUMNODES; i++) {
84 nodes[i].start = nodes[i].end = 0;
85 nodes_add[i].start = nodes_add[i].end = 0;
86 }
87 remove_all_active_ranges();
88} 41}
89 42
90static __init inline int srat_disabled(void) 43static __init inline int srat_disabled(void)
91{ 44{
92 return numa_off || acpi_numa < 0; 45 return acpi_numa < 0;
93} 46}
94 47
95/* Callback for SLIT parsing */ 48/* Callback for SLIT parsing */
96void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 49void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
97{ 50{
98 unsigned length; 51 int i, j;
99 unsigned long phys;
100
101 length = slit->header.length;
102 phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
103 PAGE_SIZE);
104
105 if (phys == MEMBLOCK_ERROR)
106 panic(" Can not save slit!\n");
107 52
108 acpi_slit = __va(phys); 53 for (i = 0; i < slit->locality_count; i++)
109 memcpy(acpi_slit, slit, length); 54 for (j = 0; j < slit->locality_count; j++)
110 memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); 55 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
56 slit->entry[slit->locality_count * i + j]);
111} 57}
112 58
113/* Callback for Proximity Domain -> x2APIC mapping */ 59/* Callback for Proximity Domain -> x2APIC mapping */
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 84 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return; 85 return;
140 } 86 }
141 apicid_to_node[apic_id] = node; 87 set_apicid_to_node(apic_id, node);
142 node_set(node, cpu_nodes_parsed); 88 node_set(node, numa_nodes_parsed);
143 acpi_numa = 1; 89 acpi_numa = 1;
144 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", 90 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
145 pxm, apic_id, node); 91 pxm, apic_id, node);
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
178 return; 124 return;
179 } 125 }
180 126
181 apicid_to_node[apic_id] = node; 127 set_apicid_to_node(apic_id, node);
182 node_set(node, cpu_nodes_parsed); 128 node_set(node, numa_nodes_parsed);
183 acpi_numa = 1; 129 acpi_numa = 1;
184 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", 130 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
185 pxm, apic_id, node); 131 pxm, apic_id, node);
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
241 } 187 }
242 188
243 if (changed) { 189 if (changed) {
244 node_set(node, cpu_nodes_parsed); 190 node_set(node, numa_nodes_parsed);
245 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
246 nd->start, nd->end); 192 nd->start, nd->end);
247 } 193 }
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
251void __init 197void __init
252acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
253{ 199{
254 struct bootnode *nd, oldnode;
255 unsigned long start, end; 200 unsigned long start, end;
256 int node, pxm; 201 int node, pxm;
257 int i;
258 202
259 if (srat_disabled()) 203 if (srat_disabled())
260 return; 204 return;
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
276 bad_srat(); 220 bad_srat();
277 return; 221 return;
278 } 222 }
279 i = conflicting_memblks(start, end); 223
280 if (i == node) { 224 if (numa_add_memblk(node, start, end) < 0) {
281 printk(KERN_WARNING
282 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
283 pxm, start, end, nodes[i].start, nodes[i].end);
284 } else if (i >= 0) {
285 printk(KERN_ERR
286 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
287 pxm, start, end, node_to_pxm(i),
288 nodes[i].start, nodes[i].end);
289 bad_srat(); 225 bad_srat();
290 return; 226 return;
291 } 227 }
292 nd = &nodes[node];
293 oldnode = *nd;
294 if (!node_test_and_set(node, nodes_parsed)) {
295 nd->start = start;
296 nd->end = end;
297 } else {
298 if (start < nd->start)
299 nd->start = start;
300 if (nd->end < end)
301 nd->end = end;
302 }
303 228
304 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
305 start, end); 230 start, end);
306 231
307 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
308 update_nodes_add(node, start, end); 233 update_nodes_add(node, start, end);
309 /* restore nodes[node] */
310 *nd = oldnode;
311 if ((nd->start | nd->end) == 0)
312 node_clear(node, nodes_parsed);
313 }
314
315 node_memblk_range[num_node_memblks].start = start;
316 node_memblk_range[num_node_memblks].end = end;
317 memblk_nodeid[num_node_memblks] = node;
318 num_node_memblks++;
319}
320
321/* Sanity check to catch more bad SRATs (they are amazingly common).
322 Make sure the PXMs cover all memory. */
323static int __init nodes_cover_memory(const struct bootnode *nodes)
324{
325 int i;
326 unsigned long pxmram, e820ram;
327
328 pxmram = 0;
329 for_each_node_mask(i, nodes_parsed) {
330 unsigned long s = nodes[i].start >> PAGE_SHIFT;
331 unsigned long e = nodes[i].end >> PAGE_SHIFT;
332 pxmram += e - s;
333 pxmram -= __absent_pages_in_range(i, s, e);
334 if ((long)pxmram < 0)
335 pxmram = 0;
336 }
337
338 e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
339 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
340 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
341 printk(KERN_ERR
342 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
343 (pxmram << PAGE_SHIFT) >> 20,
344 (e820ram << PAGE_SHIFT) >> 20);
345 return 0;
346 }
347 return 1;
348} 234}
349 235
350void __init acpi_numa_arch_fixup(void) {} 236void __init acpi_numa_arch_fixup(void) {}
351 237
352#ifdef CONFIG_NUMA_EMU 238int __init x86_acpi_numa_init(void)
353void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
354 unsigned long end)
355{
356 int i;
357
358 for_each_node_mask(i, nodes_parsed) {
359 cutoff_node(i, start, end);
360 physnodes[i].start = nodes[i].start;
361 physnodes[i].end = nodes[i].end;
362 }
363}
364#endif /* CONFIG_NUMA_EMU */
365
366/* Use the information discovered above to actually set up the nodes. */
367int __init acpi_scan_nodes(unsigned long start, unsigned long end)
368{ 239{
369 int i; 240 int ret;
370
371 if (acpi_numa <= 0)
372 return -1;
373
374 /* First clean up the node list */
375 for (i = 0; i < MAX_NUMNODES; i++)
376 cutoff_node(i, start, end);
377
378 /*
379 * Join together blocks on the same node, holes between
380 * which don't overlap with memory on other nodes.
381 */
382 for (i = 0; i < num_node_memblks; ++i) {
383 int j, k;
384
385 for (j = i + 1; j < num_node_memblks; ++j) {
386 unsigned long start, end;
387
388 if (memblk_nodeid[i] != memblk_nodeid[j])
389 continue;
390 start = min(node_memblk_range[i].end,
391 node_memblk_range[j].end);
392 end = max(node_memblk_range[i].start,
393 node_memblk_range[j].start);
394 for (k = 0; k < num_node_memblks; ++k) {
395 if (memblk_nodeid[i] == memblk_nodeid[k])
396 continue;
397 if (start < node_memblk_range[k].end &&
398 end > node_memblk_range[k].start)
399 break;
400 }
401 if (k < num_node_memblks)
402 continue;
403 start = min(node_memblk_range[i].start,
404 node_memblk_range[j].start);
405 end = max(node_memblk_range[i].end,
406 node_memblk_range[j].end);
407 printk(KERN_INFO "SRAT: Node %d "
408 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
409 memblk_nodeid[i],
410 node_memblk_range[i].start,
411 node_memblk_range[i].end,
412 node_memblk_range[j].start,
413 node_memblk_range[j].end,
414 start, end);
415 node_memblk_range[i].start = start;
416 node_memblk_range[i].end = end;
417 k = --num_node_memblks - j;
418 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
419 k * sizeof(*memblk_nodeid));
420 memmove(node_memblk_range + j, node_memblk_range + j+1,
421 k * sizeof(*node_memblk_range));
422 --j;
423 }
424 }
425
426 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
427 memblk_nodeid);
428 if (memnode_shift < 0) {
429 printk(KERN_ERR
430 "SRAT: No NUMA node hash function found. Contact maintainer\n");
431 bad_srat();
432 return -1;
433 }
434
435 for (i = 0; i < num_node_memblks; i++)
436 memblock_x86_register_active_regions(memblk_nodeid[i],
437 node_memblk_range[i].start >> PAGE_SHIFT,
438 node_memblk_range[i].end >> PAGE_SHIFT);
439
440 /* for out of order entries in SRAT */
441 sort_node_map();
442 if (!nodes_cover_memory(nodes)) {
443 bad_srat();
444 return -1;
445 }
446 241
447 /* Account for nodes with cpus and no memory */ 242 ret = acpi_numa_init();
448 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 243 if (ret < 0)
449 244 return ret;
450 /* Finally register nodes */ 245 return srat_disabled() ? -EINVAL : 0;
451 for_each_node_mask(i, node_possible_map)
452 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
453 /* Try again in case setup_node_bootmem missed one due
454 to missing bootmem */
455 for_each_node_mask(i, node_possible_map)
456 if (!node_online(i))
457 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
458
459 for (i = 0; i < nr_cpu_ids; i++) {
460 int node = early_cpu_to_node(i);
461
462 if (node == NUMA_NO_NODE)
463 continue;
464 if (!node_online(node))
465 numa_clear_node(i);
466 }
467 numa_init_array();
468 return 0;
469}
470
471#ifdef CONFIG_NUMA_EMU
472static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
473 [0 ... MAX_NUMNODES-1] = PXM_INVAL
474};
475static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
476 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
477};
478static int __init find_node_by_addr(unsigned long addr)
479{
480 int ret = NUMA_NO_NODE;
481 int i;
482
483 for_each_node_mask(i, nodes_parsed) {
484 /*
485 * Find the real node that this emulated node appears on. For
486 * the sake of simplicity, we only use a real node's starting
487 * address to determine which emulated node it appears on.
488 */
489 if (addr >= nodes[i].start && addr < nodes[i].end) {
490 ret = i;
491 break;
492 }
493 }
494 return ret;
495} 246}
496 247
497/*
498 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
499 * mappings that respect the real ACPI topology but reflect our emulated
500 * environment. For each emulated node, we find which real node it appears on
501 * and create PXM to NID mappings for those fake nodes which mirror that
502 * locality. SLIT will now represent the correct distances between emulated
503 * nodes as a result of the real topology.
504 */
505void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
506{
507 int i, j;
508
509 for (i = 0; i < num_nodes; i++) {
510 int nid, pxm;
511
512 nid = find_node_by_addr(fake_nodes[i].start);
513 if (nid == NUMA_NO_NODE)
514 continue;
515 pxm = node_to_pxm(nid);
516 if (pxm == PXM_INVAL)
517 continue;
518 fake_node_to_pxm_map[i] = pxm;
519 /*
520 * For each apicid_to_node mapping that exists for this real
521 * node, it must now point to the fake node ID.
522 */
523 for (j = 0; j < MAX_LOCAL_APIC; j++)
524 if (apicid_to_node[j] == nid &&
525 fake_apicid_to_node[j] == NUMA_NO_NODE)
526 fake_apicid_to_node[j] = i;
527 }
528
529 /*
530 * If there are apicid-to-node mappings for physical nodes that do not
531 * have a corresponding emulated node, it should default to a guaranteed
532 * value.
533 */
534 for (i = 0; i < MAX_LOCAL_APIC; i++)
535 if (apicid_to_node[i] != NUMA_NO_NODE &&
536 fake_apicid_to_node[i] == NUMA_NO_NODE)
537 fake_apicid_to_node[i] = 0;
538
539 for (i = 0; i < num_nodes; i++)
540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
541 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
542
543 nodes_clear(nodes_parsed);
544 for (i = 0; i < num_nodes; i++)
545 if (fake_nodes[i].start != fake_nodes[i].end)
546 node_set(i, nodes_parsed);
547}
548
549static int null_slit_node_compare(int a, int b)
550{
551 return node_to_pxm(a) == node_to_pxm(b);
552}
553#else
554static int null_slit_node_compare(int a, int b)
555{
556 return a == b;
557}
558#endif /* CONFIG_NUMA_EMU */
559
560int __node_distance(int a, int b)
561{
562 int index;
563
564 if (!acpi_slit)
565 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
566 REMOTE_DISTANCE;
567 index = acpi_slit->locality_count * node_to_pxm(a);
568 return acpi_slit->entry[index + node_to_pxm(b)];
569}
570
571EXPORT_SYMBOL(__node_distance);
572
573#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) 248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
574int memory_add_physaddr_to_nid(u64 start) 249int memory_add_physaddr_to_nid(u64 start)
575{ 250{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8..d6c0418c3e4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
179 sender = this_cpu_read(tlb_vector_offset); 179 sender = this_cpu_read(tlb_vector_offset);
180 f = &flush_state[sender]; 180 f = &flush_state[sender];
181 181
182 /* 182 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
183 * Could avoid this lock when 183 raw_spin_lock(&f->tlbstate_lock);
184 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
185 * probably not worth checking this for a cache-hot lock.
186 */
187 raw_spin_lock(&f->tlbstate_lock);
188 184
189 f->flush_mm = mm; 185 f->flush_mm = mm;
190 f->flush_va = va; 186 f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
202 198
203 f->flush_mm = NULL; 199 f->flush_mm = NULL;
204 f->flush_va = 0; 200 f->flush_va = 0;
205 raw_spin_unlock(&f->tlbstate_lock); 201 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202 raw_spin_unlock(&f->tlbstate_lock);
206} 203}
207 204
208void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
211 if (is_uv_system()) { 208 if (is_uv_system()) {
212 unsigned int cpu; 209 unsigned int cpu;
213 210
214 cpu = get_cpu(); 211 cpu = smp_processor_id();
215 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
216 if (cpumask) 213 if (cpumask)
217 flush_tlb_others_ipi(cpumask, mm, va); 214 flush_tlb_others_ipi(cpumask, mm, va);
218 put_cpu();
219 return; 215 return;
220 } 216 }
221 flush_tlb_others_ipi(cpumask, mm, va); 217 flush_tlb_others_ipi(cpumask, mm, va);
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 9260b3eb18d..67858be4b52 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -255,7 +255,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
255static int ce4100_conf_read(unsigned int seg, unsigned int bus, 255static int ce4100_conf_read(unsigned int seg, unsigned int bus,
256 unsigned int devfn, int reg, int len, u32 *value) 256 unsigned int devfn, int reg, int len, u32 *value)
257{ 257{
258 int i, retval = 1; 258 int i;
259 259
260 if (bus == 1) { 260 if (bus == 1) {
261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { 261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a07d09..8c4085a95ef 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -20,7 +20,8 @@
20#include <asm/xen/pci.h> 20#include <asm/xen/pci.h>
21 21
22#ifdef CONFIG_ACPI 22#ifdef CONFIG_ACPI
23static int xen_hvm_register_pirq(u32 gsi, int triggering) 23static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
24 int trigger, int polarity)
24{ 25{
25 int rc, irq; 26 int rc, irq;
26 struct physdev_map_pirq map_irq; 27 struct physdev_map_pirq map_irq;
@@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
41 return -1; 42 return -1;
42 } 43 }
43 44
44 if (triggering == ACPI_EDGE_SENSITIVE) { 45 if (trigger == ACPI_EDGE_SENSITIVE) {
45 shareable = 0; 46 shareable = 0;
46 name = "ioapic-edge"; 47 name = "ioapic-edge";
47 } else { 48 } else {
@@ -55,12 +56,6 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
55 56
56 return irq; 57 return irq;
57} 58}
58
59static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
60 int trigger, int polarity)
61{
62 return xen_hvm_register_pirq(gsi, trigger);
63}
64#endif 59#endif
65 60
66#if defined(CONFIG_PCI_MSI) 61#if defined(CONFIG_PCI_MSI)
@@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
91 86
92static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 87static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
93{ 88{
94 int irq, pirq, ret = 0; 89 int irq, pirq;
95 struct msi_desc *msidesc; 90 struct msi_desc *msidesc;
96 struct msi_msg msg; 91 struct msi_msg msg;
97 92
@@ -99,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
99 __read_msi_msg(msidesc, &msg); 94 __read_msi_msg(msidesc, &msg);
100 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | 95 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
101 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); 96 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
102 if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { 97 if (msg.data != XEN_PIRQ_MSI_DATA ||
103 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 98 xen_irq_from_pirq(pirq) < 0) {
104 "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); 99 pirq = xen_allocate_pirq_msi(dev, msidesc);
105 if (irq < 0) 100 if (pirq < 0)
106 goto error; 101 goto error;
107 ret = set_irq_msi(irq, msidesc); 102 xen_msi_compose_msg(dev, pirq, &msg);
108 if (ret < 0) 103 __write_msi_msg(msidesc, &msg);
109 goto error_while; 104 dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
110 printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" 105 } else {
111 " pirq=%d\n", irq, pirq); 106 dev_dbg(&dev->dev,
112 return 0; 107 "xen: msi already bound to pirq=%d\n", pirq);
113 } 108 }
114 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
115 "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); 110 (type == PCI_CAP_ID_MSIX) ?
116 if (irq < 0 || pirq < 0) 111 "msi-x" : "msi");
112 if (irq < 0)
117 goto error; 113 goto error;
118 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); 114 dev_dbg(&dev->dev,
119 xen_msi_compose_msg(dev, pirq, &msg); 115 "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
120 ret = set_irq_msi(irq, msidesc);
121 if (ret < 0)
122 goto error_while;
123 write_msi_msg(irq, &msg);
124 } 116 }
125 return 0; 117 return 0;
126 118
127error_while:
128 unbind_from_irqhandler(irq, NULL);
129error: 119error:
130 if (ret == -ENODEV) 120 dev_err(&dev->dev,
131 dev_err(&dev->dev, "Xen PCI frontend has not registered" \ 121 "Xen PCI frontend has not registered MSI/MSI-X support!\n");
132 " MSI/MSI-X support!\n"); 122 return -ENODEV;
133
134 return ret;
135} 123}
136 124
137/* 125/*
@@ -150,35 +138,26 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
150 return -ENOMEM; 138 return -ENOMEM;
151 139
152 if (type == PCI_CAP_ID_MSIX) 140 if (type == PCI_CAP_ID_MSIX)
153 ret = xen_pci_frontend_enable_msix(dev, &v, nvec); 141 ret = xen_pci_frontend_enable_msix(dev, v, nvec);
154 else 142 else
155 ret = xen_pci_frontend_enable_msi(dev, &v); 143 ret = xen_pci_frontend_enable_msi(dev, v);
156 if (ret) 144 if (ret)
157 goto error; 145 goto error;
158 i = 0; 146 i = 0;
159 list_for_each_entry(msidesc, &dev->msi_list, list) { 147 list_for_each_entry(msidesc, &dev->msi_list, list) {
160 irq = xen_allocate_pirq(v[i], 0, /* not sharable */ 148 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
161 (type == PCI_CAP_ID_MSIX) ? 149 (type == PCI_CAP_ID_MSIX) ?
162 "pcifront-msi-x" : "pcifront-msi"); 150 "pcifront-msi-x" :
163 if (irq < 0) { 151 "pcifront-msi");
164 ret = -1; 152 if (irq < 0)
165 goto free; 153 goto free;
166 }
167
168 ret = set_irq_msi(irq, msidesc);
169 if (ret)
170 goto error_while;
171 i++; 154 i++;
172 } 155 }
173 kfree(v); 156 kfree(v);
174 return 0; 157 return 0;
175 158
176error_while:
177 unbind_from_irqhandler(irq, NULL);
178error: 159error:
179 if (ret == -ENODEV) 160 dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
180 dev_err(&dev->dev, "Xen PCI frontend has not registered" \
181 " MSI/MSI-X support!\n");
182free: 161free:
183 kfree(v); 162 kfree(v);
184 return ret; 163 return ret;
@@ -193,6 +172,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
193 xen_pci_frontend_disable_msix(dev); 172 xen_pci_frontend_disable_msix(dev);
194 else 173 else
195 xen_pci_frontend_disable_msi(dev); 174 xen_pci_frontend_disable_msi(dev);
175
176 /* Free the IRQ's and the msidesc using the generic code. */
177 default_teardown_msi_irqs(dev);
196} 178}
197 179
198static void xen_teardown_msi_irq(unsigned int irq) 180static void xen_teardown_msi_irq(unsigned int irq)
@@ -200,47 +182,82 @@ static void xen_teardown_msi_irq(unsigned int irq)
200 xen_destroy_irq(irq); 182 xen_destroy_irq(irq);
201} 183}
202 184
185#ifdef CONFIG_XEN_DOM0
203static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 186static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
204{ 187{
205 int irq, ret; 188 int ret = 0;
206 struct msi_desc *msidesc; 189 struct msi_desc *msidesc;
207 190
208 list_for_each_entry(msidesc, &dev->msi_list, list) { 191 list_for_each_entry(msidesc, &dev->msi_list, list) {
209 irq = xen_create_msi_irq(dev, msidesc, type); 192 struct physdev_map_pirq map_irq;
210 if (irq < 0)
211 return -1;
212 193
213 ret = set_irq_msi(irq, msidesc); 194 memset(&map_irq, 0, sizeof(map_irq));
214 if (ret) 195 map_irq.domid = DOMID_SELF;
215 goto error; 196 map_irq.type = MAP_PIRQ_TYPE_MSI;
216 } 197 map_irq.index = -1;
217 return 0; 198 map_irq.pirq = -1;
199 map_irq.bus = dev->bus->number;
200 map_irq.devfn = dev->devfn;
218 201
219error: 202 if (type == PCI_CAP_ID_MSIX) {
220 xen_destroy_irq(irq); 203 int pos;
204 u32 table_offset, bir;
205
206 pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
207
208 pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
209 &table_offset);
210 bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
211
212 map_irq.table_base = pci_resource_start(dev, bir);
213 map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
214 }
215
216 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
217 if (ret) {
218 dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
219 goto out;
220 }
221
222 ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
223 map_irq.pirq, map_irq.index,
224 (type == PCI_CAP_ID_MSIX) ?
225 "msi-x" : "msi");
226 if (ret < 0)
227 goto out;
228 }
229 ret = 0;
230out:
221 return ret; 231 return ret;
222} 232}
223#endif 233#endif
234#endif
224 235
225static int xen_pcifront_enable_irq(struct pci_dev *dev) 236static int xen_pcifront_enable_irq(struct pci_dev *dev)
226{ 237{
227 int rc; 238 int rc;
228 int share = 1; 239 int share = 1;
240 u8 gsi;
229 241
230 dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); 242 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
231 243 if (rc < 0) {
232 if (dev->irq < 0) 244 dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
233 return -EINVAL; 245 rc);
246 return rc;
247 }
234 248
235 if (dev->irq < NR_IRQS_LEGACY) 249 if (gsi < NR_IRQS_LEGACY)
236 share = 0; 250 share = 0;
237 251
238 rc = xen_allocate_pirq(dev->irq, share, "pcifront"); 252 rc = xen_allocate_pirq(gsi, share, "pcifront");
239 if (rc < 0) { 253 if (rc < 0) {
240 dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", 254 dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n",
241 dev->irq, rc); 255 gsi, rc);
242 return rc; 256 return rc;
243 } 257 }
258
259 dev->irq = rc;
260 dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
244 return 0; 261 return 0;
245} 262}
246 263
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index cd6f184c3b3..28071bb31db 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -16,21 +16,19 @@
16#include <linux/serial_8250.h> 16#include <linux/serial_8250.h>
17 17
18#include <asm/ce4100.h> 18#include <asm/ce4100.h>
19#include <asm/prom.h>
19#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/i8259.h>
20#include <asm/io.h> 22#include <asm/io.h>
23#include <asm/io_apic.h>
21 24
22static int ce4100_i8042_detect(void) 25static int ce4100_i8042_detect(void)
23{ 26{
24 return 0; 27 return 0;
25} 28}
26 29
27static void __init sdv_find_smp_config(void)
28{
29}
30
31#ifdef CONFIG_SERIAL_8250 30#ifdef CONFIG_SERIAL_8250
32 31
33
34static unsigned int mem_serial_in(struct uart_port *p, int offset) 32static unsigned int mem_serial_in(struct uart_port *p, int offset)
35{ 33{
36 offset = offset << p->regshift; 34 offset = offset << p->regshift;
@@ -119,6 +117,15 @@ static void __init sdv_arch_setup(void)
119 sdv_serial_fixup(); 117 sdv_serial_fixup();
120} 118}
121 119
120#ifdef CONFIG_X86_IO_APIC
121static void __cpuinit sdv_pci_init(void)
122{
123 x86_of_pci_init();
124 /* We can't set this earlier, because we need to calibrate the timer */
125 legacy_pic = &null_legacy_pic;
126}
127#endif
128
122/* 129/*
123 * CE4100 specific x86_init function overrides and early setup 130 * CE4100 specific x86_init function overrides and early setup
124 * calls. 131 * calls.
@@ -129,6 +136,11 @@ void __init x86_ce4100_early_setup(void)
129 x86_platform.i8042_detect = ce4100_i8042_detect; 136 x86_platform.i8042_detect = ce4100_i8042_detect;
130 x86_init.resources.probe_roms = x86_init_noop; 137 x86_init.resources.probe_roms = x86_init_noop;
131 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 138 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
132 x86_init.mpparse.find_smp_config = sdv_find_smp_config; 139 x86_init.mpparse.find_smp_config = x86_init_noop;
133 x86_init.pci.init = ce4100_pci_init; 140 x86_init.pci.init = ce4100_pci_init;
141
142#ifdef CONFIG_X86_IO_APIC
143 x86_init.pci.init_irq = sdv_pci_init;
144 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
145#endif
134} 146}
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
new file mode 100644
index 00000000000..dc701ea5854
--- /dev/null
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -0,0 +1,428 @@
1/*
2 * CE4100 on Falcon Falls
3 *
4 * (c) Copyright 2010 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; version 2 of the License.
9 */
10/dts-v1/;
11/ {
12 model = "intel,falconfalls";
13 compatible = "intel,falconfalls";
14 #address-cells = <1>;
15 #size-cells = <1>;
16
17 cpus {
18 #address-cells = <1>;
19 #size-cells = <0>;
20
21 cpu@0 {
22 device_type = "cpu";
23 compatible = "intel,ce4100";
24 reg = <0>;
25 lapic = <&lapic0>;
26 };
27 };
28
29 soc@0 {
30 #address-cells = <1>;
31 #size-cells = <1>;
32 compatible = "intel,ce4100-cp";
33 ranges;
34
35 ioapic1: interrupt-controller@fec00000 {
36 #interrupt-cells = <2>;
37 compatible = "intel,ce4100-ioapic";
38 interrupt-controller;
39 reg = <0xfec00000 0x1000>;
40 };
41
42 timer@fed00000 {
43 compatible = "intel,ce4100-hpet";
44 reg = <0xfed00000 0x200>;
45 };
46
47 lapic0: interrupt-controller@fee00000 {
48 compatible = "intel,ce4100-lapic";
49 reg = <0xfee00000 0x1000>;
50 };
51
52 pci@3fc {
53 #address-cells = <3>;
54 #size-cells = <2>;
55 compatible = "intel,ce4100-pci", "pci";
56 device_type = "pci";
57 bus-range = <0 0>;
58 ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000
59 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000
60 0x0000000 0 0x0 0x0 0 0x100>;
61
62 /* Secondary IO-APIC */
63 ioapic2: interrupt-controller@0,1 {
64 #interrupt-cells = <2>;
65 compatible = "intel,ce4100-ioapic";
66 interrupt-controller;
67 reg = <0x100 0x0 0x0 0x0 0x0>;
68 assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>;
69 };
70
71 pci@1,0 {
72 #address-cells = <3>;
73 #size-cells = <2>;
74 compatible = "intel,ce4100-pci", "pci";
75 device_type = "pci";
76 bus-range = <1 1>;
77 ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
78
79 interrupt-parent = <&ioapic2>;
80
81 display@2,0 {
82 compatible = "pci8086,2e5b.2",
83 "pci8086,2e5b",
84 "pciclass038000",
85 "pciclass0380";
86
87 reg = <0x11000 0x0 0x0 0x0 0x0>;
88 interrupts = <0 1>;
89 };
90
91 multimedia@3,0 {
92 compatible = "pci8086,2e5c.2",
93 "pci8086,2e5c",
94 "pciclass048000",
95 "pciclass0480";
96
97 reg = <0x11800 0x0 0x0 0x0 0x0>;
98 interrupts = <2 1>;
99 };
100
101 multimedia@4,0 {
102 compatible = "pci8086,2e5d.2",
103 "pci8086,2e5d",
104 "pciclass048000",
105 "pciclass0480";
106
107 reg = <0x12000 0x0 0x0 0x0 0x0>;
108 interrupts = <4 1>;
109 };
110
111 multimedia@4,1 {
112 compatible = "pci8086,2e5e.2",
113 "pci8086,2e5e",
114 "pciclass048000",
115 "pciclass0480";
116
117 reg = <0x12100 0x0 0x0 0x0 0x0>;
118 interrupts = <5 1>;
119 };
120
121 sound@6,0 {
122 compatible = "pci8086,2e5f.2",
123 "pci8086,2e5f",
124 "pciclass040100",
125 "pciclass0401";
126
127 reg = <0x13000 0x0 0x0 0x0 0x0>;
128 interrupts = <6 1>;
129 };
130
131 sound@6,1 {
132 compatible = "pci8086,2e5f.2",
133 "pci8086,2e5f",
134 "pciclass040100",
135 "pciclass0401";
136
137 reg = <0x13100 0x0 0x0 0x0 0x0>;
138 interrupts = <7 1>;
139 };
140
141 sound@6,2 {
142 compatible = "pci8086,2e60.2",
143 "pci8086,2e60",
144 "pciclass040100",
145 "pciclass0401";
146
147 reg = <0x13200 0x0 0x0 0x0 0x0>;
148 interrupts = <8 1>;
149 };
150
151 display@8,0 {
152 compatible = "pci8086,2e61.2",
153 "pci8086,2e61",
154 "pciclass038000",
155 "pciclass0380";
156
157 reg = <0x14000 0x0 0x0 0x0 0x0>;
158 interrupts = <9 1>;
159 };
160
161 display@8,1 {
162 compatible = "pci8086,2e62.2",
163 "pci8086,2e62",
164 "pciclass038000",
165 "pciclass0380";
166
167 reg = <0x14100 0x0 0x0 0x0 0x0>;
168 interrupts = <10 1>;
169 };
170
171 multimedia@8,2 {
172 compatible = "pci8086,2e63.2",
173 "pci8086,2e63",
174 "pciclass048000",
175 "pciclass0480";
176
177 reg = <0x14200 0x0 0x0 0x0 0x0>;
178 interrupts = <11 1>;
179 };
180
181 entertainment-encryption@9,0 {
182 compatible = "pci8086,2e64.2",
183 "pci8086,2e64",
184 "pciclass101000",
185 "pciclass1010";
186
187 reg = <0x14800 0x0 0x0 0x0 0x0>;
188 interrupts = <12 1>;
189 };
190
191 localbus@a,0 {
192 compatible = "pci8086,2e65.2",
193 "pci8086,2e65",
194 "pciclassff0000",
195 "pciclassff00";
196
197 reg = <0x15000 0x0 0x0 0x0 0x0>;
198 };
199
200 serial@b,0 {
201 compatible = "pci8086,2e66.2",
202 "pci8086,2e66",
203 "pciclass070003",
204 "pciclass0700";
205
206 reg = <0x15800 0x0 0x0 0x0 0x0>;
207 interrupts = <14 1>;
208 };
209
210 gpio@b,1 {
211 compatible = "pci8086,2e67.2",
212 "pci8086,2e67",
213 "pciclassff0000",
214 "pciclassff00";
215
216 #gpio-cells = <2>;
217 reg = <0x15900 0x0 0x0 0x0 0x0>;
218 interrupts = <15 1>;
219 gpio-controller;
220 };
221
222 i2c-controller@b,2 {
223 #address-cells = <2>;
224 #size-cells = <1>;
225 compatible = "pci8086,2e68.2",
226 "pci8086,2e68",
227 "pciclass,ff0000",
228 "pciclass,ff00";
229
230 reg = <0x15a00 0x0 0x0 0x0 0x0>;
231 interrupts = <16 1>;
232 ranges = <0 0 0x02000000 0 0xdffe0500 0x100
233 1 0 0x02000000 0 0xdffe0600 0x100
234 2 0 0x02000000 0 0xdffe0700 0x100>;
235
236 i2c@0 {
237 #address-cells = <1>;
238 #size-cells = <0>;
239 compatible = "intel,ce4100-i2c-controller";
240 reg = <0 0 0x100>;
241 };
242
243 i2c@1 {
244 #address-cells = <1>;
245 #size-cells = <0>;
246 compatible = "intel,ce4100-i2c-controller";
247 reg = <1 0 0x100>;
248
249 gpio@26 {
250 #gpio-cells = <2>;
251 compatible = "ti,pcf8575";
252 reg = <0x26>;
253 gpio-controller;
254 };
255 };
256
257 i2c@2 {
258 #address-cells = <1>;
259 #size-cells = <0>;
260 compatible = "intel,ce4100-i2c-controller";
261 reg = <2 0 0x100>;
262
263 gpio@26 {
264 #gpio-cells = <2>;
265 compatible = "ti,pcf8575";
266 reg = <0x26>;
267 gpio-controller;
268 };
269 };
270 };
271
272 smard-card@b,3 {
273 compatible = "pci8086,2e69.2",
274 "pci8086,2e69",
275 "pciclass070500",
276 "pciclass0705";
277
278 reg = <0x15b00 0x0 0x0 0x0 0x0>;
279 interrupts = <15 1>;
280 };
281
282 spi-controller@b,4 {
283 #address-cells = <1>;
284 #size-cells = <0>;
285 compatible =
286 "pci8086,2e6a.2",
287 "pci8086,2e6a",
288 "pciclass,ff0000",
289 "pciclass,ff00";
290
291 reg = <0x15c00 0x0 0x0 0x0 0x0>;
292 interrupts = <15 1>;
293
294 dac@0 {
295 compatible = "ti,pcm1755";
296 reg = <0>;
297 spi-max-frequency = <115200>;
298 };
299
300 dac@1 {
301 compatible = "ti,pcm1609a";
302 reg = <1>;
303 spi-max-frequency = <115200>;
304 };
305
306 eeprom@2 {
307 compatible = "atmel,at93c46";
308 reg = <2>;
309 spi-max-frequency = <115200>;
310 };
311 };
312
313 multimedia@b,7 {
314 compatible = "pci8086,2e6d.2",
315 "pci8086,2e6d",
316 "pciclassff0000",
317 "pciclassff00";
318
319 reg = <0x15f00 0x0 0x0 0x0 0x0>;
320 };
321
322 ethernet@c,0 {
323 compatible = "pci8086,2e6e.2",
324 "pci8086,2e6e",
325 "pciclass020000",
326 "pciclass0200";
327
328 reg = <0x16000 0x0 0x0 0x0 0x0>;
329 interrupts = <21 1>;
330 };
331
332 clock@c,1 {
333 compatible = "pci8086,2e6f.2",
334 "pci8086,2e6f",
335 "pciclassff0000",
336 "pciclassff00";
337
338 reg = <0x16100 0x0 0x0 0x0 0x0>;
339 interrupts = <3 1>;
340 };
341
342 usb@d,0 {
343 compatible = "pci8086,2e70.2",
344 "pci8086,2e70",
345 "pciclass0c0320",
346 "pciclass0c03";
347
348 reg = <0x16800 0x0 0x0 0x0 0x0>;
349 interrupts = <22 3>;
350 };
351
352 usb@d,1 {
353 compatible = "pci8086,2e70.2",
354 "pci8086,2e70",
355 "pciclass0c0320",
356 "pciclass0c03";
357
358 reg = <0x16900 0x0 0x0 0x0 0x0>;
359 interrupts = <22 3>;
360 };
361
362 sata@e,0 {
363 compatible = "pci8086,2e71.0",
364 "pci8086,2e71",
365 "pciclass010601",
366 "pciclass0106";
367
368 reg = <0x17000 0x0 0x0 0x0 0x0>;
369 interrupts = <23 3>;
370 };
371
372 flash@f,0 {
373 compatible = "pci8086,701.1",
374 "pci8086,701",
375 "pciclass050100",
376 "pciclass0501";
377
378 reg = <0x17800 0x0 0x0 0x0 0x0>;
379 interrupts = <13 1>;
380 };
381
382 entertainment-encryption@10,0 {
383 compatible = "pci8086,702.1",
384 "pci8086,702",
385 "pciclass101000",
386 "pciclass1010";
387
388 reg = <0x18000 0x0 0x0 0x0 0x0>;
389 };
390
391 co-processor@11,0 {
392 compatible = "pci8086,703.1",
393 "pci8086,703",
394 "pciclass0b4000",
395 "pciclass0b40";
396
397 reg = <0x18800 0x0 0x0 0x0 0x0>;
398 interrupts = <1 1>;
399 };
400
401 multimedia@12,0 {
402 compatible = "pci8086,704.0",
403 "pci8086,704",
404 "pciclass048000",
405 "pciclass0480";
406
407 reg = <0x19000 0x0 0x0 0x0 0x0>;
408 };
409 };
410
411 isa@1f,0 {
412 #address-cells = <2>;
413 #size-cells = <1>;
414 compatible = "isa";
415 ranges = <1 0 0 0 0 0x100>;
416
417 rtc@70 {
418 compatible = "intel,ce4100-rtc", "motorola,mc146818";
419 interrupts = <8 3>;
420 interrupt-parent = <&ioapic1>;
421 ctrl-reg = <2>;
422 freq-reg = <0x26>;
423 reg = <1 0x70 2>;
424 };
425 };
426 };
427 };
428};
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index ea6529e93c6..5c0207bf959 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -31,6 +31,7 @@
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/io_apic.h> 32#include <asm/io_apic.h>
33#include <asm/mrst.h> 33#include <asm/mrst.h>
34#include <asm/mrst-vrtc.h>
34#include <asm/io.h> 35#include <asm/io.h>
35#include <asm/i8259.h> 36#include <asm/i8259.h>
36#include <asm/intel_scu_ipc.h> 37#include <asm/intel_scu_ipc.h>
@@ -268,6 +269,7 @@ void __init x86_mrst_early_setup(void)
268 269
269 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 270 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
270 x86_platform.i8042_detect = mrst_i8042_detect; 271 x86_platform.i8042_detect = mrst_i8042_detect;
272 x86_init.timers.wallclock_init = mrst_rtc_init;
271 x86_init.pci.init = pci_mrst_init; 273 x86_init.pci.init = pci_mrst_init;
272 x86_init.pci.fixup_irqs = x86_init_noop; 274 x86_init.pci.fixup_irqs = x86_init_noop;
273 275
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 32cd7edd71a..04cf645feb9 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -100,22 +100,14 @@ int vrtc_set_mmss(unsigned long nowtime)
100 100
101void __init mrst_rtc_init(void) 101void __init mrst_rtc_init(void)
102{ 102{
103 unsigned long rtc_paddr; 103 unsigned long vrtc_paddr = sfi_mrtc_array[0].phys_addr;
104 void __iomem *virt_base;
105 104
106 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 105 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
107 if (!sfi_mrtc_num) 106 if (!sfi_mrtc_num || !vrtc_paddr)
108 return; 107 return;
109 108
110 rtc_paddr = sfi_mrtc_array[0].phys_addr; 109 vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
111 110 vrtc_paddr);
112 /* vRTC's register address may not be page aligned */
113 set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
114
115 virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
116 virt_base += rtc_paddr & ~PAGE_MASK;
117 vrtc_virt_base = virt_base;
118
119 x86_platform.get_wallclock = vrtc_get_time; 111 x86_platform.get_wallclock = vrtc_get_time;
120 x86_platform.set_wallclock = vrtc_set_mmss; 112 x86_platform.set_wallclock = vrtc_set_mmss;
121} 113}
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index e797428b163..c2a8cab65e5 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 3obj-$(CONFIG_OLPC) += olpc_ofw.o
4obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o 4obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 7b24460917d..374a05d8ad2 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
131 unsigned long mmr_offset, int limit) 131 unsigned long mmr_offset, int limit)
132{ 132{
133 const struct cpumask *eligible_cpu = cpumask_of(cpu); 133 const struct cpumask *eligible_cpu = cpumask_of(cpu);
134 struct irq_cfg *cfg = get_irq_chip_data(irq); 134 struct irq_cfg *cfg = irq_get_chip_data(irq);
135 unsigned long mmr_value; 135 unsigned long mmr_value;
136 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
137 int mmr_pnode, err; 137 int mmr_pnode, err;
@@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
148 else 148 else
149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
150 150
151 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, 151 irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
152 irq_name); 152 irq_name);
153 153
154 mmr_value = 0; 154 mmr_value = 0;
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 63203767174..fe4cf829487 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -569,11 +569,13 @@ out_unlock:
569static struct irqaction master_action = { 569static struct irqaction master_action = {
570 .handler = piix4_master_intr, 570 .handler = piix4_master_intr,
571 .name = "PIIX4-8259", 571 .name = "PIIX4-8259",
572 .flags = IRQF_NO_THREAD,
572}; 573};
573 574
574static struct irqaction cascade_action = { 575static struct irqaction cascade_action = {
575 .handler = no_action, 576 .handler = no_action,
576 .name = "cascade", 577 .name = "cascade",
578 .flags = IRQF_NO_THREAD,
577}; 579};
578 580
579static inline void set_piix4_virtual_irq_type(void) 581static inline void set_piix4_virtual_irq_type(void)
@@ -606,7 +608,7 @@ static void __init visws_pre_intr_init(void)
606 chip = &cobalt_irq_type; 608 chip = &cobalt_irq_type;
607 609
608 if (chip) 610 if (chip)
609 set_irq_chip(i, chip); 611 irq_set_chip(i, chip);
610 } 612 }
611 613
612 setup_irq(CO_IRQ_8259, &master_action); 614 setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892e4bc..e4343fe488e 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -48,3 +48,11 @@ config XEN_DEBUG_FS
48 help 48 help
49 Enable statistics output and various tuning options in debugfs. 49 Enable statistics output and various tuning options in debugfs.
50 Enabling this option may incur a significant performance overhead. 50 Enabling this option may incur a significant performance overhead.
51
52config XEN_DEBUG
53 bool "Enable Xen debug checks"
54 depends on XEN
55 default n
56 help
57 Enable various WARN_ON checks in the Xen MMU code.
58 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542efe45f..49dbd78ec3c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1284,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor)
1284 1284
1285 xen_setup_features(); 1285 xen_setup_features();
1286 1286
1287 pv_info = xen_info; 1287 pv_info.name = "Xen HVM";
1288 pv_info.kernel_rpl = 0;
1289 1288
1290 xen_domain_type = XEN_HVM_DOMAIN; 1289 xen_domain_type = XEN_HVM_DOMAIN;
1291 1290
1292 return 0; 1291 return 0;
1293} 1292}
1294 1293
1295void xen_hvm_init_shared_info(void) 1294void __ref xen_hvm_init_shared_info(void)
1296{ 1295{
1297 int cpu; 1296 int cpu;
1298 struct xen_add_to_physmap xatp; 1297 struct xen_add_to_physmap xatp;
@@ -1331,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1331 switch (action) { 1330 switch (action) {
1332 case CPU_UP_PREPARE: 1331 case CPU_UP_PREPARE:
1333 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1332 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1333 if (xen_have_vector_callback)
1334 xen_init_lock_cpu(cpu);
1334 break; 1335 break;
1335 default: 1336 default:
1336 break; 1337 break;
@@ -1355,6 +1356,7 @@ static void __init xen_hvm_guest_init(void)
1355 1356
1356 if (xen_feature(XENFEAT_hvm_callback_vector)) 1357 if (xen_feature(XENFEAT_hvm_callback_vector))
1357 xen_have_vector_callback = 1; 1358 xen_have_vector_callback = 1;
1359 xen_hvm_smp_init();
1358 register_cpu_notifier(&xen_hvm_cpu_notifier); 1360 register_cpu_notifier(&xen_hvm_cpu_notifier);
1359 xen_unplug_emulated_devices(); 1361 xen_unplug_emulated_devices();
1360 have_vcpu_info_placement = 0; 1362 have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f6089421147..3f6f3347aa1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h> 48#include <linux/memblock.h>
49#include <linux/seq_file.h>
49 50
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
51#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
@@ -416,8 +417,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
416 if (val & _PAGE_PRESENT) { 417 if (val & _PAGE_PRESENT) {
417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 418 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
418 pteval_t flags = val & PTE_FLAGS_MASK; 419 pteval_t flags = val & PTE_FLAGS_MASK;
419 unsigned long mfn = pfn_to_mfn(pfn); 420 unsigned long mfn;
420 421
422 if (!xen_feature(XENFEAT_auto_translated_physmap))
423 mfn = get_phys_to_machine(pfn);
424 else
425 mfn = pfn;
421 /* 426 /*
422 * If there's no mfn for the pfn, then just create an 427 * If there's no mfn for the pfn, then just create an
423 * empty non-present pte. Unfortunately this loses 428 * empty non-present pte. Unfortunately this loses
@@ -427,8 +432,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
427 if (unlikely(mfn == INVALID_P2M_ENTRY)) { 432 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
428 mfn = 0; 433 mfn = 0;
429 flags = 0; 434 flags = 0;
435 } else {
436 /*
437 * Paramount to do this test _after_ the
438 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
439 * IDENTITY_FRAME_BIT resolves to true.
440 */
441 mfn &= ~FOREIGN_FRAME_BIT;
442 if (mfn & IDENTITY_FRAME_BIT) {
443 mfn &= ~IDENTITY_FRAME_BIT;
444 flags |= _PAGE_IOMAP;
445 }
430 } 446 }
431
432 val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 447 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
433 } 448 }
434 449
@@ -532,6 +547,41 @@ pte_t xen_make_pte(pteval_t pte)
532} 547}
533PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 548PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
534 549
550#ifdef CONFIG_XEN_DEBUG
551pte_t xen_make_pte_debug(pteval_t pte)
552{
553 phys_addr_t addr = (pte & PTE_PFN_MASK);
554 phys_addr_t other_addr;
555 bool io_page = false;
556 pte_t _pte;
557
558 if (pte & _PAGE_IOMAP)
559 io_page = true;
560
561 _pte = xen_make_pte(pte);
562
563 if (!addr)
564 return _pte;
565
566 if (io_page &&
567 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
568 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
569 WARN(addr != other_addr,
570 "0x%lx is using VM_IO, but it is 0x%lx!\n",
571 (unsigned long)addr, (unsigned long)other_addr);
572 } else {
573 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
574 other_addr = (_pte.pte & PTE_PFN_MASK);
575 WARN((addr == other_addr) && (!io_page) && (!iomap_set),
576 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
577 (unsigned long)addr);
578 }
579
580 return _pte;
581}
582PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
583#endif
584
535pgd_t xen_make_pgd(pgdval_t pgd) 585pgd_t xen_make_pgd(pgdval_t pgd)
536{ 586{
537 pgd = pte_pfn_to_mfn(pgd); 587 pgd = pte_pfn_to_mfn(pgd);
@@ -1441,7 +1491,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1441 * early_ioremap fixmap slot, make sure it is RO. 1491 * early_ioremap fixmap slot, make sure it is RO.
1442 */ 1492 */
1443 if (!is_early_ioremap_ptep(ptep) && 1493 if (!is_early_ioremap_ptep(ptep) &&
1444 pfn >= e820_table_start && pfn < e820_table_end) 1494 pfn >= pgt_buf_start && pfn < pgt_buf_end)
1445 pte = pte_wrprotect(pte); 1495 pte = pte_wrprotect(pte);
1446 1496
1447 return pte; 1497 return pte;
@@ -1940,6 +1990,9 @@ __init void xen_ident_map_ISA(void)
1940 1990
1941static __init void xen_post_allocator_init(void) 1991static __init void xen_post_allocator_init(void)
1942{ 1992{
1993#ifdef CONFIG_XEN_DEBUG
1994 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1995#endif
1943 pv_mmu_ops.set_pte = xen_set_pte; 1996 pv_mmu_ops.set_pte = xen_set_pte;
1944 pv_mmu_ops.set_pmd = xen_set_pmd; 1997 pv_mmu_ops.set_pmd = xen_set_pmd;
1945 pv_mmu_ops.set_pud = xen_set_pud; 1998 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2072,7 +2125,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2072 in_frames[i] = virt_to_mfn(vaddr); 2125 in_frames[i] = virt_to_mfn(vaddr);
2073 2126
2074 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2127 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2075 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2128 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2076 2129
2077 if (out_frames) 2130 if (out_frames)
2078 out_frames[i] = virt_to_pfn(vaddr); 2131 out_frames[i] = virt_to_pfn(vaddr);
@@ -2351,6 +2404,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2351 2404
2352#ifdef CONFIG_XEN_DEBUG_FS 2405#ifdef CONFIG_XEN_DEBUG_FS
2353 2406
2407static int p2m_dump_open(struct inode *inode, struct file *filp)
2408{
2409 return single_open(filp, p2m_dump_show, NULL);
2410}
2411
2412static const struct file_operations p2m_dump_fops = {
2413 .open = p2m_dump_open,
2414 .read = seq_read,
2415 .llseek = seq_lseek,
2416 .release = single_release,
2417};
2418
2354static struct dentry *d_mmu_debug; 2419static struct dentry *d_mmu_debug;
2355 2420
2356static int __init xen_mmu_debugfs(void) 2421static int __init xen_mmu_debugfs(void)
@@ -2406,6 +2471,7 @@ static int __init xen_mmu_debugfs(void)
2406 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, 2471 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2407 &mmu_stats.prot_commit_batched); 2472 &mmu_stats.prot_commit_batched);
2408 2473
2474 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2409 return 0; 2475 return 0;
2410} 2476}
2411fs_initcall(xen_mmu_debugfs); 2477fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fd12d7ce7ff..215a3ce6106 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -23,6 +23,129 @@
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always 23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to 24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively. 25 * 512 and 1024 entries respectively.
26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 *
29 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
33 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000
37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
40 * get the PFN value to match the MFN.
41 *
42 * For this to work efficiently we have one new page p2m_identity and
43 * allocate (via reserved_brk) any other pages we need to cover the sides
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 *
48 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn).
53 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they
82 * point to p2m_mid_missing).
83 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
86 * Each entry in the allocate page is "missing" (points to p2m_missing).
87 *
88 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
95 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
96 *
97 * At this point we would at minimum reserve_brk one page, but could be up to
98 * three. Each call to set_phys_range_identity has at maximum a three page
99 * cost. If we were to query the P2M at this stage, all those entries from
100 * start PFN through end PFN (so 1029MB -> 2001MB) would return
101 * INVALID_P2M_ENTRY ("missing").
102 *
103 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
107 * point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing:
123 *
124 * p2m /--------------\
125 * /-----\ | &mfn_list[0],| /-----------------\
126 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
127 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
128 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
129 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity
133 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
138 * / | IDENTITY[@256]|<----/ \---------------/
139 * / | ~0, ~0, .... |
140 * | \---------------/
141 * |
142 * p2m_missing p2m_missing
143 * /------------------\ /------------\
144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \------------------/ \------------/
147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
26 */ 149 */
27 150
28#include <linux/init.h> 151#include <linux/init.h>
@@ -30,6 +153,7 @@
30#include <linux/list.h> 153#include <linux/list.h>
31#include <linux/hash.h> 154#include <linux/hash.h>
32#include <linux/sched.h> 155#include <linux/sched.h>
156#include <linux/seq_file.h>
33 157
34#include <asm/cache.h> 158#include <asm/cache.h>
35#include <asm/setup.h> 159#include <asm/setup.h>
@@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); 183static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); 184static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61 185
186static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
187
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 188RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 189RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64 190
191/* We might hit two boundary violations at the start and end, at max each
192 * boundary violation will require three middle nodes. */
193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
194
65static inline unsigned p2m_top_index(unsigned long pfn) 195static inline unsigned p2m_top_index(unsigned long pfn)
66{ 196{
67 BUG_ON(pfn >= MAX_P2M_PFN); 197 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m)
136 * - After resume we're called from within stop_machine, but the mfn 266 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated. 267 * tree should alreay be completely allocated.
138 */ 268 */
139void xen_build_mfn_list_list(void) 269void __ref xen_build_mfn_list_list(void)
140{ 270{
141 unsigned long pfn; 271 unsigned long pfn;
142 272
@@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); 351 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top); 352 p2m_top_init(p2m_top);
223 353
354 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
355 p2m_init(p2m_identity);
356
224 /* 357 /*
225 * The domain builder gives us a pre-constructed p2m array in 358 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just 359 * mfn_list for all the pages initially given to us, so we just
@@ -266,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
266 mididx = p2m_mid_index(pfn); 399 mididx = p2m_mid_index(pfn);
267 idx = p2m_index(pfn); 400 idx = p2m_index(pfn);
268 401
402 /*
403 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
404 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
405 * would be wrong.
406 */
407 if (p2m_top[topidx][mididx] == p2m_identity)
408 return IDENTITY_FRAME(pfn);
409
269 return p2m_top[topidx][mididx][idx]; 410 return p2m_top[topidx][mididx][idx];
270} 411}
271EXPORT_SYMBOL_GPL(get_phys_to_machine); 412EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -335,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn)
335 p2m_top_mfn_p[topidx] = mid_mfn; 476 p2m_top_mfn_p[topidx] = mid_mfn;
336 } 477 }
337 478
338 if (p2m_top[topidx][mididx] == p2m_missing) { 479 if (p2m_top[topidx][mididx] == p2m_identity ||
480 p2m_top[topidx][mididx] == p2m_missing) {
339 /* p2m leaf page is missing */ 481 /* p2m leaf page is missing */
340 unsigned long *p2m; 482 unsigned long *p2m;
483 unsigned long *p2m_orig = p2m_top[topidx][mididx];
341 484
342 p2m = alloc_p2m_page(); 485 p2m = alloc_p2m_page();
343 if (!p2m) 486 if (!p2m)
@@ -345,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn)
345 488
346 p2m_init(p2m); 489 p2m_init(p2m);
347 490
348 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) 491 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
349 free_p2m_page(p2m); 492 free_p2m_page(p2m);
350 else 493 else
351 mid_mfn[mididx] = virt_to_mfn(p2m); 494 mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -354,11 +497,91 @@ static bool alloc_p2m(unsigned long pfn)
354 return true; 497 return true;
355} 498}
356 499
500bool __early_alloc_p2m(unsigned long pfn)
501{
502 unsigned topidx, mididx, idx;
503
504 topidx = p2m_top_index(pfn);
505 mididx = p2m_mid_index(pfn);
506 idx = p2m_index(pfn);
507
508 /* Pfff.. No boundary cross-over, lets get out. */
509 if (!idx)
510 return false;
511
512 WARN(p2m_top[topidx][mididx] == p2m_identity,
513 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
514 topidx, mididx);
515
516 /*
517 * Could be done by xen_build_dynamic_phys_to_machine..
518 */
519 if (p2m_top[topidx][mididx] != p2m_missing)
520 return false;
521
522 /* Boundary cross-over for the edges: */
523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525
526 p2m_init(p2m);
527
528 p2m_top[topidx][mididx] = p2m;
529
530 }
531 return idx != 0;
532}
533unsigned long set_phys_range_identity(unsigned long pfn_s,
534 unsigned long pfn_e)
535{
536 unsigned long pfn;
537
538 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
539 return 0;
540
541 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
542 return pfn_e - pfn_s;
543
544 if (pfn_s > pfn_e)
545 return 0;
546
547 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
548 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
549 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
550 {
551 unsigned topidx = p2m_top_index(pfn);
552 if (p2m_top[topidx] == p2m_mid_missing) {
553 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554
555 p2m_mid_init(mid);
556
557 p2m_top[topidx] = mid;
558 }
559 }
560
561 __early_alloc_p2m(pfn_s);
562 __early_alloc_p2m(pfn_e);
563
564 for (pfn = pfn_s; pfn < pfn_e; pfn++)
565 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
566 break;
567
568 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
569 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
570 (pfn_e - pfn_s) - (pfn - pfn_s)))
571 printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
572
573 return pfn - pfn_s;
574}
575
357/* Try to install p2m mapping; fail if intermediate bits missing */ 576/* Try to install p2m mapping; fail if intermediate bits missing */
358bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 577bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
359{ 578{
360 unsigned topidx, mididx, idx; 579 unsigned topidx, mididx, idx;
361 580
581 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
582 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
583 return true;
584 }
362 if (unlikely(pfn >= MAX_P2M_PFN)) { 585 if (unlikely(pfn >= MAX_P2M_PFN)) {
363 BUG_ON(mfn != INVALID_P2M_ENTRY); 586 BUG_ON(mfn != INVALID_P2M_ENTRY);
364 return true; 587 return true;
@@ -368,6 +591,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
368 mididx = p2m_mid_index(pfn); 591 mididx = p2m_mid_index(pfn);
369 idx = p2m_index(pfn); 592 idx = p2m_index(pfn);
370 593
594 /* For sparse holes were the p2m leaf has real PFN along with
595 * PCI holes, stick in the PFN as the MFN value.
596 */
597 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
598 if (p2m_top[topidx][mididx] == p2m_identity)
599 return true;
600
601 /* Swap over from MISSING to IDENTITY if needed. */
602 if (p2m_top[topidx][mididx] == p2m_missing) {
603 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
604 p2m_identity) != p2m_missing);
605 return true;
606 }
607 }
608
371 if (p2m_top[topidx][mididx] == p2m_missing) 609 if (p2m_top[topidx][mididx] == p2m_missing)
372 return mfn == INVALID_P2M_ENTRY; 610 return mfn == INVALID_P2M_ENTRY;
373 611
@@ -378,11 +616,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
378 616
379bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) 617bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
380{ 618{
381 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
382 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
383 return true;
384 }
385
386 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 619 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
387 if (!alloc_p2m(pfn)) 620 if (!alloc_p2m(pfn))
388 return false; 621 return false;
@@ -421,7 +654,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
421{ 654{
422 unsigned long flags; 655 unsigned long flags;
423 unsigned long pfn; 656 unsigned long pfn;
424 unsigned long address; 657 unsigned long uninitialized_var(address);
425 unsigned level; 658 unsigned level;
426 pte_t *ptep = NULL; 659 pte_t *ptep = NULL;
427 660
@@ -455,7 +688,7 @@ int m2p_remove_override(struct page *page)
455 unsigned long flags; 688 unsigned long flags;
456 unsigned long mfn; 689 unsigned long mfn;
457 unsigned long pfn; 690 unsigned long pfn;
458 unsigned long address; 691 unsigned long uninitialized_var(address);
459 unsigned level; 692 unsigned level;
460 pte_t *ptep = NULL; 693 pte_t *ptep = NULL;
461 694
@@ -520,3 +753,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
520 return ret; 753 return ret;
521} 754}
522EXPORT_SYMBOL_GPL(m2p_find_override_pfn); 755EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
756
757#ifdef CONFIG_XEN_DEBUG_FS
758
759int p2m_dump_show(struct seq_file *m, void *v)
760{
761 static const char * const level_name[] = { "top", "middle",
762 "entry", "abnormal" };
763 static const char * const type_name[] = { "identity", "missing",
764 "pfn", "abnormal"};
765#define TYPE_IDENTITY 0
766#define TYPE_MISSING 1
767#define TYPE_PFN 2
768#define TYPE_UNKNOWN 3
769 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
770 unsigned int uninitialized_var(prev_level);
771 unsigned int uninitialized_var(prev_type);
772
773 if (!p2m_top)
774 return 0;
775
776 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
777 unsigned topidx = p2m_top_index(pfn);
778 unsigned mididx = p2m_mid_index(pfn);
779 unsigned idx = p2m_index(pfn);
780 unsigned lvl, type;
781
782 lvl = 4;
783 type = TYPE_UNKNOWN;
784 if (p2m_top[topidx] == p2m_mid_missing) {
785 lvl = 0; type = TYPE_MISSING;
786 } else if (p2m_top[topidx] == NULL) {
787 lvl = 0; type = TYPE_UNKNOWN;
788 } else if (p2m_top[topidx][mididx] == NULL) {
789 lvl = 1; type = TYPE_UNKNOWN;
790 } else if (p2m_top[topidx][mididx] == p2m_identity) {
791 lvl = 1; type = TYPE_IDENTITY;
792 } else if (p2m_top[topidx][mididx] == p2m_missing) {
793 lvl = 1; type = TYPE_MISSING;
794 } else if (p2m_top[topidx][mididx][idx] == 0) {
795 lvl = 2; type = TYPE_UNKNOWN;
796 } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
797 lvl = 2; type = TYPE_IDENTITY;
798 } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
799 lvl = 2; type = TYPE_MISSING;
800 } else if (p2m_top[topidx][mididx][idx] == pfn) {
801 lvl = 2; type = TYPE_PFN;
802 } else if (p2m_top[topidx][mididx][idx] != pfn) {
803 lvl = 2; type = TYPE_PFN;
804 }
805 if (pfn == 0) {
806 prev_level = lvl;
807 prev_type = type;
808 }
809 if (pfn == MAX_DOMAIN_PAGES-1) {
810 lvl = 3;
811 type = TYPE_UNKNOWN;
812 }
813 if (prev_type != type) {
814 seq_printf(m, " [0x%lx->0x%lx] %s\n",
815 prev_pfn_type, pfn, type_name[prev_type]);
816 prev_pfn_type = pfn;
817 prev_type = type;
818 }
819 if (prev_level != lvl) {
820 seq_printf(m, " [0x%lx->0x%lx] level %s\n",
821 prev_pfn_level, pfn, level_name[prev_level]);
822 prev_pfn_level = pfn;
823 prev_level = lvl;
824 }
825 }
826 return 0;
827#undef TYPE_IDENTITY
828#undef TYPE_MISSING
829#undef TYPE_PFN
830#undef TYPE_UNKNOWN
831}
832#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a8a66a50d44..fa0269a9937 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
52 52
53static __init void xen_add_extra_mem(unsigned long pages) 53static __init void xen_add_extra_mem(unsigned long pages)
54{ 54{
55 unsigned long pfn;
56
55 u64 size = (u64)pages * PAGE_SIZE; 57 u64 size = (u64)pages * PAGE_SIZE;
56 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; 58 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
57 59
@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
66 xen_extra_mem_size += size; 68 xen_extra_mem_size += size;
67 69
68 xen_max_p2m_pfn = PFN_DOWN(extra_start + size); 70 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
71
72 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
73 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
69} 74}
70 75
71static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 76static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
104 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
105 start, end, ret); 110 start, end, ret);
106 if (ret == 1) { 111 if (ret == 1) {
107 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
108 len++; 113 len++;
109 } 114 }
110 } 115 }
@@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
138 return released; 143 return released;
139} 144}
140 145
146static unsigned long __init xen_set_identity(const struct e820entry *list,
147 ssize_t map_size)
148{
149 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
150 phys_addr_t start_pci = last;
151 const struct e820entry *entry;
152 unsigned long identity = 0;
153 int i;
154
155 for (i = 0, entry = list; i < map_size; i++, entry++) {
156 phys_addr_t start = entry->addr;
157 phys_addr_t end = start + entry->size;
158
159 if (start < last)
160 start = last;
161
162 if (end <= start)
163 continue;
164
165 /* Skip over the 1MB region. */
166 if (last > end)
167 continue;
168
169 if (entry->type == E820_RAM) {
170 if (start > start_pci)
171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start));
173
174 /* Without saving 'last' we would gooble RAM too
175 * at the end of the loop. */
176 last = end;
177 start_pci = end;
178 continue;
179 }
180 start_pci = min(start, start_pci);
181 last = end;
182 }
183 if (last > start_pci)
184 identity += set_phys_range_identity(
185 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity;
187}
141/** 188/**
142 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 * machine_specific_memory_setup - Hook for machine specific memory setup.
143 **/ 190 **/
144char * __init xen_memory_setup(void) 191char * __init xen_memory_setup(void)
145{ 192{
146 static struct e820entry map[E820MAX] __initdata; 193 static struct e820entry map[E820MAX] __initdata;
194 static struct e820entry map_raw[E820MAX] __initdata;
147 195
148 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long max_pfn = xen_start_info->nr_pages;
149 unsigned long long mem_end; 197 unsigned long long mem_end;
@@ -151,6 +199,7 @@ char * __init xen_memory_setup(void)
151 struct xen_memory_map memmap; 199 struct xen_memory_map memmap;
152 unsigned long extra_pages = 0; 200 unsigned long extra_pages = 0;
153 unsigned long extra_limit; 201 unsigned long extra_limit;
202 unsigned long identity_pages = 0;
154 int i; 203 int i;
155 int op; 204 int op;
156 205
@@ -176,6 +225,7 @@ char * __init xen_memory_setup(void)
176 } 225 }
177 BUG_ON(rc); 226 BUG_ON(rc);
178 227
228 memcpy(map_raw, map, sizeof(map));
179 e820.nr_map = 0; 229 e820.nr_map = 0;
180 xen_extra_mem_start = mem_end; 230 xen_extra_mem_start = mem_end;
181 for (i = 0; i < memmap.nr_entries; i++) { 231 for (i = 0; i < memmap.nr_entries; i++) {
@@ -194,6 +244,15 @@ char * __init xen_memory_setup(void)
194 end -= delta; 244 end -= delta;
195 245
196 extra_pages += PFN_DOWN(delta); 246 extra_pages += PFN_DOWN(delta);
247 /*
248 * Set RAM below 4GB that is not for us to be unusable.
249 * This prevents "System RAM" address space from being
250 * used as potential resource for I/O address (happens
251 * when 'allocate_resource' is called).
252 */
253 if (delta &&
254 (xen_initial_domain() && end < 0x100000000ULL))
255 e820_add_region(end, delta, E820_UNUSABLE);
197 } 256 }
198 257
199 if (map[i].size > 0 && end > xen_extra_mem_start) 258 if (map[i].size > 0 && end > xen_extra_mem_start)
@@ -251,6 +310,13 @@ char * __init xen_memory_setup(void)
251 310
252 xen_add_extra_mem(extra_pages); 311 xen_add_extra_mem(extra_pages);
253 312
313 /*
314 * Set P2M for all non-RAM pages and E820 gaps to be identity
315 * type PFNs. We supply it with the non-sanitized version
316 * of the E820.
317 */
318 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
319 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
254 return "Xen"; 320 return "Xen";
255} 321}
256 322
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c795904..30612441ed9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
509 xen_fill_possible_map(); 509 xen_fill_possible_map();
510 xen_init_spinlocks(); 510 xen_init_spinlocks();
511} 511}
512
513static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
514{
515 native_smp_prepare_cpus(max_cpus);
516 WARN_ON(xen_smp_intr_init(0));
517
518 if (!xen_have_vector_callback)
519 return;
520 xen_init_lock_cpu(0);
521 xen_init_spinlocks();
522}
523
524static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
525{
526 int rc;
527 rc = native_cpu_up(cpu);
528 WARN_ON (xen_smp_intr_init(cpu));
529 return rc;
530}
531
532static void xen_hvm_cpu_die(unsigned int cpu)
533{
534 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
535 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
536 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
537 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
538 native_cpu_die(cpu);
539}
540
541void __init xen_hvm_smp_init(void)
542{
543 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
544 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
545 smp_ops.cpu_up = xen_hvm_cpu_up;
546 smp_ops.cpu_die = xen_hvm_cpu_die;
547 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
548 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
549}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a129b..45329c8c226 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_pre_suspend(void) 15void xen_arch_pre_suspend(void)
16{ 16{
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 18 xen_start_info->console.domU.mfn =
@@ -26,8 +26,9 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled) 29void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM
31 int cpu; 32 int cpu;
32 xen_hvm_init_shared_info(); 33 xen_hvm_init_shared_info();
33 xen_callback_vector(); 34 xen_callback_vector();
@@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled)
37 xen_setup_runstate_info(cpu); 38 xen_setup_runstate_info(cpu);
38 } 39 }
39 } 40 }
41#endif
40} 42}
41 43
42void xen_post_suspend(int suspend_cancelled) 44void xen_arch_post_suspend(int suspend_cancelled)
43{ 45{
44 xen_build_mfn_list_list(); 46 xen_build_mfn_list_list();
45 47
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 067759e3d6a..2e2d370a47b 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -397,7 +397,9 @@ void xen_setup_timer(int cpu)
397 name = "<timer kasprintf failed>"; 397 name = "<timer kasprintf failed>";
398 398
399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
400 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, 400 IRQF_DISABLED|IRQF_PERCPU|
401 IRQF_NOBALANCING|IRQF_TIMER|
402 IRQF_FORCE_RESUME,
401 name, NULL); 403 name, NULL);
402 404
403 evt = &per_cpu(xen_clock_events, cpu); 405 evt = &per_cpu(xen_clock_events, cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24e29c..aaa7291c925 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .align PAGE_SIZE_asm 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33 .skip PAGE_SIZE_asm 33 .skip PAGE_SIZE
34.popsection 34.popsection
35 35
36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf98575..3112f55638c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);
64 64
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66void xen_smp_init(void); 66void xen_smp_init(void);
67void __init xen_hvm_smp_init(void);
67 68
68extern cpumask_var_t xen_cpu_initialized_map; 69extern cpumask_var_t xen_cpu_initialized_map;
69#else 70#else
70static inline void xen_smp_init(void) {} 71static inline void xen_smp_init(void) {}
72static inline void xen_hvm_smp_init(void) {}
71#endif 73#endif
72 74
73#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index e39edf5c86f..249619e7e7f 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -17,44 +17,12 @@
17#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead." 17#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
18#endif 18#endif
19 19
20#include <linux/list.h>
21#include <linux/spinlock.h>
22#include <asm/atomic.h>
23#include <asm/system.h>
24
25/*
26 * the semaphore definition
27 */
28struct rw_semaphore {
29 signed long count;
30#define RWSEM_UNLOCKED_VALUE 0x00000000 20#define RWSEM_UNLOCKED_VALUE 0x00000000
31#define RWSEM_ACTIVE_BIAS 0x00000001 21#define RWSEM_ACTIVE_BIAS 0x00000001
32#define RWSEM_ACTIVE_MASK 0x0000ffff 22#define RWSEM_ACTIVE_MASK 0x0000ffff
33#define RWSEM_WAITING_BIAS (-0x00010000) 23#define RWSEM_WAITING_BIAS (-0x00010000)
34#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 24#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
35#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 25#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
36 spinlock_t wait_lock;
37 struct list_head wait_list;
38};
39
40#define __RWSEM_INITIALIZER(name) \
41 { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
42 LIST_HEAD_INIT((name).wait_list) }
43
44#define DECLARE_RWSEM(name) \
45 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
46
47extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
48extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
49extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
50extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
51
52static inline void init_rwsem(struct rw_semaphore *sem)
53{
54 sem->count = RWSEM_UNLOCKED_VALUE;
55 spin_lock_init(&sem->wait_lock);
56 INIT_LIST_HEAD(&sem->wait_list);
57}
58 26
59/* 27/*
60 * lock for reading 28 * lock for reading
@@ -160,9 +128,4 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
160 return atomic_add_return(delta, (atomic_t *)(&sem->count)); 128 return atomic_add_return(delta, (atomic_t *)(&sem->count));
161} 129}
162 130
163static inline int rwsem_is_locked(struct rw_semaphore *sem)
164{
165 return (sem->count != 0);
166}
167
168#endif /* _XTENSA_RWSEM_H */ 131#endif /* _XTENSA_RWSEM_H */
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 19df764f639..f3e5eb43f71 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -96,16 +96,12 @@ again:
96 update_process_times(user_mode(get_irq_regs())); 96 update_process_times(user_mode(get_irq_regs()));
97#endif 97#endif
98 98
99 write_seqlock(&xtime_lock); 99 xtime_update(1); /* Linux handler in kernel/time/timekeeping */
100
101 do_timer(1); /* Linux handler in kernel/timer.c */
102 100
103 /* Note that writing CCOMPARE clears the interrupt. */ 101 /* Note that writing CCOMPARE clears the interrupt. */
104 102
105 next += CCOUNT_PER_JIFFY; 103 next += CCOUNT_PER_JIFFY;
106 set_linux_timer(next); 104 set_linux_timer(next);
107
108 write_sequnlock(&xtime_lock);
109 } 105 }
110 106
111 /* Allow platform to do something useful (Wdog). */ 107 /* Allow platform to do something useful (Wdog). */
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 5eb25eb3ea4..3b5c3189fd9 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -274,7 +274,7 @@ acpi_table_parse_srat(enum acpi_srat_type id,
274 274
275int __init acpi_numa_init(void) 275int __init acpi_numa_init(void)
276{ 276{
277 int ret = 0; 277 int cnt = 0;
278 278
279 /* 279 /*
280 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= 280 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
@@ -288,7 +288,7 @@ int __init acpi_numa_init(void)
288 acpi_parse_x2apic_affinity, 0); 288 acpi_parse_x2apic_affinity, 0);
289 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, 289 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
290 acpi_parse_processor_affinity, 0); 290 acpi_parse_processor_affinity, 0);
291 ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, 291 cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
292 acpi_parse_memory_affinity, 292 acpi_parse_memory_affinity,
293 NR_NODE_MEMBLKS); 293 NR_NODE_MEMBLKS);
294 } 294 }
@@ -297,7 +297,10 @@ int __init acpi_numa_init(void)
297 acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); 297 acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
298 298
299 acpi_numa_arch_fixup(); 299 acpi_numa_arch_fixup();
300 return ret; 300
301 if (cnt <= 0)
302 return cnt ?: -ENOENT;
303 return 0;
301} 304}
302 305
303int acpi_get_pxm(acpi_handle h) 306int acpi_get_pxm(acpi_handle h)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d7aa39e349a..9cb8668ff5f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -120,6 +120,10 @@ static DEFINE_SPINLOCK(minor_lock);
120#define EXTENDED (1<<EXT_SHIFT) 120#define EXTENDED (1<<EXT_SHIFT)
121#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) 121#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
122#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) 122#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
123#define EMULATED_HD_DISK_MINOR_OFFSET (0)
124#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
125#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
126#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
123 127
124#define DEV_NAME "xvd" /* name in /dev */ 128#define DEV_NAME "xvd" /* name in /dev */
125 129
@@ -281,7 +285,7 @@ static int blkif_queue_request(struct request *req)
281 info->shadow[id].request = req; 285 info->shadow[id].request = req;
282 286
283 ring_req->id = id; 287 ring_req->id = id;
284 ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); 288 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
285 ring_req->handle = info->handle; 289 ring_req->handle = info->handle;
286 290
287 ring_req->operation = rq_data_dir(req) ? 291 ring_req->operation = rq_data_dir(req) ?
@@ -317,7 +321,7 @@ static int blkif_queue_request(struct request *req)
317 rq_data_dir(req) ); 321 rq_data_dir(req) );
318 322
319 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); 323 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
320 ring_req->seg[i] = 324 ring_req->u.rw.seg[i] =
321 (struct blkif_request_segment) { 325 (struct blkif_request_segment) {
322 .gref = ref, 326 .gref = ref,
323 .first_sect = fsect, 327 .first_sect = fsect,
@@ -434,6 +438,65 @@ static void xlvbd_flush(struct blkfront_info *info)
434 info->feature_flush ? "enabled" : "disabled"); 438 info->feature_flush ? "enabled" : "disabled");
435} 439}
436 440
441static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
442{
443 int major;
444 major = BLKIF_MAJOR(vdevice);
445 *minor = BLKIF_MINOR(vdevice);
446 switch (major) {
447 case XEN_IDE0_MAJOR:
448 *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
449 *minor = ((*minor / 64) * PARTS_PER_DISK) +
450 EMULATED_HD_DISK_MINOR_OFFSET;
451 break;
452 case XEN_IDE1_MAJOR:
453 *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
454 *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
455 EMULATED_HD_DISK_MINOR_OFFSET;
456 break;
457 case XEN_SCSI_DISK0_MAJOR:
458 *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
459 *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
460 break;
461 case XEN_SCSI_DISK1_MAJOR:
462 case XEN_SCSI_DISK2_MAJOR:
463 case XEN_SCSI_DISK3_MAJOR:
464 case XEN_SCSI_DISK4_MAJOR:
465 case XEN_SCSI_DISK5_MAJOR:
466 case XEN_SCSI_DISK6_MAJOR:
467 case XEN_SCSI_DISK7_MAJOR:
468 *offset = (*minor / PARTS_PER_DISK) +
469 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
470 EMULATED_SD_DISK_NAME_OFFSET;
471 *minor = *minor +
472 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
473 EMULATED_SD_DISK_MINOR_OFFSET;
474 break;
475 case XEN_SCSI_DISK8_MAJOR:
476 case XEN_SCSI_DISK9_MAJOR:
477 case XEN_SCSI_DISK10_MAJOR:
478 case XEN_SCSI_DISK11_MAJOR:
479 case XEN_SCSI_DISK12_MAJOR:
480 case XEN_SCSI_DISK13_MAJOR:
481 case XEN_SCSI_DISK14_MAJOR:
482 case XEN_SCSI_DISK15_MAJOR:
483 *offset = (*minor / PARTS_PER_DISK) +
484 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
485 EMULATED_SD_DISK_NAME_OFFSET;
486 *minor = *minor +
487 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
488 EMULATED_SD_DISK_MINOR_OFFSET;
489 break;
490 case XENVBD_MAJOR:
491 *offset = *minor / PARTS_PER_DISK;
492 break;
493 default:
494 printk(KERN_WARNING "blkfront: your disk configuration is "
495 "incorrect, please use an xvd device instead\n");
496 return -ENODEV;
497 }
498 return 0;
499}
437 500
438static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 501static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
439 struct blkfront_info *info, 502 struct blkfront_info *info,
@@ -441,7 +504,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
441{ 504{
442 struct gendisk *gd; 505 struct gendisk *gd;
443 int nr_minors = 1; 506 int nr_minors = 1;
444 int err = -ENODEV; 507 int err;
445 unsigned int offset; 508 unsigned int offset;
446 int minor; 509 int minor;
447 int nr_parts; 510 int nr_parts;
@@ -456,12 +519,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
456 } 519 }
457 520
458 if (!VDEV_IS_EXTENDED(info->vdevice)) { 521 if (!VDEV_IS_EXTENDED(info->vdevice)) {
459 minor = BLKIF_MINOR(info->vdevice); 522 err = xen_translate_vdev(info->vdevice, &minor, &offset);
460 nr_parts = PARTS_PER_DISK; 523 if (err)
524 return err;
525 nr_parts = PARTS_PER_DISK;
461 } else { 526 } else {
462 minor = BLKIF_MINOR_EXT(info->vdevice); 527 minor = BLKIF_MINOR_EXT(info->vdevice);
463 nr_parts = PARTS_PER_EXT_DISK; 528 nr_parts = PARTS_PER_EXT_DISK;
529 offset = minor / nr_parts;
530 if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
531 printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
532 "emulated IDE disks,\n\t choose an xvd device name"
533 "from xvde on\n", info->vdevice);
464 } 534 }
535 err = -ENODEV;
465 536
466 if ((minor % nr_parts) == 0) 537 if ((minor % nr_parts) == 0)
467 nr_minors = nr_parts; 538 nr_minors = nr_parts;
@@ -475,8 +546,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
475 if (gd == NULL) 546 if (gd == NULL)
476 goto release; 547 goto release;
477 548
478 offset = minor / nr_parts;
479
480 if (nr_minors > 1) { 549 if (nr_minors > 1) {
481 if (offset < 26) 550 if (offset < 26)
482 sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); 551 sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
@@ -615,7 +684,7 @@ static void blkif_completion(struct blk_shadow *s)
615{ 684{
616 int i; 685 int i;
617 for (i = 0; i < s->req.nr_segments; i++) 686 for (i = 0; i < s->req.nr_segments; i++)
618 gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); 687 gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
619} 688}
620 689
621static irqreturn_t blkif_interrupt(int irq, void *dev_id) 690static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -932,7 +1001,7 @@ static int blkif_recover(struct blkfront_info *info)
932 /* Rewrite any grant references invalidated by susp/resume. */ 1001 /* Rewrite any grant references invalidated by susp/resume. */
933 for (j = 0; j < req->nr_segments; j++) 1002 for (j = 0; j < req->nr_segments; j++)
934 gnttab_grant_foreign_access_ref( 1003 gnttab_grant_foreign_access_ref(
935 req->seg[j].gref, 1004 req->u.rw.seg[j].gref,
936 info->xbdev->otherend_id, 1005 info->xbdev->otherend_id,
937 pfn_to_mfn(info->shadow[req->id].frame[j]), 1006 pfn_to_mfn(info->shadow[req->id].frame[j]),
938 rq_data_dir(info->shadow[req->id].request)); 1007 rq_data_dir(info->shadow[req->id].request));
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index e6d75627c6c..33dc2298af7 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -53,6 +53,8 @@ MODULE_LICENSE("GPL");
53 53
54#define RTC_BITS 55 /* 55 bits for this implementation */ 54#define RTC_BITS 55 /* 55 bits for this implementation */
55 55
56static struct k_clock sgi_clock;
57
56extern unsigned long sn_rtc_cycles_per_second; 58extern unsigned long sn_rtc_cycles_per_second;
57 59
58#define RTC_COUNTER_ADDR ((long *)LOCAL_MMR_ADDR(SH_RTC)) 60#define RTC_COUNTER_ADDR ((long *)LOCAL_MMR_ADDR(SH_RTC))
@@ -487,7 +489,7 @@ static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
487 return 0; 489 return 0;
488}; 490};
489 491
490static int sgi_clock_set(clockid_t clockid, struct timespec *tp) 492static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp)
491{ 493{
492 494
493 u64 nsec; 495 u64 nsec;
@@ -763,15 +765,21 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
763 return err; 765 return err;
764} 766}
765 767
768static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp)
769{
770 tp->tv_sec = 0;
771 tp->tv_nsec = sgi_clock_period;
772 return 0;
773}
774
766static struct k_clock sgi_clock = { 775static struct k_clock sgi_clock = {
767 .res = 0, 776 .clock_set = sgi_clock_set,
768 .clock_set = sgi_clock_set, 777 .clock_get = sgi_clock_get,
769 .clock_get = sgi_clock_get, 778 .clock_getres = sgi_clock_getres,
770 .timer_create = sgi_timer_create, 779 .timer_create = sgi_timer_create,
771 .nsleep = do_posix_clock_nonanosleep, 780 .timer_set = sgi_timer_set,
772 .timer_set = sgi_timer_set, 781 .timer_del = sgi_timer_del,
773 .timer_del = sgi_timer_del, 782 .timer_get = sgi_timer_get
774 .timer_get = sgi_timer_get
775}; 783};
776 784
777/** 785/**
@@ -831,8 +839,8 @@ static int __init mmtimer_init(void)
831 (unsigned long) node); 839 (unsigned long) node);
832 } 840 }
833 841
834 sgi_clock_period = sgi_clock.res = NSEC_PER_SEC / sn_rtc_cycles_per_second; 842 sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second;
835 register_posix_clock(CLOCK_SGI_CYCLE, &sgi_clock); 843 posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock);
836 844
837 printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION, 845 printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION,
838 sn_rtc_cycles_per_second/(unsigned long)1E6); 846 sn_rtc_cycles_per_second/(unsigned long)1E6);
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index 61653f07967..1b46a9d9f90 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -330,9 +330,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
330 i2c->adap = ocores_adapter; 330 i2c->adap = ocores_adapter;
331 i2c_set_adapdata(&i2c->adap, i2c); 331 i2c_set_adapdata(&i2c->adap, i2c);
332 i2c->adap.dev.parent = &pdev->dev; 332 i2c->adap.dev.parent = &pdev->dev;
333#ifdef CONFIG_OF
334 i2c->adap.dev.of_node = pdev->dev.of_node; 333 i2c->adap.dev.of_node = pdev->dev.of_node;
335#endif
336 334
337 /* add i2c adapter to i2c tree */ 335 /* add i2c adapter to i2c tree */
338 ret = i2c_add_adapter(&i2c->adap); 336 ret = i2c_add_adapter(&i2c->adap);
@@ -390,15 +388,11 @@ static int ocores_i2c_resume(struct platform_device *pdev)
390#define ocores_i2c_resume NULL 388#define ocores_i2c_resume NULL
391#endif 389#endif
392 390
393#ifdef CONFIG_OF
394static struct of_device_id ocores_i2c_match[] = { 391static struct of_device_id ocores_i2c_match[] = {
395 { 392 { .compatible = "opencores,i2c-ocores", },
396 .compatible = "opencores,i2c-ocores", 393 {},
397 },
398 {},
399}; 394};
400MODULE_DEVICE_TABLE(of, ocores_i2c_match); 395MODULE_DEVICE_TABLE(of, ocores_i2c_match);
401#endif
402 396
403/* work with hotplug and coldplug */ 397/* work with hotplug and coldplug */
404MODULE_ALIAS("platform:ocores-i2c"); 398MODULE_ALIAS("platform:ocores-i2c");
@@ -411,9 +405,7 @@ static struct platform_driver ocores_i2c_driver = {
411 .driver = { 405 .driver = {
412 .owner = THIS_MODULE, 406 .owner = THIS_MODULE,
413 .name = "ocores-i2c", 407 .name = "ocores-i2c",
414#ifdef CONFIG_OF 408 .of_match_table = ocores_i2c_match,
415 .of_match_table = ocores_i2c_match,
416#endif
417 }, 409 },
418}; 410};
419 411
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index f0bd5bcdf56..045ba6efea4 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -537,9 +537,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
537 client->dev.parent = &client->adapter->dev; 537 client->dev.parent = &client->adapter->dev;
538 client->dev.bus = &i2c_bus_type; 538 client->dev.bus = &i2c_bus_type;
539 client->dev.type = &i2c_client_type; 539 client->dev.type = &i2c_client_type;
540#ifdef CONFIG_OF
541 client->dev.of_node = info->of_node; 540 client->dev.of_node = info->of_node;
542#endif
543 541
544 dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), 542 dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap),
545 client->addr); 543 client->addr);
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index fd877f633dd..2f7fc0c5146 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1516,21 +1516,17 @@ static int __devexit mmc_spi_remove(struct spi_device *spi)
1516 return 0; 1516 return 0;
1517} 1517}
1518 1518
1519#if defined(CONFIG_OF)
1520static struct of_device_id mmc_spi_of_match_table[] __devinitdata = { 1519static struct of_device_id mmc_spi_of_match_table[] __devinitdata = {
1521 { .compatible = "mmc-spi-slot", }, 1520 { .compatible = "mmc-spi-slot", },
1522 {}, 1521 {},
1523}; 1522};
1524#endif
1525 1523
1526static struct spi_driver mmc_spi_driver = { 1524static struct spi_driver mmc_spi_driver = {
1527 .driver = { 1525 .driver = {
1528 .name = "mmc_spi", 1526 .name = "mmc_spi",
1529 .bus = &spi_bus_type, 1527 .bus = &spi_bus_type,
1530 .owner = THIS_MODULE, 1528 .owner = THIS_MODULE,
1531#if defined(CONFIG_OF)
1532 .of_match_table = mmc_spi_of_match_table, 1529 .of_match_table = mmc_spi_of_match_table,
1533#endif
1534 }, 1530 },
1535 .probe = mmc_spi_probe, 1531 .probe = mmc_spi_probe,
1536 .remove = __devexit_p(mmc_spi_remove), 1532 .remove = __devexit_p(mmc_spi_remove),
diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c
index b79d7e1555d..db0290f05bd 100644
--- a/drivers/net/ethoc.c
+++ b/drivers/net/ethoc.c
@@ -1163,15 +1163,11 @@ static int ethoc_resume(struct platform_device *pdev)
1163# define ethoc_resume NULL 1163# define ethoc_resume NULL
1164#endif 1164#endif
1165 1165
1166#ifdef CONFIG_OF
1167static struct of_device_id ethoc_match[] = { 1166static struct of_device_id ethoc_match[] = {
1168 { 1167 { .compatible = "opencores,ethoc", },
1169 .compatible = "opencores,ethoc",
1170 },
1171 {}, 1168 {},
1172}; 1169};
1173MODULE_DEVICE_TABLE(of, ethoc_match); 1170MODULE_DEVICE_TABLE(of, ethoc_match);
1174#endif
1175 1171
1176static struct platform_driver ethoc_driver = { 1172static struct platform_driver ethoc_driver = {
1177 .probe = ethoc_probe, 1173 .probe = ethoc_probe,
@@ -1181,9 +1177,7 @@ static struct platform_driver ethoc_driver = {
1181 .driver = { 1177 .driver = {
1182 .name = "ethoc", 1178 .name = "ethoc",
1183 .owner = THIS_MODULE, 1179 .owner = THIS_MODULE,
1184#ifdef CONFIG_OF
1185 .of_match_table = ethoc_match, 1180 .of_match_table = ethoc_match,
1186#endif
1187 }, 1181 },
1188}; 1182};
1189 1183
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 3c6e100a3ad..d06a6374ed6 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -69,4 +69,10 @@ config OF_MDIO
69 help 69 help
70 OpenFirmware MDIO bus (Ethernet PHY) accessors 70 OpenFirmware MDIO bus (Ethernet PHY) accessors
71 71
72config OF_PCI
73 def_tristate PCI
74 depends on PCI && (PPC || MICROBLAZE || X86)
75 help
76 OpenFirmware PCI bus accessors
77
72endmenu # OF 78endmenu # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 3ab21a0a490..f7861ed2f28 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_OF_I2C) += of_i2c.o
9obj-$(CONFIG_OF_NET) += of_net.o 9obj-$(CONFIG_OF_NET) += of_net.o
10obj-$(CONFIG_OF_SPI) += of_spi.o 10obj-$(CONFIG_OF_SPI) += of_spi.o
11obj-$(CONFIG_OF_MDIO) += of_mdio.o 11obj-$(CONFIG_OF_MDIO) += of_mdio.o
12obj-$(CONFIG_OF_PCI) += of_pci.o
diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c
new file mode 100644
index 00000000000..ac1ec54e4fd
--- /dev/null
+++ b/drivers/of/of_pci.c
@@ -0,0 +1,92 @@
1#include <linux/kernel.h>
2#include <linux/of_pci.h>
3#include <linux/of_irq.h>
4#include <asm/prom.h>
5
6/**
7 * of_irq_map_pci - Resolve the interrupt for a PCI device
8 * @pdev: the device whose interrupt is to be resolved
9 * @out_irq: structure of_irq filled by this function
10 *
11 * This function resolves the PCI interrupt for a given PCI device. If a
12 * device-node exists for a given pci_dev, it will use normal OF tree
13 * walking. If not, it will implement standard swizzling and walk up the
14 * PCI tree until an device-node is found, at which point it will finish
15 * resolving using the OF tree walking.
16 */
17int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
18{
19 struct device_node *dn, *ppnode;
20 struct pci_dev *ppdev;
21 u32 lspec;
22 __be32 lspec_be;
23 __be32 laddr[3];
24 u8 pin;
25 int rc;
26
27 /* Check if we have a device node, if yes, fallback to standard
28 * device tree parsing
29 */
30 dn = pci_device_to_OF_node(pdev);
31 if (dn) {
32 rc = of_irq_map_one(dn, 0, out_irq);
33 if (!rc)
34 return rc;
35 }
36
37 /* Ok, we don't, time to have fun. Let's start by building up an
38 * interrupt spec. we assume #interrupt-cells is 1, which is standard
39 * for PCI. If you do different, then don't use that routine.
40 */
41 rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
42 if (rc != 0)
43 return rc;
44 /* No pin, exit */
45 if (pin == 0)
46 return -ENODEV;
47
48 /* Now we walk up the PCI tree */
49 lspec = pin;
50 for (;;) {
51 /* Get the pci_dev of our parent */
52 ppdev = pdev->bus->self;
53
54 /* Ouch, it's a host bridge... */
55 if (ppdev == NULL) {
56 ppnode = pci_bus_to_OF_node(pdev->bus);
57
58 /* No node for host bridge ? give up */
59 if (ppnode == NULL)
60 return -EINVAL;
61 } else {
62 /* We found a P2P bridge, check if it has a node */
63 ppnode = pci_device_to_OF_node(ppdev);
64 }
65
66 /* Ok, we have found a parent with a device-node, hand over to
67 * the OF parsing code.
68 * We build a unit address from the linux device to be used for
69 * resolution. Note that we use the linux bus number which may
70 * not match your firmware bus numbering.
71 * Fortunately, in most cases, interrupt-map-mask doesn't
72 * include the bus number as part of the matching.
73 * You should still be careful about that though if you intend
74 * to rely on this function (you ship a firmware that doesn't
75 * create device nodes for all PCI devices).
76 */
77 if (ppnode)
78 break;
79
80 /* We can only get here if we hit a P2P bridge with no node,
81 * let's do standard swizzling and try again
82 */
83 lspec = pci_swizzle_interrupt_pin(pdev, lspec);
84 pdev = ppdev;
85 }
86
87 lspec_be = cpu_to_be32(lspec);
88 laddr[0] = cpu_to_be32((pdev->bus->number << 16) | (pdev->devfn << 8));
89 laddr[1] = laddr[2] = cpu_to_be32(0);
90 return of_irq_map_raw(ppnode, &lspec_be, 1, laddr, out_irq);
91}
92EXPORT_SYMBOL_GPL(of_irq_map_pci);
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 3a5a6fcc0ea..492b7d807fe 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -243,7 +243,7 @@ struct pci_ops pcifront_bus_ops = {
243 243
244#ifdef CONFIG_PCI_MSI 244#ifdef CONFIG_PCI_MSI
245static int pci_frontend_enable_msix(struct pci_dev *dev, 245static int pci_frontend_enable_msix(struct pci_dev *dev,
246 int **vector, int nvec) 246 int vector[], int nvec)
247{ 247{
248 int err; 248 int err;
249 int i; 249 int i;
@@ -277,18 +277,24 @@ static int pci_frontend_enable_msix(struct pci_dev *dev,
277 if (likely(!err)) { 277 if (likely(!err)) {
278 if (likely(!op.value)) { 278 if (likely(!op.value)) {
279 /* we get the result */ 279 /* we get the result */
280 for (i = 0; i < nvec; i++) 280 for (i = 0; i < nvec; i++) {
281 *(*vector+i) = op.msix_entries[i].vector; 281 if (op.msix_entries[i].vector <= 0) {
282 return 0; 282 dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n",
283 i, op.msix_entries[i].vector);
284 err = -EINVAL;
285 vector[i] = -1;
286 continue;
287 }
288 vector[i] = op.msix_entries[i].vector;
289 }
283 } else { 290 } else {
284 printk(KERN_DEBUG "enable msix get value %x\n", 291 printk(KERN_DEBUG "enable msix get value %x\n",
285 op.value); 292 op.value);
286 return op.value;
287 } 293 }
288 } else { 294 } else {
289 dev_err(&dev->dev, "enable msix get err %x\n", err); 295 dev_err(&dev->dev, "enable msix get err %x\n", err);
290 return err;
291 } 296 }
297 return err;
292} 298}
293 299
294static void pci_frontend_disable_msix(struct pci_dev *dev) 300static void pci_frontend_disable_msix(struct pci_dev *dev)
@@ -310,7 +316,7 @@ static void pci_frontend_disable_msix(struct pci_dev *dev)
310 dev_err(&dev->dev, "pci_disable_msix get err %x\n", err); 316 dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
311} 317}
312 318
313static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector) 319static int pci_frontend_enable_msi(struct pci_dev *dev, int vector[])
314{ 320{
315 int err; 321 int err;
316 struct xen_pci_op op = { 322 struct xen_pci_op op = {
@@ -324,7 +330,13 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
324 330
325 err = do_pci_op(pdev, &op); 331 err = do_pci_op(pdev, &op);
326 if (likely(!err)) { 332 if (likely(!err)) {
327 *(*vector) = op.value; 333 vector[0] = op.value;
334 if (op.value <= 0) {
335 dev_warn(&dev->dev, "MSI entry is invalid: %d!\n",
336 op.value);
337 err = -EINVAL;
338 vector[0] = -1;
339 }
328 } else { 340 } else {
329 dev_err(&dev->dev, "pci frontend enable msi failed for dev " 341 dev_err(&dev->dev, "pci frontend enable msi failed for dev "
330 "%x:%x\n", op.bus, op.devfn); 342 "%x:%x\n", op.bus, op.devfn);
@@ -733,8 +745,7 @@ static void free_pdev(struct pcifront_device *pdev)
733 745
734 pcifront_free_roots(pdev); 746 pcifront_free_roots(pdev);
735 747
736 /*For PCIE_AER error handling job*/ 748 cancel_work_sync(&pdev->op_work);
737 flush_scheduled_work();
738 749
739 if (pdev->irq >= 0) 750 if (pdev->irq >= 0)
740 unbind_from_irqhandler(pdev->irq, pdev); 751 unbind_from_irqhandler(pdev->irq, pdev);
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index c404b61386b..09b4437b3e6 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -117,6 +117,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
117 struct module *owner) 117 struct module *owner)
118{ 118{
119 struct rtc_device *rtc; 119 struct rtc_device *rtc;
120 struct rtc_wkalrm alrm;
120 int id, err; 121 int id, err;
121 122
122 if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) { 123 if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) {
@@ -166,6 +167,12 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
166 rtc->pie_timer.function = rtc_pie_update_irq; 167 rtc->pie_timer.function = rtc_pie_update_irq;
167 rtc->pie_enabled = 0; 168 rtc->pie_enabled = 0;
168 169
170 /* Check to see if there is an ALARM already set in hw */
171 err = __rtc_read_alarm(rtc, &alrm);
172
173 if (!err && !rtc_valid_tm(&alrm.time))
174 rtc_set_alarm(rtc, &alrm);
175
169 strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE); 176 strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
170 dev_set_name(&rtc->dev, "rtc%d", id); 177 dev_set_name(&rtc->dev, "rtc%d", id);
171 178
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index cb2f0728fd7..8ec6b069a7f 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -116,6 +116,186 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
116} 116}
117EXPORT_SYMBOL_GPL(rtc_set_mmss); 117EXPORT_SYMBOL_GPL(rtc_set_mmss);
118 118
119static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
120{
121 int err;
122
123 err = mutex_lock_interruptible(&rtc->ops_lock);
124 if (err)
125 return err;
126
127 if (rtc->ops == NULL)
128 err = -ENODEV;
129 else if (!rtc->ops->read_alarm)
130 err = -EINVAL;
131 else {
132 memset(alarm, 0, sizeof(struct rtc_wkalrm));
133 err = rtc->ops->read_alarm(rtc->dev.parent, alarm);
134 }
135
136 mutex_unlock(&rtc->ops_lock);
137 return err;
138}
139
140int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
141{
142 int err;
143 struct rtc_time before, now;
144 int first_time = 1;
145 unsigned long t_now, t_alm;
146 enum { none, day, month, year } missing = none;
147 unsigned days;
148
149 /* The lower level RTC driver may return -1 in some fields,
150 * creating invalid alarm->time values, for reasons like:
151 *
152 * - The hardware may not be capable of filling them in;
153 * many alarms match only on time-of-day fields, not
154 * day/month/year calendar data.
155 *
156 * - Some hardware uses illegal values as "wildcard" match
157 * values, which non-Linux firmware (like a BIOS) may try
158 * to set up as e.g. "alarm 15 minutes after each hour".
159 * Linux uses only oneshot alarms.
160 *
161 * When we see that here, we deal with it by using values from
162 * a current RTC timestamp for any missing (-1) values. The
163 * RTC driver prevents "periodic alarm" modes.
164 *
165 * But this can be racey, because some fields of the RTC timestamp
166 * may have wrapped in the interval since we read the RTC alarm,
167 * which would lead to us inserting inconsistent values in place
168 * of the -1 fields.
169 *
170 * Reading the alarm and timestamp in the reverse sequence
171 * would have the same race condition, and not solve the issue.
172 *
173 * So, we must first read the RTC timestamp,
174 * then read the RTC alarm value,
175 * and then read a second RTC timestamp.
176 *
177 * If any fields of the second timestamp have changed
178 * when compared with the first timestamp, then we know
179 * our timestamp may be inconsistent with that used by
180 * the low-level rtc_read_alarm_internal() function.
181 *
182 * So, when the two timestamps disagree, we just loop and do
183 * the process again to get a fully consistent set of values.
184 *
185 * This could all instead be done in the lower level driver,
186 * but since more than one lower level RTC implementation needs it,
187 * then it's probably best best to do it here instead of there..
188 */
189
190 /* Get the "before" timestamp */
191 err = rtc_read_time(rtc, &before);
192 if (err < 0)
193 return err;
194 do {
195 if (!first_time)
196 memcpy(&before, &now, sizeof(struct rtc_time));
197 first_time = 0;
198
199 /* get the RTC alarm values, which may be incomplete */
200 err = rtc_read_alarm_internal(rtc, alarm);
201 if (err)
202 return err;
203
204 /* full-function RTCs won't have such missing fields */
205 if (rtc_valid_tm(&alarm->time) == 0)
206 return 0;
207
208 /* get the "after" timestamp, to detect wrapped fields */
209 err = rtc_read_time(rtc, &now);
210 if (err < 0)
211 return err;
212
213 /* note that tm_sec is a "don't care" value here: */
214 } while ( before.tm_min != now.tm_min
215 || before.tm_hour != now.tm_hour
216 || before.tm_mon != now.tm_mon
217 || before.tm_year != now.tm_year);
218
219 /* Fill in the missing alarm fields using the timestamp; we
220 * know there's at least one since alarm->time is invalid.
221 */
222 if (alarm->time.tm_sec == -1)
223 alarm->time.tm_sec = now.tm_sec;
224 if (alarm->time.tm_min == -1)
225 alarm->time.tm_min = now.tm_min;
226 if (alarm->time.tm_hour == -1)
227 alarm->time.tm_hour = now.tm_hour;
228
229 /* For simplicity, only support date rollover for now */
230 if (alarm->time.tm_mday == -1) {
231 alarm->time.tm_mday = now.tm_mday;
232 missing = day;
233 }
234 if (alarm->time.tm_mon == -1) {
235 alarm->time.tm_mon = now.tm_mon;
236 if (missing == none)
237 missing = month;
238 }
239 if (alarm->time.tm_year == -1) {
240 alarm->time.tm_year = now.tm_year;
241 if (missing == none)
242 missing = year;
243 }
244
245 /* with luck, no rollover is needed */
246 rtc_tm_to_time(&now, &t_now);
247 rtc_tm_to_time(&alarm->time, &t_alm);
248 if (t_now < t_alm)
249 goto done;
250
251 switch (missing) {
252
253 /* 24 hour rollover ... if it's now 10am Monday, an alarm that
254 * that will trigger at 5am will do so at 5am Tuesday, which
255 * could also be in the next month or year. This is a common
256 * case, especially for PCs.
257 */
258 case day:
259 dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day");
260 t_alm += 24 * 60 * 60;
261 rtc_time_to_tm(t_alm, &alarm->time);
262 break;
263
264 /* Month rollover ... if it's the 31th, an alarm on the 3rd will
265 * be next month. An alarm matching on the 30th, 29th, or 28th
266 * may end up in the month after that! Many newer PCs support
267 * this type of alarm.
268 */
269 case month:
270 dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month");
271 do {
272 if (alarm->time.tm_mon < 11)
273 alarm->time.tm_mon++;
274 else {
275 alarm->time.tm_mon = 0;
276 alarm->time.tm_year++;
277 }
278 days = rtc_month_days(alarm->time.tm_mon,
279 alarm->time.tm_year);
280 } while (days < alarm->time.tm_mday);
281 break;
282
283 /* Year rollover ... easy except for leap years! */
284 case year:
285 dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year");
286 do {
287 alarm->time.tm_year++;
288 } while (rtc_valid_tm(&alarm->time) != 0);
289 break;
290
291 default:
292 dev_warn(&rtc->dev, "alarm rollover not handled\n");
293 }
294
295done:
296 return 0;
297}
298
119int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) 299int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
120{ 300{
121 int err; 301 int err;
diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c
index 26d1cf5d19a..518a76ec71c 100644
--- a/drivers/rtc/rtc-at91rm9200.c
+++ b/drivers/rtc/rtc-at91rm9200.c
@@ -183,33 +183,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
183 return 0; 183 return 0;
184} 184}
185 185
186/*
187 * Handle commands from user-space
188 */
189static int at91_rtc_ioctl(struct device *dev, unsigned int cmd,
190 unsigned long arg)
191{
192 int ret = 0;
193
194 pr_debug("%s(): cmd=%08x, arg=%08lx.\n", __func__, cmd, arg);
195
196 /* important: scrub old status before enabling IRQs */
197 switch (cmd) {
198 case RTC_UIE_OFF: /* update off */
199 at91_sys_write(AT91_RTC_IDR, AT91_RTC_SECEV);
200 break;
201 case RTC_UIE_ON: /* update on */
202 at91_sys_write(AT91_RTC_SCCR, AT91_RTC_SECEV);
203 at91_sys_write(AT91_RTC_IER, AT91_RTC_SECEV);
204 break;
205 default:
206 ret = -ENOIOCTLCMD;
207 break;
208 }
209
210 return ret;
211}
212
213static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 186static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
214{ 187{
215 pr_debug("%s(): cmd=%08x\n", __func__, enabled); 188 pr_debug("%s(): cmd=%08x\n", __func__, enabled);
@@ -269,7 +242,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id)
269} 242}
270 243
271static const struct rtc_class_ops at91_rtc_ops = { 244static const struct rtc_class_ops at91_rtc_ops = {
272 .ioctl = at91_rtc_ioctl,
273 .read_time = at91_rtc_readtime, 245 .read_time = at91_rtc_readtime,
274 .set_time = at91_rtc_settime, 246 .set_time = at91_rtc_settime,
275 .read_alarm = at91_rtc_readalarm, 247 .read_alarm = at91_rtc_readalarm,
diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c
index 5469c52cba3..a3ad957507d 100644
--- a/drivers/rtc/rtc-at91sam9.c
+++ b/drivers/rtc/rtc-at91sam9.c
@@ -216,33 +216,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
216 return 0; 216 return 0;
217} 217}
218 218
219/*
220 * Handle commands from user-space
221 */
222static int at91_rtc_ioctl(struct device *dev, unsigned int cmd,
223 unsigned long arg)
224{
225 struct sam9_rtc *rtc = dev_get_drvdata(dev);
226 int ret = 0;
227 u32 mr = rtt_readl(rtc, MR);
228
229 dev_dbg(dev, "ioctl: cmd=%08x, arg=%08lx, mr %08x\n", cmd, arg, mr);
230
231 switch (cmd) {
232 case RTC_UIE_OFF: /* update off */
233 rtt_writel(rtc, MR, mr & ~AT91_RTT_RTTINCIEN);
234 break;
235 case RTC_UIE_ON: /* update on */
236 rtt_writel(rtc, MR, mr | AT91_RTT_RTTINCIEN);
237 break;
238 default:
239 ret = -ENOIOCTLCMD;
240 break;
241 }
242
243 return ret;
244}
245
246static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 219static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
247{ 220{
248 struct sam9_rtc *rtc = dev_get_drvdata(dev); 221 struct sam9_rtc *rtc = dev_get_drvdata(dev);
@@ -303,7 +276,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *_rtc)
303} 276}
304 277
305static const struct rtc_class_ops at91_rtc_ops = { 278static const struct rtc_class_ops at91_rtc_ops = {
306 .ioctl = at91_rtc_ioctl,
307 .read_time = at91_rtc_readtime, 279 .read_time = at91_rtc_readtime,
308 .set_time = at91_rtc_settime, 280 .set_time = at91_rtc_settime,
309 .read_alarm = at91_rtc_readalarm, 281 .read_alarm = at91_rtc_readalarm,
diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c
index 17971d93354..ca9cff85ab8 100644
--- a/drivers/rtc/rtc-bfin.c
+++ b/drivers/rtc/rtc-bfin.c
@@ -240,32 +240,6 @@ static void bfin_rtc_int_set_alarm(struct bfin_rtc *rtc)
240 */ 240 */
241 bfin_rtc_int_set(rtc->rtc_alarm.tm_yday == -1 ? RTC_ISTAT_ALARM : RTC_ISTAT_ALARM_DAY); 241 bfin_rtc_int_set(rtc->rtc_alarm.tm_yday == -1 ? RTC_ISTAT_ALARM : RTC_ISTAT_ALARM_DAY);
242} 242}
243static int bfin_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
244{
245 struct bfin_rtc *rtc = dev_get_drvdata(dev);
246 int ret = 0;
247
248 dev_dbg_stamp(dev);
249
250 bfin_rtc_sync_pending(dev);
251
252 switch (cmd) {
253 case RTC_UIE_ON:
254 dev_dbg_stamp(dev);
255 bfin_rtc_int_set(RTC_ISTAT_SEC);
256 break;
257 case RTC_UIE_OFF:
258 dev_dbg_stamp(dev);
259 bfin_rtc_int_clear(~RTC_ISTAT_SEC);
260 break;
261
262 default:
263 dev_dbg_stamp(dev);
264 ret = -ENOIOCTLCMD;
265 }
266
267 return ret;
268}
269 243
270static int bfin_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 244static int bfin_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
271{ 245{
@@ -358,7 +332,6 @@ static int bfin_rtc_proc(struct device *dev, struct seq_file *seq)
358} 332}
359 333
360static struct rtc_class_ops bfin_rtc_ops = { 334static struct rtc_class_ops bfin_rtc_ops = {
361 .ioctl = bfin_rtc_ioctl,
362 .read_time = bfin_rtc_read_time, 335 .read_time = bfin_rtc_read_time,
363 .set_time = bfin_rtc_set_time, 336 .set_time = bfin_rtc_set_time,
364 .read_alarm = bfin_rtc_read_alarm, 337 .read_alarm = bfin_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index c7ff8df347e..911e75cdc12 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -37,6 +37,8 @@
37#include <linux/mod_devicetable.h> 37#include <linux/mod_devicetable.h>
38#include <linux/log2.h> 38#include <linux/log2.h>
39#include <linux/pm.h> 39#include <linux/pm.h>
40#include <linux/of.h>
41#include <linux/of_platform.h>
40 42
41/* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */ 43/* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */
42#include <asm-generic/rtc.h> 44#include <asm-generic/rtc.h>
@@ -375,50 +377,6 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t)
375 return 0; 377 return 0;
376} 378}
377 379
378static int cmos_irq_set_freq(struct device *dev, int freq)
379{
380 struct cmos_rtc *cmos = dev_get_drvdata(dev);
381 int f;
382 unsigned long flags;
383
384 if (!is_valid_irq(cmos->irq))
385 return -ENXIO;
386
387 if (!is_power_of_2(freq))
388 return -EINVAL;
389 /* 0 = no irqs; 1 = 2^15 Hz ... 15 = 2^0 Hz */
390 f = ffs(freq);
391 if (f-- > 16)
392 return -EINVAL;
393 f = 16 - f;
394
395 spin_lock_irqsave(&rtc_lock, flags);
396 hpet_set_periodic_freq(freq);
397 CMOS_WRITE(RTC_REF_CLCK_32KHZ | f, RTC_FREQ_SELECT);
398 spin_unlock_irqrestore(&rtc_lock, flags);
399
400 return 0;
401}
402
403static int cmos_irq_set_state(struct device *dev, int enabled)
404{
405 struct cmos_rtc *cmos = dev_get_drvdata(dev);
406 unsigned long flags;
407
408 if (!is_valid_irq(cmos->irq))
409 return -ENXIO;
410
411 spin_lock_irqsave(&rtc_lock, flags);
412
413 if (enabled)
414 cmos_irq_enable(cmos, RTC_PIE);
415 else
416 cmos_irq_disable(cmos, RTC_PIE);
417
418 spin_unlock_irqrestore(&rtc_lock, flags);
419 return 0;
420}
421
422static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled) 380static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled)
423{ 381{
424 struct cmos_rtc *cmos = dev_get_drvdata(dev); 382 struct cmos_rtc *cmos = dev_get_drvdata(dev);
@@ -438,25 +396,6 @@ static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled)
438 return 0; 396 return 0;
439} 397}
440 398
441static int cmos_update_irq_enable(struct device *dev, unsigned int enabled)
442{
443 struct cmos_rtc *cmos = dev_get_drvdata(dev);
444 unsigned long flags;
445
446 if (!is_valid_irq(cmos->irq))
447 return -EINVAL;
448
449 spin_lock_irqsave(&rtc_lock, flags);
450
451 if (enabled)
452 cmos_irq_enable(cmos, RTC_UIE);
453 else
454 cmos_irq_disable(cmos, RTC_UIE);
455
456 spin_unlock_irqrestore(&rtc_lock, flags);
457 return 0;
458}
459
460#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE) 399#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
461 400
462static int cmos_procfs(struct device *dev, struct seq_file *seq) 401static int cmos_procfs(struct device *dev, struct seq_file *seq)
@@ -501,10 +440,7 @@ static const struct rtc_class_ops cmos_rtc_ops = {
501 .read_alarm = cmos_read_alarm, 440 .read_alarm = cmos_read_alarm,
502 .set_alarm = cmos_set_alarm, 441 .set_alarm = cmos_set_alarm,
503 .proc = cmos_procfs, 442 .proc = cmos_procfs,
504 .irq_set_freq = cmos_irq_set_freq,
505 .irq_set_state = cmos_irq_set_state,
506 .alarm_irq_enable = cmos_alarm_irq_enable, 443 .alarm_irq_enable = cmos_alarm_irq_enable,
507 .update_irq_enable = cmos_update_irq_enable,
508}; 444};
509 445
510/*----------------------------------------------------------------*/ 446/*----------------------------------------------------------------*/
@@ -1123,6 +1059,47 @@ static struct pnp_driver cmos_pnp_driver = {
1123 1059
1124#endif /* CONFIG_PNP */ 1060#endif /* CONFIG_PNP */
1125 1061
1062#ifdef CONFIG_OF
1063static const struct of_device_id of_cmos_match[] = {
1064 {
1065 .compatible = "motorola,mc146818",
1066 },
1067 { },
1068};
1069MODULE_DEVICE_TABLE(of, of_cmos_match);
1070
1071static __init void cmos_of_init(struct platform_device *pdev)
1072{
1073 struct device_node *node = pdev->dev.of_node;
1074 struct rtc_time time;
1075 int ret;
1076 const __be32 *val;
1077
1078 if (!node)
1079 return;
1080
1081 val = of_get_property(node, "ctrl-reg", NULL);
1082 if (val)
1083 CMOS_WRITE(be32_to_cpup(val), RTC_CONTROL);
1084
1085 val = of_get_property(node, "freq-reg", NULL);
1086 if (val)
1087 CMOS_WRITE(be32_to_cpup(val), RTC_FREQ_SELECT);
1088
1089 get_rtc_time(&time);
1090 ret = rtc_valid_tm(&time);
1091 if (ret) {
1092 struct rtc_time def_time = {
1093 .tm_year = 1,
1094 .tm_mday = 1,
1095 };
1096 set_rtc_time(&def_time);
1097 }
1098}
1099#else
1100static inline void cmos_of_init(struct platform_device *pdev) {}
1101#define of_cmos_match NULL
1102#endif
1126/*----------------------------------------------------------------*/ 1103/*----------------------------------------------------------------*/
1127 1104
1128/* Platform setup should have set up an RTC device, when PNP is 1105/* Platform setup should have set up an RTC device, when PNP is
@@ -1131,6 +1108,7 @@ static struct pnp_driver cmos_pnp_driver = {
1131 1108
1132static int __init cmos_platform_probe(struct platform_device *pdev) 1109static int __init cmos_platform_probe(struct platform_device *pdev)
1133{ 1110{
1111 cmos_of_init(pdev);
1134 cmos_wake_setup(&pdev->dev); 1112 cmos_wake_setup(&pdev->dev);
1135 return cmos_do_probe(&pdev->dev, 1113 return cmos_do_probe(&pdev->dev,
1136 platform_get_resource(pdev, IORESOURCE_IO, 0), 1114 platform_get_resource(pdev, IORESOURCE_IO, 0),
@@ -1162,6 +1140,7 @@ static struct platform_driver cmos_platform_driver = {
1162#ifdef CONFIG_PM 1140#ifdef CONFIG_PM
1163 .pm = &cmos_pm_ops, 1141 .pm = &cmos_pm_ops,
1164#endif 1142#endif
1143 .of_match_table = of_cmos_match,
1165 } 1144 }
1166}; 1145};
1167 1146
diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c
index 34647fc1ee9..8d46838dff8 100644
--- a/drivers/rtc/rtc-davinci.c
+++ b/drivers/rtc/rtc-davinci.c
@@ -231,10 +231,6 @@ davinci_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
231 case RTC_WIE_OFF: 231 case RTC_WIE_OFF:
232 rtc_ctrl &= ~PRTCSS_RTC_CTRL_WEN; 232 rtc_ctrl &= ~PRTCSS_RTC_CTRL_WEN;
233 break; 233 break;
234 case RTC_UIE_OFF:
235 case RTC_UIE_ON:
236 ret = -ENOTTY;
237 break;
238 default: 234 default:
239 ret = -ENOIOCTLCMD; 235 ret = -ENOIOCTLCMD;
240 } 236 }
@@ -473,55 +469,6 @@ static int davinci_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
473 return 0; 469 return 0;
474} 470}
475 471
476static int davinci_rtc_irq_set_state(struct device *dev, int enabled)
477{
478 struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev);
479 unsigned long flags;
480 u8 rtc_ctrl;
481
482 spin_lock_irqsave(&davinci_rtc_lock, flags);
483
484 rtc_ctrl = rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL);
485
486 if (enabled) {
487 while (rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL)
488 & PRTCSS_RTC_CTRL_WDTBUS)
489 cpu_relax();
490
491 rtc_ctrl |= PRTCSS_RTC_CTRL_TE;
492 rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL);
493
494 rtcss_write(davinci_rtc, 0x0, PRTCSS_RTC_CLKC_CNT);
495
496 rtc_ctrl |= PRTCSS_RTC_CTRL_TIEN |
497 PRTCSS_RTC_CTRL_TMMD |
498 PRTCSS_RTC_CTRL_TMRFLG;
499 } else
500 rtc_ctrl &= ~PRTCSS_RTC_CTRL_TIEN;
501
502 rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL);
503
504 spin_unlock_irqrestore(&davinci_rtc_lock, flags);
505
506 return 0;
507}
508
509static int davinci_rtc_irq_set_freq(struct device *dev, int freq)
510{
511 struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev);
512 unsigned long flags;
513 u16 tmr_counter = (0x8000 >> (ffs(freq) - 1));
514
515 spin_lock_irqsave(&davinci_rtc_lock, flags);
516
517 rtcss_write(davinci_rtc, tmr_counter & 0xFF, PRTCSS_RTC_TMR0);
518 rtcss_write(davinci_rtc, (tmr_counter & 0xFF00) >> 8, PRTCSS_RTC_TMR1);
519
520 spin_unlock_irqrestore(&davinci_rtc_lock, flags);
521
522 return 0;
523}
524
525static struct rtc_class_ops davinci_rtc_ops = { 472static struct rtc_class_ops davinci_rtc_ops = {
526 .ioctl = davinci_rtc_ioctl, 473 .ioctl = davinci_rtc_ioctl,
527 .read_time = davinci_rtc_read_time, 474 .read_time = davinci_rtc_read_time,
@@ -529,8 +476,6 @@ static struct rtc_class_ops davinci_rtc_ops = {
529 .alarm_irq_enable = davinci_rtc_alarm_irq_enable, 476 .alarm_irq_enable = davinci_rtc_alarm_irq_enable,
530 .read_alarm = davinci_rtc_read_alarm, 477 .read_alarm = davinci_rtc_read_alarm,
531 .set_alarm = davinci_rtc_set_alarm, 478 .set_alarm = davinci_rtc_set_alarm,
532 .irq_set_state = davinci_rtc_irq_set_state,
533 .irq_set_freq = davinci_rtc_irq_set_freq,
534}; 479};
535 480
536static int __init davinci_rtc_probe(struct platform_device *pdev) 481static int __init davinci_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 37268e97de4..3fffd708711 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -397,29 +397,12 @@ static int ds1511_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
397 return 0; 397 return 0;
398} 398}
399 399
400static int ds1511_rtc_update_irq_enable(struct device *dev,
401 unsigned int enabled)
402{
403 struct platform_device *pdev = to_platform_device(dev);
404 struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
405
406 if (pdata->irq <= 0)
407 return -EINVAL;
408 if (enabled)
409 pdata->irqen |= RTC_UF;
410 else
411 pdata->irqen &= ~RTC_UF;
412 ds1511_rtc_update_alarm(pdata);
413 return 0;
414}
415
416static const struct rtc_class_ops ds1511_rtc_ops = { 400static const struct rtc_class_ops ds1511_rtc_ops = {
417 .read_time = ds1511_rtc_read_time, 401 .read_time = ds1511_rtc_read_time,
418 .set_time = ds1511_rtc_set_time, 402 .set_time = ds1511_rtc_set_time,
419 .read_alarm = ds1511_rtc_read_alarm, 403 .read_alarm = ds1511_rtc_read_alarm,
420 .set_alarm = ds1511_rtc_set_alarm, 404 .set_alarm = ds1511_rtc_set_alarm,
421 .alarm_irq_enable = ds1511_rtc_alarm_irq_enable, 405 .alarm_irq_enable = ds1511_rtc_alarm_irq_enable,
422 .update_irq_enable = ds1511_rtc_update_irq_enable,
423}; 406};
424 407
425 static ssize_t 408 static ssize_t
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index ff432e2ca27..fee41b97c9e 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -227,29 +227,12 @@ static int ds1553_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
227 return 0; 227 return 0;
228} 228}
229 229
230static int ds1553_rtc_update_irq_enable(struct device *dev,
231 unsigned int enabled)
232{
233 struct platform_device *pdev = to_platform_device(dev);
234 struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
235
236 if (pdata->irq <= 0)
237 return -EINVAL;
238 if (enabled)
239 pdata->irqen |= RTC_UF;
240 else
241 pdata->irqen &= ~RTC_UF;
242 ds1553_rtc_update_alarm(pdata);
243 return 0;
244}
245
246static const struct rtc_class_ops ds1553_rtc_ops = { 230static const struct rtc_class_ops ds1553_rtc_ops = {
247 .read_time = ds1553_rtc_read_time, 231 .read_time = ds1553_rtc_read_time,
248 .set_time = ds1553_rtc_set_time, 232 .set_time = ds1553_rtc_set_time,
249 .read_alarm = ds1553_rtc_read_alarm, 233 .read_alarm = ds1553_rtc_read_alarm,
250 .set_alarm = ds1553_rtc_set_alarm, 234 .set_alarm = ds1553_rtc_set_alarm,
251 .alarm_irq_enable = ds1553_rtc_alarm_irq_enable, 235 .alarm_irq_enable = ds1553_rtc_alarm_irq_enable,
252 .update_irq_enable = ds1553_rtc_update_irq_enable,
253}; 236};
254 237
255static ssize_t ds1553_nvram_read(struct file *filp, struct kobject *kobj, 238static ssize_t ds1553_nvram_read(struct file *filp, struct kobject *kobj,
diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c
index 950735415a7..27b7bf672ac 100644
--- a/drivers/rtc/rtc-ds3232.c
+++ b/drivers/rtc/rtc-ds3232.c
@@ -339,23 +339,6 @@ static int ds3232_alarm_irq_enable(struct device *dev, unsigned int enabled)
339 return 0; 339 return 0;
340} 340}
341 341
342static int ds3232_update_irq_enable(struct device *dev, unsigned int enabled)
343{
344 struct i2c_client *client = to_i2c_client(dev);
345 struct ds3232 *ds3232 = i2c_get_clientdata(client);
346
347 if (client->irq <= 0)
348 return -EINVAL;
349
350 if (enabled)
351 ds3232->rtc->irq_data |= RTC_UF;
352 else
353 ds3232->rtc->irq_data &= ~RTC_UF;
354
355 ds3232_update_alarm(client);
356 return 0;
357}
358
359static irqreturn_t ds3232_irq(int irq, void *dev_id) 342static irqreturn_t ds3232_irq(int irq, void *dev_id)
360{ 343{
361 struct i2c_client *client = dev_id; 344 struct i2c_client *client = dev_id;
@@ -406,7 +389,6 @@ static const struct rtc_class_ops ds3232_rtc_ops = {
406 .read_alarm = ds3232_read_alarm, 389 .read_alarm = ds3232_read_alarm,
407 .set_alarm = ds3232_set_alarm, 390 .set_alarm = ds3232_set_alarm,
408 .alarm_irq_enable = ds3232_alarm_irq_enable, 391 .alarm_irq_enable = ds3232_alarm_irq_enable,
409 .update_irq_enable = ds3232_update_irq_enable,
410}; 392};
411 393
412static int __devinit ds3232_probe(struct i2c_client *client, 394static int __devinit ds3232_probe(struct i2c_client *client,
diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c
index 2e16f72c905..b6473631d18 100644
--- a/drivers/rtc/rtc-jz4740.c
+++ b/drivers/rtc/rtc-jz4740.c
@@ -168,12 +168,6 @@ static int jz4740_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
168 return ret; 168 return ret;
169} 169}
170 170
171static int jz4740_rtc_update_irq_enable(struct device *dev, unsigned int enable)
172{
173 struct jz4740_rtc *rtc = dev_get_drvdata(dev);
174 return jz4740_rtc_ctrl_set_bits(rtc, JZ_RTC_CTRL_1HZ_IRQ, enable);
175}
176
177static int jz4740_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) 171static int jz4740_rtc_alarm_irq_enable(struct device *dev, unsigned int enable)
178{ 172{
179 struct jz4740_rtc *rtc = dev_get_drvdata(dev); 173 struct jz4740_rtc *rtc = dev_get_drvdata(dev);
@@ -185,7 +179,6 @@ static struct rtc_class_ops jz4740_rtc_ops = {
185 .set_mmss = jz4740_rtc_set_mmss, 179 .set_mmss = jz4740_rtc_set_mmss,
186 .read_alarm = jz4740_rtc_read_alarm, 180 .read_alarm = jz4740_rtc_read_alarm,
187 .set_alarm = jz4740_rtc_set_alarm, 181 .set_alarm = jz4740_rtc_set_alarm,
188 .update_irq_enable = jz4740_rtc_update_irq_enable,
189 .alarm_irq_enable = jz4740_rtc_alarm_irq_enable, 182 .alarm_irq_enable = jz4740_rtc_alarm_irq_enable,
190}; 183};
191 184
diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c
index 5314b153bfb..c4200646955 100644
--- a/drivers/rtc/rtc-mc13xxx.c
+++ b/drivers/rtc/rtc-mc13xxx.c
@@ -282,12 +282,6 @@ static irqreturn_t mc13xxx_rtc_update_handler(int irq, void *dev)
282 return IRQ_HANDLED; 282 return IRQ_HANDLED;
283} 283}
284 284
285static int mc13xxx_rtc_update_irq_enable(struct device *dev,
286 unsigned int enabled)
287{
288 return mc13xxx_rtc_irq_enable(dev, enabled, MC13XXX_IRQ_1HZ);
289}
290
291static int mc13xxx_rtc_alarm_irq_enable(struct device *dev, 285static int mc13xxx_rtc_alarm_irq_enable(struct device *dev,
292 unsigned int enabled) 286 unsigned int enabled)
293{ 287{
@@ -300,7 +294,6 @@ static const struct rtc_class_ops mc13xxx_rtc_ops = {
300 .read_alarm = mc13xxx_rtc_read_alarm, 294 .read_alarm = mc13xxx_rtc_read_alarm,
301 .set_alarm = mc13xxx_rtc_set_alarm, 295 .set_alarm = mc13xxx_rtc_set_alarm,
302 .alarm_irq_enable = mc13xxx_rtc_alarm_irq_enable, 296 .alarm_irq_enable = mc13xxx_rtc_alarm_irq_enable,
303 .update_irq_enable = mc13xxx_rtc_update_irq_enable,
304}; 297};
305 298
306static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev) 299static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev)
diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c
index dfcdf0901d2..b40c1ff1ebc 100644
--- a/drivers/rtc/rtc-mpc5121.c
+++ b/drivers/rtc/rtc-mpc5121.c
@@ -240,32 +240,12 @@ static int mpc5121_rtc_alarm_irq_enable(struct device *dev,
240 return 0; 240 return 0;
241} 241}
242 242
243static int mpc5121_rtc_update_irq_enable(struct device *dev,
244 unsigned int enabled)
245{
246 struct mpc5121_rtc_data *rtc = dev_get_drvdata(dev);
247 struct mpc5121_rtc_regs __iomem *regs = rtc->regs;
248 int val;
249
250 val = in_8(&regs->int_enable);
251
252 if (enabled)
253 val = (val & ~0x8) | 0x1;
254 else
255 val &= ~0x1;
256
257 out_8(&regs->int_enable, val);
258
259 return 0;
260}
261
262static const struct rtc_class_ops mpc5121_rtc_ops = { 243static const struct rtc_class_ops mpc5121_rtc_ops = {
263 .read_time = mpc5121_rtc_read_time, 244 .read_time = mpc5121_rtc_read_time,
264 .set_time = mpc5121_rtc_set_time, 245 .set_time = mpc5121_rtc_set_time,
265 .read_alarm = mpc5121_rtc_read_alarm, 246 .read_alarm = mpc5121_rtc_read_alarm,
266 .set_alarm = mpc5121_rtc_set_alarm, 247 .set_alarm = mpc5121_rtc_set_alarm,
267 .alarm_irq_enable = mpc5121_rtc_alarm_irq_enable, 248 .alarm_irq_enable = mpc5121_rtc_alarm_irq_enable,
268 .update_irq_enable = mpc5121_rtc_update_irq_enable,
269}; 249};
270 250
271static int __devinit mpc5121_rtc_probe(struct platform_device *op, 251static int __devinit mpc5121_rtc_probe(struct platform_device *op,
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index 1db62db8469..b86bc328463 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -62,6 +62,17 @@ static inline int is_intr(u8 rtc_intr)
62 return rtc_intr & RTC_IRQMASK; 62 return rtc_intr & RTC_IRQMASK;
63} 63}
64 64
65static inline unsigned char vrtc_is_updating(void)
66{
67 unsigned char uip;
68 unsigned long flags;
69
70 spin_lock_irqsave(&rtc_lock, flags);
71 uip = (vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP);
72 spin_unlock_irqrestore(&rtc_lock, flags);
73 return uip;
74}
75
65/* 76/*
66 * rtc_time's year contains the increment over 1900, but vRTC's YEAR 77 * rtc_time's year contains the increment over 1900, but vRTC's YEAR
67 * register can't be programmed to value larger than 0x64, so vRTC 78 * register can't be programmed to value larger than 0x64, so vRTC
@@ -76,7 +87,7 @@ static int mrst_read_time(struct device *dev, struct rtc_time *time)
76{ 87{
77 unsigned long flags; 88 unsigned long flags;
78 89
79 if (rtc_is_updating()) 90 if (vrtc_is_updating())
80 mdelay(20); 91 mdelay(20);
81 92
82 spin_lock_irqsave(&rtc_lock, flags); 93 spin_lock_irqsave(&rtc_lock, flags);
@@ -236,25 +247,6 @@ static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
236 return 0; 247 return 0;
237} 248}
238 249
239static int mrst_irq_set_state(struct device *dev, int enabled)
240{
241 struct mrst_rtc *mrst = dev_get_drvdata(dev);
242 unsigned long flags;
243
244 if (!mrst->irq)
245 return -ENXIO;
246
247 spin_lock_irqsave(&rtc_lock, flags);
248
249 if (enabled)
250 mrst_irq_enable(mrst, RTC_PIE);
251 else
252 mrst_irq_disable(mrst, RTC_PIE);
253
254 spin_unlock_irqrestore(&rtc_lock, flags);
255 return 0;
256}
257
258/* Currently, the vRTC doesn't support UIE ON/OFF */ 250/* Currently, the vRTC doesn't support UIE ON/OFF */
259static int mrst_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 251static int mrst_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
260{ 252{
@@ -301,7 +293,6 @@ static const struct rtc_class_ops mrst_rtc_ops = {
301 .read_alarm = mrst_read_alarm, 293 .read_alarm = mrst_read_alarm,
302 .set_alarm = mrst_set_alarm, 294 .set_alarm = mrst_set_alarm,
303 .proc = mrst_procfs, 295 .proc = mrst_procfs,
304 .irq_set_state = mrst_irq_set_state,
305 .alarm_irq_enable = mrst_rtc_alarm_irq_enable, 296 .alarm_irq_enable = mrst_rtc_alarm_irq_enable,
306}; 297};
307 298
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 0b06c1e03fd..826ab64a8fa 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -274,12 +274,6 @@ static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
274 return 0; 274 return 0;
275} 275}
276 276
277static int mxc_rtc_update_irq_enable(struct device *dev, unsigned int enabled)
278{
279 mxc_rtc_irq_enable(dev, RTC_1HZ_BIT, enabled);
280 return 0;
281}
282
283/* 277/*
284 * This function reads the current RTC time into tm in Gregorian date. 278 * This function reads the current RTC time into tm in Gregorian date.
285 */ 279 */
@@ -368,7 +362,6 @@ static struct rtc_class_ops mxc_rtc_ops = {
368 .read_alarm = mxc_rtc_read_alarm, 362 .read_alarm = mxc_rtc_read_alarm,
369 .set_alarm = mxc_rtc_set_alarm, 363 .set_alarm = mxc_rtc_set_alarm,
370 .alarm_irq_enable = mxc_rtc_alarm_irq_enable, 364 .alarm_irq_enable = mxc_rtc_alarm_irq_enable,
371 .update_irq_enable = mxc_rtc_update_irq_enable,
372}; 365};
373 366
374static int __init mxc_rtc_probe(struct platform_device *pdev) 367static int __init mxc_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c
index ddb0857e15a..781068d62f2 100644
--- a/drivers/rtc/rtc-nuc900.c
+++ b/drivers/rtc/rtc-nuc900.c
@@ -134,20 +134,6 @@ static void nuc900_rtc_bin2bcd(struct device *dev, struct rtc_time *settm,
134 gettm->bcd_hour = bin2bcd(settm->tm_hour) << 16; 134 gettm->bcd_hour = bin2bcd(settm->tm_hour) << 16;
135} 135}
136 136
137static int nuc900_update_irq_enable(struct device *dev, unsigned int enabled)
138{
139 struct nuc900_rtc *rtc = dev_get_drvdata(dev);
140
141 if (enabled)
142 __raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)|
143 (TICKINTENB), rtc->rtc_reg + REG_RTC_RIER);
144 else
145 __raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)&
146 (~TICKINTENB), rtc->rtc_reg + REG_RTC_RIER);
147
148 return 0;
149}
150
151static int nuc900_alarm_irq_enable(struct device *dev, unsigned int enabled) 137static int nuc900_alarm_irq_enable(struct device *dev, unsigned int enabled)
152{ 138{
153 struct nuc900_rtc *rtc = dev_get_drvdata(dev); 139 struct nuc900_rtc *rtc = dev_get_drvdata(dev);
@@ -234,7 +220,6 @@ static struct rtc_class_ops nuc900_rtc_ops = {
234 .read_alarm = nuc900_rtc_read_alarm, 220 .read_alarm = nuc900_rtc_read_alarm,
235 .set_alarm = nuc900_rtc_set_alarm, 221 .set_alarm = nuc900_rtc_set_alarm,
236 .alarm_irq_enable = nuc900_alarm_irq_enable, 222 .alarm_irq_enable = nuc900_alarm_irq_enable,
237 .update_irq_enable = nuc900_update_irq_enable,
238}; 223};
239 224
240static int __devinit nuc900_rtc_probe(struct platform_device *pdev) 225static int __devinit nuc900_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index b4dbf3a319b..de0dd7b1f14 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -135,44 +135,6 @@ static irqreturn_t rtc_irq(int irq, void *rtc)
135 return IRQ_HANDLED; 135 return IRQ_HANDLED;
136} 136}
137 137
138#ifdef CONFIG_RTC_INTF_DEV
139
140static int
141omap_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
142{
143 u8 reg;
144
145 switch (cmd) {
146 case RTC_UIE_OFF:
147 case RTC_UIE_ON:
148 break;
149 default:
150 return -ENOIOCTLCMD;
151 }
152
153 local_irq_disable();
154 rtc_wait_not_busy();
155 reg = rtc_read(OMAP_RTC_INTERRUPTS_REG);
156 switch (cmd) {
157 /* UIE = Update Interrupt Enable (1/second) */
158 case RTC_UIE_OFF:
159 reg &= ~OMAP_RTC_INTERRUPTS_IT_TIMER;
160 break;
161 case RTC_UIE_ON:
162 reg |= OMAP_RTC_INTERRUPTS_IT_TIMER;
163 break;
164 }
165 rtc_wait_not_busy();
166 rtc_write(reg, OMAP_RTC_INTERRUPTS_REG);
167 local_irq_enable();
168
169 return 0;
170}
171
172#else
173#define omap_rtc_ioctl NULL
174#endif
175
176static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 138static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
177{ 139{
178 u8 reg; 140 u8 reg;
@@ -313,7 +275,6 @@ static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
313} 275}
314 276
315static struct rtc_class_ops omap_rtc_ops = { 277static struct rtc_class_ops omap_rtc_ops = {
316 .ioctl = omap_rtc_ioctl,
317 .read_time = omap_rtc_read_time, 278 .read_time = omap_rtc_read_time,
318 .set_time = omap_rtc_set_time, 279 .set_time = omap_rtc_set_time,
319 .read_alarm = omap_rtc_read_alarm, 280 .read_alarm = omap_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c
index 25c0b3fd44f..a633abc4289 100644
--- a/drivers/rtc/rtc-pcap.c
+++ b/drivers/rtc/rtc-pcap.c
@@ -131,18 +131,12 @@ static int pcap_rtc_alarm_irq_enable(struct device *dev, unsigned int en)
131 return pcap_rtc_irq_enable(dev, PCAP_IRQ_TODA, en); 131 return pcap_rtc_irq_enable(dev, PCAP_IRQ_TODA, en);
132} 132}
133 133
134static int pcap_rtc_update_irq_enable(struct device *dev, unsigned int en)
135{
136 return pcap_rtc_irq_enable(dev, PCAP_IRQ_1HZ, en);
137}
138
139static const struct rtc_class_ops pcap_rtc_ops = { 134static const struct rtc_class_ops pcap_rtc_ops = {
140 .read_time = pcap_rtc_read_time, 135 .read_time = pcap_rtc_read_time,
141 .read_alarm = pcap_rtc_read_alarm, 136 .read_alarm = pcap_rtc_read_alarm,
142 .set_alarm = pcap_rtc_set_alarm, 137 .set_alarm = pcap_rtc_set_alarm,
143 .set_mmss = pcap_rtc_set_mmss, 138 .set_mmss = pcap_rtc_set_mmss,
144 .alarm_irq_enable = pcap_rtc_alarm_irq_enable, 139 .alarm_irq_enable = pcap_rtc_alarm_irq_enable,
145 .update_irq_enable = pcap_rtc_update_irq_enable,
146}; 140};
147 141
148static int __devinit pcap_rtc_probe(struct platform_device *pdev) 142static int __devinit pcap_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-pcf50633.c b/drivers/rtc/rtc-pcf50633.c
index 16edf94ab42..f90c574f9d0 100644
--- a/drivers/rtc/rtc-pcf50633.c
+++ b/drivers/rtc/rtc-pcf50633.c
@@ -106,25 +106,6 @@ pcf50633_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
106 return 0; 106 return 0;
107} 107}
108 108
109static int
110pcf50633_rtc_update_irq_enable(struct device *dev, unsigned int enabled)
111{
112 struct pcf50633_rtc *rtc = dev_get_drvdata(dev);
113 int err;
114
115 if (enabled)
116 err = pcf50633_irq_unmask(rtc->pcf, PCF50633_IRQ_SECOND);
117 else
118 err = pcf50633_irq_mask(rtc->pcf, PCF50633_IRQ_SECOND);
119
120 if (err < 0)
121 return err;
122
123 rtc->second_enabled = enabled;
124
125 return 0;
126}
127
128static int pcf50633_rtc_read_time(struct device *dev, struct rtc_time *tm) 109static int pcf50633_rtc_read_time(struct device *dev, struct rtc_time *tm)
129{ 110{
130 struct pcf50633_rtc *rtc; 111 struct pcf50633_rtc *rtc;
@@ -262,8 +243,7 @@ static struct rtc_class_ops pcf50633_rtc_ops = {
262 .set_time = pcf50633_rtc_set_time, 243 .set_time = pcf50633_rtc_set_time,
263 .read_alarm = pcf50633_rtc_read_alarm, 244 .read_alarm = pcf50633_rtc_read_alarm,
264 .set_alarm = pcf50633_rtc_set_alarm, 245 .set_alarm = pcf50633_rtc_set_alarm,
265 .alarm_irq_enable = pcf50633_rtc_alarm_irq_enable, 246 .alarm_irq_enable = pcf50633_rtc_alarm_irq_enable,
266 .update_irq_enable = pcf50633_rtc_update_irq_enable,
267}; 247};
268 248
269static void pcf50633_rtc_irq(int irq, void *data) 249static void pcf50633_rtc_irq(int irq, void *data)
diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c
index bbdb2f02798..d554368c9f5 100644
--- a/drivers/rtc/rtc-pl030.c
+++ b/drivers/rtc/rtc-pl030.c
@@ -35,11 +35,6 @@ static irqreturn_t pl030_interrupt(int irq, void *dev_id)
35 return IRQ_HANDLED; 35 return IRQ_HANDLED;
36} 36}
37 37
38static int pl030_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
39{
40 return -ENOIOCTLCMD;
41}
42
43static int pl030_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) 38static int pl030_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
44{ 39{
45 struct pl030_rtc *rtc = dev_get_drvdata(dev); 40 struct pl030_rtc *rtc = dev_get_drvdata(dev);
@@ -96,7 +91,6 @@ static int pl030_set_time(struct device *dev, struct rtc_time *tm)
96} 91}
97 92
98static const struct rtc_class_ops pl030_ops = { 93static const struct rtc_class_ops pl030_ops = {
99 .ioctl = pl030_ioctl,
100 .read_time = pl030_read_time, 94 .read_time = pl030_read_time,
101 .set_time = pl030_set_time, 95 .set_time = pl030_set_time,
102 .read_alarm = pl030_read_alarm, 96 .read_alarm = pl030_read_alarm,
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index b7a6690e5b3..d829ea63c4f 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -293,57 +293,6 @@ static int pl031_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
293 return ret; 293 return ret;
294} 294}
295 295
296/* Periodic interrupt is only available in ST variants. */
297static int pl031_irq_set_state(struct device *dev, int enabled)
298{
299 struct pl031_local *ldata = dev_get_drvdata(dev);
300
301 if (enabled == 1) {
302 /* Clear any pending timer interrupt. */
303 writel(RTC_BIT_PI, ldata->base + RTC_ICR);
304
305 writel(readl(ldata->base + RTC_IMSC) | RTC_BIT_PI,
306 ldata->base + RTC_IMSC);
307
308 /* Now start the timer */
309 writel(readl(ldata->base + RTC_TCR) | RTC_TCR_EN,
310 ldata->base + RTC_TCR);
311
312 } else {
313 writel(readl(ldata->base + RTC_IMSC) & (~RTC_BIT_PI),
314 ldata->base + RTC_IMSC);
315
316 /* Also stop the timer */
317 writel(readl(ldata->base + RTC_TCR) & (~RTC_TCR_EN),
318 ldata->base + RTC_TCR);
319 }
320 /* Wait at least 1 RTC32 clock cycle to ensure next access
321 * to RTC_TCR will succeed.
322 */
323 udelay(40);
324
325 return 0;
326}
327
328static int pl031_irq_set_freq(struct device *dev, int freq)
329{
330 struct pl031_local *ldata = dev_get_drvdata(dev);
331
332 /* Cant set timer if it is already enabled */
333 if (readl(ldata->base + RTC_TCR) & RTC_TCR_EN) {
334 dev_err(dev, "can't change frequency while timer enabled\n");
335 return -EINVAL;
336 }
337
338 /* If self start bit in RTC_TCR is set timer will start here,
339 * but we never set that bit. Instead we start the timer when
340 * set_state is called with enabled == 1.
341 */
342 writel(RTC_TIMER_FREQ / freq, ldata->base + RTC_TLR);
343
344 return 0;
345}
346
347static int pl031_remove(struct amba_device *adev) 296static int pl031_remove(struct amba_device *adev)
348{ 297{
349 struct pl031_local *ldata = dev_get_drvdata(&adev->dev); 298 struct pl031_local *ldata = dev_get_drvdata(&adev->dev);
@@ -440,8 +389,6 @@ static struct rtc_class_ops stv1_pl031_ops = {
440 .read_alarm = pl031_read_alarm, 389 .read_alarm = pl031_read_alarm,
441 .set_alarm = pl031_set_alarm, 390 .set_alarm = pl031_set_alarm,
442 .alarm_irq_enable = pl031_alarm_irq_enable, 391 .alarm_irq_enable = pl031_alarm_irq_enable,
443 .irq_set_state = pl031_irq_set_state,
444 .irq_set_freq = pl031_irq_set_freq,
445}; 392};
446 393
447/* And the second ST derivative */ 394/* And the second ST derivative */
@@ -451,8 +398,6 @@ static struct rtc_class_ops stv2_pl031_ops = {
451 .read_alarm = pl031_stv2_read_alarm, 398 .read_alarm = pl031_stv2_read_alarm,
452 .set_alarm = pl031_stv2_set_alarm, 399 .set_alarm = pl031_stv2_set_alarm,
453 .alarm_irq_enable = pl031_alarm_irq_enable, 400 .alarm_irq_enable = pl031_alarm_irq_enable,
454 .irq_set_state = pl031_irq_set_state,
455 .irq_set_freq = pl031_irq_set_freq,
456}; 401};
457 402
458static struct amba_id pl031_ids[] = { 403static struct amba_id pl031_ids[] = {
diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c
index 242bbf86c74..0a59fda5c09 100644
--- a/drivers/rtc/rtc-proc.c
+++ b/drivers/rtc/rtc-proc.c
@@ -69,6 +69,14 @@ static int rtc_proc_show(struct seq_file *seq, void *offset)
69 alrm.enabled ? "yes" : "no"); 69 alrm.enabled ? "yes" : "no");
70 seq_printf(seq, "alrm_pending\t: %s\n", 70 seq_printf(seq, "alrm_pending\t: %s\n",
71 alrm.pending ? "yes" : "no"); 71 alrm.pending ? "yes" : "no");
72 seq_printf(seq, "update IRQ enabled\t: %s\n",
73 (rtc->uie_rtctimer.enabled) ? "yes" : "no");
74 seq_printf(seq, "periodic IRQ enabled\t: %s\n",
75 (rtc->pie_enabled) ? "yes" : "no");
76 seq_printf(seq, "periodic IRQ frequency\t: %d\n",
77 rtc->irq_freq);
78 seq_printf(seq, "max user IRQ frequency\t: %d\n",
79 rtc->max_user_freq);
72 } 80 }
73 81
74 seq_printf(seq, "24hr\t\t: yes\n"); 82 seq_printf(seq, "24hr\t\t: yes\n");
diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c
index 29e867a1aaa..fc9f4991574 100644
--- a/drivers/rtc/rtc-pxa.c
+++ b/drivers/rtc/rtc-pxa.c
@@ -209,32 +209,6 @@ static void pxa_rtc_release(struct device *dev)
209 free_irq(pxa_rtc->irq_1Hz, dev); 209 free_irq(pxa_rtc->irq_1Hz, dev);
210} 210}
211 211
212static int pxa_periodic_irq_set_freq(struct device *dev, int freq)
213{
214 struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
215 int period_ms;
216
217 if (freq < 1 || freq > MAXFREQ_PERIODIC)
218 return -EINVAL;
219
220 period_ms = 1000 / freq;
221 rtc_writel(pxa_rtc, PIAR, period_ms);
222
223 return 0;
224}
225
226static int pxa_periodic_irq_set_state(struct device *dev, int enabled)
227{
228 struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
229
230 if (enabled)
231 rtsr_set_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE);
232 else
233 rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE);
234
235 return 0;
236}
237
238static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled) 212static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled)
239{ 213{
240 struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); 214 struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
@@ -250,21 +224,6 @@ static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled)
250 return 0; 224 return 0;
251} 225}
252 226
253static int pxa_update_irq_enable(struct device *dev, unsigned int enabled)
254{
255 struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
256
257 spin_lock_irq(&pxa_rtc->lock);
258
259 if (enabled)
260 rtsr_set_bits(pxa_rtc, RTSR_HZE);
261 else
262 rtsr_clear_bits(pxa_rtc, RTSR_HZE);
263
264 spin_unlock_irq(&pxa_rtc->lock);
265 return 0;
266}
267
268static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm) 227static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm)
269{ 228{
270 struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); 229 struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
@@ -346,10 +305,7 @@ static const struct rtc_class_ops pxa_rtc_ops = {
346 .read_alarm = pxa_rtc_read_alarm, 305 .read_alarm = pxa_rtc_read_alarm,
347 .set_alarm = pxa_rtc_set_alarm, 306 .set_alarm = pxa_rtc_set_alarm,
348 .alarm_irq_enable = pxa_alarm_irq_enable, 307 .alarm_irq_enable = pxa_alarm_irq_enable,
349 .update_irq_enable = pxa_update_irq_enable,
350 .proc = pxa_rtc_proc, 308 .proc = pxa_rtc_proc,
351 .irq_set_state = pxa_periodic_irq_set_state,
352 .irq_set_freq = pxa_periodic_irq_set_freq,
353}; 309};
354 310
355static int __init pxa_rtc_probe(struct platform_device *pdev) 311static int __init pxa_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c
index 6aaa1550e3b..85c1b848dd7 100644
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -281,57 +281,6 @@ static int rs5c372_rtc_set_time(struct device *dev, struct rtc_time *tm)
281 return rs5c372_set_datetime(to_i2c_client(dev), tm); 281 return rs5c372_set_datetime(to_i2c_client(dev), tm);
282} 282}
283 283
284#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
285
286static int
287rs5c_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
288{
289 struct i2c_client *client = to_i2c_client(dev);
290 struct rs5c372 *rs5c = i2c_get_clientdata(client);
291 unsigned char buf;
292 int status, addr;
293
294 buf = rs5c->regs[RS5C_REG_CTRL1];
295 switch (cmd) {
296 case RTC_UIE_OFF:
297 case RTC_UIE_ON:
298 /* some 327a modes use a different IRQ pin for 1Hz irqs */
299 if (rs5c->type == rtc_rs5c372a
300 && (buf & RS5C372A_CTRL1_SL1))
301 return -ENOIOCTLCMD;
302 default:
303 return -ENOIOCTLCMD;
304 }
305
306 status = rs5c_get_regs(rs5c);
307 if (status < 0)
308 return status;
309
310 addr = RS5C_ADDR(RS5C_REG_CTRL1);
311 switch (cmd) {
312 case RTC_UIE_OFF: /* update off */
313 buf &= ~RS5C_CTRL1_CT_MASK;
314 break;
315 case RTC_UIE_ON: /* update on */
316 buf &= ~RS5C_CTRL1_CT_MASK;
317 buf |= RS5C_CTRL1_CT4;
318 break;
319 }
320
321 if (i2c_smbus_write_byte_data(client, addr, buf) < 0) {
322 printk(KERN_WARNING "%s: can't update alarm\n",
323 rs5c->rtc->name);
324 status = -EIO;
325 } else
326 rs5c->regs[RS5C_REG_CTRL1] = buf;
327
328 return status;
329}
330
331#else
332#define rs5c_rtc_ioctl NULL
333#endif
334
335 284
336static int rs5c_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 285static int rs5c_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
337{ 286{
@@ -480,7 +429,6 @@ static int rs5c372_rtc_proc(struct device *dev, struct seq_file *seq)
480 429
481static const struct rtc_class_ops rs5c372_rtc_ops = { 430static const struct rtc_class_ops rs5c372_rtc_ops = {
482 .proc = rs5c372_rtc_proc, 431 .proc = rs5c372_rtc_proc,
483 .ioctl = rs5c_rtc_ioctl,
484 .read_time = rs5c372_rtc_read_time, 432 .read_time = rs5c372_rtc_read_time,
485 .set_time = rs5c372_rtc_set_time, 433 .set_time = rs5c372_rtc_set_time,
486 .read_alarm = rs5c_read_alarm, 434 .read_alarm = rs5c_read_alarm,
diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c
index af32a62e12a..fde172fb2ab 100644
--- a/drivers/rtc/rtc-rx8025.c
+++ b/drivers/rtc/rtc-rx8025.c
@@ -424,37 +424,12 @@ static int rx8025_alarm_irq_enable(struct device *dev, unsigned int enabled)
424 return 0; 424 return 0;
425} 425}
426 426
427static int rx8025_irq_set_state(struct device *dev, int enabled)
428{
429 struct i2c_client *client = to_i2c_client(dev);
430 struct rx8025_data *rx8025 = i2c_get_clientdata(client);
431 int ctrl1;
432 int err;
433
434 if (client->irq <= 0)
435 return -ENXIO;
436
437 ctrl1 = rx8025->ctrl1 & ~RX8025_BIT_CTRL1_CT;
438 if (enabled)
439 ctrl1 |= RX8025_BIT_CTRL1_CT_1HZ;
440 if (ctrl1 != rx8025->ctrl1) {
441 rx8025->ctrl1 = ctrl1;
442 err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1,
443 rx8025->ctrl1);
444 if (err)
445 return err;
446 }
447
448 return 0;
449}
450
451static struct rtc_class_ops rx8025_rtc_ops = { 427static struct rtc_class_ops rx8025_rtc_ops = {
452 .read_time = rx8025_get_time, 428 .read_time = rx8025_get_time,
453 .set_time = rx8025_set_time, 429 .set_time = rx8025_set_time,
454 .read_alarm = rx8025_read_alarm, 430 .read_alarm = rx8025_read_alarm,
455 .set_alarm = rx8025_set_alarm, 431 .set_alarm = rx8025_set_alarm,
456 .alarm_irq_enable = rx8025_alarm_irq_enable, 432 .alarm_irq_enable = rx8025_alarm_irq_enable,
457 .irq_set_state = rx8025_irq_set_state,
458}; 433};
459 434
460/* 435/*
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index b80fa288240..714964913e5 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -93,37 +93,6 @@ static int s3c_rtc_setaie(struct device *dev, unsigned int enabled)
93 return 0; 93 return 0;
94} 94}
95 95
96static int s3c_rtc_setpie(struct device *dev, int enabled)
97{
98 unsigned int tmp;
99
100 pr_debug("%s: pie=%d\n", __func__, enabled);
101
102 spin_lock_irq(&s3c_rtc_pie_lock);
103
104 if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
105 tmp = readw(s3c_rtc_base + S3C2410_RTCCON);
106 tmp &= ~S3C64XX_RTCCON_TICEN;
107
108 if (enabled)
109 tmp |= S3C64XX_RTCCON_TICEN;
110
111 writew(tmp, s3c_rtc_base + S3C2410_RTCCON);
112 } else {
113 tmp = readb(s3c_rtc_base + S3C2410_TICNT);
114 tmp &= ~S3C2410_TICNT_ENABLE;
115
116 if (enabled)
117 tmp |= S3C2410_TICNT_ENABLE;
118
119 writeb(tmp, s3c_rtc_base + S3C2410_TICNT);
120 }
121
122 spin_unlock_irq(&s3c_rtc_pie_lock);
123
124 return 0;
125}
126
127static int s3c_rtc_setfreq(struct device *dev, int freq) 96static int s3c_rtc_setfreq(struct device *dev, int freq)
128{ 97{
129 struct platform_device *pdev = to_platform_device(dev); 98 struct platform_device *pdev = to_platform_device(dev);
@@ -379,8 +348,6 @@ static const struct rtc_class_ops s3c_rtcops = {
379 .set_time = s3c_rtc_settime, 348 .set_time = s3c_rtc_settime,
380 .read_alarm = s3c_rtc_getalarm, 349 .read_alarm = s3c_rtc_getalarm,
381 .set_alarm = s3c_rtc_setalarm, 350 .set_alarm = s3c_rtc_setalarm,
382 .irq_set_freq = s3c_rtc_setfreq,
383 .irq_set_state = s3c_rtc_setpie,
384 .proc = s3c_rtc_proc, 351 .proc = s3c_rtc_proc,
385 .alarm_irq_enable = s3c_rtc_setaie, 352 .alarm_irq_enable = s3c_rtc_setaie,
386}; 353};
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index 5dfe5ffcb0d..0b40bb88a88 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -43,7 +43,6 @@
43#define RTC_DEF_TRIM 0 43#define RTC_DEF_TRIM 0
44 44
45static const unsigned long RTC_FREQ = 1024; 45static const unsigned long RTC_FREQ = 1024;
46static unsigned long timer_freq;
47static struct rtc_time rtc_alarm; 46static struct rtc_time rtc_alarm;
48static DEFINE_SPINLOCK(sa1100_rtc_lock); 47static DEFINE_SPINLOCK(sa1100_rtc_lock);
49 48
@@ -156,114 +155,11 @@ static irqreturn_t sa1100_rtc_interrupt(int irq, void *dev_id)
156 return IRQ_HANDLED; 155 return IRQ_HANDLED;
157} 156}
158 157
159static int sa1100_irq_set_freq(struct device *dev, int freq)
160{
161 if (freq < 1 || freq > timer_freq) {
162 return -EINVAL;
163 } else {
164 struct rtc_device *rtc = (struct rtc_device *)dev;
165
166 rtc->irq_freq = freq;
167
168 return 0;
169 }
170}
171
172static int rtc_timer1_count;
173
174static int sa1100_irq_set_state(struct device *dev, int enabled)
175{
176 spin_lock_irq(&sa1100_rtc_lock);
177 if (enabled) {
178 struct rtc_device *rtc = (struct rtc_device *)dev;
179
180 OSMR1 = timer_freq / rtc->irq_freq + OSCR;
181 OIER |= OIER_E1;
182 rtc_timer1_count = 1;
183 } else {
184 OIER &= ~OIER_E1;
185 }
186 spin_unlock_irq(&sa1100_rtc_lock);
187
188 return 0;
189}
190
191static inline int sa1100_timer1_retrigger(struct rtc_device *rtc)
192{
193 unsigned long diff;
194 unsigned long period = timer_freq / rtc->irq_freq;
195
196 spin_lock_irq(&sa1100_rtc_lock);
197
198 do {
199 OSMR1 += period;
200 diff = OSMR1 - OSCR;
201 /* If OSCR > OSMR1, diff is a very large number (unsigned
202 * math). This means we have a lost interrupt. */
203 } while (diff > period);
204 OIER |= OIER_E1;
205
206 spin_unlock_irq(&sa1100_rtc_lock);
207
208 return 0;
209}
210
211static irqreturn_t timer1_interrupt(int irq, void *dev_id)
212{
213 struct platform_device *pdev = to_platform_device(dev_id);
214 struct rtc_device *rtc = platform_get_drvdata(pdev);
215
216 /*
217 * If we match for the first time, rtc_timer1_count will be 1.
218 * Otherwise, we wrapped around (very unlikely but
219 * still possible) so compute the amount of missed periods.
220 * The match reg is updated only when the data is actually retrieved
221 * to avoid unnecessary interrupts.
222 */
223 OSSR = OSSR_M1; /* clear match on timer1 */
224
225 rtc_update_irq(rtc, rtc_timer1_count, RTC_PF | RTC_IRQF);
226
227 if (rtc_timer1_count == 1)
228 rtc_timer1_count =
229 (rtc->irq_freq * ((1 << 30) / (timer_freq >> 2)));
230
231 /* retrigger. */
232 sa1100_timer1_retrigger(rtc);
233
234 return IRQ_HANDLED;
235}
236
237static int sa1100_rtc_read_callback(struct device *dev, int data)
238{
239 if (data & RTC_PF) {
240 struct rtc_device *rtc = (struct rtc_device *)dev;
241
242 /* interpolate missed periods and set match for the next */
243 unsigned long period = timer_freq / rtc->irq_freq;
244 unsigned long oscr = OSCR;
245 unsigned long osmr1 = OSMR1;
246 unsigned long missed = (oscr - osmr1)/period;
247 data += missed << 8;
248 OSSR = OSSR_M1; /* clear match on timer 1 */
249 OSMR1 = osmr1 + (missed + 1)*period;
250 /* Ensure we didn't miss another match in the mean time.
251 * Here we compare (match - OSCR) 8 instead of 0 --
252 * see comment in pxa_timer_interrupt() for explanation.
253 */
254 while ((signed long)((osmr1 = OSMR1) - OSCR) <= 8) {
255 data += 0x100;
256 OSSR = OSSR_M1; /* clear match on timer 1 */
257 OSMR1 = osmr1 + period;
258 }
259 }
260 return data;
261}
262
263static int sa1100_rtc_open(struct device *dev) 158static int sa1100_rtc_open(struct device *dev)
264{ 159{
265 int ret; 160 int ret;
266 struct rtc_device *rtc = (struct rtc_device *)dev; 161 struct platform_device *plat_dev = to_platform_device(dev);
162 struct rtc_device *rtc = platform_get_drvdata(plat_dev);
267 163
268 ret = request_irq(IRQ_RTC1Hz, sa1100_rtc_interrupt, IRQF_DISABLED, 164 ret = request_irq(IRQ_RTC1Hz, sa1100_rtc_interrupt, IRQF_DISABLED,
269 "rtc 1Hz", dev); 165 "rtc 1Hz", dev);
@@ -277,19 +173,11 @@ static int sa1100_rtc_open(struct device *dev)
277 dev_err(dev, "IRQ %d already in use.\n", IRQ_RTCAlrm); 173 dev_err(dev, "IRQ %d already in use.\n", IRQ_RTCAlrm);
278 goto fail_ai; 174 goto fail_ai;
279 } 175 }
280 ret = request_irq(IRQ_OST1, timer1_interrupt, IRQF_DISABLED,
281 "rtc timer", dev);
282 if (ret) {
283 dev_err(dev, "IRQ %d already in use.\n", IRQ_OST1);
284 goto fail_pi;
285 }
286 rtc->max_user_freq = RTC_FREQ; 176 rtc->max_user_freq = RTC_FREQ;
287 sa1100_irq_set_freq(dev, RTC_FREQ); 177 rtc_irq_set_freq(rtc, NULL, RTC_FREQ);
288 178
289 return 0; 179 return 0;
290 180
291 fail_pi:
292 free_irq(IRQ_RTCAlrm, dev);
293 fail_ai: 181 fail_ai:
294 free_irq(IRQ_RTC1Hz, dev); 182 free_irq(IRQ_RTC1Hz, dev);
295 fail_ui: 183 fail_ui:
@@ -304,30 +192,10 @@ static void sa1100_rtc_release(struct device *dev)
304 OSSR = OSSR_M1; 192 OSSR = OSSR_M1;
305 spin_unlock_irq(&sa1100_rtc_lock); 193 spin_unlock_irq(&sa1100_rtc_lock);
306 194
307 free_irq(IRQ_OST1, dev);
308 free_irq(IRQ_RTCAlrm, dev); 195 free_irq(IRQ_RTCAlrm, dev);
309 free_irq(IRQ_RTC1Hz, dev); 196 free_irq(IRQ_RTC1Hz, dev);
310} 197}
311 198
312
313static int sa1100_rtc_ioctl(struct device *dev, unsigned int cmd,
314 unsigned long arg)
315{
316 switch (cmd) {
317 case RTC_UIE_OFF:
318 spin_lock_irq(&sa1100_rtc_lock);
319 RTSR &= ~RTSR_HZE;
320 spin_unlock_irq(&sa1100_rtc_lock);
321 return 0;
322 case RTC_UIE_ON:
323 spin_lock_irq(&sa1100_rtc_lock);
324 RTSR |= RTSR_HZE;
325 spin_unlock_irq(&sa1100_rtc_lock);
326 return 0;
327 }
328 return -ENOIOCTLCMD;
329}
330
331static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 199static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
332{ 200{
333 spin_lock_irq(&sa1100_rtc_lock); 201 spin_lock_irq(&sa1100_rtc_lock);
@@ -386,31 +254,20 @@ static int sa1100_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
386 254
387static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq) 255static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq)
388{ 256{
389 struct rtc_device *rtc = (struct rtc_device *)dev; 257 seq_printf(seq, "trim/divider\t\t: 0x%08x\n", (u32) RTTR);
390 258 seq_printf(seq, "RTSR\t\t\t: 0x%08x\n", (u32)RTSR);
391 seq_printf(seq, "trim/divider\t: 0x%08x\n", (u32) RTTR);
392 seq_printf(seq, "update_IRQ\t: %s\n",
393 (RTSR & RTSR_HZE) ? "yes" : "no");
394 seq_printf(seq, "periodic_IRQ\t: %s\n",
395 (OIER & OIER_E1) ? "yes" : "no");
396 seq_printf(seq, "periodic_freq\t: %d\n", rtc->irq_freq);
397 seq_printf(seq, "RTSR\t\t: 0x%08x\n", (u32)RTSR);
398 259
399 return 0; 260 return 0;
400} 261}
401 262
402static const struct rtc_class_ops sa1100_rtc_ops = { 263static const struct rtc_class_ops sa1100_rtc_ops = {
403 .open = sa1100_rtc_open, 264 .open = sa1100_rtc_open,
404 .read_callback = sa1100_rtc_read_callback,
405 .release = sa1100_rtc_release, 265 .release = sa1100_rtc_release,
406 .ioctl = sa1100_rtc_ioctl,
407 .read_time = sa1100_rtc_read_time, 266 .read_time = sa1100_rtc_read_time,
408 .set_time = sa1100_rtc_set_time, 267 .set_time = sa1100_rtc_set_time,
409 .read_alarm = sa1100_rtc_read_alarm, 268 .read_alarm = sa1100_rtc_read_alarm,
410 .set_alarm = sa1100_rtc_set_alarm, 269 .set_alarm = sa1100_rtc_set_alarm,
411 .proc = sa1100_rtc_proc, 270 .proc = sa1100_rtc_proc,
412 .irq_set_freq = sa1100_irq_set_freq,
413 .irq_set_state = sa1100_irq_set_state,
414 .alarm_irq_enable = sa1100_rtc_alarm_irq_enable, 271 .alarm_irq_enable = sa1100_rtc_alarm_irq_enable,
415}; 272};
416 273
@@ -418,8 +275,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev)
418{ 275{
419 struct rtc_device *rtc; 276 struct rtc_device *rtc;
420 277
421 timer_freq = get_clock_tick_rate();
422
423 /* 278 /*
424 * According to the manual we should be able to let RTTR be zero 279 * According to the manual we should be able to let RTTR be zero
425 * and then a default diviser for a 32.768KHz clock is used. 280 * and then a default diviser for a 32.768KHz clock is used.
@@ -445,11 +300,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev)
445 300
446 platform_set_drvdata(pdev, rtc); 301 platform_set_drvdata(pdev, rtc);
447 302
448 /* Set the irq_freq */
449 /*TODO: Find out who is messing with this value after we initialize
450 * it here.*/
451 rtc->irq_freq = RTC_FREQ;
452
453 /* Fix for a nasty initialization problem the in SA11xx RTSR register. 303 /* Fix for a nasty initialization problem the in SA11xx RTSR register.
454 * See also the comments in sa1100_rtc_interrupt(). 304 * See also the comments in sa1100_rtc_interrupt().
455 * 305 *
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 93314a9e7fa..e55dc1ac83a 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -344,27 +344,6 @@ static inline void sh_rtc_setcie(struct device *dev, unsigned int enable)
344 spin_unlock_irq(&rtc->lock); 344 spin_unlock_irq(&rtc->lock);
345} 345}
346 346
347static int sh_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
348{
349 struct sh_rtc *rtc = dev_get_drvdata(dev);
350 unsigned int ret = 0;
351
352 switch (cmd) {
353 case RTC_UIE_OFF:
354 rtc->periodic_freq &= ~PF_OXS;
355 sh_rtc_setcie(dev, 0);
356 break;
357 case RTC_UIE_ON:
358 rtc->periodic_freq |= PF_OXS;
359 sh_rtc_setcie(dev, 1);
360 break;
361 default:
362 ret = -ENOIOCTLCMD;
363 }
364
365 return ret;
366}
367
368static int sh_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 347static int sh_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
369{ 348{
370 sh_rtc_setaie(dev, enabled); 349 sh_rtc_setaie(dev, enabled);
@@ -598,13 +577,10 @@ static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
598} 577}
599 578
600static struct rtc_class_ops sh_rtc_ops = { 579static struct rtc_class_ops sh_rtc_ops = {
601 .ioctl = sh_rtc_ioctl,
602 .read_time = sh_rtc_read_time, 580 .read_time = sh_rtc_read_time,
603 .set_time = sh_rtc_set_time, 581 .set_time = sh_rtc_set_time,
604 .read_alarm = sh_rtc_read_alarm, 582 .read_alarm = sh_rtc_read_alarm,
605 .set_alarm = sh_rtc_set_alarm, 583 .set_alarm = sh_rtc_set_alarm,
606 .irq_set_state = sh_rtc_irq_set_state,
607 .irq_set_freq = sh_rtc_irq_set_freq,
608 .proc = sh_rtc_proc, 584 .proc = sh_rtc_proc,
609 .alarm_irq_enable = sh_rtc_alarm_irq_enable, 585 .alarm_irq_enable = sh_rtc_alarm_irq_enable,
610}; 586};
diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c
index 7e7d0c806f2..572e9534b59 100644
--- a/drivers/rtc/rtc-stmp3xxx.c
+++ b/drivers/rtc/rtc-stmp3xxx.c
@@ -115,19 +115,6 @@ static int stmp3xxx_alarm_irq_enable(struct device *dev, unsigned int enabled)
115 return 0; 115 return 0;
116} 116}
117 117
118static int stmp3xxx_update_irq_enable(struct device *dev, unsigned int enabled)
119{
120 struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev);
121
122 if (enabled)
123 stmp3xxx_setl(BM_RTC_CTRL_ONEMSEC_IRQ_EN,
124 rtc_data->io + HW_RTC_CTRL);
125 else
126 stmp3xxx_clearl(BM_RTC_CTRL_ONEMSEC_IRQ_EN,
127 rtc_data->io + HW_RTC_CTRL);
128 return 0;
129}
130
131static int stmp3xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) 118static int stmp3xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
132{ 119{
133 struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev); 120 struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev);
@@ -149,8 +136,6 @@ static int stmp3xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
149static struct rtc_class_ops stmp3xxx_rtc_ops = { 136static struct rtc_class_ops stmp3xxx_rtc_ops = {
150 .alarm_irq_enable = 137 .alarm_irq_enable =
151 stmp3xxx_alarm_irq_enable, 138 stmp3xxx_alarm_irq_enable,
152 .update_irq_enable =
153 stmp3xxx_update_irq_enable,
154 .read_time = stmp3xxx_rtc_gettime, 139 .read_time = stmp3xxx_rtc_gettime,
155 .set_mmss = stmp3xxx_rtc_set_mmss, 140 .set_mmss = stmp3xxx_rtc_set_mmss,
156 .read_alarm = stmp3xxx_rtc_read_alarm, 141 .read_alarm = stmp3xxx_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c
index a82d6fe9707..7e96254bd36 100644
--- a/drivers/rtc/rtc-test.c
+++ b/drivers/rtc/rtc-test.c
@@ -78,11 +78,16 @@ static ssize_t test_irq_store(struct device *dev,
78 struct rtc_device *rtc = platform_get_drvdata(plat_dev); 78 struct rtc_device *rtc = platform_get_drvdata(plat_dev);
79 79
80 retval = count; 80 retval = count;
81 if (strncmp(buf, "tick", 4) == 0) 81 if (strncmp(buf, "tick", 4) == 0 && rtc->pie_enabled)
82 rtc_update_irq(rtc, 1, RTC_PF | RTC_IRQF); 82 rtc_update_irq(rtc, 1, RTC_PF | RTC_IRQF);
83 else if (strncmp(buf, "alarm", 5) == 0) 83 else if (strncmp(buf, "alarm", 5) == 0) {
84 rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF); 84 struct rtc_wkalrm alrm;
85 else if (strncmp(buf, "update", 6) == 0) 85 int err = rtc_read_alarm(rtc, &alrm);
86
87 if (!err && alrm.enabled)
88 rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF);
89
90 } else if (strncmp(buf, "update", 6) == 0 && rtc->uie_rtctimer.enabled)
86 rtc_update_irq(rtc, 1, RTC_UF | RTC_IRQF); 91 rtc_update_irq(rtc, 1, RTC_UF | RTC_IRQF);
87 else 92 else
88 retval = -EINVAL; 93 retval = -EINVAL;
diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c
index ed1b8682812..f9a2799c44d 100644
--- a/drivers/rtc/rtc-twl.c
+++ b/drivers/rtc/rtc-twl.c
@@ -213,18 +213,6 @@ static int twl_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
213 return ret; 213 return ret;
214} 214}
215 215
216static int twl_rtc_update_irq_enable(struct device *dev, unsigned enabled)
217{
218 int ret;
219
220 if (enabled)
221 ret = set_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M);
222 else
223 ret = mask_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M);
224
225 return ret;
226}
227
228/* 216/*
229 * Gets current TWL RTC time and date parameters. 217 * Gets current TWL RTC time and date parameters.
230 * 218 *
@@ -433,7 +421,6 @@ static struct rtc_class_ops twl_rtc_ops = {
433 .read_alarm = twl_rtc_read_alarm, 421 .read_alarm = twl_rtc_read_alarm,
434 .set_alarm = twl_rtc_set_alarm, 422 .set_alarm = twl_rtc_set_alarm,
435 .alarm_irq_enable = twl_rtc_alarm_irq_enable, 423 .alarm_irq_enable = twl_rtc_alarm_irq_enable,
436 .update_irq_enable = twl_rtc_update_irq_enable,
437}; 424};
438 425
439/*----------------------------------------------------------------------*/ 426/*----------------------------------------------------------------------*/
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index 769190ac6d1..c5698cda366 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -207,36 +207,6 @@ static int vr41xx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
207 return 0; 207 return 0;
208} 208}
209 209
210static int vr41xx_rtc_irq_set_freq(struct device *dev, int freq)
211{
212 u64 count;
213
214 if (!is_power_of_2(freq))
215 return -EINVAL;
216 count = RTC_FREQUENCY;
217 do_div(count, freq);
218
219 spin_lock_irq(&rtc_lock);
220
221 periodic_count = count;
222 rtc1_write(RTCL1LREG, periodic_count);
223 rtc1_write(RTCL1HREG, periodic_count >> 16);
224
225 spin_unlock_irq(&rtc_lock);
226
227 return 0;
228}
229
230static int vr41xx_rtc_irq_set_state(struct device *dev, int enabled)
231{
232 if (enabled)
233 enable_irq(pie_irq);
234 else
235 disable_irq(pie_irq);
236
237 return 0;
238}
239
240static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) 210static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
241{ 211{
242 switch (cmd) { 212 switch (cmd) {
@@ -308,8 +278,6 @@ static const struct rtc_class_ops vr41xx_rtc_ops = {
308 .set_time = vr41xx_rtc_set_time, 278 .set_time = vr41xx_rtc_set_time,
309 .read_alarm = vr41xx_rtc_read_alarm, 279 .read_alarm = vr41xx_rtc_read_alarm,
310 .set_alarm = vr41xx_rtc_set_alarm, 280 .set_alarm = vr41xx_rtc_set_alarm,
311 .irq_set_freq = vr41xx_rtc_irq_set_freq,
312 .irq_set_state = vr41xx_rtc_irq_set_state,
313}; 281};
314 282
315static int __devinit rtc_probe(struct platform_device *pdev) 283static int __devinit rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c
index 82931dc65c0..bdc909bd56d 100644
--- a/drivers/rtc/rtc-wm831x.c
+++ b/drivers/rtc/rtc-wm831x.c
@@ -315,21 +315,6 @@ static int wm831x_rtc_alarm_irq_enable(struct device *dev,
315 return wm831x_rtc_stop_alarm(wm831x_rtc); 315 return wm831x_rtc_stop_alarm(wm831x_rtc);
316} 316}
317 317
318static int wm831x_rtc_update_irq_enable(struct device *dev,
319 unsigned int enabled)
320{
321 struct wm831x_rtc *wm831x_rtc = dev_get_drvdata(dev);
322 int val;
323
324 if (enabled)
325 val = 1 << WM831X_RTC_PINT_FREQ_SHIFT;
326 else
327 val = 0;
328
329 return wm831x_set_bits(wm831x_rtc->wm831x, WM831X_RTC_CONTROL,
330 WM831X_RTC_PINT_FREQ_MASK, val);
331}
332
333static irqreturn_t wm831x_alm_irq(int irq, void *data) 318static irqreturn_t wm831x_alm_irq(int irq, void *data)
334{ 319{
335 struct wm831x_rtc *wm831x_rtc = data; 320 struct wm831x_rtc *wm831x_rtc = data;
@@ -354,7 +339,6 @@ static const struct rtc_class_ops wm831x_rtc_ops = {
354 .read_alarm = wm831x_rtc_readalarm, 339 .read_alarm = wm831x_rtc_readalarm,
355 .set_alarm = wm831x_rtc_setalarm, 340 .set_alarm = wm831x_rtc_setalarm,
356 .alarm_irq_enable = wm831x_rtc_alarm_irq_enable, 341 .alarm_irq_enable = wm831x_rtc_alarm_irq_enable,
357 .update_irq_enable = wm831x_rtc_update_irq_enable,
358}; 342};
359 343
360#ifdef CONFIG_PM 344#ifdef CONFIG_PM
diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
index 3d0dc76b38a..66421426e40 100644
--- a/drivers/rtc/rtc-wm8350.c
+++ b/drivers/rtc/rtc-wm8350.c
@@ -302,26 +302,6 @@ static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
302 return ret; 302 return ret;
303} 303}
304 304
305static int wm8350_rtc_update_irq_enable(struct device *dev,
306 unsigned int enabled)
307{
308 struct wm8350 *wm8350 = dev_get_drvdata(dev);
309
310 /* Suppress duplicate changes since genirq nests enable and
311 * disable calls. */
312 if (enabled == wm8350->rtc.update_enabled)
313 return 0;
314
315 if (enabled)
316 wm8350_unmask_irq(wm8350, WM8350_IRQ_RTC_SEC);
317 else
318 wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC);
319
320 wm8350->rtc.update_enabled = enabled;
321
322 return 0;
323}
324
325static irqreturn_t wm8350_rtc_alarm_handler(int irq, void *data) 305static irqreturn_t wm8350_rtc_alarm_handler(int irq, void *data)
326{ 306{
327 struct wm8350 *wm8350 = data; 307 struct wm8350 *wm8350 = data;
@@ -357,7 +337,6 @@ static const struct rtc_class_ops wm8350_rtc_ops = {
357 .read_alarm = wm8350_rtc_readalarm, 337 .read_alarm = wm8350_rtc_readalarm,
358 .set_alarm = wm8350_rtc_setalarm, 338 .set_alarm = wm8350_rtc_setalarm,
359 .alarm_irq_enable = wm8350_rtc_alarm_irq_enable, 339 .alarm_irq_enable = wm8350_rtc_alarm_irq_enable,
360 .update_irq_enable = wm8350_rtc_update_irq_enable,
361}; 340};
362 341
363#ifdef CONFIG_PM 342#ifdef CONFIG_PM
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index 95928833855..a429b01d028 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -1557,9 +1557,7 @@ static int __devinit pxa2xx_spi_probe(struct platform_device *pdev)
1557 drv_data->ssp = ssp; 1557 drv_data->ssp = ssp;
1558 1558
1559 master->dev.parent = &pdev->dev; 1559 master->dev.parent = &pdev->dev;
1560#ifdef CONFIG_OF
1561 master->dev.of_node = pdev->dev.of_node; 1560 master->dev.of_node = pdev->dev.of_node;
1562#endif
1563 /* the spi->mode bits understood by this driver: */ 1561 /* the spi->mode bits understood by this driver: */
1564 master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; 1562 master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
1565 1563
diff --git a/drivers/spi/pxa2xx_spi_pci.c b/drivers/spi/pxa2xx_spi_pci.c
index 19752b09e15..378e504f89e 100644
--- a/drivers/spi/pxa2xx_spi_pci.c
+++ b/drivers/spi/pxa2xx_spi_pci.c
@@ -89,9 +89,7 @@ static int __devinit ce4100_spi_probe(struct pci_dev *dev,
89 goto err_nomem; 89 goto err_nomem;
90 90
91 pdev->dev.parent = &dev->dev; 91 pdev->dev.parent = &dev->dev;
92#ifdef CONFIG_OF
93 pdev->dev.of_node = dev->dev.of_node; 92 pdev->dev.of_node = dev->dev.of_node;
94#endif
95 ssp = &spi_info->ssp; 93 ssp = &spi_info->ssp;
96 ssp->phys_base = pci_resource_start(dev, 0); 94 ssp->phys_base = pci_resource_start(dev, 0);
97 ssp->mmio_base = ioremap(phys_beg, phys_len); 95 ssp->mmio_base = ioremap(phys_beg, phys_len);
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 7adaef62a99..4d2c75df886 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -351,14 +351,12 @@ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id)
351 return IRQ_HANDLED; 351 return IRQ_HANDLED;
352} 352}
353 353
354#ifdef CONFIG_OF
355static const struct of_device_id xilinx_spi_of_match[] = { 354static const struct of_device_id xilinx_spi_of_match[] = {
356 { .compatible = "xlnx,xps-spi-2.00.a", }, 355 { .compatible = "xlnx,xps-spi-2.00.a", },
357 { .compatible = "xlnx,xps-spi-2.00.b", }, 356 { .compatible = "xlnx,xps-spi-2.00.b", },
358 {} 357 {}
359}; 358};
360MODULE_DEVICE_TABLE(of, xilinx_spi_of_match); 359MODULE_DEVICE_TABLE(of, xilinx_spi_of_match);
361#endif
362 360
363struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem, 361struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
364 u32 irq, s16 bus_num, int num_cs, int little_endian, int bits_per_word) 362 u32 irq, s16 bus_num, int num_cs, int little_endian, int bits_per_word)
@@ -394,9 +392,7 @@ struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
394 392
395 master->bus_num = bus_num; 393 master->bus_num = bus_num;
396 master->num_chipselect = num_cs; 394 master->num_chipselect = num_cs;
397#ifdef CONFIG_OF
398 master->dev.of_node = dev->of_node; 395 master->dev.of_node = dev->of_node;
399#endif
400 396
401 xspi->mem = *mem; 397 xspi->mem = *mem;
402 xspi->irq = irq; 398 xspi->irq = irq;
@@ -539,9 +535,7 @@ static struct platform_driver xilinx_spi_driver = {
539 .driver = { 535 .driver = {
540 .name = XILINX_SPI_NAME, 536 .name = XILINX_SPI_NAME,
541 .owner = THIS_MODULE, 537 .owner = THIS_MODULE,
542#ifdef CONFIG_OF
543 .of_match_table = xilinx_spi_of_match, 538 .of_match_table = xilinx_spi_of_match,
544#endif
545 }, 539 },
546}; 540};
547 541
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 43f9f02c7db..718050ace08 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -232,7 +232,7 @@ static int increase_reservation(unsigned long nr_pages)
232 set_phys_to_machine(pfn, frame_list[i]); 232 set_phys_to_machine(pfn, frame_list[i]);
233 233
234 /* Link back into the page tables if not highmem. */ 234 /* Link back into the page tables if not highmem. */
235 if (pfn < max_low_pfn) { 235 if (!xen_hvm_domain() && pfn < max_low_pfn) {
236 int ret; 236 int ret;
237 ret = HYPERVISOR_update_va_mapping( 237 ret = HYPERVISOR_update_va_mapping(
238 (unsigned long)__va(pfn << PAGE_SHIFT), 238 (unsigned long)__va(pfn << PAGE_SHIFT),
@@ -280,7 +280,7 @@ static int decrease_reservation(unsigned long nr_pages)
280 280
281 scrub_page(page); 281 scrub_page(page);
282 282
283 if (!PageHighMem(page)) { 283 if (!xen_hvm_domain() && !PageHighMem(page)) {
284 ret = HYPERVISOR_update_va_mapping( 284 ret = HYPERVISOR_update_va_mapping(
285 (unsigned long)__va(pfn << PAGE_SHIFT), 285 (unsigned long)__va(pfn << PAGE_SHIFT),
286 __pte_ma(0), 0); 286 __pte_ma(0), 0);
@@ -296,7 +296,7 @@ static int decrease_reservation(unsigned long nr_pages)
296 /* No more mappings: invalidate P2M and add to balloon. */ 296 /* No more mappings: invalidate P2M and add to balloon. */
297 for (i = 0; i < nr_pages; i++) { 297 for (i = 0; i < nr_pages; i++) {
298 pfn = mfn_to_pfn(frame_list[i]); 298 pfn = mfn_to_pfn(frame_list[i]);
299 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 299 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
300 balloon_append(pfn_to_page(pfn)); 300 balloon_append(pfn_to_page(pfn));
301 } 301 }
302 302
@@ -392,15 +392,19 @@ static struct notifier_block xenstore_notifier;
392 392
393static int __init balloon_init(void) 393static int __init balloon_init(void)
394{ 394{
395 unsigned long pfn, extra_pfn_end; 395 unsigned long pfn, nr_pages, extra_pfn_end;
396 struct page *page; 396 struct page *page;
397 397
398 if (!xen_pv_domain()) 398 if (!xen_domain())
399 return -ENODEV; 399 return -ENODEV;
400 400
401 pr_info("xen_balloon: Initialising balloon driver.\n"); 401 pr_info("xen_balloon: Initialising balloon driver.\n");
402 402
403 balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); 403 if (xen_pv_domain())
404 nr_pages = xen_start_info->nr_pages;
405 else
406 nr_pages = max_pfn;
407 balloon_stats.current_pages = min(nr_pages, max_pfn);
404 balloon_stats.target_pages = balloon_stats.current_pages; 408 balloon_stats.target_pages = balloon_stats.current_pages;
405 balloon_stats.balloon_low = 0; 409 balloon_stats.balloon_low = 0;
406 balloon_stats.balloon_high = 0; 410 balloon_stats.balloon_high = 0;
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 74681478100..0ad1699a1b3 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -114,7 +114,7 @@ struct cpu_evtchn_s {
114static __initdata struct cpu_evtchn_s init_evtchn_mask = { 114static __initdata struct cpu_evtchn_s init_evtchn_mask = {
115 .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, 115 .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
116}; 116};
117static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; 117static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask;
118 118
119static inline unsigned long *cpu_evtchn_mask(int cpu) 119static inline unsigned long *cpu_evtchn_mask(int cpu)
120{ 120{
@@ -277,7 +277,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
277 277
278 BUG_ON(irq == -1); 278 BUG_ON(irq == -1);
279#ifdef CONFIG_SMP 279#ifdef CONFIG_SMP
280 cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); 280 cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
281#endif 281#endif
282 282
283 clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); 283 clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
@@ -294,7 +294,7 @@ static void init_evtchn_cpu_bindings(void)
294 294
295 /* By default all event channels notify CPU#0. */ 295 /* By default all event channels notify CPU#0. */
296 for_each_irq_desc(i, desc) { 296 for_each_irq_desc(i, desc) {
297 cpumask_copy(desc->affinity, cpumask_of(0)); 297 cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
298 } 298 }
299#endif 299#endif
300 300
@@ -376,81 +376,69 @@ static void unmask_evtchn(int port)
376 put_cpu(); 376 put_cpu();
377} 377}
378 378
379static int get_nr_hw_irqs(void) 379static int xen_allocate_irq_dynamic(void)
380{ 380{
381 int ret = 1; 381 int first = 0;
382 int irq;
382 383
383#ifdef CONFIG_X86_IO_APIC 384#ifdef CONFIG_X86_IO_APIC
384 ret = get_nr_irqs_gsi(); 385 /*
386 * For an HVM guest or domain 0 which see "real" (emulated or
387 * actual repectively) GSIs we allocate dynamic IRQs
388 * e.g. those corresponding to event channels or MSIs
389 * etc. from the range above those "real" GSIs to avoid
390 * collisions.
391 */
392 if (xen_initial_domain() || xen_hvm_domain())
393 first = get_nr_irqs_gsi();
385#endif 394#endif
386 395
387 return ret; 396retry:
388} 397 irq = irq_alloc_desc_from(first, -1);
389 398
390static int find_unbound_pirq(int type) 399 if (irq == -ENOMEM && first > NR_IRQS_LEGACY) {
391{ 400 printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n");
392 int rc, i; 401 first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY);
393 struct physdev_get_free_pirq op_get_free_pirq; 402 goto retry;
394 op_get_free_pirq.type = type; 403 }
395 404
396 rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); 405 if (irq < 0)
397 if (!rc) 406 panic("No available IRQ to bind to: increase nr_irqs!\n");
398 return op_get_free_pirq.pirq;
399 407
400 for (i = 0; i < nr_irqs; i++) { 408 return irq;
401 if (pirq_to_irq[i] < 0)
402 return i;
403 }
404 return -1;
405} 409}
406 410
407static int find_unbound_irq(void) 411static int xen_allocate_irq_gsi(unsigned gsi)
408{ 412{
409 struct irq_data *data; 413 int irq;
410 int irq, res;
411 int bottom = get_nr_hw_irqs();
412 int top = nr_irqs-1;
413
414 if (bottom == nr_irqs)
415 goto no_irqs;
416 414
417 /* This loop starts from the top of IRQ space and goes down. 415 /*
418 * We need this b/c if we have a PCI device in a Xen PV guest 416 * A PV guest has no concept of a GSI (since it has no ACPI
419 * we do not have an IO-APIC (though the backend might have them) 417 * nor access to/knowledge of the physical APICs). Therefore
420 * mapped in. To not have a collision of physical IRQs with the Xen 418 * all IRQs are dynamically allocated from the entire IRQ
421 * event channels start at the top of the IRQ space for virtual IRQs. 419 * space.
422 */ 420 */
423 for (irq = top; irq > bottom; irq--) { 421 if (xen_pv_domain() && !xen_initial_domain())
424 data = irq_get_irq_data(irq); 422 return xen_allocate_irq_dynamic();
425 /* only 15->0 have init'd desc; handle irq > 16 */
426 if (!data)
427 break;
428 if (data->chip == &no_irq_chip)
429 break;
430 if (data->chip != &xen_dynamic_chip)
431 continue;
432 if (irq_info[irq].type == IRQT_UNBOUND)
433 return irq;
434 }
435
436 if (irq == bottom)
437 goto no_irqs;
438 423
439 res = irq_alloc_desc_at(irq, -1); 424 /* Legacy IRQ descriptors are already allocated by the arch. */
425 if (gsi < NR_IRQS_LEGACY)
426 return gsi;
440 427
441 if (WARN_ON(res != irq)) 428 irq = irq_alloc_desc_at(gsi, -1);
442 return -1; 429 if (irq < 0)
430 panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq);
443 431
444 return irq; 432 return irq;
445
446no_irqs:
447 panic("No available IRQ to bind to: increase nr_irqs!\n");
448} 433}
449 434
450static bool identity_mapped_irq(unsigned irq) 435static void xen_free_irq(unsigned irq)
451{ 436{
452 /* identity map all the hardware irqs */ 437 /* Legacy IRQ descriptors are managed by the arch. */
453 return irq < get_nr_hw_irqs(); 438 if (irq < NR_IRQS_LEGACY)
439 return;
440
441 irq_free_desc(irq);
454} 442}
455 443
456static void pirq_unmask_notify(int irq) 444static void pirq_unmask_notify(int irq)
@@ -486,7 +474,7 @@ static bool probing_irq(int irq)
486 return desc && desc->action == NULL; 474 return desc && desc->action == NULL;
487} 475}
488 476
489static unsigned int startup_pirq(unsigned int irq) 477static unsigned int __startup_pirq(unsigned int irq)
490{ 478{
491 struct evtchn_bind_pirq bind_pirq; 479 struct evtchn_bind_pirq bind_pirq;
492 struct irq_info *info = info_for_irq(irq); 480 struct irq_info *info = info_for_irq(irq);
@@ -524,9 +512,15 @@ out:
524 return 0; 512 return 0;
525} 513}
526 514
527static void shutdown_pirq(unsigned int irq) 515static unsigned int startup_pirq(struct irq_data *data)
516{
517 return __startup_pirq(data->irq);
518}
519
520static void shutdown_pirq(struct irq_data *data)
528{ 521{
529 struct evtchn_close close; 522 struct evtchn_close close;
523 unsigned int irq = data->irq;
530 struct irq_info *info = info_for_irq(irq); 524 struct irq_info *info = info_for_irq(irq);
531 int evtchn = evtchn_from_irq(irq); 525 int evtchn = evtchn_from_irq(irq);
532 526
@@ -546,20 +540,20 @@ static void shutdown_pirq(unsigned int irq)
546 info->evtchn = 0; 540 info->evtchn = 0;
547} 541}
548 542
549static void enable_pirq(unsigned int irq) 543static void enable_pirq(struct irq_data *data)
550{ 544{
551 startup_pirq(irq); 545 startup_pirq(data);
552} 546}
553 547
554static void disable_pirq(unsigned int irq) 548static void disable_pirq(struct irq_data *data)
555{ 549{
556} 550}
557 551
558static void ack_pirq(unsigned int irq) 552static void ack_pirq(struct irq_data *data)
559{ 553{
560 int evtchn = evtchn_from_irq(irq); 554 int evtchn = evtchn_from_irq(data->irq);
561 555
562 move_native_irq(irq); 556 move_native_irq(data->irq);
563 557
564 if (VALID_EVTCHN(evtchn)) { 558 if (VALID_EVTCHN(evtchn)) {
565 mask_evtchn(evtchn); 559 mask_evtchn(evtchn);
@@ -567,23 +561,6 @@ static void ack_pirq(unsigned int irq)
567 } 561 }
568} 562}
569 563
570static void end_pirq(unsigned int irq)
571{
572 int evtchn = evtchn_from_irq(irq);
573 struct irq_desc *desc = irq_to_desc(irq);
574
575 if (WARN_ON(!desc))
576 return;
577
578 if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
579 (IRQ_DISABLED|IRQ_PENDING)) {
580 shutdown_pirq(irq);
581 } else if (VALID_EVTCHN(evtchn)) {
582 unmask_evtchn(evtchn);
583 pirq_unmask_notify(irq);
584 }
585}
586
587static int find_irq_by_gsi(unsigned gsi) 564static int find_irq_by_gsi(unsigned gsi)
588{ 565{
589 int irq; 566 int irq;
@@ -638,14 +615,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
638 goto out; /* XXX need refcount? */ 615 goto out; /* XXX need refcount? */
639 } 616 }
640 617
641 /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore 618 irq = xen_allocate_irq_gsi(gsi);
642 * we are using the !xen_initial_domain() to drop in the function.*/
643 if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
644 xen_pv_domain())) {
645 irq = gsi;
646 irq_alloc_desc_at(irq, -1);
647 } else
648 irq = find_unbound_irq();
649 619
650 set_irq_chip_and_handler_name(irq, &xen_pirq_chip, 620 set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
651 handle_level_irq, name); 621 handle_level_irq, name);
@@ -658,7 +628,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
658 * this in the priv domain. */ 628 * this in the priv domain. */
659 if (xen_initial_domain() && 629 if (xen_initial_domain() &&
660 HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { 630 HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
661 irq_free_desc(irq); 631 xen_free_irq(irq);
662 irq = -ENOSPC; 632 irq = -ENOSPC;
663 goto out; 633 goto out;
664 } 634 }
@@ -674,87 +644,46 @@ out:
674} 644}
675 645
676#ifdef CONFIG_PCI_MSI 646#ifdef CONFIG_PCI_MSI
677#include <linux/msi.h> 647int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
678#include "../pci/msi.h"
679
680void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
681{ 648{
682 spin_lock(&irq_mapping_update_lock); 649 int rc;
683 650 struct physdev_get_free_pirq op_get_free_pirq;
684 if (alloc & XEN_ALLOC_IRQ) {
685 *irq = find_unbound_irq();
686 if (*irq == -1)
687 goto out;
688 }
689
690 if (alloc & XEN_ALLOC_PIRQ) {
691 *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
692 if (*pirq == -1)
693 goto out;
694 }
695 651
696 set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, 652 op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
697 handle_level_irq, name); 653 rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
698 654
699 irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0); 655 WARN_ONCE(rc == -ENOSYS,
700 pirq_to_irq[*pirq] = *irq; 656 "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
701 657
702out: 658 return rc ? -1 : op_get_free_pirq.pirq;
703 spin_unlock(&irq_mapping_update_lock);
704} 659}
705 660
706int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) 661int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
662 int pirq, int vector, const char *name)
707{ 663{
708 int irq = -1; 664 int irq, ret;
709 struct physdev_map_pirq map_irq;
710 int rc;
711 int pos;
712 u32 table_offset, bir;
713
714 memset(&map_irq, 0, sizeof(map_irq));
715 map_irq.domid = DOMID_SELF;
716 map_irq.type = MAP_PIRQ_TYPE_MSI;
717 map_irq.index = -1;
718 map_irq.pirq = -1;
719 map_irq.bus = dev->bus->number;
720 map_irq.devfn = dev->devfn;
721
722 if (type == PCI_CAP_ID_MSIX) {
723 pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
724
725 pci_read_config_dword(dev, msix_table_offset_reg(pos),
726 &table_offset);
727 bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
728
729 map_irq.table_base = pci_resource_start(dev, bir);
730 map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
731 }
732 665
733 spin_lock(&irq_mapping_update_lock); 666 spin_lock(&irq_mapping_update_lock);
734 667
735 irq = find_unbound_irq(); 668 irq = xen_allocate_irq_dynamic();
736
737 if (irq == -1) 669 if (irq == -1)
738 goto out; 670 goto out;
739 671
740 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
741 if (rc) {
742 printk(KERN_WARNING "xen map irq failed %d\n", rc);
743
744 irq_free_desc(irq);
745
746 irq = -1;
747 goto out;
748 }
749 irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
750
751 set_irq_chip_and_handler_name(irq, &xen_pirq_chip, 672 set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
752 handle_level_irq, 673 handle_level_irq, name);
753 (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
754 674
675 irq_info[irq] = mk_pirq_info(0, pirq, 0, vector);
676 pirq_to_irq[pirq] = irq;
677 ret = irq_set_msi_desc(irq, msidesc);
678 if (ret < 0)
679 goto error_irq;
755out: 680out:
756 spin_unlock(&irq_mapping_update_lock); 681 spin_unlock(&irq_mapping_update_lock);
757 return irq; 682 return irq;
683error_irq:
684 spin_unlock(&irq_mapping_update_lock);
685 xen_free_irq(irq);
686 return -1;
758} 687}
759#endif 688#endif
760 689
@@ -779,11 +708,12 @@ int xen_destroy_irq(int irq)
779 printk(KERN_WARNING "unmap irq failed %d\n", rc); 708 printk(KERN_WARNING "unmap irq failed %d\n", rc);
780 goto out; 709 goto out;
781 } 710 }
782 pirq_to_irq[info->u.pirq.pirq] = -1;
783 } 711 }
712 pirq_to_irq[info->u.pirq.pirq] = -1;
713
784 irq_info[irq] = mk_unbound_info(); 714 irq_info[irq] = mk_unbound_info();
785 715
786 irq_free_desc(irq); 716 xen_free_irq(irq);
787 717
788out: 718out:
789 spin_unlock(&irq_mapping_update_lock); 719 spin_unlock(&irq_mapping_update_lock);
@@ -814,7 +744,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
814 irq = evtchn_to_irq[evtchn]; 744 irq = evtchn_to_irq[evtchn];
815 745
816 if (irq == -1) { 746 if (irq == -1) {
817 irq = find_unbound_irq(); 747 irq = xen_allocate_irq_dynamic();
818 748
819 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, 749 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
820 handle_fasteoi_irq, "event"); 750 handle_fasteoi_irq, "event");
@@ -839,7 +769,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
839 irq = per_cpu(ipi_to_irq, cpu)[ipi]; 769 irq = per_cpu(ipi_to_irq, cpu)[ipi];
840 770
841 if (irq == -1) { 771 if (irq == -1) {
842 irq = find_unbound_irq(); 772 irq = xen_allocate_irq_dynamic();
843 if (irq < 0) 773 if (irq < 0)
844 goto out; 774 goto out;
845 775
@@ -875,7 +805,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
875 irq = per_cpu(virq_to_irq, cpu)[virq]; 805 irq = per_cpu(virq_to_irq, cpu)[virq];
876 806
877 if (irq == -1) { 807 if (irq == -1) {
878 irq = find_unbound_irq(); 808 irq = xen_allocate_irq_dynamic();
879 809
880 set_irq_chip_and_handler_name(irq, &xen_percpu_chip, 810 set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
881 handle_percpu_irq, "virq"); 811 handle_percpu_irq, "virq");
@@ -934,7 +864,7 @@ static void unbind_from_irq(unsigned int irq)
934 if (irq_info[irq].type != IRQT_UNBOUND) { 864 if (irq_info[irq].type != IRQT_UNBOUND) {
935 irq_info[irq] = mk_unbound_info(); 865 irq_info[irq] = mk_unbound_info();
936 866
937 irq_free_desc(irq); 867 xen_free_irq(irq);
938 } 868 }
939 869
940 spin_unlock(&irq_mapping_update_lock); 870 spin_unlock(&irq_mapping_update_lock);
@@ -990,7 +920,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
990 if (irq < 0) 920 if (irq < 0)
991 return irq; 921 return irq;
992 922
993 irqflags |= IRQF_NO_SUSPEND; 923 irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME;
994 retval = request_irq(irq, handler, irqflags, devname, dev_id); 924 retval = request_irq(irq, handler, irqflags, devname, dev_id);
995 if (retval != 0) { 925 if (retval != 0) {
996 unbind_from_irq(irq); 926 unbind_from_irq(irq);
@@ -1234,11 +1164,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
1234 return 0; 1164 return 0;
1235} 1165}
1236 1166
1237static int set_affinity_irq(unsigned irq, const struct cpumask *dest) 1167static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
1168 bool force)
1238{ 1169{
1239 unsigned tcpu = cpumask_first(dest); 1170 unsigned tcpu = cpumask_first(dest);
1240 1171
1241 return rebind_irq_to_cpu(irq, tcpu); 1172 return rebind_irq_to_cpu(data->irq, tcpu);
1242} 1173}
1243 1174
1244int resend_irq_on_evtchn(unsigned int irq) 1175int resend_irq_on_evtchn(unsigned int irq)
@@ -1257,35 +1188,35 @@ int resend_irq_on_evtchn(unsigned int irq)
1257 return 1; 1188 return 1;
1258} 1189}
1259 1190
1260static void enable_dynirq(unsigned int irq) 1191static void enable_dynirq(struct irq_data *data)
1261{ 1192{
1262 int evtchn = evtchn_from_irq(irq); 1193 int evtchn = evtchn_from_irq(data->irq);
1263 1194
1264 if (VALID_EVTCHN(evtchn)) 1195 if (VALID_EVTCHN(evtchn))
1265 unmask_evtchn(evtchn); 1196 unmask_evtchn(evtchn);
1266} 1197}
1267 1198
1268static void disable_dynirq(unsigned int irq) 1199static void disable_dynirq(struct irq_data *data)
1269{ 1200{
1270 int evtchn = evtchn_from_irq(irq); 1201 int evtchn = evtchn_from_irq(data->irq);
1271 1202
1272 if (VALID_EVTCHN(evtchn)) 1203 if (VALID_EVTCHN(evtchn))
1273 mask_evtchn(evtchn); 1204 mask_evtchn(evtchn);
1274} 1205}
1275 1206
1276static void ack_dynirq(unsigned int irq) 1207static void ack_dynirq(struct irq_data *data)
1277{ 1208{
1278 int evtchn = evtchn_from_irq(irq); 1209 int evtchn = evtchn_from_irq(data->irq);
1279 1210
1280 move_masked_irq(irq); 1211 move_masked_irq(data->irq);
1281 1212
1282 if (VALID_EVTCHN(evtchn)) 1213 if (VALID_EVTCHN(evtchn))
1283 unmask_evtchn(evtchn); 1214 unmask_evtchn(evtchn);
1284} 1215}
1285 1216
1286static int retrigger_dynirq(unsigned int irq) 1217static int retrigger_dynirq(struct irq_data *data)
1287{ 1218{
1288 int evtchn = evtchn_from_irq(irq); 1219 int evtchn = evtchn_from_irq(data->irq);
1289 struct shared_info *sh = HYPERVISOR_shared_info; 1220 struct shared_info *sh = HYPERVISOR_shared_info;
1290 int ret = 0; 1221 int ret = 0;
1291 1222
@@ -1334,7 +1265,7 @@ static void restore_cpu_pirqs(void)
1334 1265
1335 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); 1266 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
1336 1267
1337 startup_pirq(irq); 1268 __startup_pirq(irq);
1338 } 1269 }
1339} 1270}
1340 1271
@@ -1445,7 +1376,6 @@ void xen_poll_irq(int irq)
1445void xen_irq_resume(void) 1376void xen_irq_resume(void)
1446{ 1377{
1447 unsigned int cpu, irq, evtchn; 1378 unsigned int cpu, irq, evtchn;
1448 struct irq_desc *desc;
1449 1379
1450 init_evtchn_cpu_bindings(); 1380 init_evtchn_cpu_bindings();
1451 1381
@@ -1465,66 +1395,48 @@ void xen_irq_resume(void)
1465 restore_cpu_ipis(cpu); 1395 restore_cpu_ipis(cpu);
1466 } 1396 }
1467 1397
1468 /*
1469 * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
1470 * are not handled by the IRQ core.
1471 */
1472 for_each_irq_desc(irq, desc) {
1473 if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
1474 continue;
1475 if (desc->status & IRQ_DISABLED)
1476 continue;
1477
1478 evtchn = evtchn_from_irq(irq);
1479 if (evtchn == -1)
1480 continue;
1481
1482 unmask_evtchn(evtchn);
1483 }
1484
1485 restore_cpu_pirqs(); 1398 restore_cpu_pirqs();
1486} 1399}
1487 1400
1488static struct irq_chip xen_dynamic_chip __read_mostly = { 1401static struct irq_chip xen_dynamic_chip __read_mostly = {
1489 .name = "xen-dyn", 1402 .name = "xen-dyn",
1490 1403
1491 .disable = disable_dynirq, 1404 .irq_disable = disable_dynirq,
1492 .mask = disable_dynirq, 1405 .irq_mask = disable_dynirq,
1493 .unmask = enable_dynirq, 1406 .irq_unmask = enable_dynirq,
1494 1407
1495 .eoi = ack_dynirq, 1408 .irq_eoi = ack_dynirq,
1496 .set_affinity = set_affinity_irq, 1409 .irq_set_affinity = set_affinity_irq,
1497 .retrigger = retrigger_dynirq, 1410 .irq_retrigger = retrigger_dynirq,
1498}; 1411};
1499 1412
1500static struct irq_chip xen_pirq_chip __read_mostly = { 1413static struct irq_chip xen_pirq_chip __read_mostly = {
1501 .name = "xen-pirq", 1414 .name = "xen-pirq",
1502 1415
1503 .startup = startup_pirq, 1416 .irq_startup = startup_pirq,
1504 .shutdown = shutdown_pirq, 1417 .irq_shutdown = shutdown_pirq,
1505 1418
1506 .enable = enable_pirq, 1419 .irq_enable = enable_pirq,
1507 .unmask = enable_pirq, 1420 .irq_unmask = enable_pirq,
1508 1421
1509 .disable = disable_pirq, 1422 .irq_disable = disable_pirq,
1510 .mask = disable_pirq, 1423 .irq_mask = disable_pirq,
1511 1424
1512 .ack = ack_pirq, 1425 .irq_ack = ack_pirq,
1513 .end = end_pirq,
1514 1426
1515 .set_affinity = set_affinity_irq, 1427 .irq_set_affinity = set_affinity_irq,
1516 1428
1517 .retrigger = retrigger_dynirq, 1429 .irq_retrigger = retrigger_dynirq,
1518}; 1430};
1519 1431
1520static struct irq_chip xen_percpu_chip __read_mostly = { 1432static struct irq_chip xen_percpu_chip __read_mostly = {
1521 .name = "xen-percpu", 1433 .name = "xen-percpu",
1522 1434
1523 .disable = disable_dynirq, 1435 .irq_disable = disable_dynirq,
1524 .mask = disable_dynirq, 1436 .irq_mask = disable_dynirq,
1525 .unmask = enable_dynirq, 1437 .irq_unmask = enable_dynirq,
1526 1438
1527 .ack = ack_dynirq, 1439 .irq_ack = ack_dynirq,
1528}; 1440};
1529 1441
1530int xen_set_callback_via(uint64_t via) 1442int xen_set_callback_via(uint64_t via)
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 24177272bcb..ebb292859b5 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -34,42 +34,38 @@ enum shutdown_state {
34/* Ignore multiple shutdown requests. */ 34/* Ignore multiple shutdown requests. */
35static enum shutdown_state shutting_down = SHUTDOWN_INVALID; 35static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
36 36
37#ifdef CONFIG_PM_SLEEP 37struct suspend_info {
38static int xen_hvm_suspend(void *data) 38 int cancelled;
39{ 39 unsigned long arg; /* extra hypercall argument */
40 int err; 40 void (*pre)(void);
41 struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; 41 void (*post)(int cancelled);
42 int *cancelled = data; 42};
43
44 BUG_ON(!irqs_disabled());
45
46 err = sysdev_suspend(PMSG_SUSPEND);
47 if (err) {
48 printk(KERN_ERR "xen_hvm_suspend: sysdev_suspend failed: %d\n",
49 err);
50 return err;
51 }
52
53 *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
54 43
55 xen_hvm_post_suspend(*cancelled); 44static void xen_hvm_post_suspend(int cancelled)
45{
46 xen_arch_hvm_post_suspend(cancelled);
56 gnttab_resume(); 47 gnttab_resume();
48}
57 49
58 if (!*cancelled) { 50static void xen_pre_suspend(void)
59 xen_irq_resume(); 51{
60 xen_console_resume(); 52 xen_mm_pin_all();
61 xen_timer_resume(); 53 gnttab_suspend();
62 } 54 xen_arch_pre_suspend();
63 55}
64 sysdev_resume();
65 56
66 return 0; 57static void xen_post_suspend(int cancelled)
58{
59 xen_arch_post_suspend(cancelled);
60 gnttab_resume();
61 xen_mm_unpin_all();
67} 62}
68 63
64#ifdef CONFIG_PM_SLEEP
69static int xen_suspend(void *data) 65static int xen_suspend(void *data)
70{ 66{
67 struct suspend_info *si = data;
71 int err; 68 int err;
72 int *cancelled = data;
73 69
74 BUG_ON(!irqs_disabled()); 70 BUG_ON(!irqs_disabled());
75 71
@@ -80,22 +76,20 @@ static int xen_suspend(void *data)
80 return err; 76 return err;
81 } 77 }
82 78
83 xen_mm_pin_all(); 79 if (si->pre)
84 gnttab_suspend(); 80 si->pre();
85 xen_pre_suspend();
86 81
87 /* 82 /*
88 * This hypercall returns 1 if suspend was cancelled 83 * This hypercall returns 1 if suspend was cancelled
89 * or the domain was merely checkpointed, and 0 if it 84 * or the domain was merely checkpointed, and 0 if it
90 * is resuming in a new domain. 85 * is resuming in a new domain.
91 */ 86 */
92 *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); 87 si->cancelled = HYPERVISOR_suspend(si->arg);
93 88
94 xen_post_suspend(*cancelled); 89 if (si->post)
95 gnttab_resume(); 90 si->post(si->cancelled);
96 xen_mm_unpin_all();
97 91
98 if (!*cancelled) { 92 if (!si->cancelled) {
99 xen_irq_resume(); 93 xen_irq_resume();
100 xen_console_resume(); 94 xen_console_resume();
101 xen_timer_resume(); 95 xen_timer_resume();
@@ -109,7 +103,7 @@ static int xen_suspend(void *data)
109static void do_suspend(void) 103static void do_suspend(void)
110{ 104{
111 int err; 105 int err;
112 int cancelled = 1; 106 struct suspend_info si;
113 107
114 shutting_down = SHUTDOWN_SUSPEND; 108 shutting_down = SHUTDOWN_SUSPEND;
115 109
@@ -139,20 +133,29 @@ static void do_suspend(void)
139 goto out_resume; 133 goto out_resume;
140 } 134 }
141 135
142 if (xen_hvm_domain()) 136 si.cancelled = 1;
143 err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); 137
144 else 138 if (xen_hvm_domain()) {
145 err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); 139 si.arg = 0UL;
140 si.pre = NULL;
141 si.post = &xen_hvm_post_suspend;
142 } else {
143 si.arg = virt_to_mfn(xen_start_info);
144 si.pre = &xen_pre_suspend;
145 si.post = &xen_post_suspend;
146 }
147
148 err = stop_machine(xen_suspend, &si, cpumask_of(0));
146 149
147 dpm_resume_noirq(PMSG_RESUME); 150 dpm_resume_noirq(PMSG_RESUME);
148 151
149 if (err) { 152 if (err) {
150 printk(KERN_ERR "failed to start xen_suspend: %d\n", err); 153 printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
151 cancelled = 1; 154 si.cancelled = 1;
152 } 155 }
153 156
154out_resume: 157out_resume:
155 if (!cancelled) { 158 if (!si.cancelled) {
156 xen_arch_resume(); 159 xen_arch_resume();
157 xs_resume(); 160 xs_resume();
158 } else 161 } else
@@ -172,12 +175,39 @@ out:
172} 175}
173#endif /* CONFIG_PM_SLEEP */ 176#endif /* CONFIG_PM_SLEEP */
174 177
178struct shutdown_handler {
179 const char *command;
180 void (*cb)(void);
181};
182
183static void do_poweroff(void)
184{
185 shutting_down = SHUTDOWN_POWEROFF;
186 orderly_poweroff(false);
187}
188
189static void do_reboot(void)
190{
191 shutting_down = SHUTDOWN_POWEROFF; /* ? */
192 ctrl_alt_del();
193}
194
175static void shutdown_handler(struct xenbus_watch *watch, 195static void shutdown_handler(struct xenbus_watch *watch,
176 const char **vec, unsigned int len) 196 const char **vec, unsigned int len)
177{ 197{
178 char *str; 198 char *str;
179 struct xenbus_transaction xbt; 199 struct xenbus_transaction xbt;
180 int err; 200 int err;
201 static struct shutdown_handler handlers[] = {
202 { "poweroff", do_poweroff },
203 { "halt", do_poweroff },
204 { "reboot", do_reboot },
205#ifdef CONFIG_PM_SLEEP
206 { "suspend", do_suspend },
207#endif
208 {NULL, NULL},
209 };
210 static struct shutdown_handler *handler;
181 211
182 if (shutting_down != SHUTDOWN_INVALID) 212 if (shutting_down != SHUTDOWN_INVALID)
183 return; 213 return;
@@ -194,7 +224,14 @@ static void shutdown_handler(struct xenbus_watch *watch,
194 return; 224 return;
195 } 225 }
196 226
197 xenbus_write(xbt, "control", "shutdown", ""); 227 for (handler = &handlers[0]; handler->command; handler++) {
228 if (strcmp(str, handler->command) == 0)
229 break;
230 }
231
232 /* Only acknowledge commands which we are prepared to handle. */
233 if (handler->cb)
234 xenbus_write(xbt, "control", "shutdown", "");
198 235
199 err = xenbus_transaction_end(xbt, 0); 236 err = xenbus_transaction_end(xbt, 0);
200 if (err == -EAGAIN) { 237 if (err == -EAGAIN) {
@@ -202,17 +239,8 @@ static void shutdown_handler(struct xenbus_watch *watch,
202 goto again; 239 goto again;
203 } 240 }
204 241
205 if (strcmp(str, "poweroff") == 0 || 242 if (handler->cb) {
206 strcmp(str, "halt") == 0) { 243 handler->cb();
207 shutting_down = SHUTDOWN_POWEROFF;
208 orderly_poweroff(false);
209 } else if (strcmp(str, "reboot") == 0) {
210 shutting_down = SHUTDOWN_POWEROFF; /* ? */
211 ctrl_alt_del();
212#ifdef CONFIG_PM_SLEEP
213 } else if (strcmp(str, "suspend") == 0) {
214 do_suspend();
215#endif
216 } else { 244 } else {
217 printk(KERN_INFO "Ignoring shutdown request: %s\n", str); 245 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
218 shutting_down = SHUTDOWN_INVALID; 246 shutting_down = SHUTDOWN_INVALID;
@@ -291,27 +319,18 @@ static int shutdown_event(struct notifier_block *notifier,
291 return NOTIFY_DONE; 319 return NOTIFY_DONE;
292} 320}
293 321
294static int __init __setup_shutdown_event(void)
295{
296 /* Delay initialization in the PV on HVM case */
297 if (xen_hvm_domain())
298 return 0;
299
300 if (!xen_pv_domain())
301 return -ENODEV;
302
303 return xen_setup_shutdown_event();
304}
305
306int xen_setup_shutdown_event(void) 322int xen_setup_shutdown_event(void)
307{ 323{
308 static struct notifier_block xenstore_notifier = { 324 static struct notifier_block xenstore_notifier = {
309 .notifier_call = shutdown_event 325 .notifier_call = shutdown_event
310 }; 326 };
327
328 if (!xen_domain())
329 return -ENODEV;
311 register_xenstore_notifier(&xenstore_notifier); 330 register_xenstore_notifier(&xenstore_notifier);
312 331
313 return 0; 332 return 0;
314} 333}
315EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); 334EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
316 335
317subsys_initcall(__setup_shutdown_event); 336subsys_initcall(xen_setup_shutdown_event);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index afbe041f42c..319dd0a94d5 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -156,9 +156,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
156 if (ret) 156 if (ret)
157 goto out; 157 goto out;
158 xenbus_probe(NULL); 158 xenbus_probe(NULL);
159 ret = xen_setup_shutdown_event();
160 if (ret)
161 goto out;
162 return 0; 159 return 0;
163 160
164out: 161out:
diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57ed..7cb53aafac1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
47 def_bool n 47 def_bool n
48 48
49config EXPORTFS 49config EXPORTFS
50 tristate 50 bool
51 51
52config FILE_LOCKING 52config FILE_LOCKING
53 bool "Enable POSIX file locking API" if EXPERT 53 bool "Enable POSIX file locking API" if EXPERT
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c..ba01202844c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
48obj-$(CONFIG_NFS_COMMON) += nfs_common/ 48obj-$(CONFIG_NFS_COMMON) += nfs_common/
49obj-$(CONFIG_GENERIC_ACL) += generic_acl.o 49obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
50 50
51obj-$(CONFIG_FHANDLE) += fhandle.o
52
51obj-y += quota/ 53obj-y += quota/
52 54
53obj-$(CONFIG_PROC_FS) += proc/ 55obj-$(CONFIG_PROC_FS) += proc/
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ff27d7a477b..b4ffad859ad 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
21 int len = *max_len; 21 int len = *max_len;
22 int type; 22 int type;
23 23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || 24 if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) 25 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
26 return 255; 26 return 255;
27 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
28 *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 return 255;
30 }
27 31
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 32 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT; 33 type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9007bbd01db..4a0107e1874 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4806,9 +4806,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4806 int err; 4806 int err;
4807 int drop_inode = 0; 4807 int drop_inode = 0;
4808 4808
4809 if (inode->i_nlink == 0)
4810 return -ENOENT;
4811
4812 /* do not allow sys_link's with other subvols of the same device */ 4809 /* do not allow sys_link's with other subvols of the same device */
4813 if (root->objectid != BTRFS_I(inode)->root->objectid) 4810 if (root->objectid != BTRFS_I(inode)->root->objectid)
4814 return -EPERM; 4811 return -EPERM;
diff --git a/fs/compat.c b/fs/compat.c
index 691c3fd8ce1..c6d31a3bab8 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
262 */ 262 */
263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
264{ 264{
265 struct path path; 265 struct kstatfs tmp;
266 int error; 266 int error = user_statfs(pathname, &tmp);
267 267 if (!error)
268 error = user_path(pathname, &path); 268 error = put_compat_statfs(buf, &tmp);
269 if (!error) {
270 struct kstatfs tmp;
271 error = vfs_statfs(&path, &tmp);
272 if (!error)
273 error = put_compat_statfs(buf, &tmp);
274 path_put(&path);
275 }
276 return error; 269 return error;
277} 270}
278 271
279asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) 272asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
280{ 273{
281 struct file * file;
282 struct kstatfs tmp; 274 struct kstatfs tmp;
283 int error; 275 int error = fd_statfs(fd, &tmp);
284
285 error = -EBADF;
286 file = fget(fd);
287 if (!file)
288 goto out;
289 error = vfs_statfs(&file->f_path, &tmp);
290 if (!error) 276 if (!error)
291 error = put_compat_statfs(buf, &tmp); 277 error = put_compat_statfs(buf, &tmp);
292 fput(file);
293out:
294 return error; 278 return error;
295} 279}
296 280
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
329 313
330asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) 314asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
331{ 315{
332 struct path path; 316 struct kstatfs tmp;
333 int error; 317 int error;
334 318
335 if (sz != sizeof(*buf)) 319 if (sz != sizeof(*buf))
336 return -EINVAL; 320 return -EINVAL;
337 321
338 error = user_path(pathname, &path); 322 error = user_statfs(pathname, &tmp);
339 if (!error) { 323 if (!error)
340 struct kstatfs tmp; 324 error = put_compat_statfs64(buf, &tmp);
341 error = vfs_statfs(&path, &tmp);
342 if (!error)
343 error = put_compat_statfs64(buf, &tmp);
344 path_put(&path);
345 }
346 return error; 325 return error;
347} 326}
348 327
349asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) 328asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
350{ 329{
351 struct file * file;
352 struct kstatfs tmp; 330 struct kstatfs tmp;
353 int error; 331 int error;
354 332
355 if (sz != sizeof(*buf)) 333 if (sz != sizeof(*buf))
356 return -EINVAL; 334 return -EINVAL;
357 335
358 error = -EBADF; 336 error = fd_statfs(fd, &tmp);
359 file = fget(fd);
360 if (!file)
361 goto out;
362 error = vfs_statfs(&file->f_path, &tmp);
363 if (!error) 337 if (!error)
364 error = put_compat_statfs64(buf, &tmp); 338 error = put_compat_statfs64(buf, &tmp);
365 fput(file);
366out:
367 return error; 339 return error;
368} 340}
369 341
@@ -2312,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
2312} 2284}
2313 2285
2314#endif /* CONFIG_TIMERFD */ 2286#endif /* CONFIG_TIMERFD */
2287
2288#ifdef CONFIG_FHANDLE
2289/*
2290 * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
2291 * doesn't set the O_LARGEFILE flag.
2292 */
2293asmlinkage long
2294compat_sys_open_by_handle_at(int mountdirfd,
2295 struct file_handle __user *handle, int flags)
2296{
2297 return do_handle_open(mountdirfd, handle, flags);
2298}
2299#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index 611ffe928c0..a39fe47c466 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
296 __releases(parent->d_lock) 296 __releases(parent->d_lock)
297 __releases(dentry->d_inode->i_lock) 297 __releases(dentry->d_inode->i_lock)
298{ 298{
299 dentry->d_parent = NULL;
300 list_del(&dentry->d_u.d_child); 299 list_del(&dentry->d_u.d_child);
300 /*
301 * Inform try_to_ascend() that we are no longer attached to the
302 * dentry tree
303 */
304 dentry->d_flags |= DCACHE_DISCONNECTED;
301 if (parent) 305 if (parent)
302 spin_unlock(&parent->d_lock); 306 spin_unlock(&parent->d_lock);
303 dentry_iput(dentry); 307 dentry_iput(dentry);
@@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb)
1012} 1016}
1013 1017
1014/* 1018/*
1019 * This tries to ascend one level of parenthood, but
1020 * we can race with renaming, so we need to re-check
1021 * the parenthood after dropping the lock and check
1022 * that the sequence number still matches.
1023 */
1024static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
1025{
1026 struct dentry *new = old->d_parent;
1027
1028 rcu_read_lock();
1029 spin_unlock(&old->d_lock);
1030 spin_lock(&new->d_lock);
1031
1032 /*
1033 * might go back up the wrong parent if we have had a rename
1034 * or deletion
1035 */
1036 if (new != old->d_parent ||
1037 (old->d_flags & DCACHE_DISCONNECTED) ||
1038 (!locked && read_seqretry(&rename_lock, seq))) {
1039 spin_unlock(&new->d_lock);
1040 new = NULL;
1041 }
1042 rcu_read_unlock();
1043 return new;
1044}
1045
1046
1047/*
1015 * Search for at least 1 mount point in the dentry's subdirs. 1048 * Search for at least 1 mount point in the dentry's subdirs.
1016 * We descend to the next level whenever the d_subdirs 1049 * We descend to the next level whenever the d_subdirs
1017 * list is non-empty and continue searching. 1050 * list is non-empty and continue searching.
@@ -1066,24 +1099,10 @@ resume:
1066 * All done at this level ... ascend and resume the search. 1099 * All done at this level ... ascend and resume the search.
1067 */ 1100 */
1068 if (this_parent != parent) { 1101 if (this_parent != parent) {
1069 struct dentry *tmp; 1102 struct dentry *child = this_parent;
1070 struct dentry *child; 1103 this_parent = try_to_ascend(this_parent, locked, seq);
1071 1104 if (!this_parent)
1072 tmp = this_parent->d_parent;
1073 rcu_read_lock();
1074 spin_unlock(&this_parent->d_lock);
1075 child = this_parent;
1076 this_parent = tmp;
1077 spin_lock(&this_parent->d_lock);
1078 /* might go back up the wrong parent if we have had a rename
1079 * or deletion */
1080 if (this_parent != child->d_parent ||
1081 (!locked && read_seqretry(&rename_lock, seq))) {
1082 spin_unlock(&this_parent->d_lock);
1083 rcu_read_unlock();
1084 goto rename_retry; 1105 goto rename_retry;
1085 }
1086 rcu_read_unlock();
1087 next = child->d_u.d_child.next; 1106 next = child->d_u.d_child.next;
1088 goto resume; 1107 goto resume;
1089 } 1108 }
@@ -1181,24 +1200,10 @@ resume:
1181 * All done at this level ... ascend and resume the search. 1200 * All done at this level ... ascend and resume the search.
1182 */ 1201 */
1183 if (this_parent != parent) { 1202 if (this_parent != parent) {
1184 struct dentry *tmp; 1203 struct dentry *child = this_parent;
1185 struct dentry *child; 1204 this_parent = try_to_ascend(this_parent, locked, seq);
1186 1205 if (!this_parent)
1187 tmp = this_parent->d_parent;
1188 rcu_read_lock();
1189 spin_unlock(&this_parent->d_lock);
1190 child = this_parent;
1191 this_parent = tmp;
1192 spin_lock(&this_parent->d_lock);
1193 /* might go back up the wrong parent if we have had a rename
1194 * or deletion */
1195 if (this_parent != child->d_parent ||
1196 (!locked && read_seqretry(&rename_lock, seq))) {
1197 spin_unlock(&this_parent->d_lock);
1198 rcu_read_unlock();
1199 goto rename_retry; 1206 goto rename_retry;
1200 }
1201 rcu_read_unlock();
1202 next = child->d_u.d_child.next; 1207 next = child->d_u.d_child.next;
1203 goto resume; 1208 goto resume;
1204 } 1209 }
@@ -2942,28 +2947,14 @@ resume:
2942 spin_unlock(&dentry->d_lock); 2947 spin_unlock(&dentry->d_lock);
2943 } 2948 }
2944 if (this_parent != root) { 2949 if (this_parent != root) {
2945 struct dentry *tmp; 2950 struct dentry *child = this_parent;
2946 struct dentry *child;
2947
2948 tmp = this_parent->d_parent;
2949 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { 2951 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
2950 this_parent->d_flags |= DCACHE_GENOCIDE; 2952 this_parent->d_flags |= DCACHE_GENOCIDE;
2951 this_parent->d_count--; 2953 this_parent->d_count--;
2952 } 2954 }
2953 rcu_read_lock(); 2955 this_parent = try_to_ascend(this_parent, locked, seq);
2954 spin_unlock(&this_parent->d_lock); 2956 if (!this_parent)
2955 child = this_parent;
2956 this_parent = tmp;
2957 spin_lock(&this_parent->d_lock);
2958 /* might go back up the wrong parent if we have had a rename
2959 * or deletion */
2960 if (this_parent != child->d_parent ||
2961 (!locked && read_seqretry(&rename_lock, seq))) {
2962 spin_unlock(&this_parent->d_lock);
2963 rcu_read_unlock();
2964 goto rename_retry; 2957 goto rename_retry;
2965 }
2966 rcu_read_unlock();
2967 next = child->d_u.d_child.next; 2958 next = child->d_u.d_child.next;
2968 goto resume; 2959 goto resume;
2969 } 2960 }
diff --git a/fs/exec.c b/fs/exec.c
index 52a447d9b6a..ba99e1abb1a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
115 struct file *file; 115 struct file *file;
116 char *tmp = getname(library); 116 char *tmp = getname(library);
117 int error = PTR_ERR(tmp); 117 int error = PTR_ERR(tmp);
118 static const struct open_flags uselib_flags = {
119 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
120 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
121 .intent = LOOKUP_OPEN
122 };
118 123
119 if (IS_ERR(tmp)) 124 if (IS_ERR(tmp))
120 goto out; 125 goto out;
121 126
122 file = do_filp_open(AT_FDCWD, tmp, 127 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
123 O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
124 MAY_READ | MAY_EXEC | MAY_OPEN);
125 putname(tmp); 128 putname(tmp);
126 error = PTR_ERR(file); 129 error = PTR_ERR(file);
127 if (IS_ERR(file)) 130 if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
721{ 724{
722 struct file *file; 725 struct file *file;
723 int err; 726 int err;
727 static const struct open_flags open_exec_flags = {
728 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
729 .acc_mode = MAY_EXEC | MAY_OPEN,
730 .intent = LOOKUP_OPEN
731 };
724 732
725 file = do_filp_open(AT_FDCWD, name, 733 file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
726 O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
727 MAY_EXEC | MAY_OPEN);
728 if (IS_ERR(file)) 734 if (IS_ERR(file))
729 goto out; 735 goto out;
730 736
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd..b05acb79613 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
320 struct inode * inode = dentry->d_inode; 320 struct inode * inode = dentry->d_inode;
321 int len = *max_len; 321 int len = *max_len;
322 int type = FILEID_INO32_GEN; 322 int type = FILEID_INO32_GEN;
323 323
324 if (len < 2 || (connectable && len < 4)) 324 if (connectable && (len < 4)) {
325 *max_len = 4;
326 return 255;
327 } else if (len < 2) {
328 *max_len = 2;
325 return 255; 329 return 255;
330 }
326 331
327 len = 2; 332 len = 2;
328 fid->i32.ino = inode->i_ino; 333 fid->i32.ino = inode->i_ino;
@@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
369 /* 374 /*
370 * Try to get any dentry for the given file handle from the filesystem. 375 * Try to get any dentry for the given file handle from the filesystem.
371 */ 376 */
377 if (!nop || !nop->fh_to_dentry)
378 return ERR_PTR(-ESTALE);
372 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); 379 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
373 if (!result) 380 if (!result)
374 result = ERR_PTR(-ESTALE); 381 result = ERR_PTR(-ESTALE);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810e..561f6925626 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
2253 2253
2254 dquot_initialize(dir); 2254 dquot_initialize(dir);
2255 2255
2256 /*
2257 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2258 * otherwise has the potential to corrupt the orphan inode list.
2259 */
2260 if (inode->i_nlink == 0)
2261 return -ENOENT;
2262
2263retry: 2256retry:
2264 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2257 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2265 EXT3_INDEX_EXTRA_TRANS_BLOCKS); 2258 EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f247..9cc19a1dea8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1936 sb->s_qcop = &ext3_qctl_operations; 1936 sb->s_qcop = &ext3_qctl_operations;
1937 sb->dq_op = &ext3_quota_operations; 1937 sb->dq_op = &ext3_quota_operations;
1938#endif 1938#endif
1939 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
1939 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 1940 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1940 mutex_init(&sbi->s_orphan_lock); 1941 mutex_init(&sbi->s_orphan_lock);
1941 mutex_init(&sbi->s_resize_lock); 1942 mutex_init(&sbi->s_resize_lock);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c..e781b7ea563 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
2304 2304
2305 dquot_initialize(dir); 2305 dquot_initialize(dir);
2306 2306
2307 /*
2308 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2309 * otherwise has the potential to corrupt the orphan inode list.
2310 */
2311 if (inode->i_nlink == 0)
2312 return -ENOENT;
2313
2314retry: 2307retry:
2315 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2308 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2316 EXT4_INDEX_EXTRA_TRANS_BLOCKS); 2309 EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f6a318f836b..5977b356a43 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3415 sb->s_qcop = &ext4_qctl_operations; 3415 sb->s_qcop = &ext4_qctl_operations;
3416 sb->dq_op = &ext4_quota_operations; 3416 sb->dq_op = &ext4_quota_operations;
3417#endif 3417#endif
3418 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
3419
3418 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3420 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3419 mutex_init(&sbi->s_orphan_lock); 3421 mutex_init(&sbi->s_orphan_lock);
3420 mutex_init(&sbi->s_resize_lock); 3422 mutex_init(&sbi->s_resize_lock);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd..0e277ec4b61 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
757 struct inode *inode = de->d_inode; 757 struct inode *inode = de->d_inode;
758 u32 ipos_h, ipos_m, ipos_l; 758 u32 ipos_h, ipos_m, ipos_l;
759 759
760 if (len < 5) 760 if (len < 5) {
761 *lenp = 5;
761 return 255; /* no room */ 762 return 255; /* no room */
763 }
762 764
763 ipos_h = MSDOS_I(inode)->i_pos >> 8; 765 ipos_h = MSDOS_I(inode)->i_pos >> 8;
764 ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; 766 ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cb1026181bd..6c82e5bac03 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
131SYSCALL_DEFINE1(dup, unsigned int, fildes) 131SYSCALL_DEFINE1(dup, unsigned int, fildes)
132{ 132{
133 int ret = -EBADF; 133 int ret = -EBADF;
134 struct file *file = fget(fildes); 134 struct file *file = fget_raw(fildes);
135 135
136 if (file) { 136 if (file) {
137 ret = get_unused_fd(); 137 ret = get_unused_fd();
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
426 return err; 426 return err;
427} 427}
428 428
429static int check_fcntl_cmd(unsigned cmd)
430{
431 switch (cmd) {
432 case F_DUPFD:
433 case F_DUPFD_CLOEXEC:
434 case F_GETFD:
435 case F_SETFD:
436 case F_GETFL:
437 return 1;
438 }
439 return 0;
440}
441
429SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) 442SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
430{ 443{
431 struct file *filp; 444 struct file *filp;
432 long err = -EBADF; 445 long err = -EBADF;
433 446
434 filp = fget(fd); 447 filp = fget_raw(fd);
435 if (!filp) 448 if (!filp)
436 goto out; 449 goto out;
437 450
451 if (unlikely(filp->f_mode & FMODE_PATH)) {
452 if (!check_fcntl_cmd(cmd)) {
453 fput(filp);
454 goto out;
455 }
456 }
457
438 err = security_file_fcntl(filp, cmd, arg); 458 err = security_file_fcntl(filp, cmd, arg);
439 if (err) { 459 if (err) {
440 fput(filp); 460 fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
456 long err; 476 long err;
457 477
458 err = -EBADF; 478 err = -EBADF;
459 filp = fget(fd); 479 filp = fget_raw(fd);
460 if (!filp) 480 if (!filp)
461 goto out; 481 goto out;
462 482
483 if (unlikely(filp->f_mode & FMODE_PATH)) {
484 if (!check_fcntl_cmd(cmd)) {
485 fput(filp);
486 goto out;
487 }
488 }
489
463 err = security_file_fcntl(filp, cmd, arg); 490 err = security_file_fcntl(filp, cmd, arg);
464 if (err) { 491 if (err) {
465 fput(filp); 492 fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
808 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 835 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
809 * is defined as O_NONBLOCK on some platforms and not on others. 836 * is defined as O_NONBLOCK on some platforms and not on others.
810 */ 837 */
811 BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 838 BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
812 O_RDONLY | O_WRONLY | O_RDWR | 839 O_RDONLY | O_WRONLY | O_RDWR |
813 O_CREAT | O_EXCL | O_NOCTTY | 840 O_CREAT | O_EXCL | O_NOCTTY |
814 O_TRUNC | O_APPEND | /* O_NONBLOCK | */ 841 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
815 __O_SYNC | O_DSYNC | FASYNC | 842 __O_SYNC | O_DSYNC | FASYNC |
816 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 843 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
817 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 844 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
818 __FMODE_EXEC 845 __FMODE_EXEC | O_PATH
819 )); 846 ));
820 847
821 fasync_cache = kmem_cache_create("fasync_cache", 848 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 00000000000..bf93ad2bee0
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,265 @@
1#include <linux/syscalls.h>
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/mount.h>
6#include <linux/namei.h>
7#include <linux/exportfs.h>
8#include <linux/fs_struct.h>
9#include <linux/fsnotify.h>
10#include <asm/uaccess.h>
11#include "internal.h"
12
13static long do_sys_name_to_handle(struct path *path,
14 struct file_handle __user *ufh,
15 int __user *mnt_id)
16{
17 long retval;
18 struct file_handle f_handle;
19 int handle_dwords, handle_bytes;
20 struct file_handle *handle = NULL;
21
22 /*
23 * We need t make sure wether the file system
24 * support decoding of the file handle
25 */
26 if (!path->mnt->mnt_sb->s_export_op ||
27 !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
28 return -EOPNOTSUPP;
29
30 if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
31 return -EFAULT;
32
33 if (f_handle.handle_bytes > MAX_HANDLE_SZ)
34 return -EINVAL;
35
36 handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
37 GFP_KERNEL);
38 if (!handle)
39 return -ENOMEM;
40
41 /* convert handle size to multiple of sizeof(u32) */
42 handle_dwords = f_handle.handle_bytes >> 2;
43
44 /* we ask for a non connected handle */
45 retval = exportfs_encode_fh(path->dentry,
46 (struct fid *)handle->f_handle,
47 &handle_dwords, 0);
48 handle->handle_type = retval;
49 /* convert handle size to bytes */
50 handle_bytes = handle_dwords * sizeof(u32);
51 handle->handle_bytes = handle_bytes;
52 if ((handle->handle_bytes > f_handle.handle_bytes) ||
53 (retval == 255) || (retval == -ENOSPC)) {
54 /* As per old exportfs_encode_fh documentation
55 * we could return ENOSPC to indicate overflow
56 * But file system returned 255 always. So handle
57 * both the values
58 */
59 /*
60 * set the handle size to zero so we copy only
61 * non variable part of the file_handle
62 */
63 handle_bytes = 0;
64 retval = -EOVERFLOW;
65 } else
66 retval = 0;
67 /* copy the mount id */
68 if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
69 copy_to_user(ufh, handle,
70 sizeof(struct file_handle) + handle_bytes))
71 retval = -EFAULT;
72 kfree(handle);
73 return retval;
74}
75
76/**
77 * sys_name_to_handle_at: convert name to handle
78 * @dfd: directory relative to which name is interpreted if not absolute
79 * @name: name that should be converted to handle.
80 * @handle: resulting file handle
81 * @mnt_id: mount id of the file system containing the file
82 * @flag: flag value to indicate whether to follow symlink or not
83 *
84 * @handle->handle_size indicate the space available to store the
85 * variable part of the file handle in bytes. If there is not
86 * enough space, the field is updated to return the minimum
87 * value required.
88 */
89SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
90 struct file_handle __user *, handle, int __user *, mnt_id,
91 int, flag)
92{
93 struct path path;
94 int lookup_flags;
95 int err;
96
97 if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
98 return -EINVAL;
99
100 lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
101 if (flag & AT_EMPTY_PATH)
102 lookup_flags |= LOOKUP_EMPTY;
103 err = user_path_at(dfd, name, lookup_flags, &path);
104 if (!err) {
105 err = do_sys_name_to_handle(&path, handle, mnt_id);
106 path_put(&path);
107 }
108 return err;
109}
110
111static struct vfsmount *get_vfsmount_from_fd(int fd)
112{
113 struct path path;
114
115 if (fd == AT_FDCWD) {
116 struct fs_struct *fs = current->fs;
117 spin_lock(&fs->lock);
118 path = fs->pwd;
119 mntget(path.mnt);
120 spin_unlock(&fs->lock);
121 } else {
122 int fput_needed;
123 struct file *file = fget_light(fd, &fput_needed);
124 if (!file)
125 return ERR_PTR(-EBADF);
126 path = file->f_path;
127 mntget(path.mnt);
128 fput_light(file, fput_needed);
129 }
130 return path.mnt;
131}
132
133static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
134{
135 return 1;
136}
137
138static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
139 struct path *path)
140{
141 int retval = 0;
142 int handle_dwords;
143
144 path->mnt = get_vfsmount_from_fd(mountdirfd);
145 if (IS_ERR(path->mnt)) {
146 retval = PTR_ERR(path->mnt);
147 goto out_err;
148 }
149 /* change the handle size to multiple of sizeof(u32) */
150 handle_dwords = handle->handle_bytes >> 2;
151 path->dentry = exportfs_decode_fh(path->mnt,
152 (struct fid *)handle->f_handle,
153 handle_dwords, handle->handle_type,
154 vfs_dentry_acceptable, NULL);
155 if (IS_ERR(path->dentry)) {
156 retval = PTR_ERR(path->dentry);
157 goto out_mnt;
158 }
159 return 0;
160out_mnt:
161 mntput(path->mnt);
162out_err:
163 return retval;
164}
165
166static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
167 struct path *path)
168{
169 int retval = 0;
170 struct file_handle f_handle;
171 struct file_handle *handle = NULL;
172
173 /*
174 * With handle we don't look at the execute bit on the
175 * the directory. Ideally we would like CAP_DAC_SEARCH.
176 * But we don't have that
177 */
178 if (!capable(CAP_DAC_READ_SEARCH)) {
179 retval = -EPERM;
180 goto out_err;
181 }
182 if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
183 retval = -EFAULT;
184 goto out_err;
185 }
186 if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
187 (f_handle.handle_bytes == 0)) {
188 retval = -EINVAL;
189 goto out_err;
190 }
191 handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
192 GFP_KERNEL);
193 if (!handle) {
194 retval = -ENOMEM;
195 goto out_err;
196 }
197 /* copy the full handle */
198 if (copy_from_user(handle, ufh,
199 sizeof(struct file_handle) +
200 f_handle.handle_bytes)) {
201 retval = -EFAULT;
202 goto out_handle;
203 }
204
205 retval = do_handle_to_path(mountdirfd, handle, path);
206
207out_handle:
208 kfree(handle);
209out_err:
210 return retval;
211}
212
213long do_handle_open(int mountdirfd,
214 struct file_handle __user *ufh, int open_flag)
215{
216 long retval = 0;
217 struct path path;
218 struct file *file;
219 int fd;
220
221 retval = handle_to_path(mountdirfd, ufh, &path);
222 if (retval)
223 return retval;
224
225 fd = get_unused_fd_flags(open_flag);
226 if (fd < 0) {
227 path_put(&path);
228 return fd;
229 }
230 file = file_open_root(path.dentry, path.mnt, "", open_flag);
231 if (IS_ERR(file)) {
232 put_unused_fd(fd);
233 retval = PTR_ERR(file);
234 } else {
235 retval = fd;
236 fsnotify_open(file);
237 fd_install(fd, file);
238 }
239 path_put(&path);
240 return retval;
241}
242
243/**
244 * sys_open_by_handle_at: Open the file handle
245 * @mountdirfd: directory file descriptor
246 * @handle: file handle to be opened
247 * @flag: open flags.
248 *
249 * @mountdirfd indicate the directory file descriptor
250 * of the mount point. file handle is decoded relative
251 * to the vfsmount pointed by the @mountdirfd. @flags
252 * value is same as the open(2) flags.
253 */
254SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
255 struct file_handle __user *, handle,
256 int, flags)
257{
258 long ret;
259
260 if (force_o_largefile())
261 flags |= O_LARGEFILE;
262
263 ret = do_handle_open(mountdirfd, handle, flags);
264 return ret;
265}
diff --git a/fs/file_table.c b/fs/file_table.c
index eb36b6b17e2..74a9544ac77 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -276,11 +276,10 @@ struct file *fget(unsigned int fd)
276 rcu_read_lock(); 276 rcu_read_lock();
277 file = fcheck_files(files, fd); 277 file = fcheck_files(files, fd);
278 if (file) { 278 if (file) {
279 if (!atomic_long_inc_not_zero(&file->f_count)) { 279 /* File object ref couldn't be taken */
280 /* File object ref couldn't be taken */ 280 if (file->f_mode & FMODE_PATH ||
281 rcu_read_unlock(); 281 !atomic_long_inc_not_zero(&file->f_count))
282 return NULL; 282 file = NULL;
283 }
284 } 283 }
285 rcu_read_unlock(); 284 rcu_read_unlock();
286 285
@@ -289,6 +288,25 @@ struct file *fget(unsigned int fd)
289 288
290EXPORT_SYMBOL(fget); 289EXPORT_SYMBOL(fget);
291 290
291struct file *fget_raw(unsigned int fd)
292{
293 struct file *file;
294 struct files_struct *files = current->files;
295
296 rcu_read_lock();
297 file = fcheck_files(files, fd);
298 if (file) {
299 /* File object ref couldn't be taken */
300 if (!atomic_long_inc_not_zero(&file->f_count))
301 file = NULL;
302 }
303 rcu_read_unlock();
304
305 return file;
306}
307
308EXPORT_SYMBOL(fget_raw);
309
292/* 310/*
293 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 311 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
294 * 312 *
@@ -313,6 +331,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
313 *fput_needed = 0; 331 *fput_needed = 0;
314 if (atomic_read(&files->count) == 1) { 332 if (atomic_read(&files->count) == 1) {
315 file = fcheck_files(files, fd); 333 file = fcheck_files(files, fd);
334 if (file && (file->f_mode & FMODE_PATH))
335 file = NULL;
336 } else {
337 rcu_read_lock();
338 file = fcheck_files(files, fd);
339 if (file) {
340 if (!(file->f_mode & FMODE_PATH) &&
341 atomic_long_inc_not_zero(&file->f_count))
342 *fput_needed = 1;
343 else
344 /* Didn't get the reference, someone's freed */
345 file = NULL;
346 }
347 rcu_read_unlock();
348 }
349
350 return file;
351}
352
353struct file *fget_raw_light(unsigned int fd, int *fput_needed)
354{
355 struct file *file;
356 struct files_struct *files = current->files;
357
358 *fput_needed = 0;
359 if (atomic_read(&files->count) == 1) {
360 file = fcheck_files(files, fd);
316 } else { 361 } else {
317 rcu_read_lock(); 362 rcu_read_lock();
318 file = fcheck_files(files, fd); 363 file = fcheck_files(files, fd);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd..051b1a08452 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
637 u64 nodeid; 637 u64 nodeid;
638 u32 generation; 638 u32 generation;
639 639
640 if (*max_len < len) 640 if (*max_len < len) {
641 *max_len = len;
641 return 255; 642 return 255;
643 }
642 644
643 nodeid = get_fuse_inode(inode)->nodeid; 645 nodeid = get_fuse_inode(inode)->nodeid;
644 generation = inode->i_generation; 646 generation = inode->i_generation;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f..b5a5e60df0d 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
36 struct super_block *sb = inode->i_sb; 36 struct super_block *sb = inode->i_sb;
37 struct gfs2_inode *ip = GFS2_I(inode); 37 struct gfs2_inode *ip = GFS2_I(inode);
38 38
39 if (*len < GFS2_SMALL_FH_SIZE || 39 if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
40 (connectable && *len < GFS2_LARGE_FH_SIZE)) 40 *len = GFS2_LARGE_FH_SIZE;
41 return 255; 41 return 255;
42 } else if (*len < GFS2_SMALL_FH_SIZE) {
43 *len = GFS2_SMALL_FH_SIZE;
44 return 255;
45 }
42 46
43 fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); 47 fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
44 fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); 48 fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/inode.c b/fs/inode.c
index 0647d80accf..9910c039f02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly;
84DEFINE_SPINLOCK(inode_lock); 84DEFINE_SPINLOCK(inode_lock);
85 85
86/* 86/*
87 * iprune_sem provides exclusion between the kswapd or try_to_free_pages 87 * iprune_sem provides exclusion between the icache shrinking and the
88 * icache shrinking path, and the umount path. Without this exclusion, 88 * umount path.
89 * by the time prune_icache calls iput for the inode whose pages it has
90 * been invalidating, or by the time it calls clear_inode & destroy_inode
91 * from its final dispose_list, the struct super_block they refer to
92 * (for inode->i_sb->s_op) may already have been freed and reused.
93 * 89 *
94 * We make this an rwsem because the fastpath is icache shrinking. In 90 * We don't actually need it to protect anything in the umount path,
95 * some cases a filesystem may be doing a significant amount of work in 91 * but only need to cycle through it to make sure any inode that
96 * its inode reclaim code, so this should improve parallelism. 92 * prune_icache took off the LRU list has been fully torn down by the
93 * time we are past evict_inodes.
97 */ 94 */
98static DECLARE_RWSEM(iprune_sem); 95static DECLARE_RWSEM(iprune_sem);
99 96
@@ -516,17 +513,12 @@ void evict_inodes(struct super_block *sb)
516 struct inode *inode, *next; 513 struct inode *inode, *next;
517 LIST_HEAD(dispose); 514 LIST_HEAD(dispose);
518 515
519 down_write(&iprune_sem);
520
521 spin_lock(&inode_lock); 516 spin_lock(&inode_lock);
522 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 517 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
523 if (atomic_read(&inode->i_count)) 518 if (atomic_read(&inode->i_count))
524 continue; 519 continue;
525 520 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
526 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
527 WARN_ON(1);
528 continue; 521 continue;
529 }
530 522
531 inode->i_state |= I_FREEING; 523 inode->i_state |= I_FREEING;
532 524
@@ -542,6 +534,13 @@ void evict_inodes(struct super_block *sb)
542 spin_unlock(&inode_lock); 534 spin_unlock(&inode_lock);
543 535
544 dispose_list(&dispose); 536 dispose_list(&dispose);
537
538 /*
539 * Cycle through iprune_sem to make sure any inode that prune_icache
540 * moved off the list before we took the lock has been fully torn
541 * down.
542 */
543 down_write(&iprune_sem);
545 up_write(&iprune_sem); 544 up_write(&iprune_sem);
546} 545}
547 546
@@ -561,8 +560,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
561 struct inode *inode, *next; 560 struct inode *inode, *next;
562 LIST_HEAD(dispose); 561 LIST_HEAD(dispose);
563 562
564 down_write(&iprune_sem);
565
566 spin_lock(&inode_lock); 563 spin_lock(&inode_lock);
567 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 564 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
568 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 565 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
@@ -590,7 +587,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
590 spin_unlock(&inode_lock); 587 spin_unlock(&inode_lock);
591 588
592 dispose_list(&dispose); 589 dispose_list(&dispose);
593 up_write(&iprune_sem);
594 590
595 return busy; 591 return busy;
596} 592}
diff --git a/fs/internal.h b/fs/internal.h
index 9b976b57d7f..f3d15de44b1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,6 +106,19 @@ extern void put_super(struct super_block *sb);
106struct nameidata; 106struct nameidata;
107extern struct file *nameidata_to_filp(struct nameidata *); 107extern struct file *nameidata_to_filp(struct nameidata *);
108extern void release_open_intent(struct nameidata *); 108extern void release_open_intent(struct nameidata *);
109struct open_flags {
110 int open_flag;
111 int mode;
112 int acc_mode;
113 int intent;
114};
115extern struct file *do_filp_open(int dfd, const char *pathname,
116 const struct open_flags *op, int lookup_flags);
117extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
118 const char *, const struct open_flags *, int lookup_flags);
119
120extern long do_handle_open(int mountdirfd,
121 struct file_handle __user *ufh, int open_flag);
109 122
110/* 123/*
111 * inode.c 124 * inode.c
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb3847..dd4687ff30d 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
124 * offset of the inode and the upper 16 bits of fh32[1] to 124 * offset of the inode and the upper 16 bits of fh32[1] to
125 * hold the offset of the parent. 125 * hold the offset of the parent.
126 */ 126 */
127 127 if (connectable && (len < 5)) {
128 if (len < 3 || (connectable && len < 5)) 128 *max_len = 5;
129 return 255;
130 } else if (len < 3) {
131 *max_len = 3;
129 return 255; 132 return 255;
133 }
130 134
131 len = 3; 135 len = 3;
132 fh32[0] = ei->i_iget5_block; 136 fh32[0] = ei->i_iget5_block;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 5a2b269428a..3f04a180493 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
809 if (ip->i_nlink == JFS_LINK_MAX) 809 if (ip->i_nlink == JFS_LINK_MAX)
810 return -EMLINK; 810 return -EMLINK;
811 811
812 if (ip->i_nlink == 0)
813 return -ENOENT;
814
815 dquot_initialize(dir); 812 dquot_initialize(dir);
816 813
817 tid = txBegin(ip->i_sb, 0); 814 tid = txBegin(ip->i_sb, 0);
diff --git a/fs/namei.c b/fs/namei.c
index a4689eb2df2..b912b7abe74 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
136 return retval; 136 return retval;
137} 137}
138 138
139char * getname(const char __user * filename) 139static char *getname_flags(const char __user * filename, int flags)
140{ 140{
141 char *tmp, *result; 141 char *tmp, *result;
142 142
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
147 147
148 result = tmp; 148 result = tmp;
149 if (retval < 0) { 149 if (retval < 0) {
150 __putname(tmp); 150 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
151 result = ERR_PTR(retval); 151 __putname(tmp);
152 result = ERR_PTR(retval);
153 }
152 } 154 }
153 } 155 }
154 audit_getname(result); 156 audit_getname(result);
155 return result; 157 return result;
156} 158}
157 159
160char *getname(const char __user * filename)
161{
162 return getname_flags(filename, 0);
163}
164
158#ifdef CONFIG_AUDITSYSCALL 165#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name) 166void putname(const char *name)
160{ 167{
@@ -401,9 +408,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
401{ 408{
402 struct fs_struct *fs = current->fs; 409 struct fs_struct *fs = current->fs;
403 struct dentry *dentry = nd->path.dentry; 410 struct dentry *dentry = nd->path.dentry;
411 int want_root = 0;
404 412
405 BUG_ON(!(nd->flags & LOOKUP_RCU)); 413 BUG_ON(!(nd->flags & LOOKUP_RCU));
406 if (nd->root.mnt) { 414 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
415 want_root = 1;
407 spin_lock(&fs->lock); 416 spin_lock(&fs->lock);
408 if (nd->root.mnt != fs->root.mnt || 417 if (nd->root.mnt != fs->root.mnt ||
409 nd->root.dentry != fs->root.dentry) 418 nd->root.dentry != fs->root.dentry)
@@ -414,7 +423,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
414 goto err; 423 goto err;
415 BUG_ON(nd->inode != dentry->d_inode); 424 BUG_ON(nd->inode != dentry->d_inode);
416 spin_unlock(&dentry->d_lock); 425 spin_unlock(&dentry->d_lock);
417 if (nd->root.mnt) { 426 if (want_root) {
418 path_get(&nd->root); 427 path_get(&nd->root);
419 spin_unlock(&fs->lock); 428 spin_unlock(&fs->lock);
420 } 429 }
@@ -427,7 +436,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
427err: 436err:
428 spin_unlock(&dentry->d_lock); 437 spin_unlock(&dentry->d_lock);
429err_root: 438err_root:
430 if (nd->root.mnt) 439 if (want_root)
431 spin_unlock(&fs->lock); 440 spin_unlock(&fs->lock);
432 return -ECHILD; 441 return -ECHILD;
433} 442}
@@ -454,9 +463,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
454{ 463{
455 struct fs_struct *fs = current->fs; 464 struct fs_struct *fs = current->fs;
456 struct dentry *parent = nd->path.dentry; 465 struct dentry *parent = nd->path.dentry;
466 int want_root = 0;
457 467
458 BUG_ON(!(nd->flags & LOOKUP_RCU)); 468 BUG_ON(!(nd->flags & LOOKUP_RCU));
459 if (nd->root.mnt) { 469 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
470 want_root = 1;
460 spin_lock(&fs->lock); 471 spin_lock(&fs->lock);
461 if (nd->root.mnt != fs->root.mnt || 472 if (nd->root.mnt != fs->root.mnt ||
462 nd->root.dentry != fs->root.dentry) 473 nd->root.dentry != fs->root.dentry)
@@ -476,7 +487,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
476 parent->d_count++; 487 parent->d_count++;
477 spin_unlock(&dentry->d_lock); 488 spin_unlock(&dentry->d_lock);
478 spin_unlock(&parent->d_lock); 489 spin_unlock(&parent->d_lock);
479 if (nd->root.mnt) { 490 if (want_root) {
480 path_get(&nd->root); 491 path_get(&nd->root);
481 spin_unlock(&fs->lock); 492 spin_unlock(&fs->lock);
482 } 493 }
@@ -490,7 +501,7 @@ err:
490 spin_unlock(&dentry->d_lock); 501 spin_unlock(&dentry->d_lock);
491 spin_unlock(&parent->d_lock); 502 spin_unlock(&parent->d_lock);
492err_root: 503err_root:
493 if (nd->root.mnt) 504 if (want_root)
494 spin_unlock(&fs->lock); 505 spin_unlock(&fs->lock);
495 return -ECHILD; 506 return -ECHILD;
496} 507}
@@ -498,8 +509,16 @@ err_root:
498/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ 509/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
499static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) 510static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
500{ 511{
501 if (nd->flags & LOOKUP_RCU) 512 if (nd->flags & LOOKUP_RCU) {
502 return nameidata_dentry_drop_rcu(nd, dentry); 513 if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
514 nd->flags &= ~LOOKUP_RCU;
515 if (!(nd->flags & LOOKUP_ROOT))
516 nd->root.mnt = NULL;
517 rcu_read_unlock();
518 br_read_unlock(vfsmount_lock);
519 return -ECHILD;
520 }
521 }
503 return 0; 522 return 0;
504} 523}
505 524
@@ -518,7 +537,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
518 537
519 BUG_ON(!(nd->flags & LOOKUP_RCU)); 538 BUG_ON(!(nd->flags & LOOKUP_RCU));
520 nd->flags &= ~LOOKUP_RCU; 539 nd->flags &= ~LOOKUP_RCU;
521 nd->root.mnt = NULL; 540 if (!(nd->flags & LOOKUP_ROOT))
541 nd->root.mnt = NULL;
522 spin_lock(&dentry->d_lock); 542 spin_lock(&dentry->d_lock);
523 if (!__d_rcu_to_refcount(dentry, nd->seq)) 543 if (!__d_rcu_to_refcount(dentry, nd->seq))
524 goto err_unlock; 544 goto err_unlock;
@@ -539,14 +559,6 @@ err_unlock:
539 return -ECHILD; 559 return -ECHILD;
540} 560}
541 561
542/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
543static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
544{
545 if (likely(nd->flags & LOOKUP_RCU))
546 return nameidata_drop_rcu_last(nd);
547 return 0;
548}
549
550/** 562/**
551 * release_open_intent - free up open intent resources 563 * release_open_intent - free up open intent resources
552 * @nd: pointer to nameidata 564 * @nd: pointer to nameidata
@@ -590,42 +602,8 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
590 return dentry; 602 return dentry;
591} 603}
592 604
593static inline struct dentry *
594do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
595{
596 int status = d_revalidate(dentry, nd);
597 if (likely(status > 0))
598 return dentry;
599 if (status == -ECHILD) {
600 if (nameidata_dentry_drop_rcu(nd, dentry))
601 return ERR_PTR(-ECHILD);
602 return do_revalidate(dentry, nd);
603 }
604 if (status < 0)
605 return ERR_PTR(status);
606 /* Don't d_invalidate in rcu-walk mode */
607 if (nameidata_dentry_drop_rcu(nd, dentry))
608 return ERR_PTR(-ECHILD);
609 if (!d_invalidate(dentry)) {
610 dput(dentry);
611 dentry = NULL;
612 }
613 return dentry;
614}
615
616static inline int need_reval_dot(struct dentry *dentry)
617{
618 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
619 return 0;
620
621 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
622 return 0;
623
624 return 1;
625}
626
627/* 605/*
628 * force_reval_path - force revalidation of a dentry 606 * handle_reval_path - force revalidation of a dentry
629 * 607 *
630 * In some situations the path walking code will trust dentries without 608 * In some situations the path walking code will trust dentries without
631 * revalidating them. This causes problems for filesystems that depend on 609 * revalidating them. This causes problems for filesystems that depend on
@@ -639,27 +617,28 @@ static inline int need_reval_dot(struct dentry *dentry)
639 * invalidate the dentry. It's up to the caller to handle putting references 617 * invalidate the dentry. It's up to the caller to handle putting references
640 * to the path if necessary. 618 * to the path if necessary.
641 */ 619 */
642static int 620static inline int handle_reval_path(struct nameidata *nd)
643force_reval_path(struct path *path, struct nameidata *nd)
644{ 621{
622 struct dentry *dentry = nd->path.dentry;
645 int status; 623 int status;
646 struct dentry *dentry = path->dentry;
647 624
648 /* 625 if (likely(!(nd->flags & LOOKUP_JUMPED)))
649 * only check on filesystems where it's possible for the dentry to 626 return 0;
650 * become stale. 627
651 */ 628 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
652 if (!need_reval_dot(dentry)) 629 return 0;
630
631 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
653 return 0; 632 return 0;
654 633
634 /* Note: we do not d_invalidate() */
655 status = d_revalidate(dentry, nd); 635 status = d_revalidate(dentry, nd);
656 if (status > 0) 636 if (status > 0)
657 return 0; 637 return 0;
658 638
659 if (!status) { 639 if (!status)
660 d_invalidate(dentry);
661 status = -ESTALE; 640 status = -ESTALE;
662 } 641
663 return status; 642 return status;
664} 643}
665 644
@@ -728,6 +707,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
728 path_put(&nd->path); 707 path_put(&nd->path);
729 nd->path = nd->root; 708 nd->path = nd->root;
730 path_get(&nd->root); 709 path_get(&nd->root);
710 nd->flags |= LOOKUP_JUMPED;
731 } 711 }
732 nd->inode = nd->path.dentry->d_inode; 712 nd->inode = nd->path.dentry->d_inode;
733 713
@@ -757,19 +737,42 @@ static inline void path_to_nameidata(const struct path *path,
757 nd->path.dentry = path->dentry; 737 nd->path.dentry = path->dentry;
758} 738}
759 739
740static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
741{
742 struct inode *inode = link->dentry->d_inode;
743 if (!IS_ERR(cookie) && inode->i_op->put_link)
744 inode->i_op->put_link(link->dentry, nd, cookie);
745 path_put(link);
746}
747
760static __always_inline int 748static __always_inline int
761__do_follow_link(const struct path *link, struct nameidata *nd, void **p) 749follow_link(struct path *link, struct nameidata *nd, void **p)
762{ 750{
763 int error; 751 int error;
764 struct dentry *dentry = link->dentry; 752 struct dentry *dentry = link->dentry;
765 753
766 BUG_ON(nd->flags & LOOKUP_RCU); 754 BUG_ON(nd->flags & LOOKUP_RCU);
767 755
756 if (link->mnt == nd->path.mnt)
757 mntget(link->mnt);
758
759 if (unlikely(current->total_link_count >= 40)) {
760 *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
761 path_put(&nd->path);
762 return -ELOOP;
763 }
764 cond_resched();
765 current->total_link_count++;
766
768 touch_atime(link->mnt, dentry); 767 touch_atime(link->mnt, dentry);
769 nd_set_link(nd, NULL); 768 nd_set_link(nd, NULL);
770 769
771 if (link->mnt == nd->path.mnt) 770 error = security_inode_follow_link(link->dentry, nd);
772 mntget(link->mnt); 771 if (error) {
772 *p = ERR_PTR(error); /* no ->put_link(), please */
773 path_put(&nd->path);
774 return error;
775 }
773 776
774 nd->last_type = LAST_BIND; 777 nd->last_type = LAST_BIND;
775 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 778 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -780,56 +783,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
780 if (s) 783 if (s)
781 error = __vfs_follow_link(nd, s); 784 error = __vfs_follow_link(nd, s);
782 else if (nd->last_type == LAST_BIND) { 785 else if (nd->last_type == LAST_BIND) {
783 error = force_reval_path(&nd->path, nd); 786 nd->flags |= LOOKUP_JUMPED;
784 if (error) 787 nd->inode = nd->path.dentry->d_inode;
788 if (nd->inode->i_op->follow_link) {
789 /* stepped on a _really_ weird one */
785 path_put(&nd->path); 790 path_put(&nd->path);
791 error = -ELOOP;
792 }
786 } 793 }
787 } 794 }
788 return error; 795 return error;
789} 796}
790 797
791/*
792 * This limits recursive symlink follows to 8, while
793 * limiting consecutive symlinks to 40.
794 *
795 * Without that kind of total limit, nasty chains of consecutive
796 * symlinks can cause almost arbitrarily long lookups.
797 */
798static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
799{
800 void *cookie;
801 int err = -ELOOP;
802
803 /* We drop rcu-walk here */
804 if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
805 return -ECHILD;
806 BUG_ON(inode != path->dentry->d_inode);
807
808 if (current->link_count >= MAX_NESTED_LINKS)
809 goto loop;
810 if (current->total_link_count >= 40)
811 goto loop;
812 BUG_ON(nd->depth >= MAX_NESTED_LINKS);
813 cond_resched();
814 err = security_inode_follow_link(path->dentry, nd);
815 if (err)
816 goto loop;
817 current->link_count++;
818 current->total_link_count++;
819 nd->depth++;
820 err = __do_follow_link(path, nd, &cookie);
821 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
822 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
823 path_put(path);
824 current->link_count--;
825 nd->depth--;
826 return err;
827loop:
828 path_put_conditional(path, nd);
829 path_put(&nd->path);
830 return err;
831}
832
833static int follow_up_rcu(struct path *path) 798static int follow_up_rcu(struct path *path)
834{ 799{
835 struct vfsmount *parent; 800 struct vfsmount *parent;
@@ -1068,7 +1033,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1068 1033
1069 seq = read_seqcount_begin(&parent->d_seq); 1034 seq = read_seqcount_begin(&parent->d_seq);
1070 if (read_seqcount_retry(&old->d_seq, nd->seq)) 1035 if (read_seqcount_retry(&old->d_seq, nd->seq))
1071 return -ECHILD; 1036 goto failed;
1072 inode = parent->d_inode; 1037 inode = parent->d_inode;
1073 nd->path.dentry = parent; 1038 nd->path.dentry = parent;
1074 nd->seq = seq; 1039 nd->seq = seq;
@@ -1081,8 +1046,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1081 } 1046 }
1082 __follow_mount_rcu(nd, &nd->path, &inode, true); 1047 __follow_mount_rcu(nd, &nd->path, &inode, true);
1083 nd->inode = inode; 1048 nd->inode = inode;
1084
1085 return 0; 1049 return 0;
1050
1051failed:
1052 nd->flags &= ~LOOKUP_RCU;
1053 if (!(nd->flags & LOOKUP_ROOT))
1054 nd->root.mnt = NULL;
1055 rcu_read_unlock();
1056 br_read_unlock(vfsmount_lock);
1057 return -ECHILD;
1086} 1058}
1087 1059
1088/* 1060/*
@@ -1216,68 +1188,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1216{ 1188{
1217 struct vfsmount *mnt = nd->path.mnt; 1189 struct vfsmount *mnt = nd->path.mnt;
1218 struct dentry *dentry, *parent = nd->path.dentry; 1190 struct dentry *dentry, *parent = nd->path.dentry;
1219 struct inode *dir; 1191 int need_reval = 1;
1192 int status = 1;
1220 int err; 1193 int err;
1221 1194
1222 /* 1195 /*
1223 * See if the low-level filesystem might want
1224 * to use its own hash..
1225 */
1226 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1227 err = parent->d_op->d_hash(parent, nd->inode, name);
1228 if (err < 0)
1229 return err;
1230 }
1231
1232 /*
1233 * Rename seqlock is not required here because in the off chance 1196 * Rename seqlock is not required here because in the off chance
1234 * of a false negative due to a concurrent rename, we're going to 1197 * of a false negative due to a concurrent rename, we're going to
1235 * do the non-racy lookup, below. 1198 * do the non-racy lookup, below.
1236 */ 1199 */
1237 if (nd->flags & LOOKUP_RCU) { 1200 if (nd->flags & LOOKUP_RCU) {
1238 unsigned seq; 1201 unsigned seq;
1239
1240 *inode = nd->inode; 1202 *inode = nd->inode;
1241 dentry = __d_lookup_rcu(parent, name, &seq, inode); 1203 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1242 if (!dentry) { 1204 if (!dentry)
1243 if (nameidata_drop_rcu(nd)) 1205 goto unlazy;
1244 return -ECHILD; 1206
1245 goto need_lookup;
1246 }
1247 /* Memory barrier in read_seqcount_begin of child is enough */ 1207 /* Memory barrier in read_seqcount_begin of child is enough */
1248 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1208 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1249 return -ECHILD; 1209 return -ECHILD;
1250
1251 nd->seq = seq; 1210 nd->seq = seq;
1211
1252 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1212 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1253 dentry = do_revalidate_rcu(dentry, nd); 1213 status = d_revalidate(dentry, nd);
1254 if (!dentry) 1214 if (unlikely(status <= 0)) {
1255 goto need_lookup; 1215 if (status != -ECHILD)
1256 if (IS_ERR(dentry)) 1216 need_reval = 0;
1257 goto fail; 1217 goto unlazy;
1258 if (!(nd->flags & LOOKUP_RCU)) 1218 }
1259 goto done;
1260 } 1219 }
1261 path->mnt = mnt; 1220 path->mnt = mnt;
1262 path->dentry = dentry; 1221 path->dentry = dentry;
1263 if (likely(__follow_mount_rcu(nd, path, inode, false))) 1222 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1264 return 0; 1223 return 0;
1265 if (nameidata_drop_rcu(nd)) 1224unlazy:
1266 return -ECHILD; 1225 if (dentry) {
1267 /* fallthru */ 1226 if (nameidata_dentry_drop_rcu(nd, dentry))
1227 return -ECHILD;
1228 } else {
1229 if (nameidata_drop_rcu(nd))
1230 return -ECHILD;
1231 }
1232 } else {
1233 dentry = __d_lookup(parent, name);
1268 } 1234 }
1269 dentry = __d_lookup(parent, name); 1235
1270 if (!dentry) 1236retry:
1271 goto need_lookup; 1237 if (unlikely(!dentry)) {
1272found: 1238 struct inode *dir = parent->d_inode;
1273 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1239 BUG_ON(nd->inode != dir);
1274 dentry = do_revalidate(dentry, nd); 1240
1275 if (!dentry) 1241 mutex_lock(&dir->i_mutex);
1276 goto need_lookup; 1242 dentry = d_lookup(parent, name);
1277 if (IS_ERR(dentry)) 1243 if (likely(!dentry)) {
1278 goto fail; 1244 dentry = d_alloc_and_lookup(parent, name, nd);
1245 if (IS_ERR(dentry)) {
1246 mutex_unlock(&dir->i_mutex);
1247 return PTR_ERR(dentry);
1248 }
1249 /* known good */
1250 need_reval = 0;
1251 status = 1;
1252 }
1253 mutex_unlock(&dir->i_mutex);
1279 } 1254 }
1280done: 1255 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1256 status = d_revalidate(dentry, nd);
1257 if (unlikely(status <= 0)) {
1258 if (status < 0) {
1259 dput(dentry);
1260 return status;
1261 }
1262 if (!d_invalidate(dentry)) {
1263 dput(dentry);
1264 dentry = NULL;
1265 need_reval = 1;
1266 goto retry;
1267 }
1268 }
1269
1281 path->mnt = mnt; 1270 path->mnt = mnt;
1282 path->dentry = dentry; 1271 path->dentry = dentry;
1283 err = follow_managed(path, nd->flags); 1272 err = follow_managed(path, nd->flags);
@@ -1287,39 +1276,113 @@ done:
1287 } 1276 }
1288 *inode = path->dentry->d_inode; 1277 *inode = path->dentry->d_inode;
1289 return 0; 1278 return 0;
1279}
1280
1281static inline int may_lookup(struct nameidata *nd)
1282{
1283 if (nd->flags & LOOKUP_RCU) {
1284 int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1285 if (err != -ECHILD)
1286 return err;
1287 if (nameidata_drop_rcu(nd))
1288 return -ECHILD;
1289 }
1290 return exec_permission(nd->inode, 0);
1291}
1290 1292
1291need_lookup: 1293static inline int handle_dots(struct nameidata *nd, int type)
1292 dir = parent->d_inode; 1294{
1293 BUG_ON(nd->inode != dir); 1295 if (type == LAST_DOTDOT) {
1296 if (nd->flags & LOOKUP_RCU) {
1297 if (follow_dotdot_rcu(nd))
1298 return -ECHILD;
1299 } else
1300 follow_dotdot(nd);
1301 }
1302 return 0;
1303}
1294 1304
1295 mutex_lock(&dir->i_mutex); 1305static void terminate_walk(struct nameidata *nd)
1296 /* 1306{
1297 * First re-do the cached lookup just in case it was created 1307 if (!(nd->flags & LOOKUP_RCU)) {
1298 * while we waited for the directory semaphore, or the first 1308 path_put(&nd->path);
1299 * lookup failed due to an unrelated rename. 1309 } else {
1300 * 1310 nd->flags &= ~LOOKUP_RCU;
1301 * This could use version numbering or similar to avoid unnecessary 1311 if (!(nd->flags & LOOKUP_ROOT))
1302 * cache lookups, but then we'd have to do the first lookup in the 1312 nd->root.mnt = NULL;
1303 * non-racy way. However in the common case here, everything should 1313 rcu_read_unlock();
1304 * be hot in cache, so would it be a big win? 1314 br_read_unlock(vfsmount_lock);
1305 */
1306 dentry = d_lookup(parent, name);
1307 if (likely(!dentry)) {
1308 dentry = d_alloc_and_lookup(parent, name, nd);
1309 mutex_unlock(&dir->i_mutex);
1310 if (IS_ERR(dentry))
1311 goto fail;
1312 goto done;
1313 } 1315 }
1316}
1317
1318static inline int walk_component(struct nameidata *nd, struct path *path,
1319 struct qstr *name, int type, int follow)
1320{
1321 struct inode *inode;
1322 int err;
1314 /* 1323 /*
1315 * Uhhuh! Nasty case: the cache was re-populated while 1324 * "." and ".." are special - ".." especially so because it has
1316 * we waited on the semaphore. Need to revalidate. 1325 * to be able to know about the current root directory and
1326 * parent relationships.
1317 */ 1327 */
1318 mutex_unlock(&dir->i_mutex); 1328 if (unlikely(type != LAST_NORM))
1319 goto found; 1329 return handle_dots(nd, type);
1330 err = do_lookup(nd, name, path, &inode);
1331 if (unlikely(err)) {
1332 terminate_walk(nd);
1333 return err;
1334 }
1335 if (!inode) {
1336 path_to_nameidata(path, nd);
1337 terminate_walk(nd);
1338 return -ENOENT;
1339 }
1340 if (unlikely(inode->i_op->follow_link) && follow) {
1341 if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
1342 return -ECHILD;
1343 BUG_ON(inode != path->dentry->d_inode);
1344 return 1;
1345 }
1346 path_to_nameidata(path, nd);
1347 nd->inode = inode;
1348 return 0;
1349}
1320 1350
1321fail: 1351/*
1322 return PTR_ERR(dentry); 1352 * This limits recursive symlink follows to 8, while
1353 * limiting consecutive symlinks to 40.
1354 *
1355 * Without that kind of total limit, nasty chains of consecutive
1356 * symlinks can cause almost arbitrarily long lookups.
1357 */
1358static inline int nested_symlink(struct path *path, struct nameidata *nd)
1359{
1360 int res;
1361
1362 BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1363 if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1364 path_put_conditional(path, nd);
1365 path_put(&nd->path);
1366 return -ELOOP;
1367 }
1368
1369 nd->depth++;
1370 current->link_count++;
1371
1372 do {
1373 struct path link = *path;
1374 void *cookie;
1375
1376 res = follow_link(&link, nd, &cookie);
1377 if (!res)
1378 res = walk_component(nd, path, &nd->last,
1379 nd->last_type, LOOKUP_FOLLOW);
1380 put_link(nd, &link, cookie);
1381 } while (res > 0);
1382
1383 current->link_count--;
1384 nd->depth--;
1385 return res;
1323} 1386}
1324 1387
1325/* 1388/*
@@ -1339,30 +1402,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1339 while (*name=='/') 1402 while (*name=='/')
1340 name++; 1403 name++;
1341 if (!*name) 1404 if (!*name)
1342 goto return_reval; 1405 return 0;
1343
1344 if (nd->depth)
1345 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
1346 1406
1347 /* At this point we know we have a real path component. */ 1407 /* At this point we know we have a real path component. */
1348 for(;;) { 1408 for(;;) {
1349 struct inode *inode;
1350 unsigned long hash; 1409 unsigned long hash;
1351 struct qstr this; 1410 struct qstr this;
1352 unsigned int c; 1411 unsigned int c;
1412 int type;
1353 1413
1354 nd->flags |= LOOKUP_CONTINUE; 1414 nd->flags |= LOOKUP_CONTINUE;
1355 if (nd->flags & LOOKUP_RCU) { 1415
1356 err = exec_permission(nd->inode, IPERM_FLAG_RCU); 1416 err = may_lookup(nd);
1357 if (err == -ECHILD) {
1358 if (nameidata_drop_rcu(nd))
1359 return -ECHILD;
1360 goto exec_again;
1361 }
1362 } else {
1363exec_again:
1364 err = exec_permission(nd->inode, 0);
1365 }
1366 if (err) 1417 if (err)
1367 break; 1418 break;
1368 1419
@@ -1378,52 +1429,43 @@ exec_again:
1378 this.len = name - (const char *) this.name; 1429 this.len = name - (const char *) this.name;
1379 this.hash = end_name_hash(hash); 1430 this.hash = end_name_hash(hash);
1380 1431
1432 type = LAST_NORM;
1433 if (this.name[0] == '.') switch (this.len) {
1434 case 2:
1435 if (this.name[1] == '.') {
1436 type = LAST_DOTDOT;
1437 nd->flags |= LOOKUP_JUMPED;
1438 }
1439 break;
1440 case 1:
1441 type = LAST_DOT;
1442 }
1443 if (likely(type == LAST_NORM)) {
1444 struct dentry *parent = nd->path.dentry;
1445 nd->flags &= ~LOOKUP_JUMPED;
1446 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1447 err = parent->d_op->d_hash(parent, nd->inode,
1448 &this);
1449 if (err < 0)
1450 break;
1451 }
1452 }
1453
1381 /* remove trailing slashes? */ 1454 /* remove trailing slashes? */
1382 if (!c) 1455 if (!c)
1383 goto last_component; 1456 goto last_component;
1384 while (*++name == '/'); 1457 while (*++name == '/');
1385 if (!*name) 1458 if (!*name)
1386 goto last_with_slashes; 1459 goto last_component;
1387 1460
1388 /* 1461 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
1389 * "." and ".." are special - ".." especially so because it has 1462 if (err < 0)
1390 * to be able to know about the current root directory and 1463 return err;
1391 * parent relationships.
1392 */
1393 if (this.name[0] == '.') switch (this.len) {
1394 default:
1395 break;
1396 case 2:
1397 if (this.name[1] != '.')
1398 break;
1399 if (nd->flags & LOOKUP_RCU) {
1400 if (follow_dotdot_rcu(nd))
1401 return -ECHILD;
1402 } else
1403 follow_dotdot(nd);
1404 /* fallthrough */
1405 case 1:
1406 continue;
1407 }
1408 /* This does the actual lookups.. */
1409 err = do_lookup(nd, &this, &next, &inode);
1410 if (err)
1411 break;
1412 err = -ENOENT;
1413 if (!inode)
1414 goto out_dput;
1415 1464
1416 if (inode->i_op->follow_link) { 1465 if (err) {
1417 err = do_follow_link(inode, &next, nd); 1466 err = nested_symlink(&next, nd);
1418 if (err) 1467 if (err)
1419 goto return_err; 1468 return err;
1420 nd->inode = nd->path.dentry->d_inode;
1421 err = -ENOENT;
1422 if (!nd->inode)
1423 break;
1424 } else {
1425 path_to_nameidata(&next, nd);
1426 nd->inode = inode;
1427 } 1469 }
1428 err = -ENOTDIR; 1470 err = -ENOTDIR;
1429 if (!nd->inode->i_op->lookup) 1471 if (!nd->inode->i_op->lookup)
@@ -1431,210 +1473,109 @@ exec_again:
1431 continue; 1473 continue;
1432 /* here ends the main loop */ 1474 /* here ends the main loop */
1433 1475
1434last_with_slashes:
1435 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1436last_component: 1476last_component:
1437 /* Clear LOOKUP_CONTINUE iff it was previously unset */ 1477 /* Clear LOOKUP_CONTINUE iff it was previously unset */
1438 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; 1478 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
1439 if (lookup_flags & LOOKUP_PARENT)
1440 goto lookup_parent;
1441 if (this.name[0] == '.') switch (this.len) {
1442 default:
1443 break;
1444 case 2:
1445 if (this.name[1] != '.')
1446 break;
1447 if (nd->flags & LOOKUP_RCU) {
1448 if (follow_dotdot_rcu(nd))
1449 return -ECHILD;
1450 } else
1451 follow_dotdot(nd);
1452 /* fallthrough */
1453 case 1:
1454 goto return_reval;
1455 }
1456 err = do_lookup(nd, &this, &next, &inode);
1457 if (err)
1458 break;
1459 if (inode && unlikely(inode->i_op->follow_link) &&
1460 (lookup_flags & LOOKUP_FOLLOW)) {
1461 err = do_follow_link(inode, &next, nd);
1462 if (err)
1463 goto return_err;
1464 nd->inode = nd->path.dentry->d_inode;
1465 } else {
1466 path_to_nameidata(&next, nd);
1467 nd->inode = inode;
1468 }
1469 err = -ENOENT;
1470 if (!nd->inode)
1471 break;
1472 if (lookup_flags & LOOKUP_DIRECTORY) {
1473 err = -ENOTDIR;
1474 if (!nd->inode->i_op->lookup)
1475 break;
1476 }
1477 goto return_base;
1478lookup_parent:
1479 nd->last = this; 1479 nd->last = this;
1480 nd->last_type = LAST_NORM; 1480 nd->last_type = type;
1481 if (this.name[0] != '.')
1482 goto return_base;
1483 if (this.len == 1)
1484 nd->last_type = LAST_DOT;
1485 else if (this.len == 2 && this.name[1] == '.')
1486 nd->last_type = LAST_DOTDOT;
1487 else
1488 goto return_base;
1489return_reval:
1490 /*
1491 * We bypassed the ordinary revalidation routines.
1492 * We may need to check the cached dentry for staleness.
1493 */
1494 if (need_reval_dot(nd->path.dentry)) {
1495 if (nameidata_drop_rcu_last_maybe(nd))
1496 return -ECHILD;
1497 /* Note: we do not d_invalidate() */
1498 err = d_revalidate(nd->path.dentry, nd);
1499 if (!err)
1500 err = -ESTALE;
1501 if (err < 0)
1502 break;
1503 return 0;
1504 }
1505return_base:
1506 if (nameidata_drop_rcu_last_maybe(nd))
1507 return -ECHILD;
1508 return 0; 1481 return 0;
1509out_dput:
1510 if (!(nd->flags & LOOKUP_RCU))
1511 path_put_conditional(&next, nd);
1512 break;
1513 } 1482 }
1514 if (!(nd->flags & LOOKUP_RCU)) 1483 terminate_walk(nd);
1515 path_put(&nd->path);
1516return_err:
1517 return err; 1484 return err;
1518} 1485}
1519 1486
1520static inline int path_walk_rcu(const char *name, struct nameidata *nd) 1487static int path_init(int dfd, const char *name, unsigned int flags,
1521{ 1488 struct nameidata *nd, struct file **fp)
1522 current->total_link_count = 0;
1523
1524 return link_path_walk(name, nd);
1525}
1526
1527static inline int path_walk_simple(const char *name, struct nameidata *nd)
1528{
1529 current->total_link_count = 0;
1530
1531 return link_path_walk(name, nd);
1532}
1533
1534static int path_walk(const char *name, struct nameidata *nd)
1535{
1536 struct path save = nd->path;
1537 int result;
1538
1539 current->total_link_count = 0;
1540
1541 /* make sure the stuff we saved doesn't go away */
1542 path_get(&save);
1543
1544 result = link_path_walk(name, nd);
1545 if (result == -ESTALE) {
1546 /* nd->path had been dropped */
1547 current->total_link_count = 0;
1548 nd->path = save;
1549 nd->inode = save.dentry->d_inode;
1550 path_get(&nd->path);
1551 nd->flags |= LOOKUP_REVAL;
1552 result = link_path_walk(name, nd);
1553 }
1554
1555 path_put(&save);
1556
1557 return result;
1558}
1559
1560static void path_finish_rcu(struct nameidata *nd)
1561{
1562 if (nd->flags & LOOKUP_RCU) {
1563 /* RCU dangling. Cancel it. */
1564 nd->flags &= ~LOOKUP_RCU;
1565 nd->root.mnt = NULL;
1566 rcu_read_unlock();
1567 br_read_unlock(vfsmount_lock);
1568 }
1569 if (nd->file)
1570 fput(nd->file);
1571}
1572
1573static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1574{ 1489{
1575 int retval = 0; 1490 int retval = 0;
1576 int fput_needed; 1491 int fput_needed;
1577 struct file *file; 1492 struct file *file;
1578 1493
1579 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1494 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1580 nd->flags = flags | LOOKUP_RCU; 1495 nd->flags = flags | LOOKUP_JUMPED;
1581 nd->depth = 0; 1496 nd->depth = 0;
1497 if (flags & LOOKUP_ROOT) {
1498 struct inode *inode = nd->root.dentry->d_inode;
1499 if (*name) {
1500 if (!inode->i_op->lookup)
1501 return -ENOTDIR;
1502 retval = inode_permission(inode, MAY_EXEC);
1503 if (retval)
1504 return retval;
1505 }
1506 nd->path = nd->root;
1507 nd->inode = inode;
1508 if (flags & LOOKUP_RCU) {
1509 br_read_lock(vfsmount_lock);
1510 rcu_read_lock();
1511 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1512 } else {
1513 path_get(&nd->path);
1514 }
1515 return 0;
1516 }
1517
1582 nd->root.mnt = NULL; 1518 nd->root.mnt = NULL;
1583 nd->file = NULL;
1584 1519
1585 if (*name=='/') { 1520 if (*name=='/') {
1586 struct fs_struct *fs = current->fs; 1521 if (flags & LOOKUP_RCU) {
1587 unsigned seq; 1522 br_read_lock(vfsmount_lock);
1588 1523 rcu_read_lock();
1589 br_read_lock(vfsmount_lock); 1524 set_root_rcu(nd);
1590 rcu_read_lock(); 1525 } else {
1591 1526 set_root(nd);
1592 do { 1527 path_get(&nd->root);
1593 seq = read_seqcount_begin(&fs->seq); 1528 }
1594 nd->root = fs->root; 1529 nd->path = nd->root;
1595 nd->path = nd->root;
1596 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1597 } while (read_seqcount_retry(&fs->seq, seq));
1598
1599 } else if (dfd == AT_FDCWD) { 1530 } else if (dfd == AT_FDCWD) {
1600 struct fs_struct *fs = current->fs; 1531 if (flags & LOOKUP_RCU) {
1601 unsigned seq; 1532 struct fs_struct *fs = current->fs;
1602 1533 unsigned seq;
1603 br_read_lock(vfsmount_lock);
1604 rcu_read_lock();
1605 1534
1606 do { 1535 br_read_lock(vfsmount_lock);
1607 seq = read_seqcount_begin(&fs->seq); 1536 rcu_read_lock();
1608 nd->path = fs->pwd;
1609 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1610 } while (read_seqcount_retry(&fs->seq, seq));
1611 1537
1538 do {
1539 seq = read_seqcount_begin(&fs->seq);
1540 nd->path = fs->pwd;
1541 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1542 } while (read_seqcount_retry(&fs->seq, seq));
1543 } else {
1544 get_fs_pwd(current->fs, &nd->path);
1545 }
1612 } else { 1546 } else {
1613 struct dentry *dentry; 1547 struct dentry *dentry;
1614 1548
1615 file = fget_light(dfd, &fput_needed); 1549 file = fget_raw_light(dfd, &fput_needed);
1616 retval = -EBADF; 1550 retval = -EBADF;
1617 if (!file) 1551 if (!file)
1618 goto out_fail; 1552 goto out_fail;
1619 1553
1620 dentry = file->f_path.dentry; 1554 dentry = file->f_path.dentry;
1621 1555
1622 retval = -ENOTDIR; 1556 if (*name) {
1623 if (!S_ISDIR(dentry->d_inode->i_mode)) 1557 retval = -ENOTDIR;
1624 goto fput_fail; 1558 if (!S_ISDIR(dentry->d_inode->i_mode))
1559 goto fput_fail;
1625 1560
1626 retval = file_permission(file, MAY_EXEC); 1561 retval = file_permission(file, MAY_EXEC);
1627 if (retval) 1562 if (retval)
1628 goto fput_fail; 1563 goto fput_fail;
1564 }
1629 1565
1630 nd->path = file->f_path; 1566 nd->path = file->f_path;
1631 if (fput_needed) 1567 if (flags & LOOKUP_RCU) {
1632 nd->file = file; 1568 if (fput_needed)
1633 1569 *fp = file;
1634 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1570 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1635 br_read_lock(vfsmount_lock); 1571 br_read_lock(vfsmount_lock);
1636 rcu_read_lock(); 1572 rcu_read_lock();
1573 } else {
1574 path_get(&file->f_path);
1575 fput_light(file, fput_needed);
1576 }
1637 } 1577 }
1578
1638 nd->inode = nd->path.dentry->d_inode; 1579 nd->inode = nd->path.dentry->d_inode;
1639 return 0; 1580 return 0;
1640 1581
@@ -1644,60 +1585,23 @@ out_fail:
1644 return retval; 1585 return retval;
1645} 1586}
1646 1587
1647static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1588static inline int lookup_last(struct nameidata *nd, struct path *path)
1648{ 1589{
1649 int retval = 0; 1590 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1650 int fput_needed; 1591 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1651 struct file *file;
1652
1653 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1654 nd->flags = flags;
1655 nd->depth = 0;
1656 nd->root.mnt = NULL;
1657
1658 if (*name=='/') {
1659 set_root(nd);
1660 nd->path = nd->root;
1661 path_get(&nd->root);
1662 } else if (dfd == AT_FDCWD) {
1663 get_fs_pwd(current->fs, &nd->path);
1664 } else {
1665 struct dentry *dentry;
1666
1667 file = fget_light(dfd, &fput_needed);
1668 retval = -EBADF;
1669 if (!file)
1670 goto out_fail;
1671
1672 dentry = file->f_path.dentry;
1673
1674 retval = -ENOTDIR;
1675 if (!S_ISDIR(dentry->d_inode->i_mode))
1676 goto fput_fail;
1677
1678 retval = file_permission(file, MAY_EXEC);
1679 if (retval)
1680 goto fput_fail;
1681 1592
1682 nd->path = file->f_path; 1593 nd->flags &= ~LOOKUP_PARENT;
1683 path_get(&file->f_path); 1594 return walk_component(nd, path, &nd->last, nd->last_type,
1684 1595 nd->flags & LOOKUP_FOLLOW);
1685 fput_light(file, fput_needed);
1686 }
1687 nd->inode = nd->path.dentry->d_inode;
1688 return 0;
1689
1690fput_fail:
1691 fput_light(file, fput_needed);
1692out_fail:
1693 return retval;
1694} 1596}
1695 1597
1696/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1598/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1697static int do_path_lookup(int dfd, const char *name, 1599static int path_lookupat(int dfd, const char *name,
1698 unsigned int flags, struct nameidata *nd) 1600 unsigned int flags, struct nameidata *nd)
1699{ 1601{
1700 int retval; 1602 struct file *base = NULL;
1603 struct path path;
1604 int err;
1701 1605
1702 /* 1606 /*
1703 * Path walking is largely split up into 2 different synchronisation 1607 * Path walking is largely split up into 2 different synchronisation
@@ -1713,44 +1617,75 @@ static int do_path_lookup(int dfd, const char *name,
1713 * be handled by restarting a traditional ref-walk (which will always 1617 * be handled by restarting a traditional ref-walk (which will always
1714 * be able to complete). 1618 * be able to complete).
1715 */ 1619 */
1716 retval = path_init_rcu(dfd, name, flags, nd); 1620 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1717 if (unlikely(retval)) 1621
1718 return retval; 1622 if (unlikely(err))
1719 retval = path_walk_rcu(name, nd); 1623 return err;
1720 path_finish_rcu(nd); 1624
1721 if (nd->root.mnt) { 1625 current->total_link_count = 0;
1722 path_put(&nd->root); 1626 err = link_path_walk(name, nd);
1723 nd->root.mnt = NULL; 1627
1628 if (!err && !(flags & LOOKUP_PARENT)) {
1629 err = lookup_last(nd, &path);
1630 while (err > 0) {
1631 void *cookie;
1632 struct path link = path;
1633 nd->flags |= LOOKUP_PARENT;
1634 err = follow_link(&link, nd, &cookie);
1635 if (!err)
1636 err = lookup_last(nd, &path);
1637 put_link(nd, &link, cookie);
1638 }
1724 } 1639 }
1725 1640
1726 if (unlikely(retval == -ECHILD || retval == -ESTALE)) { 1641 if (nd->flags & LOOKUP_RCU) {
1727 /* slower, locked walk */ 1642 /* went all way through without dropping RCU */
1728 if (retval == -ESTALE) 1643 BUG_ON(err);
1729 flags |= LOOKUP_REVAL; 1644 if (nameidata_drop_rcu_last(nd))
1730 retval = path_init(dfd, name, flags, nd); 1645 err = -ECHILD;
1731 if (unlikely(retval)) 1646 }
1732 return retval; 1647
1733 retval = path_walk(name, nd); 1648 if (!err)
1734 if (nd->root.mnt) { 1649 err = handle_reval_path(nd);
1735 path_put(&nd->root); 1650
1736 nd->root.mnt = NULL; 1651 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1652 if (!nd->inode->i_op->lookup) {
1653 path_put(&nd->path);
1654 return -ENOTDIR;
1737 } 1655 }
1738 } 1656 }
1739 1657
1658 if (base)
1659 fput(base);
1660
1661 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
1662 path_put(&nd->root);
1663 nd->root.mnt = NULL;
1664 }
1665 return err;
1666}
1667
1668static int do_path_lookup(int dfd, const char *name,
1669 unsigned int flags, struct nameidata *nd)
1670{
1671 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
1672 if (unlikely(retval == -ECHILD))
1673 retval = path_lookupat(dfd, name, flags, nd);
1674 if (unlikely(retval == -ESTALE))
1675 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
1676
1740 if (likely(!retval)) { 1677 if (likely(!retval)) {
1741 if (unlikely(!audit_dummy_context())) { 1678 if (unlikely(!audit_dummy_context())) {
1742 if (nd->path.dentry && nd->inode) 1679 if (nd->path.dentry && nd->inode)
1743 audit_inode(name, nd->path.dentry); 1680 audit_inode(name, nd->path.dentry);
1744 } 1681 }
1745 } 1682 }
1746
1747 return retval; 1683 return retval;
1748} 1684}
1749 1685
1750int path_lookup(const char *name, unsigned int flags, 1686int kern_path_parent(const char *name, struct nameidata *nd)
1751 struct nameidata *nd)
1752{ 1687{
1753 return do_path_lookup(AT_FDCWD, name, flags, nd); 1688 return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
1754} 1689}
1755 1690
1756int kern_path(const char *name, unsigned int flags, struct path *path) 1691int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1774,29 +1709,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1774 const char *name, unsigned int flags, 1709 const char *name, unsigned int flags,
1775 struct nameidata *nd) 1710 struct nameidata *nd)
1776{ 1711{
1777 int retval; 1712 nd->root.dentry = dentry;
1778 1713 nd->root.mnt = mnt;
1779 /* same as do_path_lookup */ 1714 /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
1780 nd->last_type = LAST_ROOT; 1715 return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
1781 nd->flags = flags;
1782 nd->depth = 0;
1783
1784 nd->path.dentry = dentry;
1785 nd->path.mnt = mnt;
1786 path_get(&nd->path);
1787 nd->root = nd->path;
1788 path_get(&nd->root);
1789 nd->inode = nd->path.dentry->d_inode;
1790
1791 retval = path_walk(name, nd);
1792 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1793 nd->inode))
1794 audit_inode(name, nd->path.dentry);
1795
1796 path_put(&nd->root);
1797 nd->root.mnt = NULL;
1798
1799 return retval;
1800} 1716}
1801 1717
1802static struct dentry *__lookup_hash(struct qstr *name, 1718static struct dentry *__lookup_hash(struct qstr *name,
@@ -1811,17 +1727,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
1811 return ERR_PTR(err); 1727 return ERR_PTR(err);
1812 1728
1813 /* 1729 /*
1814 * See if the low-level filesystem might want
1815 * to use its own hash..
1816 */
1817 if (base->d_flags & DCACHE_OP_HASH) {
1818 err = base->d_op->d_hash(base, inode, name);
1819 dentry = ERR_PTR(err);
1820 if (err < 0)
1821 goto out;
1822 }
1823
1824 /*
1825 * Don't bother with __d_lookup: callers are for creat as 1730 * Don't bother with __d_lookup: callers are for creat as
1826 * well as unlink, so a lot of the time it would cost 1731 * well as unlink, so a lot of the time it would cost
1827 * a double lookup. 1732 * a double lookup.
@@ -1833,7 +1738,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
1833 1738
1834 if (!dentry) 1739 if (!dentry)
1835 dentry = d_alloc_and_lookup(base, name, nd); 1740 dentry = d_alloc_and_lookup(base, name, nd);
1836out: 1741
1837 return dentry; 1742 return dentry;
1838} 1743}
1839 1744
@@ -1847,28 +1752,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
1847 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1752 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1848} 1753}
1849 1754
1850static int __lookup_one_len(const char *name, struct qstr *this,
1851 struct dentry *base, int len)
1852{
1853 unsigned long hash;
1854 unsigned int c;
1855
1856 this->name = name;
1857 this->len = len;
1858 if (!len)
1859 return -EACCES;
1860
1861 hash = init_name_hash();
1862 while (len--) {
1863 c = *(const unsigned char *)name++;
1864 if (c == '/' || c == '\0')
1865 return -EACCES;
1866 hash = partial_name_hash(c, hash);
1867 }
1868 this->hash = end_name_hash(hash);
1869 return 0;
1870}
1871
1872/** 1755/**
1873 * lookup_one_len - filesystem helper to lookup single pathname component 1756 * lookup_one_len - filesystem helper to lookup single pathname component
1874 * @name: pathname component to lookup 1757 * @name: pathname component to lookup
@@ -1882,14 +1765,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
1882 */ 1765 */
1883struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1766struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1884{ 1767{
1885 int err;
1886 struct qstr this; 1768 struct qstr this;
1769 unsigned long hash;
1770 unsigned int c;
1887 1771
1888 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1772 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1889 1773
1890 err = __lookup_one_len(name, &this, base, len); 1774 this.name = name;
1891 if (err) 1775 this.len = len;
1892 return ERR_PTR(err); 1776 if (!len)
1777 return ERR_PTR(-EACCES);
1778
1779 hash = init_name_hash();
1780 while (len--) {
1781 c = *(const unsigned char *)name++;
1782 if (c == '/' || c == '\0')
1783 return ERR_PTR(-EACCES);
1784 hash = partial_name_hash(c, hash);
1785 }
1786 this.hash = end_name_hash(hash);
1787 /*
1788 * See if the low-level filesystem might want
1789 * to use its own hash..
1790 */
1791 if (base->d_flags & DCACHE_OP_HASH) {
1792 int err = base->d_op->d_hash(base, base->d_inode, &this);
1793 if (err < 0)
1794 return ERR_PTR(err);
1795 }
1893 1796
1894 return __lookup_hash(&this, base, NULL); 1797 return __lookup_hash(&this, base, NULL);
1895} 1798}
@@ -1898,7 +1801,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
1898 struct path *path) 1801 struct path *path)
1899{ 1802{
1900 struct nameidata nd; 1803 struct nameidata nd;
1901 char *tmp = getname(name); 1804 char *tmp = getname_flags(name, flags);
1902 int err = PTR_ERR(tmp); 1805 int err = PTR_ERR(tmp);
1903 if (!IS_ERR(tmp)) { 1806 if (!IS_ERR(tmp)) {
1904 1807
@@ -2078,12 +1981,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
2078 return error; 1981 return error;
2079} 1982}
2080 1983
2081int may_open(struct path *path, int acc_mode, int flag) 1984static int may_open(struct path *path, int acc_mode, int flag)
2082{ 1985{
2083 struct dentry *dentry = path->dentry; 1986 struct dentry *dentry = path->dentry;
2084 struct inode *inode = dentry->d_inode; 1987 struct inode *inode = dentry->d_inode;
2085 int error; 1988 int error;
2086 1989
1990 /* O_PATH? */
1991 if (!acc_mode)
1992 return 0;
1993
2087 if (!inode) 1994 if (!inode)
2088 return -ENOENT; 1995 return -ENOENT;
2089 1996
@@ -2152,34 +2059,6 @@ static int handle_truncate(struct file *filp)
2152} 2059}
2153 2060
2154/* 2061/*
2155 * Be careful about ever adding any more callers of this
2156 * function. Its flags must be in the namei format, not
2157 * what get passed to sys_open().
2158 */
2159static int __open_namei_create(struct nameidata *nd, struct path *path,
2160 int open_flag, int mode)
2161{
2162 int error;
2163 struct dentry *dir = nd->path.dentry;
2164
2165 if (!IS_POSIXACL(dir->d_inode))
2166 mode &= ~current_umask();
2167 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
2168 if (error)
2169 goto out_unlock;
2170 error = vfs_create(dir->d_inode, path->dentry, mode, nd);
2171out_unlock:
2172 mutex_unlock(&dir->d_inode->i_mutex);
2173 dput(nd->path.dentry);
2174 nd->path.dentry = path->dentry;
2175
2176 if (error)
2177 return error;
2178 /* Don't check for write permission, don't truncate */
2179 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
2180}
2181
2182/*
2183 * Note that while the flag value (low two bits) for sys_open means: 2062 * Note that while the flag value (low two bits) for sys_open means:
2184 * 00 - read-only 2063 * 00 - read-only
2185 * 01 - write-only 2064 * 01 - write-only
@@ -2203,126 +2082,115 @@ static inline int open_to_namei_flags(int flag)
2203 return flag; 2082 return flag;
2204} 2083}
2205 2084
2206static int open_will_truncate(int flag, struct inode *inode)
2207{
2208 /*
2209 * We'll never write to the fs underlying
2210 * a device file.
2211 */
2212 if (special_file(inode->i_mode))
2213 return 0;
2214 return (flag & O_TRUNC);
2215}
2216
2217static struct file *finish_open(struct nameidata *nd,
2218 int open_flag, int acc_mode)
2219{
2220 struct file *filp;
2221 int will_truncate;
2222 int error;
2223
2224 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
2225 if (will_truncate) {
2226 error = mnt_want_write(nd->path.mnt);
2227 if (error)
2228 goto exit;
2229 }
2230 error = may_open(&nd->path, acc_mode, open_flag);
2231 if (error) {
2232 if (will_truncate)
2233 mnt_drop_write(nd->path.mnt);
2234 goto exit;
2235 }
2236 filp = nameidata_to_filp(nd);
2237 if (!IS_ERR(filp)) {
2238 error = ima_file_check(filp, acc_mode);
2239 if (error) {
2240 fput(filp);
2241 filp = ERR_PTR(error);
2242 }
2243 }
2244 if (!IS_ERR(filp)) {
2245 if (will_truncate) {
2246 error = handle_truncate(filp);
2247 if (error) {
2248 fput(filp);
2249 filp = ERR_PTR(error);
2250 }
2251 }
2252 }
2253 /*
2254 * It is now safe to drop the mnt write
2255 * because the filp has had a write taken
2256 * on its behalf.
2257 */
2258 if (will_truncate)
2259 mnt_drop_write(nd->path.mnt);
2260 path_put(&nd->path);
2261 return filp;
2262
2263exit:
2264 path_put(&nd->path);
2265 return ERR_PTR(error);
2266}
2267
2268/* 2085/*
2269 * Handle O_CREAT case for do_filp_open 2086 * Handle the last step of open()
2270 */ 2087 */
2271static struct file *do_last(struct nameidata *nd, struct path *path, 2088static struct file *do_last(struct nameidata *nd, struct path *path,
2272 int open_flag, int acc_mode, 2089 const struct open_flags *op, const char *pathname)
2273 int mode, const char *pathname)
2274{ 2090{
2275 struct dentry *dir = nd->path.dentry; 2091 struct dentry *dir = nd->path.dentry;
2092 struct dentry *dentry;
2093 int open_flag = op->open_flag;
2094 int will_truncate = open_flag & O_TRUNC;
2095 int want_write = 0;
2096 int acc_mode = op->acc_mode;
2276 struct file *filp; 2097 struct file *filp;
2277 int error = -EISDIR; 2098 int error;
2099
2100 nd->flags &= ~LOOKUP_PARENT;
2101 nd->flags |= op->intent;
2278 2102
2279 switch (nd->last_type) { 2103 switch (nd->last_type) {
2280 case LAST_DOTDOT: 2104 case LAST_DOTDOT:
2281 follow_dotdot(nd);
2282 dir = nd->path.dentry;
2283 case LAST_DOT: 2105 case LAST_DOT:
2284 if (need_reval_dot(dir)) { 2106 error = handle_dots(nd, nd->last_type);
2285 int status = d_revalidate(nd->path.dentry, nd); 2107 if (error)
2286 if (!status) 2108 return ERR_PTR(error);
2287 status = -ESTALE;
2288 if (status < 0) {
2289 error = status;
2290 goto exit;
2291 }
2292 }
2293 /* fallthrough */ 2109 /* fallthrough */
2294 case LAST_ROOT: 2110 case LAST_ROOT:
2295 goto exit; 2111 if (nd->flags & LOOKUP_RCU) {
2112 if (nameidata_drop_rcu_last(nd))
2113 return ERR_PTR(-ECHILD);
2114 }
2115 error = handle_reval_path(nd);
2116 if (error)
2117 goto exit;
2118 audit_inode(pathname, nd->path.dentry);
2119 if (open_flag & O_CREAT) {
2120 error = -EISDIR;
2121 goto exit;
2122 }
2123 goto ok;
2296 case LAST_BIND: 2124 case LAST_BIND:
2125 /* can't be RCU mode here */
2126 error = handle_reval_path(nd);
2127 if (error)
2128 goto exit;
2297 audit_inode(pathname, dir); 2129 audit_inode(pathname, dir);
2298 goto ok; 2130 goto ok;
2299 } 2131 }
2300 2132
2133 if (!(open_flag & O_CREAT)) {
2134 int symlink_ok = 0;
2135 if (nd->last.name[nd->last.len])
2136 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2137 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
2138 symlink_ok = 1;
2139 /* we _can_ be in RCU mode here */
2140 error = walk_component(nd, path, &nd->last, LAST_NORM,
2141 !symlink_ok);
2142 if (error < 0)
2143 return ERR_PTR(error);
2144 if (error) /* symlink */
2145 return NULL;
2146 /* sayonara */
2147 if (nd->flags & LOOKUP_RCU) {
2148 if (nameidata_drop_rcu_last(nd))
2149 return ERR_PTR(-ECHILD);
2150 }
2151
2152 error = -ENOTDIR;
2153 if (nd->flags & LOOKUP_DIRECTORY) {
2154 if (!nd->inode->i_op->lookup)
2155 goto exit;
2156 }
2157 audit_inode(pathname, nd->path.dentry);
2158 goto ok;
2159 }
2160
2161 /* create side of things */
2162
2163 if (nd->flags & LOOKUP_RCU) {
2164 if (nameidata_drop_rcu_last(nd))
2165 return ERR_PTR(-ECHILD);
2166 }
2167
2168 audit_inode(pathname, dir);
2169 error = -EISDIR;
2301 /* trailing slashes? */ 2170 /* trailing slashes? */
2302 if (nd->last.name[nd->last.len]) 2171 if (nd->last.name[nd->last.len])
2303 goto exit; 2172 goto exit;
2304 2173
2305 mutex_lock(&dir->d_inode->i_mutex); 2174 mutex_lock(&dir->d_inode->i_mutex);
2306 2175
2307 path->dentry = lookup_hash(nd); 2176 dentry = lookup_hash(nd);
2308 path->mnt = nd->path.mnt; 2177 error = PTR_ERR(dentry);
2309 2178 if (IS_ERR(dentry)) {
2310 error = PTR_ERR(path->dentry);
2311 if (IS_ERR(path->dentry)) {
2312 mutex_unlock(&dir->d_inode->i_mutex); 2179 mutex_unlock(&dir->d_inode->i_mutex);
2313 goto exit; 2180 goto exit;
2314 } 2181 }
2315 2182
2316 if (IS_ERR(nd->intent.open.file)) { 2183 path->dentry = dentry;
2317 error = PTR_ERR(nd->intent.open.file); 2184 path->mnt = nd->path.mnt;
2318 goto exit_mutex_unlock;
2319 }
2320 2185
2321 /* Negative dentry, just create the file */ 2186 /* Negative dentry, just create the file */
2322 if (!path->dentry->d_inode) { 2187 if (!dentry->d_inode) {
2188 int mode = op->mode;
2189 if (!IS_POSIXACL(dir->d_inode))
2190 mode &= ~current_umask();
2323 /* 2191 /*
2324 * This write is needed to ensure that a 2192 * This write is needed to ensure that a
2325 * ro->rw transition does not occur between 2193 * rw->ro transition does not occur between
2326 * the time when the file is created and when 2194 * the time when the file is created and when
2327 * a permanent write count is taken through 2195 * a permanent write count is taken through
2328 * the 'struct file' in nameidata_to_filp(). 2196 * the 'struct file' in nameidata_to_filp().
@@ -2330,22 +2198,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2330 error = mnt_want_write(nd->path.mnt); 2198 error = mnt_want_write(nd->path.mnt);
2331 if (error) 2199 if (error)
2332 goto exit_mutex_unlock; 2200 goto exit_mutex_unlock;
2333 error = __open_namei_create(nd, path, open_flag, mode); 2201 want_write = 1;
2334 if (error) { 2202 /* Don't check for write permission, don't truncate */
2335 mnt_drop_write(nd->path.mnt); 2203 open_flag &= ~O_TRUNC;
2336 goto exit; 2204 will_truncate = 0;
2337 } 2205 acc_mode = MAY_OPEN;
2338 filp = nameidata_to_filp(nd); 2206 error = security_path_mknod(&nd->path, dentry, mode, 0);
2339 mnt_drop_write(nd->path.mnt); 2207 if (error)
2340 path_put(&nd->path); 2208 goto exit_mutex_unlock;
2341 if (!IS_ERR(filp)) { 2209 error = vfs_create(dir->d_inode, dentry, mode, nd);
2342 error = ima_file_check(filp, acc_mode); 2210 if (error)
2343 if (error) { 2211 goto exit_mutex_unlock;
2344 fput(filp); 2212 mutex_unlock(&dir->d_inode->i_mutex);
2345 filp = ERR_PTR(error); 2213 dput(nd->path.dentry);
2346 } 2214 nd->path.dentry = dentry;
2347 } 2215 goto common;
2348 return filp;
2349 } 2216 }
2350 2217
2351 /* 2218 /*
@@ -2375,7 +2242,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2375 if (S_ISDIR(nd->inode->i_mode)) 2242 if (S_ISDIR(nd->inode->i_mode))
2376 goto exit; 2243 goto exit;
2377ok: 2244ok:
2378 filp = finish_open(nd, open_flag, acc_mode); 2245 if (!S_ISREG(nd->inode->i_mode))
2246 will_truncate = 0;
2247
2248 if (will_truncate) {
2249 error = mnt_want_write(nd->path.mnt);
2250 if (error)
2251 goto exit;
2252 want_write = 1;
2253 }
2254common:
2255 error = may_open(&nd->path, acc_mode, open_flag);
2256 if (error)
2257 goto exit;
2258 filp = nameidata_to_filp(nd);
2259 if (!IS_ERR(filp)) {
2260 error = ima_file_check(filp, op->acc_mode);
2261 if (error) {
2262 fput(filp);
2263 filp = ERR_PTR(error);
2264 }
2265 }
2266 if (!IS_ERR(filp)) {
2267 if (will_truncate) {
2268 error = handle_truncate(filp);
2269 if (error) {
2270 fput(filp);
2271 filp = ERR_PTR(error);
2272 }
2273 }
2274 }
2275out:
2276 if (want_write)
2277 mnt_drop_write(nd->path.mnt);
2278 path_put(&nd->path);
2379 return filp; 2279 return filp;
2380 2280
2381exit_mutex_unlock: 2281exit_mutex_unlock:
@@ -2383,204 +2283,103 @@ exit_mutex_unlock:
2383exit_dput: 2283exit_dput:
2384 path_put_conditional(path, nd); 2284 path_put_conditional(path, nd);
2385exit: 2285exit:
2386 path_put(&nd->path); 2286 filp = ERR_PTR(error);
2387 return ERR_PTR(error); 2287 goto out;
2388} 2288}
2389 2289
2390/* 2290static struct file *path_openat(int dfd, const char *pathname,
2391 * Note that the low bits of the passed in "open_flag" 2291 struct nameidata *nd, const struct open_flags *op, int flags)
2392 * are not the same as in the local variable "flag". See
2393 * open_to_namei_flags() for more details.
2394 */
2395struct file *do_filp_open(int dfd, const char *pathname,
2396 int open_flag, int mode, int acc_mode)
2397{ 2292{
2293 struct file *base = NULL;
2398 struct file *filp; 2294 struct file *filp;
2399 struct nameidata nd;
2400 int error;
2401 struct path path; 2295 struct path path;
2402 int count = 0; 2296 int error;
2403 int flag = open_to_namei_flags(open_flag);
2404 int flags;
2405
2406 if (!(open_flag & O_CREAT))
2407 mode = 0;
2408
2409 /* Must never be set by userspace */
2410 open_flag &= ~FMODE_NONOTIFY;
2411
2412 /*
2413 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
2414 * check for O_DSYNC if the need any syncing at all we enforce it's
2415 * always set instead of having to deal with possibly weird behaviour
2416 * for malicious applications setting only __O_SYNC.
2417 */
2418 if (open_flag & __O_SYNC)
2419 open_flag |= O_DSYNC;
2420
2421 if (!acc_mode)
2422 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
2423
2424 /* O_TRUNC implies we need access checks for write permissions */
2425 if (open_flag & O_TRUNC)
2426 acc_mode |= MAY_WRITE;
2427
2428 /* Allow the LSM permission hook to distinguish append
2429 access from general write access. */
2430 if (open_flag & O_APPEND)
2431 acc_mode |= MAY_APPEND;
2432
2433 flags = LOOKUP_OPEN;
2434 if (open_flag & O_CREAT) {
2435 flags |= LOOKUP_CREATE;
2436 if (open_flag & O_EXCL)
2437 flags |= LOOKUP_EXCL;
2438 }
2439 if (open_flag & O_DIRECTORY)
2440 flags |= LOOKUP_DIRECTORY;
2441 if (!(open_flag & O_NOFOLLOW))
2442 flags |= LOOKUP_FOLLOW;
2443 2297
2444 filp = get_empty_filp(); 2298 filp = get_empty_filp();
2445 if (!filp) 2299 if (!filp)
2446 return ERR_PTR(-ENFILE); 2300 return ERR_PTR(-ENFILE);
2447 2301
2448 filp->f_flags = open_flag; 2302 filp->f_flags = op->open_flag;
2449 nd.intent.open.file = filp; 2303 nd->intent.open.file = filp;
2450 nd.intent.open.flags = flag; 2304 nd->intent.open.flags = open_to_namei_flags(op->open_flag);
2451 nd.intent.open.create_mode = mode; 2305 nd->intent.open.create_mode = op->mode;
2452
2453 if (open_flag & O_CREAT)
2454 goto creat;
2455 2306
2456 /* !O_CREAT, simple open */ 2307 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
2457 error = do_path_lookup(dfd, pathname, flags, &nd);
2458 if (unlikely(error)) 2308 if (unlikely(error))
2459 goto out_filp2;
2460 error = -ELOOP;
2461 if (!(nd.flags & LOOKUP_FOLLOW)) {
2462 if (nd.inode->i_op->follow_link)
2463 goto out_path2;
2464 }
2465 error = -ENOTDIR;
2466 if (nd.flags & LOOKUP_DIRECTORY) {
2467 if (!nd.inode->i_op->lookup)
2468 goto out_path2;
2469 }
2470 audit_inode(pathname, nd.path.dentry);
2471 filp = finish_open(&nd, open_flag, acc_mode);
2472out2:
2473 release_open_intent(&nd);
2474 return filp;
2475
2476out_path2:
2477 path_put(&nd.path);
2478out_filp2:
2479 filp = ERR_PTR(error);
2480 goto out2;
2481
2482creat:
2483 /* OK, have to create the file. Find the parent. */
2484 error = path_init_rcu(dfd, pathname,
2485 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2486 if (error)
2487 goto out_filp; 2309 goto out_filp;
2488 error = path_walk_rcu(pathname, &nd);
2489 path_finish_rcu(&nd);
2490 if (unlikely(error == -ECHILD || error == -ESTALE)) {
2491 /* slower, locked walk */
2492 if (error == -ESTALE) {
2493reval:
2494 flags |= LOOKUP_REVAL;
2495 }
2496 error = path_init(dfd, pathname,
2497 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2498 if (error)
2499 goto out_filp;
2500 2310
2501 error = path_walk_simple(pathname, &nd); 2311 current->total_link_count = 0;
2502 } 2312 error = link_path_walk(pathname, nd);
2503 if (unlikely(error)) 2313 if (unlikely(error))
2504 goto out_filp; 2314 goto out_filp;
2505 if (unlikely(!audit_dummy_context()))
2506 audit_inode(pathname, nd.path.dentry);
2507 2315
2508 /* 2316 filp = do_last(nd, &path, op, pathname);
2509 * We have the parent and last component.
2510 */
2511 nd.flags = flags;
2512 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
2513 while (unlikely(!filp)) { /* trailing symlink */ 2317 while (unlikely(!filp)) { /* trailing symlink */
2514 struct path link = path; 2318 struct path link = path;
2515 struct inode *linki = link.dentry->d_inode;
2516 void *cookie; 2319 void *cookie;
2517 error = -ELOOP; 2320 if (!(nd->flags & LOOKUP_FOLLOW)) {
2518 if (!(nd.flags & LOOKUP_FOLLOW)) 2321 path_put_conditional(&path, nd);
2519 goto exit_dput; 2322 path_put(&nd->path);
2520 if (count++ == 32) 2323 filp = ERR_PTR(-ELOOP);
2521 goto exit_dput; 2324 break;
2522 /*
2523 * This is subtle. Instead of calling do_follow_link() we do
2524 * the thing by hands. The reason is that this way we have zero
2525 * link_count and path_walk() (called from ->follow_link)
2526 * honoring LOOKUP_PARENT. After that we have the parent and
2527 * last component, i.e. we are in the same situation as after
2528 * the first path_walk(). Well, almost - if the last component
2529 * is normal we get its copy stored in nd->last.name and we will
2530 * have to putname() it when we are done. Procfs-like symlinks
2531 * just set LAST_BIND.
2532 */
2533 nd.flags |= LOOKUP_PARENT;
2534 error = security_inode_follow_link(link.dentry, &nd);
2535 if (error)
2536 goto exit_dput;
2537 error = __do_follow_link(&link, &nd, &cookie);
2538 if (unlikely(error)) {
2539 if (!IS_ERR(cookie) && linki->i_op->put_link)
2540 linki->i_op->put_link(link.dentry, &nd, cookie);
2541 /* nd.path had been dropped */
2542 nd.path = link;
2543 goto out_path;
2544 } 2325 }
2545 nd.flags &= ~LOOKUP_PARENT; 2326 nd->flags |= LOOKUP_PARENT;
2546 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2327 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
2547 if (linki->i_op->put_link) 2328 error = follow_link(&link, nd, &cookie);
2548 linki->i_op->put_link(link.dentry, &nd, cookie); 2329 if (unlikely(error))
2549 path_put(&link); 2330 filp = ERR_PTR(error);
2331 else
2332 filp = do_last(nd, &path, op, pathname);
2333 put_link(nd, &link, cookie);
2550 } 2334 }
2551out: 2335out:
2552 if (nd.root.mnt) 2336 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
2553 path_put(&nd.root); 2337 path_put(&nd->root);
2554 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) 2338 if (base)
2555 goto reval; 2339 fput(base);
2556 release_open_intent(&nd); 2340 release_open_intent(nd);
2557 return filp; 2341 return filp;
2558 2342
2559exit_dput:
2560 path_put_conditional(&path, &nd);
2561out_path:
2562 path_put(&nd.path);
2563out_filp: 2343out_filp:
2564 filp = ERR_PTR(error); 2344 filp = ERR_PTR(error);
2565 goto out; 2345 goto out;
2566} 2346}
2567 2347
2568/** 2348struct file *do_filp_open(int dfd, const char *pathname,
2569 * filp_open - open file and return file pointer 2349 const struct open_flags *op, int flags)
2570 *
2571 * @filename: path to open
2572 * @flags: open flags as per the open(2) second argument
2573 * @mode: mode for the new file if O_CREAT is set, else ignored
2574 *
2575 * This is the helper to open a file from kernelspace if you really
2576 * have to. But in generally you should not do this, so please move
2577 * along, nothing to see here..
2578 */
2579struct file *filp_open(const char *filename, int flags, int mode)
2580{ 2350{
2581 return do_filp_open(AT_FDCWD, filename, flags, mode, 0); 2351 struct nameidata nd;
2352 struct file *filp;
2353
2354 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
2355 if (unlikely(filp == ERR_PTR(-ECHILD)))
2356 filp = path_openat(dfd, pathname, &nd, op, flags);
2357 if (unlikely(filp == ERR_PTR(-ESTALE)))
2358 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
2359 return filp;
2360}
2361
2362struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2363 const char *name, const struct open_flags *op, int flags)
2364{
2365 struct nameidata nd;
2366 struct file *file;
2367
2368 nd.root.mnt = mnt;
2369 nd.root.dentry = dentry;
2370
2371 flags |= LOOKUP_ROOT;
2372
2373 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
2374 return ERR_PTR(-ELOOP);
2375
2376 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
2377 if (unlikely(file == ERR_PTR(-ECHILD)))
2378 file = path_openat(-1, name, &nd, op, flags);
2379 if (unlikely(file == ERR_PTR(-ESTALE)))
2380 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
2381 return file;
2582} 2382}
2583EXPORT_SYMBOL(filp_open);
2584 2383
2585/** 2384/**
2586 * lookup_create - lookup a dentry, creating it if it doesn't exist 2385 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -3119,7 +2918,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
3119 return error; 2918 return error;
3120 2919
3121 mutex_lock(&inode->i_mutex); 2920 mutex_lock(&inode->i_mutex);
3122 error = dir->i_op->link(old_dentry, dir, new_dentry); 2921 /* Make sure we don't allow creating hardlink to an unlinked file */
2922 if (inode->i_nlink == 0)
2923 error = -ENOENT;
2924 else
2925 error = dir->i_op->link(old_dentry, dir, new_dentry);
3123 mutex_unlock(&inode->i_mutex); 2926 mutex_unlock(&inode->i_mutex);
3124 if (!error) 2927 if (!error)
3125 fsnotify_link(dir, inode, new_dentry); 2928 fsnotify_link(dir, inode, new_dentry);
@@ -3141,15 +2944,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3141 struct dentry *new_dentry; 2944 struct dentry *new_dentry;
3142 struct nameidata nd; 2945 struct nameidata nd;
3143 struct path old_path; 2946 struct path old_path;
2947 int how = 0;
3144 int error; 2948 int error;
3145 char *to; 2949 char *to;
3146 2950
3147 if ((flags & ~AT_SYMLINK_FOLLOW) != 0) 2951 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3148 return -EINVAL; 2952 return -EINVAL;
2953 /*
2954 * To use null names we require CAP_DAC_READ_SEARCH
2955 * This ensures that not everyone will be able to create
2956 * handlink using the passed filedescriptor.
2957 */
2958 if (flags & AT_EMPTY_PATH) {
2959 if (!capable(CAP_DAC_READ_SEARCH))
2960 return -ENOENT;
2961 how = LOOKUP_EMPTY;
2962 }
2963
2964 if (flags & AT_SYMLINK_FOLLOW)
2965 how |= LOOKUP_FOLLOW;
3149 2966
3150 error = user_path_at(olddfd, oldname, 2967 error = user_path_at(olddfd, oldname, how, &old_path);
3151 flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
3152 &old_path);
3153 if (error) 2968 if (error)
3154 return error; 2969 return error;
3155 2970
@@ -3586,7 +3401,7 @@ EXPORT_SYMBOL(page_readlink);
3586EXPORT_SYMBOL(__page_symlink); 3401EXPORT_SYMBOL(__page_symlink);
3587EXPORT_SYMBOL(page_symlink); 3402EXPORT_SYMBOL(page_symlink);
3588EXPORT_SYMBOL(page_symlink_inode_operations); 3403EXPORT_SYMBOL(page_symlink_inode_operations);
3589EXPORT_SYMBOL(path_lookup); 3404EXPORT_SYMBOL(kern_path_parent);
3590EXPORT_SYMBOL(kern_path); 3405EXPORT_SYMBOL(kern_path);
3591EXPORT_SYMBOL(vfs_path_lookup); 3406EXPORT_SYMBOL(vfs_path_lookup);
3592EXPORT_SYMBOL(inode_permission); 3407EXPORT_SYMBOL(inode_permission);
diff --git a/fs/namespace.c b/fs/namespace.c
index d1edf26025d..dffe6f49ab9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = {
1002 .show = show_vfsmnt 1002 .show = show_vfsmnt
1003}; 1003};
1004 1004
1005static int uuid_is_nil(u8 *uuid)
1006{
1007 int i;
1008 u8 *cp = (u8 *)uuid;
1009
1010 for (i = 0; i < 16; i++) {
1011 if (*cp++)
1012 return 0;
1013 }
1014 return 1;
1015}
1016
1005static int show_mountinfo(struct seq_file *m, void *v) 1017static int show_mountinfo(struct seq_file *m, void *v)
1006{ 1018{
1007 struct proc_mounts *p = m->private; 1019 struct proc_mounts *p = m->private;
@@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v)
1040 if (IS_MNT_UNBINDABLE(mnt)) 1052 if (IS_MNT_UNBINDABLE(mnt))
1041 seq_puts(m, " unbindable"); 1053 seq_puts(m, " unbindable");
1042 1054
1055 if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
1056 /* print the uuid */
1057 seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
1058
1043 /* Filesystem specific data */ 1059 /* Filesystem specific data */
1044 seq_puts(m, " - "); 1060 seq_puts(m, " - ");
1045 show_type(m, sb); 1061 show_type(m, sb);
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242dd..124e8fcb0dd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
22 22
23static struct file *do_open(char *name, int flags) 23static struct file *do_open(char *name, int flags)
24{ 24{
25 struct nameidata nd;
26 struct vfsmount *mnt; 25 struct vfsmount *mnt;
27 int error; 26 struct file *file;
28 27
29 mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); 28 mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
30 if (IS_ERR(mnt)) 29 if (IS_ERR(mnt))
31 return (struct file *)mnt; 30 return (struct file *)mnt;
32 31
33 error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); 32 file = file_open_root(mnt->mnt_root, mnt, name, flags);
34 mntput(mnt); /* drop do_kern_mount reference */
35 if (error)
36 return ERR_PTR(error);
37
38 if (flags == O_RDWR)
39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
40 else
41 error = may_open(&nd.path, MAY_WRITE, flags);
42 33
43 if (!error) 34 mntput(mnt); /* drop do_kern_mount reference */
44 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 35 return file;
45 current_cred());
46
47 path_put(&nd.path);
48 return ERR_PTR(error);
49} 36}
50 37
51static struct { 38static struct {
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4f..254652a9b54 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
197 dentry->d_name.len, dentry->d_name.name, 197 dentry->d_name.len, dentry->d_name.name,
198 fh, len, connectable); 198 fh, len, connectable);
199 199
200 if (len < 3 || (connectable && len < 6)) { 200 if (connectable && (len < 6)) {
201 mlog(ML_ERROR, "fh buffer is too small for encoding\n"); 201 *max_len = 6;
202 type = 255;
203 goto bail;
204 } else if (len < 3) {
205 *max_len = 3;
202 type = 255; 206 type = 255;
203 goto bail; 207 goto bail;
204 } 208 }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19ebc5aad39..29623da133c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4379,7 +4379,7 @@ static int ocfs2_user_path_parent(const char __user *path,
4379 if (IS_ERR(s)) 4379 if (IS_ERR(s))
4380 return PTR_ERR(s); 4380 return PTR_ERR(s);
4381 4381
4382 error = path_lookup(s, LOOKUP_PARENT, nd); 4382 error = kern_path_parent(s, nd);
4383 if (error) 4383 if (error)
4384 putname(s); 4384 putname(s);
4385 else 4385 else
diff --git a/fs/open.c b/fs/open.c
index b47aab39c05..3cac0bda46d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -573,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
573{ 573{
574 struct path path; 574 struct path path;
575 int error = -EINVAL; 575 int error = -EINVAL;
576 int follow; 576 int lookup_flags;
577 577
578 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) 578 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
579 goto out; 579 goto out;
580 580
581 follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 581 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
582 error = user_path_at(dfd, filename, follow, &path); 582 if (flag & AT_EMPTY_PATH)
583 lookup_flags |= LOOKUP_EMPTY;
584 error = user_path_at(dfd, filename, lookup_flags, &path);
583 if (error) 585 if (error)
584 goto out; 586 goto out;
585 error = mnt_want_write(path.mnt); 587 error = mnt_want_write(path.mnt);
@@ -669,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
669 int (*open)(struct inode *, struct file *), 671 int (*open)(struct inode *, struct file *),
670 const struct cred *cred) 672 const struct cred *cred)
671{ 673{
674 static const struct file_operations empty_fops = {};
672 struct inode *inode; 675 struct inode *inode;
673 int error; 676 int error;
674 677
675 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | 678 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
676 FMODE_PREAD | FMODE_PWRITE; 679 FMODE_PREAD | FMODE_PWRITE;
680
681 if (unlikely(f->f_flags & O_PATH))
682 f->f_mode = FMODE_PATH;
683
677 inode = dentry->d_inode; 684 inode = dentry->d_inode;
678 if (f->f_mode & FMODE_WRITE) { 685 if (f->f_mode & FMODE_WRITE) {
679 error = __get_file_write_access(inode, mnt); 686 error = __get_file_write_access(inode, mnt);
@@ -687,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
687 f->f_path.dentry = dentry; 694 f->f_path.dentry = dentry;
688 f->f_path.mnt = mnt; 695 f->f_path.mnt = mnt;
689 f->f_pos = 0; 696 f->f_pos = 0;
690 f->f_op = fops_get(inode->i_fop);
691 file_sb_list_add(f, inode->i_sb); 697 file_sb_list_add(f, inode->i_sb);
692 698
699 if (unlikely(f->f_mode & FMODE_PATH)) {
700 f->f_op = &empty_fops;
701 return f;
702 }
703
704 f->f_op = fops_get(inode->i_fop);
705
693 error = security_dentry_open(f, cred); 706 error = security_dentry_open(f, cred);
694 if (error) 707 if (error)
695 goto cleanup_all; 708 goto cleanup_all;
@@ -890,15 +903,110 @@ void fd_install(unsigned int fd, struct file *file)
890 903
891EXPORT_SYMBOL(fd_install); 904EXPORT_SYMBOL(fd_install);
892 905
906static inline int build_open_flags(int flags, int mode, struct open_flags *op)
907{
908 int lookup_flags = 0;
909 int acc_mode;
910
911 if (!(flags & O_CREAT))
912 mode = 0;
913 op->mode = mode;
914
915 /* Must never be set by userspace */
916 flags &= ~FMODE_NONOTIFY;
917
918 /*
919 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
920 * check for O_DSYNC if the need any syncing at all we enforce it's
921 * always set instead of having to deal with possibly weird behaviour
922 * for malicious applications setting only __O_SYNC.
923 */
924 if (flags & __O_SYNC)
925 flags |= O_DSYNC;
926
927 /*
928 * If we have O_PATH in the open flag. Then we
929 * cannot have anything other than the below set of flags
930 */
931 if (flags & O_PATH) {
932 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
933 acc_mode = 0;
934 } else {
935 acc_mode = MAY_OPEN | ACC_MODE(flags);
936 }
937
938 op->open_flag = flags;
939
940 /* O_TRUNC implies we need access checks for write permissions */
941 if (flags & O_TRUNC)
942 acc_mode |= MAY_WRITE;
943
944 /* Allow the LSM permission hook to distinguish append
945 access from general write access. */
946 if (flags & O_APPEND)
947 acc_mode |= MAY_APPEND;
948
949 op->acc_mode = acc_mode;
950
951 op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
952
953 if (flags & O_CREAT) {
954 op->intent |= LOOKUP_CREATE;
955 if (flags & O_EXCL)
956 op->intent |= LOOKUP_EXCL;
957 }
958
959 if (flags & O_DIRECTORY)
960 lookup_flags |= LOOKUP_DIRECTORY;
961 if (!(flags & O_NOFOLLOW))
962 lookup_flags |= LOOKUP_FOLLOW;
963 return lookup_flags;
964}
965
966/**
967 * filp_open - open file and return file pointer
968 *
969 * @filename: path to open
970 * @flags: open flags as per the open(2) second argument
971 * @mode: mode for the new file if O_CREAT is set, else ignored
972 *
973 * This is the helper to open a file from kernelspace if you really
974 * have to. But in generally you should not do this, so please move
975 * along, nothing to see here..
976 */
977struct file *filp_open(const char *filename, int flags, int mode)
978{
979 struct open_flags op;
980 int lookup = build_open_flags(flags, mode, &op);
981 return do_filp_open(AT_FDCWD, filename, &op, lookup);
982}
983EXPORT_SYMBOL(filp_open);
984
985struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
986 const char *filename, int flags)
987{
988 struct open_flags op;
989 int lookup = build_open_flags(flags, 0, &op);
990 if (flags & O_CREAT)
991 return ERR_PTR(-EINVAL);
992 if (!filename && (flags & O_DIRECTORY))
993 if (!dentry->d_inode->i_op->lookup)
994 return ERR_PTR(-ENOTDIR);
995 return do_file_open_root(dentry, mnt, filename, &op, lookup);
996}
997EXPORT_SYMBOL(file_open_root);
998
893long do_sys_open(int dfd, const char __user *filename, int flags, int mode) 999long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
894{ 1000{
1001 struct open_flags op;
1002 int lookup = build_open_flags(flags, mode, &op);
895 char *tmp = getname(filename); 1003 char *tmp = getname(filename);
896 int fd = PTR_ERR(tmp); 1004 int fd = PTR_ERR(tmp);
897 1005
898 if (!IS_ERR(tmp)) { 1006 if (!IS_ERR(tmp)) {
899 fd = get_unused_fd_flags(flags); 1007 fd = get_unused_fd_flags(flags);
900 if (fd >= 0) { 1008 if (fd >= 0) {
901 struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); 1009 struct file *f = do_filp_open(dfd, tmp, &op, lookup);
902 if (IS_ERR(f)) { 1010 if (IS_ERR(f)) {
903 put_unused_fd(fd); 1011 put_unused_fd(fd);
904 fd = PTR_ERR(f); 1012 fd = PTR_ERR(f);
@@ -968,8 +1076,10 @@ int filp_close(struct file *filp, fl_owner_t id)
968 if (filp->f_op && filp->f_op->flush) 1076 if (filp->f_op && filp->f_op->flush)
969 retval = filp->f_op->flush(filp, id); 1077 retval = filp->f_op->flush(filp, id);
970 1078
971 dnotify_flush(filp, id); 1079 if (likely(!(filp->f_mode & FMODE_PATH))) {
972 locks_remove_posix(filp, id); 1080 dnotify_flush(filp, id);
1081 locks_remove_posix(filp, id);
1082 }
973 fput(filp); 1083 fput(filp);
974 return retval; 1084 return retval;
975} 1085}
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index be03a0b08b4..764b86a0196 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13#define MAX_OSF_PARTITIONS 8 13#define MAX_OSF_PARTITIONS 18
14 14
15int osf_partition(struct parsed_partitions *state) 15int osf_partition(struct parsed_partitions *state)
16{ 16{
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e..1bba24bad82 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1593 struct inode *inode = dentry->d_inode; 1593 struct inode *inode = dentry->d_inode;
1594 int maxlen = *lenp; 1594 int maxlen = *lenp;
1595 1595
1596 if (maxlen < 3) 1596 if (need_parent && (maxlen < 5)) {
1597 *lenp = 5;
1597 return 255; 1598 return 255;
1599 } else if (maxlen < 3) {
1600 *lenp = 3;
1601 return 255;
1602 }
1598 1603
1599 data[0] = inode->i_ino; 1604 data[0] = inode->i_ino;
1600 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1605 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 68fdf45cc6c..4b2eb564fda 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1122,10 +1122,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1122 reiserfs_write_unlock(dir->i_sb); 1122 reiserfs_write_unlock(dir->i_sb);
1123 return -EMLINK; 1123 return -EMLINK;
1124 } 1124 }
1125 if (inode->i_nlink == 0) {
1126 reiserfs_write_unlock(dir->i_sb);
1127 return -ENOENT;
1128 }
1129 1125
1130 /* inc before scheduling so reiserfs_unlink knows we are here */ 1126 /* inc before scheduling so reiserfs_unlink knows we are here */
1131 inc_nlink(inode); 1127 inc_nlink(inode);
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b70..961039121cb 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
75 int error = -EINVAL; 75 int error = -EINVAL;
76 int lookup_flags = 0; 76 int lookup_flags = 0;
77 77
78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) 78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
79 AT_EMPTY_PATH)) != 0)
79 goto out; 80 goto out;
80 81
81 if (!(flag & AT_SYMLINK_NOFOLLOW)) 82 if (!(flag & AT_SYMLINK_NOFOLLOW))
82 lookup_flags |= LOOKUP_FOLLOW; 83 lookup_flags |= LOOKUP_FOLLOW;
83 if (flag & AT_NO_AUTOMOUNT) 84 if (flag & AT_NO_AUTOMOUNT)
84 lookup_flags |= LOOKUP_NO_AUTOMOUNT; 85 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
86 if (flag & AT_EMPTY_PATH)
87 lookup_flags |= LOOKUP_EMPTY;
85 88
86 error = user_path_at(dfd, filename, lookup_flags, &path); 89 error = user_path_at(dfd, filename, lookup_flags, &path);
87 if (error) 90 if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
297 if (bufsiz <= 0) 300 if (bufsiz <= 0)
298 return -EINVAL; 301 return -EINVAL;
299 302
300 error = user_path_at(dfd, pathname, 0, &path); 303 error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
301 if (!error) { 304 if (!error) {
302 struct inode *inode = path.dentry->d_inode; 305 struct inode *inode = path.dentry->d_inode;
303 306
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996..8244924dec5 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
73} 73}
74EXPORT_SYMBOL(vfs_statfs); 74EXPORT_SYMBOL(vfs_statfs);
75 75
76static int do_statfs_native(struct path *path, struct statfs *buf) 76int user_statfs(const char __user *pathname, struct kstatfs *st)
77{ 77{
78 struct kstatfs st; 78 struct path path;
79 int retval; 79 int error = user_path(pathname, &path);
80 if (!error) {
81 error = vfs_statfs(&path, st);
82 path_put(&path);
83 }
84 return error;
85}
80 86
81 retval = vfs_statfs(path, &st); 87int fd_statfs(int fd, struct kstatfs *st)
82 if (retval) 88{
83 return retval; 89 struct file *file = fget(fd);
90 int error = -EBADF;
91 if (file) {
92 error = vfs_statfs(&file->f_path, st);
93 fput(file);
94 }
95 return error;
96}
84 97
85 if (sizeof(*buf) == sizeof(st)) 98static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
86 memcpy(buf, &st, sizeof(st)); 99{
100 struct statfs buf;
101
102 if (sizeof(buf) == sizeof(*st))
103 memcpy(&buf, st, sizeof(*st));
87 else { 104 else {
88 if (sizeof buf->f_blocks == 4) { 105 if (sizeof buf.f_blocks == 4) {
89 if ((st.f_blocks | st.f_bfree | st.f_bavail | 106 if ((st->f_blocks | st->f_bfree | st->f_bavail |
90 st.f_bsize | st.f_frsize) & 107 st->f_bsize | st->f_frsize) &
91 0xffffffff00000000ULL) 108 0xffffffff00000000ULL)
92 return -EOVERFLOW; 109 return -EOVERFLOW;
93 /* 110 /*
94 * f_files and f_ffree may be -1; it's okay to stuff 111 * f_files and f_ffree may be -1; it's okay to stuff
95 * that into 32 bits 112 * that into 32 bits
96 */ 113 */
97 if (st.f_files != -1 && 114 if (st->f_files != -1 &&
98 (st.f_files & 0xffffffff00000000ULL)) 115 (st->f_files & 0xffffffff00000000ULL))
99 return -EOVERFLOW; 116 return -EOVERFLOW;
100 if (st.f_ffree != -1 && 117 if (st->f_ffree != -1 &&
101 (st.f_ffree & 0xffffffff00000000ULL)) 118 (st->f_ffree & 0xffffffff00000000ULL))
102 return -EOVERFLOW; 119 return -EOVERFLOW;
103 } 120 }
104 121
105 buf->f_type = st.f_type; 122 buf.f_type = st->f_type;
106 buf->f_bsize = st.f_bsize; 123 buf.f_bsize = st->f_bsize;
107 buf->f_blocks = st.f_blocks; 124 buf.f_blocks = st->f_blocks;
108 buf->f_bfree = st.f_bfree; 125 buf.f_bfree = st->f_bfree;
109 buf->f_bavail = st.f_bavail; 126 buf.f_bavail = st->f_bavail;
110 buf->f_files = st.f_files; 127 buf.f_files = st->f_files;
111 buf->f_ffree = st.f_ffree; 128 buf.f_ffree = st->f_ffree;
112 buf->f_fsid = st.f_fsid; 129 buf.f_fsid = st->f_fsid;
113 buf->f_namelen = st.f_namelen; 130 buf.f_namelen = st->f_namelen;
114 buf->f_frsize = st.f_frsize; 131 buf.f_frsize = st->f_frsize;
115 buf->f_flags = st.f_flags; 132 buf.f_flags = st->f_flags;
116 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 133 memset(buf.f_spare, 0, sizeof(buf.f_spare));
117 } 134 }
135 if (copy_to_user(p, &buf, sizeof(buf)))
136 return -EFAULT;
118 return 0; 137 return 0;
119} 138}
120 139
121static int do_statfs64(struct path *path, struct statfs64 *buf) 140static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
122{ 141{
123 struct kstatfs st; 142 struct statfs64 buf;
124 int retval; 143 if (sizeof(buf) == sizeof(*st))
125 144 memcpy(&buf, st, sizeof(*st));
126 retval = vfs_statfs(path, &st);
127 if (retval)
128 return retval;
129
130 if (sizeof(*buf) == sizeof(st))
131 memcpy(buf, &st, sizeof(st));
132 else { 145 else {
133 buf->f_type = st.f_type; 146 buf.f_type = st->f_type;
134 buf->f_bsize = st.f_bsize; 147 buf.f_bsize = st->f_bsize;
135 buf->f_blocks = st.f_blocks; 148 buf.f_blocks = st->f_blocks;
136 buf->f_bfree = st.f_bfree; 149 buf.f_bfree = st->f_bfree;
137 buf->f_bavail = st.f_bavail; 150 buf.f_bavail = st->f_bavail;
138 buf->f_files = st.f_files; 151 buf.f_files = st->f_files;
139 buf->f_ffree = st.f_ffree; 152 buf.f_ffree = st->f_ffree;
140 buf->f_fsid = st.f_fsid; 153 buf.f_fsid = st->f_fsid;
141 buf->f_namelen = st.f_namelen; 154 buf.f_namelen = st->f_namelen;
142 buf->f_frsize = st.f_frsize; 155 buf.f_frsize = st->f_frsize;
143 buf->f_flags = st.f_flags; 156 buf.f_flags = st->f_flags;
144 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 157 memset(buf.f_spare, 0, sizeof(buf.f_spare));
145 } 158 }
159 if (copy_to_user(p, &buf, sizeof(buf)))
160 return -EFAULT;
146 return 0; 161 return 0;
147} 162}
148 163
149SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) 164SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
150{ 165{
151 struct path path; 166 struct kstatfs st;
152 int error; 167 int error = user_statfs(pathname, &st);
153 168 if (!error)
154 error = user_path(pathname, &path); 169 error = do_statfs_native(&st, buf);
155 if (!error) {
156 struct statfs tmp;
157 error = do_statfs_native(&path, &tmp);
158 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
159 error = -EFAULT;
160 path_put(&path);
161 }
162 return error; 170 return error;
163} 171}
164 172
165SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) 173SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
166{ 174{
167 struct path path; 175 struct kstatfs st;
168 long error; 176 int error;
169
170 if (sz != sizeof(*buf)) 177 if (sz != sizeof(*buf))
171 return -EINVAL; 178 return -EINVAL;
172 error = user_path(pathname, &path); 179 error = user_statfs(pathname, &st);
173 if (!error) { 180 if (!error)
174 struct statfs64 tmp; 181 error = do_statfs64(&st, buf);
175 error = do_statfs64(&path, &tmp);
176 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
177 error = -EFAULT;
178 path_put(&path);
179 }
180 return error; 182 return error;
181} 183}
182 184
183SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) 185SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
184{ 186{
185 struct file *file; 187 struct kstatfs st;
186 struct statfs tmp; 188 int error = fd_statfs(fd, &st);
187 int error; 189 if (!error)
188 190 error = do_statfs_native(&st, buf);
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = do_statfs_native(&file->f_path, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error; 191 return error;
199} 192}
200 193
201SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) 194SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
202{ 195{
203 struct file *file; 196 struct kstatfs st;
204 struct statfs64 tmp;
205 int error; 197 int error;
206 198
207 if (sz != sizeof(*buf)) 199 if (sz != sizeof(*buf))
208 return -EINVAL; 200 return -EINVAL;
209 201
210 error = -EBADF; 202 error = fd_statfs(fd, &st);
211 file = fget(fd); 203 if (!error)
212 if (!file) 204 error = do_statfs64(&st, buf);
213 goto out;
214 error = do_statfs64(&file->f_path, &tmp);
215 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
216 error = -EFAULT;
217 fput(file);
218out:
219 return error; 205 return error;
220} 206}
221 207
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7..7217d67a80a 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
522 ubifs_assert(mutex_is_locked(&dir->i_mutex)); 522 ubifs_assert(mutex_is_locked(&dir->i_mutex));
523 ubifs_assert(mutex_is_locked(&inode->i_mutex)); 523 ubifs_assert(mutex_is_locked(&inode->i_mutex));
524 524
525 /*
526 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
527 * otherwise has the potential to corrupt the orphan inode list.
528 *
529 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
530 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
531 * lock 'dirA->i_mutex', so this is possible. Both of the functions
532 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
533 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
534 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
535 * to the list of orphans. After this, 'vfs_link()' will link
536 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
537 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
538 * to the list of orphans.
539 */
540 if (inode->i_nlink == 0)
541 return -ENOENT;
542
543 err = dbg_check_synced_i_size(inode); 525 err = dbg_check_synced_i_size(inode);
544 if (err) 526 if (err)
545 return err; 527 return err;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b7c338d5e9d..f1dce848ef9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1286 struct fid *fid = (struct fid *)fh; 1286 struct fid *fid = (struct fid *)fh;
1287 int type = FILEID_UDF_WITHOUT_PARENT; 1287 int type = FILEID_UDF_WITHOUT_PARENT;
1288 1288
1289 if (len < 3 || (connectable && len < 5)) 1289 if (connectable && (len < 5)) {
1290 *lenp = 5;
1291 return 255;
1292 } else if (len < 3) {
1293 *lenp = 3;
1290 return 255; 1294 return 255;
1295 }
1291 1296
1292 *lenp = 3; 1297 *lenp = 3;
1293 fid->udf.block = location.logicalBlockNum; 1298 fid->udf.block = location.logicalBlockNum;
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fd..f4f878fc008 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
89 * seven combinations work. The real answer is "don't use v2". 89 * seven combinations work. The real answer is "don't use v2".
90 */ 90 */
91 len = xfs_fileid_length(fileid_type); 91 len = xfs_fileid_length(fileid_type);
92 if (*max_len < len) 92 if (*max_len < len) {
93 *max_len = len;
93 return 255; 94 return 255;
95 }
94 *max_len = len; 96 *max_len = len;
95 97
96 switch (fileid_type) { 98 switch (fileid_type) {
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 2bcc5c7c22a..61e03dd7939 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -30,6 +30,9 @@ typedef u64 cputime64_t;
30#define cputime64_to_jiffies64(__ct) (__ct) 30#define cputime64_to_jiffies64(__ct) (__ct)
31#define jiffies64_to_cputime64(__jif) (__jif) 31#define jiffies64_to_cputime64(__jif) (__jif)
32#define cputime_to_cputime64(__ct) ((u64) __ct) 32#define cputime_to_cputime64(__ct) ((u64) __ct)
33#define cputime64_gt(__a, __b) ((__a) > (__b))
34
35#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct)
33 36
34 37
35/* 38/*
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 0fc16e3f0bf..84793c7025e 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -80,6 +80,10 @@
80#define O_SYNC (__O_SYNC|O_DSYNC) 80#define O_SYNC (__O_SYNC|O_DSYNC)
81#endif 81#endif
82 82
83#ifndef O_PATH
84#define O_PATH 010000000
85#endif
86
83#ifndef O_NDELAY 87#ifndef O_NDELAY
84#define O_NDELAY O_NONBLOCK 88#define O_NDELAY O_NONBLOCK
85#endif 89#endif
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 3c2344f4813..01f227e1425 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -6,7 +6,7 @@
6#include <asm/errno.h> 6#include <asm/errno.h>
7 7
8static inline int 8static inline int
9futex_atomic_op_inuser (int encoded_op, int __user *uaddr) 9futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
10{ 10{
11 int op = (encoded_op >> 28) & 7; 11 int op = (encoded_op >> 28) & 7;
12 int cmp = (encoded_op >> 24) & 15; 12 int cmp = (encoded_op >> 24) & 15;
@@ -16,7 +16,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
16 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 16 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
17 oparg = 1 << oparg; 17 oparg = 1 << oparg;
18 18
19 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) 19 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
20 return -EFAULT; 20 return -EFAULT;
21 21
22 pagefault_disable(); 22 pagefault_disable();
@@ -48,7 +48,8 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
48} 48}
49 49
50static inline int 50static inline int
51futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) 51futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
52 u32 oldval, u32 newval)
52{ 53{
53 return -ENOSYS; 54 return -ENOSYS;
54} 55}
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc258f..c1a1216e29c 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
11extern char _end[]; 11extern char _end[];
12extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; 12extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
13extern char __kprobes_text_start[], __kprobes_text_end[]; 13extern char __kprobes_text_start[], __kprobes_text_end[];
14extern char __entry_text_start[], __entry_text_end[];
14extern char __initdata_begin[], __initdata_end[]; 15extern char __initdata_begin[], __initdata_end[];
15extern char __start_rodata[], __end_rodata[]; 16extern char __start_rodata[], __end_rodata[];
16 17
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index b969770196c..57af0338d27 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -646,9 +646,13 @@ __SYSCALL(__NR_prlimit64, sys_prlimit64)
646__SYSCALL(__NR_fanotify_init, sys_fanotify_init) 646__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
647#define __NR_fanotify_mark 263 647#define __NR_fanotify_mark 263
648__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) 648__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
649#define __NR_name_to_handle_at 264
650__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
651#define __NR_open_by_handle_at 265
652__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
649 653
650#undef __NR_syscalls 654#undef __NR_syscalls
651#define __NR_syscalls 264 655#define __NR_syscalls 266
652 656
653/* 657/*
654 * All syscalls below here should go away really, 658 * All syscalls below here should go away really,
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e3395b4..906c3ceca9a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
424 *(.kprobes.text) \ 424 *(.kprobes.text) \
425 VMLINUX_SYMBOL(__kprobes_text_end) = .; 425 VMLINUX_SYMBOL(__kprobes_text_end) = .;
426 426
427#define ENTRY_TEXT \
428 ALIGN_FUNCTION(); \
429 VMLINUX_SYMBOL(__entry_text_start) = .; \
430 *(.entry.text) \
431 VMLINUX_SYMBOL(__entry_text_end) = .;
432
427#ifdef CONFIG_FUNCTION_GRAPH_TRACER 433#ifdef CONFIG_FUNCTION_GRAPH_TRACER
428#define IRQENTRY_TEXT \ 434#define IRQENTRY_TEXT \
429 ALIGN_FUNCTION(); \ 435 ALIGN_FUNCTION(); \
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce104e33cd2..e654fa23991 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -474,7 +474,8 @@ struct cgroup_subsys {
474 struct cgroup *old_cgrp, struct task_struct *tsk, 474 struct cgroup *old_cgrp, struct task_struct *tsk,
475 bool threadgroup); 475 bool threadgroup);
476 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); 476 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
477 void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); 477 void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
478 struct cgroup *old_cgrp, struct task_struct *task);
478 int (*populate)(struct cgroup_subsys *ss, 479 int (*populate)(struct cgroup_subsys *ss,
479 struct cgroup *cgrp); 480 struct cgroup *cgrp);
480 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); 481 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
@@ -626,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
626/* Get id and depth of css */ 627/* Get id and depth of css */
627unsigned short css_id(struct cgroup_subsys_state *css); 628unsigned short css_id(struct cgroup_subsys_state *css);
628unsigned short css_depth(struct cgroup_subsys_state *css); 629unsigned short css_depth(struct cgroup_subsys_state *css);
630struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
629 631
630#else /* !CONFIG_CGROUPS */ 632#else /* !CONFIG_CGROUPS */
631 633
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff02b6c..cdbfcb8780e 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
65SUBSYS(blkio) 65SUBSYS(blkio)
66#endif 66#endif
67 67
68#ifdef CONFIG_CGROUP_PERF
69SUBSYS(perf)
70#endif
71
68/* */ 72/* */
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 597692f1fc8..65970b811e2 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -34,7 +34,10 @@ struct debug_obj {
34 34
35/** 35/**
36 * struct debug_obj_descr - object type specific debug description structure 36 * struct debug_obj_descr - object type specific debug description structure
37 *
37 * @name: name of the object typee 38 * @name: name of the object typee
39 * @debug_hint: function returning address, which have associated
40 * kernel symbol, to allow identify the object
38 * @fixup_init: fixup function, which is called when the init check 41 * @fixup_init: fixup function, which is called when the init check
39 * fails 42 * fails
40 * @fixup_activate: fixup function, which is called when the activate check 43 * @fixup_activate: fixup function, which is called when the activate check
@@ -46,7 +49,7 @@ struct debug_obj {
46 */ 49 */
47struct debug_obj_descr { 50struct debug_obj_descr {
48 const char *name; 51 const char *name;
49 52 void *(*debug_hint) (void *addr);
50 int (*fixup_init) (void *addr, enum debug_obj_state state); 53 int (*fixup_init) (void *addr, enum debug_obj_state state);
51 int (*fixup_activate) (void *addr, enum debug_obj_state state); 54 int (*fixup_activate) (void *addr, enum debug_obj_state state);
52 int (*fixup_destroy) (void *addr, enum debug_obj_state state); 55 int (*fixup_destroy) (void *addr, enum debug_obj_state state);
diff --git a/include/linux/device.h b/include/linux/device.h
index 1bf5cf0b451..ca5d25225aa 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -128,9 +128,7 @@ struct device_driver {
128 128
129 bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ 129 bool suppress_bind_attrs; /* disables bind/unbind via sysfs */
130 130
131#if defined(CONFIG_OF)
132 const struct of_device_id *of_match_table; 131 const struct of_device_id *of_match_table;
133#endif
134 132
135 int (*probe) (struct device *dev); 133 int (*probe) (struct device *dev);
136 int (*remove) (struct device *dev); 134 int (*remove) (struct device *dev);
@@ -441,9 +439,8 @@ struct device {
441 override */ 439 override */
442 /* arch specific additions */ 440 /* arch specific additions */
443 struct dev_archdata archdata; 441 struct dev_archdata archdata;
444#ifdef CONFIG_OF 442
445 struct device_node *of_node; 443 struct device_node *of_node; /* associated device tree node */
446#endif
447 444
448 dev_t devt; /* dev_t, creates the sysfs "dev" */ 445 dev_t devt; /* dev_t, creates the sysfs "dev" */
449 446
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 28028988c86..33a42f24b27 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -8,6 +8,9 @@ struct inode;
8struct super_block; 8struct super_block;
9struct vfsmount; 9struct vfsmount;
10 10
11/* limit the handle size to NFSv4 handle size now */
12#define MAX_HANDLE_SZ 128
13
11/* 14/*
12 * The fileid_type identifies how the file within the filesystem is encoded. 15 * The fileid_type identifies how the file within the filesystem is encoded.
13 * In theory this is freely set and parsed by the filesystem, but we try to 16 * In theory this is freely set and parsed by the filesystem, but we try to
@@ -121,8 +124,10 @@ struct fid {
121 * set, the encode_fh() should store sufficient information so that a good 124 * set, the encode_fh() should store sufficient information so that a good
122 * attempt can be made to find not only the file but also it's place in the 125 * attempt can be made to find not only the file but also it's place in the
123 * filesystem. This typically means storing a reference to de->d_parent in 126 * filesystem. This typically means storing a reference to de->d_parent in
124 * the filehandle fragment. encode_fh() should return the number of bytes 127 * the filehandle fragment. encode_fh() should return the fileid_type on
125 * stored or a negative error code such as %-ENOSPC 128 * success and on error returns 255 (if the space needed to encode fh is
129 * greater than @max_len*4 bytes). On error @max_len contains the minimum
130 * size(in 4 byte unit) needed to encode the file handle.
126 * 131 *
127 * fh_to_dentry: 132 * fh_to_dentry:
128 * @fh_to_dentry is given a &struct super_block (@sb) and a file handle 133 * @fh_to_dentry is given a &struct super_block (@sb) and a file handle
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a562fa5fb4e..f550f894ba1 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -46,6 +46,7 @@
46 unlinking file. */ 46 unlinking file. */
47#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ 47#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
48#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ 48#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */
49#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */
49 50
50#ifdef __KERNEL__ 51#ifdef __KERNEL__
51 52
diff --git a/include/linux/file.h b/include/linux/file.h
index e85baebf627..21a79958541 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -29,6 +29,8 @@ static inline void fput_light(struct file *file, int fput_needed)
29 29
30extern struct file *fget(unsigned int fd); 30extern struct file *fget(unsigned int fd);
31extern struct file *fget_light(unsigned int fd, int *fput_needed); 31extern struct file *fget_light(unsigned int fd, int *fput_needed);
32extern struct file *fget_raw(unsigned int fd);
33extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
32extern void set_close_on_exec(unsigned int fd, int flag); 34extern void set_close_on_exec(unsigned int fd, int flag);
33extern void put_filp(struct file *); 35extern void put_filp(struct file *);
34extern int alloc_fd(unsigned start, unsigned flags); 36extern int alloc_fd(unsigned start, unsigned flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e38b50a4b9d..13df14e2c42 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -102,6 +102,9 @@ struct inodes_stat_t {
102/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ 102/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
103#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) 103#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
104 104
105/* File is opened with O_PATH; almost nothing can be done with it */
106#define FMODE_PATH ((__force fmode_t)0x4000)
107
105/* File was opened by fanotify and shouldn't generate fanotify events */ 108/* File was opened by fanotify and shouldn't generate fanotify events */
106#define FMODE_NONOTIFY ((__force fmode_t)0x1000000) 109#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
107 110
@@ -978,6 +981,13 @@ struct file {
978#endif 981#endif
979}; 982};
980 983
984struct file_handle {
985 __u32 handle_bytes;
986 int handle_type;
987 /* file identifier */
988 unsigned char f_handle[0];
989};
990
981#define get_file(x) atomic_long_inc(&(x)->f_count) 991#define get_file(x) atomic_long_inc(&(x)->f_count)
982#define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) 992#define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1)
983#define file_count(x) atomic_long_read(&(x)->f_count) 993#define file_count(x) atomic_long_read(&(x)->f_count)
@@ -1401,6 +1411,7 @@ struct super_block {
1401 wait_queue_head_t s_wait_unfrozen; 1411 wait_queue_head_t s_wait_unfrozen;
1402 1412
1403 char s_id[32]; /* Informational name */ 1413 char s_id[32]; /* Informational name */
1414 u8 s_uuid[16]; /* UUID */
1404 1415
1405 void *s_fs_info; /* Filesystem private info */ 1416 void *s_fs_info; /* Filesystem private info */
1406 fmode_t s_mode; 1417 fmode_t s_mode;
@@ -1874,6 +1885,8 @@ extern void drop_collected_mounts(struct vfsmount *);
1874extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, 1885extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
1875 struct vfsmount *); 1886 struct vfsmount *);
1876extern int vfs_statfs(struct path *, struct kstatfs *); 1887extern int vfs_statfs(struct path *, struct kstatfs *);
1888extern int user_statfs(const char __user *, struct kstatfs *);
1889extern int fd_statfs(int, struct kstatfs *);
1877extern int statfs_by_dentry(struct dentry *, struct kstatfs *); 1890extern int statfs_by_dentry(struct dentry *, struct kstatfs *);
1878extern int freeze_super(struct super_block *super); 1891extern int freeze_super(struct super_block *super);
1879extern int thaw_super(struct super_block *super); 1892extern int thaw_super(struct super_block *super);
@@ -1990,6 +2003,8 @@ extern int do_fallocate(struct file *file, int mode, loff_t offset,
1990extern long do_sys_open(int dfd, const char __user *filename, int flags, 2003extern long do_sys_open(int dfd, const char __user *filename, int flags,
1991 int mode); 2004 int mode);
1992extern struct file *filp_open(const char *, int, int); 2005extern struct file *filp_open(const char *, int, int);
2006extern struct file *file_open_root(struct dentry *, struct vfsmount *,
2007 const char *, int);
1993extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, 2008extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
1994 const struct cred *); 2009 const struct cred *);
1995extern int filp_close(struct file *, fl_owner_t id); 2010extern int filp_close(struct file *, fl_owner_t id);
@@ -2205,10 +2220,6 @@ extern struct file *create_read_pipe(struct file *f, int flags);
2205extern struct file *create_write_pipe(int flags); 2220extern struct file *create_write_pipe(int flags);
2206extern void free_write_pipe(struct file *); 2221extern void free_write_pipe(struct file *);
2207 2222
2208extern struct file *do_filp_open(int dfd, const char *pathname,
2209 int open_flag, int mode, int acc_mode);
2210extern int may_open(struct path *, int, int);
2211
2212extern int kernel_read(struct file *, loff_t, char *, unsigned long); 2223extern int kernel_read(struct file *, loff_t, char *, unsigned long);
2213extern struct file * open_exec(const char *); 2224extern struct file * open_exec(const char *);
2214 2225
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index dcd6a7c3a43..ca29e03c1fa 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -428,6 +428,7 @@ extern void unregister_ftrace_graph(void);
428 428
429extern void ftrace_graph_init_task(struct task_struct *t); 429extern void ftrace_graph_init_task(struct task_struct *t);
430extern void ftrace_graph_exit_task(struct task_struct *t); 430extern void ftrace_graph_exit_task(struct task_struct *t);
431extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu);
431 432
432static inline int task_curr_ret_stack(struct task_struct *t) 433static inline int task_curr_ret_stack(struct task_struct *t)
433{ 434{
@@ -451,6 +452,7 @@ static inline void unpause_graph_tracing(void)
451 452
452static inline void ftrace_graph_init_task(struct task_struct *t) { } 453static inline void ftrace_graph_init_task(struct task_struct *t) { }
453static inline void ftrace_graph_exit_task(struct task_struct *t) { } 454static inline void ftrace_graph_exit_task(struct task_struct *t) { }
455static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { }
454 456
455static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc, 457static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
456 trace_func_graph_ent_t entryfunc) 458 trace_func_graph_ent_t entryfunc)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 47e3997f7b5..22b32af1b5e 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -37,7 +37,6 @@ struct trace_entry {
37 unsigned char flags; 37 unsigned char flags;
38 unsigned char preempt_count; 38 unsigned char preempt_count;
39 int pid; 39 int pid;
40 int lock_depth;
41}; 40};
42 41
43#define FTRACE_MAX_EVENT \ 42#define FTRACE_MAX_EVENT \
@@ -208,7 +207,6 @@ struct ftrace_event_call {
208 207
209#define PERF_MAX_TRACE_SIZE 2048 208#define PERF_MAX_TRACE_SIZE 2048
210 209
211#define MAX_FILTER_PRED 32
212#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ 210#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */
213 211
214extern void destroy_preds(struct ftrace_event_call *call); 212extern void destroy_preds(struct ftrace_event_call *call);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f376ddc64c4..62f500c724f 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -54,11 +54,13 @@ enum hrtimer_restart {
54 * 0x00 inactive 54 * 0x00 inactive
55 * 0x01 enqueued into rbtree 55 * 0x01 enqueued into rbtree
56 * 0x02 callback function running 56 * 0x02 callback function running
57 * 0x04 timer is migrated to another cpu
57 * 58 *
58 * Special cases: 59 * Special cases:
59 * 0x03 callback function running and enqueued 60 * 0x03 callback function running and enqueued
60 * (was requeued on another CPU) 61 * (was requeued on another CPU)
61 * 0x09 timer was migrated on CPU hotunplug 62 * 0x05 timer was migrated on CPU hotunplug
63 *
62 * The "callback function running and enqueued" status is only possible on 64 * The "callback function running and enqueued" status is only possible on
63 * SMP. It happens for example when a posix timer expired and the callback 65 * SMP. It happens for example when a posix timer expired and the callback
64 * queued a signal. Between dropping the lock which protects the posix timer 66 * queued a signal. Between dropping the lock which protects the posix timer
@@ -67,8 +69,11 @@ enum hrtimer_restart {
67 * as otherwise the timer could be removed before the softirq code finishes the 69 * as otherwise the timer could be removed before the softirq code finishes the
68 * the handling of the timer. 70 * the handling of the timer.
69 * 71 *
70 * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state to 72 * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
71 * preserve the HRTIMER_STATE_CALLBACK bit in the above scenario. 73 * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
74 * also affects HRTIMER_STATE_MIGRATE where the preservation is not
75 * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
76 * enqueued on the new cpu.
72 * 77 *
73 * All state transitions are protected by cpu_base->lock. 78 * All state transitions are protected by cpu_base->lock.
74 */ 79 */
@@ -148,7 +153,12 @@ struct hrtimer_clock_base {
148#endif 153#endif
149}; 154};
150 155
151#define HRTIMER_MAX_CLOCK_BASES 2 156enum hrtimer_base_type {
157 HRTIMER_BASE_REALTIME,
158 HRTIMER_BASE_MONOTONIC,
159 HRTIMER_BASE_BOOTTIME,
160 HRTIMER_MAX_CLOCK_BASES,
161};
152 162
153/* 163/*
154 * struct hrtimer_cpu_base - the per cpu clock bases 164 * struct hrtimer_cpu_base - the per cpu clock bases
@@ -308,6 +318,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
308 318
309extern ktime_t ktime_get(void); 319extern ktime_t ktime_get(void);
310extern ktime_t ktime_get_real(void); 320extern ktime_t ktime_get_real(void);
321extern ktime_t ktime_get_boottime(void);
311 322
312 323
313DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 324DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
@@ -370,8 +381,9 @@ extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
370extern ktime_t hrtimer_get_next_event(void); 381extern ktime_t hrtimer_get_next_event(void);
371 382
372/* 383/*
373 * A timer is active, when it is enqueued into the rbtree or the callback 384 * A timer is active, when it is enqueued into the rbtree or the
374 * function is running. 385 * callback function is running or it's in the state of being migrated
386 * to another cpu.
375 */ 387 */
376static inline int hrtimer_active(const struct hrtimer *timer) 388static inline int hrtimer_active(const struct hrtimer *timer)
377{ 389{
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 903576df88d..06a8d9c7de9 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -258,9 +258,7 @@ struct i2c_board_info {
258 unsigned short addr; 258 unsigned short addr;
259 void *platform_data; 259 void *platform_data;
260 struct dev_archdata *archdata; 260 struct dev_archdata *archdata;
261#ifdef CONFIG_OF
262 struct device_node *of_node; 261 struct device_node *of_node;
263#endif
264 int irq; 262 int irq;
265}; 263};
266 264
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d4253e4..59b72ca1c5d 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -14,6 +14,8 @@
14#include <linux/smp.h> 14#include <linux/smp.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/kref.h>
18#include <linux/workqueue.h>
17 19
18#include <asm/atomic.h> 20#include <asm/atomic.h>
19#include <asm/ptrace.h> 21#include <asm/ptrace.h>
@@ -55,7 +57,8 @@
55 * Used by threaded interrupts which need to keep the 57 * Used by threaded interrupts which need to keep the
56 * irq line disabled until the threaded handler has been run. 58 * irq line disabled until the threaded handler has been run.
57 * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend 59 * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
58 * 60 * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
61 * IRQF_NO_THREAD - Interrupt cannot be threaded
59 */ 62 */
60#define IRQF_DISABLED 0x00000020 63#define IRQF_DISABLED 0x00000020
61#define IRQF_SAMPLE_RANDOM 0x00000040 64#define IRQF_SAMPLE_RANDOM 0x00000040
@@ -67,22 +70,10 @@
67#define IRQF_IRQPOLL 0x00001000 70#define IRQF_IRQPOLL 0x00001000
68#define IRQF_ONESHOT 0x00002000 71#define IRQF_ONESHOT 0x00002000
69#define IRQF_NO_SUSPEND 0x00004000 72#define IRQF_NO_SUSPEND 0x00004000
73#define IRQF_FORCE_RESUME 0x00008000
74#define IRQF_NO_THREAD 0x00010000
70 75
71#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND) 76#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
72
73/*
74 * Bits used by threaded handlers:
75 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
76 * IRQTF_DIED - handler thread died
77 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
78 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
79 */
80enum {
81 IRQTF_RUNTHREAD,
82 IRQTF_DIED,
83 IRQTF_WARNED,
84 IRQTF_AFFINITY,
85};
86 77
87/* 78/*
88 * These values can be returned by request_any_context_irq() and 79 * These values can be returned by request_any_context_irq() and
@@ -110,6 +101,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
110 * @thread_fn: interupt handler function for threaded interrupts 101 * @thread_fn: interupt handler function for threaded interrupts
111 * @thread: thread pointer for threaded interrupts 102 * @thread: thread pointer for threaded interrupts
112 * @thread_flags: flags related to @thread 103 * @thread_flags: flags related to @thread
104 * @thread_mask: bitmask for keeping track of @thread activity
113 */ 105 */
114struct irqaction { 106struct irqaction {
115 irq_handler_t handler; 107 irq_handler_t handler;
@@ -120,6 +112,7 @@ struct irqaction {
120 irq_handler_t thread_fn; 112 irq_handler_t thread_fn;
121 struct task_struct *thread; 113 struct task_struct *thread;
122 unsigned long thread_flags; 114 unsigned long thread_flags;
115 unsigned long thread_mask;
123 const char *name; 116 const char *name;
124 struct proc_dir_entry *dir; 117 struct proc_dir_entry *dir;
125} ____cacheline_internodealigned_in_smp; 118} ____cacheline_internodealigned_in_smp;
@@ -240,6 +233,35 @@ extern int irq_can_set_affinity(unsigned int irq);
240extern int irq_select_affinity(unsigned int irq); 233extern int irq_select_affinity(unsigned int irq);
241 234
242extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); 235extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
236
237/**
238 * struct irq_affinity_notify - context for notification of IRQ affinity changes
239 * @irq: Interrupt to which notification applies
240 * @kref: Reference count, for internal use
241 * @work: Work item, for internal use
242 * @notify: Function to be called on change. This will be
243 * called in process context.
244 * @release: Function to be called on release. This will be
245 * called in process context. Once registered, the
246 * structure must only be freed when this function is
247 * called or later.
248 */
249struct irq_affinity_notify {
250 unsigned int irq;
251 struct kref kref;
252 struct work_struct work;
253 void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
254 void (*release)(struct kref *ref);
255};
256
257extern int
258irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
259
260static inline void irq_run_affinity_notifiers(void)
261{
262 flush_scheduled_work();
263}
264
243#else /* CONFIG_SMP */ 265#else /* CONFIG_SMP */
244 266
245static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) 267static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -255,7 +277,7 @@ static inline int irq_can_set_affinity(unsigned int irq)
255static inline int irq_select_affinity(unsigned int irq) { return 0; } 277static inline int irq_select_affinity(unsigned int irq) { return 0; }
256 278
257static inline int irq_set_affinity_hint(unsigned int irq, 279static inline int irq_set_affinity_hint(unsigned int irq,
258 const struct cpumask *m) 280 const struct cpumask *m)
259{ 281{
260 return -EINVAL; 282 return -EINVAL;
261} 283}
@@ -314,16 +336,24 @@ static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long
314} 336}
315 337
316/* IRQ wakeup (PM) control: */ 338/* IRQ wakeup (PM) control: */
317extern int set_irq_wake(unsigned int irq, unsigned int on); 339extern int irq_set_irq_wake(unsigned int irq, unsigned int on);
340
341#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
342/* Please do not use: Use the replacement functions instead */
343static inline int set_irq_wake(unsigned int irq, unsigned int on)
344{
345 return irq_set_irq_wake(irq, on);
346}
347#endif
318 348
319static inline int enable_irq_wake(unsigned int irq) 349static inline int enable_irq_wake(unsigned int irq)
320{ 350{
321 return set_irq_wake(irq, 1); 351 return irq_set_irq_wake(irq, 1);
322} 352}
323 353
324static inline int disable_irq_wake(unsigned int irq) 354static inline int disable_irq_wake(unsigned int irq)
325{ 355{
326 return set_irq_wake(irq, 0); 356 return irq_set_irq_wake(irq, 0);
327} 357}
328 358
329#else /* !CONFIG_GENERIC_HARDIRQS */ 359#else /* !CONFIG_GENERIC_HARDIRQS */
@@ -353,6 +383,13 @@ static inline int disable_irq_wake(unsigned int irq)
353} 383}
354#endif /* CONFIG_GENERIC_HARDIRQS */ 384#endif /* CONFIG_GENERIC_HARDIRQS */
355 385
386
387#ifdef CONFIG_IRQ_FORCED_THREADING
388extern bool force_irqthreads;
389#else
390#define force_irqthreads (0)
391#endif
392
356#ifndef __ARCH_SET_SOFTIRQ_PENDING 393#ifndef __ARCH_SET_SOFTIRQ_PENDING
357#define set_softirq_pending(x) (local_softirq_pending() = (x)) 394#define set_softirq_pending(x) (local_softirq_pending() = (x))
358#define or_softirq_pending(x) (local_softirq_pending() |= (x)) 395#define or_softirq_pending(x) (local_softirq_pending() |= (x))
@@ -426,6 +463,13 @@ extern void raise_softirq(unsigned int nr);
426 */ 463 */
427DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 464DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
428 465
466DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
467
468static inline struct task_struct *this_cpu_ksoftirqd(void)
469{
470 return this_cpu_read(ksoftirqd);
471}
472
429/* Try to send a softirq to a remote cpu. If this cannot be done, the 473/* Try to send a softirq to a remote cpu. If this cannot be done, the
430 * work will be queued to the local cpu. 474 * work will be queued to the local cpu.
431 */ 475 */
@@ -645,6 +689,7 @@ static inline void init_irq_proc(void)
645 689
646struct seq_file; 690struct seq_file;
647int show_interrupts(struct seq_file *p, void *v); 691int show_interrupts(struct seq_file *p, void *v);
692int arch_show_interrupts(struct seq_file *p, int prec);
648 693
649extern int early_irq_init(void); 694extern int early_irq_init(void);
650extern int arch_probe_nr_irqs(void); 695extern int arch_probe_nr_irqs(void);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 80fcb53057b..1d3577f30d4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -29,61 +29,104 @@
29#include <asm/irq_regs.h> 29#include <asm/irq_regs.h>
30 30
31struct irq_desc; 31struct irq_desc;
32struct irq_data;
32typedef void (*irq_flow_handler_t)(unsigned int irq, 33typedef void (*irq_flow_handler_t)(unsigned int irq,
33 struct irq_desc *desc); 34 struct irq_desc *desc);
34 35typedef void (*irq_preflow_handler_t)(struct irq_data *data);
35 36
36/* 37/*
37 * IRQ line status. 38 * IRQ line status.
38 * 39 *
39 * Bits 0-7 are reserved for the IRQF_* bits in linux/interrupt.h 40 * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h
41 *
42 * IRQ_TYPE_NONE - default, unspecified type
43 * IRQ_TYPE_EDGE_RISING - rising edge triggered
44 * IRQ_TYPE_EDGE_FALLING - falling edge triggered
45 * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered
46 * IRQ_TYPE_LEVEL_HIGH - high level triggered
47 * IRQ_TYPE_LEVEL_LOW - low level triggered
48 * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits
49 * IRQ_TYPE_SENSE_MASK - Mask for all the above bits
50 * IRQ_TYPE_PROBE - Special flag for probing in progress
51 *
52 * Bits which can be modified via irq_set/clear/modify_status_flags()
53 * IRQ_LEVEL - Interrupt is level type. Will be also
54 * updated in the code when the above trigger
55 * bits are modified via set_irq_type()
56 * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect
57 * it from affinity setting
58 * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing
59 * IRQ_NOREQUEST - Interrupt cannot be requested via
60 * request_irq()
61 * IRQ_NOAUTOEN - Interrupt is not automatically enabled in
62 * request/setup_irq()
63 * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set)
64 * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context
65 * IRQ_NESTED_TRHEAD - Interrupt nests into another thread
66 *
67 * Deprecated bits. They are kept updated as long as
68 * CONFIG_GENERIC_HARDIRQS_NO_COMPAT is not set. Will go away soon. These bits
69 * are internal state of the core code and if you really need to acces
70 * them then talk to the genirq maintainer instead of hacking
71 * something weird.
40 * 72 *
41 * IRQ types
42 */ 73 */
43#define IRQ_TYPE_NONE 0x00000000 /* Default, unspecified type */ 74enum {
44#define IRQ_TYPE_EDGE_RISING 0x00000001 /* Edge rising type */ 75 IRQ_TYPE_NONE = 0x00000000,
45#define IRQ_TYPE_EDGE_FALLING 0x00000002 /* Edge falling type */ 76 IRQ_TYPE_EDGE_RISING = 0x00000001,
46#define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING) 77 IRQ_TYPE_EDGE_FALLING = 0x00000002,
47#define IRQ_TYPE_LEVEL_HIGH 0x00000004 /* Level high type */ 78 IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING),
48#define IRQ_TYPE_LEVEL_LOW 0x00000008 /* Level low type */ 79 IRQ_TYPE_LEVEL_HIGH = 0x00000004,
49#define IRQ_TYPE_SENSE_MASK 0x0000000f /* Mask of the above */ 80 IRQ_TYPE_LEVEL_LOW = 0x00000008,
50#define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */ 81 IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH),
51 82 IRQ_TYPE_SENSE_MASK = 0x0000000f,
52/* Internal flags */ 83
53#define IRQ_INPROGRESS 0x00000100 /* IRQ handler active - do not enter! */ 84 IRQ_TYPE_PROBE = 0x00000010,
54#define IRQ_DISABLED 0x00000200 /* IRQ disabled - do not enter! */ 85
55#define IRQ_PENDING 0x00000400 /* IRQ pending - replay on enable */ 86 IRQ_LEVEL = (1 << 8),
56#define IRQ_REPLAY 0x00000800 /* IRQ has been replayed but not acked yet */ 87 IRQ_PER_CPU = (1 << 9),
57#define IRQ_AUTODETECT 0x00001000 /* IRQ is being autodetected */ 88 IRQ_NOPROBE = (1 << 10),
58#define IRQ_WAITING 0x00002000 /* IRQ not yet seen - for autodetection */ 89 IRQ_NOREQUEST = (1 << 11),
59#define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ 90 IRQ_NOAUTOEN = (1 << 12),
60#define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */ 91 IRQ_NO_BALANCING = (1 << 13),
61#define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ 92 IRQ_MOVE_PCNTXT = (1 << 14),
62#define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */ 93 IRQ_NESTED_THREAD = (1 << 15),
63#define IRQ_NOREQUEST 0x00040000 /* IRQ cannot be requested */ 94
64#define IRQ_NOAUTOEN 0x00080000 /* IRQ will not be enabled on request irq */ 95#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
65#define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ 96 IRQ_INPROGRESS = (1 << 16),
66#define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ 97 IRQ_REPLAY = (1 << 17),
67#define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ 98 IRQ_WAITING = (1 << 18),
68#define IRQ_SPURIOUS_DISABLED 0x00800000 /* IRQ was disabled by the spurious trap */ 99 IRQ_DISABLED = (1 << 19),
69#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ 100 IRQ_PENDING = (1 << 20),
70#define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ 101 IRQ_MASKED = (1 << 21),
71#define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ 102 IRQ_MOVE_PENDING = (1 << 22),
72#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ 103 IRQ_AFFINITY_SET = (1 << 23),
73#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ 104 IRQ_WAKEUP = (1 << 24),
105#endif
106};
74 107
75#define IRQF_MODIFY_MASK \ 108#define IRQF_MODIFY_MASK \
76 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ 109 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
77 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ 110 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
78 IRQ_PER_CPU) 111 IRQ_PER_CPU | IRQ_NESTED_THREAD)
79 112
80#ifdef CONFIG_IRQ_PER_CPU 113#define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
81# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) 114
82# define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) 115static inline __deprecated bool CHECK_IRQ_PER_CPU(unsigned int status)
83#else 116{
84# define CHECK_IRQ_PER_CPU(var) 0 117 return status & IRQ_PER_CPU;
85# define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING 118}
86#endif 119
120/*
121 * Return value for chip->irq_set_affinity()
122 *
123 * IRQ_SET_MASK_OK - OK, core updates irq_data.affinity
124 * IRQ_SET_MASK_NOCPY - OK, chip did update irq_data.affinity
125 */
126enum {
127 IRQ_SET_MASK_OK = 0,
128 IRQ_SET_MASK_OK_NOCOPY,
129};
87 130
88struct msi_desc; 131struct msi_desc;
89 132
@@ -91,6 +134,8 @@ struct msi_desc;
91 * struct irq_data - per irq and irq chip data passed down to chip functions 134 * struct irq_data - per irq and irq chip data passed down to chip functions
92 * @irq: interrupt number 135 * @irq: interrupt number
93 * @node: node index useful for balancing 136 * @node: node index useful for balancing
137 * @state_use_accessor: status information for irq chip functions.
138 * Use accessor functions to deal with it
94 * @chip: low level interrupt hardware access 139 * @chip: low level interrupt hardware access
95 * @handler_data: per-IRQ data for the irq_chip methods 140 * @handler_data: per-IRQ data for the irq_chip methods
96 * @chip_data: platform-specific per-chip private data for the chip 141 * @chip_data: platform-specific per-chip private data for the chip
@@ -105,6 +150,7 @@ struct msi_desc;
105struct irq_data { 150struct irq_data {
106 unsigned int irq; 151 unsigned int irq;
107 unsigned int node; 152 unsigned int node;
153 unsigned int state_use_accessors;
108 struct irq_chip *chip; 154 struct irq_chip *chip;
109 void *handler_data; 155 void *handler_data;
110 void *chip_data; 156 void *chip_data;
@@ -114,6 +160,80 @@ struct irq_data {
114#endif 160#endif
115}; 161};
116 162
163/*
164 * Bit masks for irq_data.state
165 *
166 * IRQD_TRIGGER_MASK - Mask for the trigger type bits
167 * IRQD_SETAFFINITY_PENDING - Affinity setting is pending
168 * IRQD_NO_BALANCING - Balancing disabled for this IRQ
169 * IRQD_PER_CPU - Interrupt is per cpu
170 * IRQD_AFFINITY_SET - Interrupt affinity was set
171 * IRQD_LEVEL - Interrupt is level triggered
172 * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup
173 * from suspend
174 * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process
175 * context
176 */
177enum {
178 IRQD_TRIGGER_MASK = 0xf,
179 IRQD_SETAFFINITY_PENDING = (1 << 8),
180 IRQD_NO_BALANCING = (1 << 10),
181 IRQD_PER_CPU = (1 << 11),
182 IRQD_AFFINITY_SET = (1 << 12),
183 IRQD_LEVEL = (1 << 13),
184 IRQD_WAKEUP_STATE = (1 << 14),
185 IRQD_MOVE_PCNTXT = (1 << 15),
186};
187
188static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
189{
190 return d->state_use_accessors & IRQD_SETAFFINITY_PENDING;
191}
192
193static inline bool irqd_is_per_cpu(struct irq_data *d)
194{
195 return d->state_use_accessors & IRQD_PER_CPU;
196}
197
198static inline bool irqd_can_balance(struct irq_data *d)
199{
200 return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING));
201}
202
203static inline bool irqd_affinity_was_set(struct irq_data *d)
204{
205 return d->state_use_accessors & IRQD_AFFINITY_SET;
206}
207
208static inline u32 irqd_get_trigger_type(struct irq_data *d)
209{
210 return d->state_use_accessors & IRQD_TRIGGER_MASK;
211}
212
213/*
214 * Must only be called inside irq_chip.irq_set_type() functions.
215 */
216static inline void irqd_set_trigger_type(struct irq_data *d, u32 type)
217{
218 d->state_use_accessors &= ~IRQD_TRIGGER_MASK;
219 d->state_use_accessors |= type & IRQD_TRIGGER_MASK;
220}
221
222static inline bool irqd_is_level_type(struct irq_data *d)
223{
224 return d->state_use_accessors & IRQD_LEVEL;
225}
226
227static inline bool irqd_is_wakeup_set(struct irq_data *d)
228{
229 return d->state_use_accessors & IRQD_WAKEUP_STATE;
230}
231
232static inline bool irqd_can_move_in_process_context(struct irq_data *d)
233{
234 return d->state_use_accessors & IRQD_MOVE_PCNTXT;
235}
236
117/** 237/**
118 * struct irq_chip - hardware interrupt chip descriptor 238 * struct irq_chip - hardware interrupt chip descriptor
119 * 239 *
@@ -150,6 +270,7 @@ struct irq_data {
150 * @irq_set_wake: enable/disable power-management wake-on of an IRQ 270 * @irq_set_wake: enable/disable power-management wake-on of an IRQ
151 * @irq_bus_lock: function to lock access to slow bus (i2c) chips 271 * @irq_bus_lock: function to lock access to slow bus (i2c) chips
152 * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips 272 * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips
273 * @flags: chip specific flags
153 * 274 *
154 * @release: release function solely used by UML 275 * @release: release function solely used by UML
155 */ 276 */
@@ -196,12 +317,27 @@ struct irq_chip {
196 void (*irq_bus_lock)(struct irq_data *data); 317 void (*irq_bus_lock)(struct irq_data *data);
197 void (*irq_bus_sync_unlock)(struct irq_data *data); 318 void (*irq_bus_sync_unlock)(struct irq_data *data);
198 319
320 unsigned long flags;
321
199 /* Currently used only by UML, might disappear one day.*/ 322 /* Currently used only by UML, might disappear one day.*/
200#ifdef CONFIG_IRQ_RELEASE_METHOD 323#ifdef CONFIG_IRQ_RELEASE_METHOD
201 void (*release)(unsigned int irq, void *dev_id); 324 void (*release)(unsigned int irq, void *dev_id);
202#endif 325#endif
203}; 326};
204 327
328/*
329 * irq_chip specific flags
330 *
331 * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type()
332 * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled
333 * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path
334 */
335enum {
336 IRQCHIP_SET_TYPE_MASKED = (1 << 0),
337 IRQCHIP_EOI_IF_HANDLED = (1 << 1),
338 IRQCHIP_MASK_ON_SUSPEND = (1 << 2),
339};
340
205/* This include will go away once we isolated irq_desc usage to core code */ 341/* This include will go away once we isolated irq_desc usage to core code */
206#include <linux/irqdesc.h> 342#include <linux/irqdesc.h>
207 343
@@ -218,7 +354,7 @@ struct irq_chip {
218# define ARCH_IRQ_INIT_FLAGS 0 354# define ARCH_IRQ_INIT_FLAGS 0
219#endif 355#endif
220 356
221#define IRQ_DEFAULT_INIT_FLAGS (IRQ_DISABLED | ARCH_IRQ_INIT_FLAGS) 357#define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS
222 358
223struct irqaction; 359struct irqaction;
224extern int setup_irq(unsigned int irq, struct irqaction *new); 360extern int setup_irq(unsigned int irq, struct irqaction *new);
@@ -229,9 +365,13 @@ extern void remove_irq(unsigned int irq, struct irqaction *act);
229#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) 365#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
230void move_native_irq(int irq); 366void move_native_irq(int irq);
231void move_masked_irq(int irq); 367void move_masked_irq(int irq);
368void irq_move_irq(struct irq_data *data);
369void irq_move_masked_irq(struct irq_data *data);
232#else 370#else
233static inline void move_native_irq(int irq) { } 371static inline void move_native_irq(int irq) { }
234static inline void move_masked_irq(int irq) { } 372static inline void move_masked_irq(int irq) { }
373static inline void irq_move_irq(struct irq_data *data) { }
374static inline void irq_move_masked_irq(struct irq_data *data) { }
235#endif 375#endif
236 376
237extern int no_irq_affinity; 377extern int no_irq_affinity;
@@ -267,23 +407,23 @@ extern struct irq_chip no_irq_chip;
267extern struct irq_chip dummy_irq_chip; 407extern struct irq_chip dummy_irq_chip;
268 408
269extern void 409extern void
270set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, 410irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
271 irq_flow_handler_t handle);
272extern void
273set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
274 irq_flow_handler_t handle, const char *name); 411 irq_flow_handler_t handle, const char *name);
275 412
413static inline void irq_set_chip_and_handler(unsigned int irq, struct irq_chip *chip,
414 irq_flow_handler_t handle)
415{
416 irq_set_chip_and_handler_name(irq, chip, handle, NULL);
417}
418
276extern void 419extern void
277__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 420__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
278 const char *name); 421 const char *name);
279 422
280/*
281 * Set a highlevel flow handler for a given IRQ:
282 */
283static inline void 423static inline void
284set_irq_handler(unsigned int irq, irq_flow_handler_t handle) 424irq_set_handler(unsigned int irq, irq_flow_handler_t handle)
285{ 425{
286 __set_irq_handler(irq, handle, 0, NULL); 426 __irq_set_handler(irq, handle, 0, NULL);
287} 427}
288 428
289/* 429/*
@@ -292,14 +432,11 @@ set_irq_handler(unsigned int irq, irq_flow_handler_t handle)
292 * IRQ_NOREQUEST and IRQ_NOPROBE) 432 * IRQ_NOREQUEST and IRQ_NOPROBE)
293 */ 433 */
294static inline void 434static inline void
295set_irq_chained_handler(unsigned int irq, 435irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle)
296 irq_flow_handler_t handle)
297{ 436{
298 __set_irq_handler(irq, handle, 1, NULL); 437 __irq_set_handler(irq, handle, 1, NULL);
299} 438}
300 439
301extern void set_irq_nested_thread(unsigned int irq, int nest);
302
303void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); 440void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set);
304 441
305static inline void irq_set_status_flags(unsigned int irq, unsigned long set) 442static inline void irq_set_status_flags(unsigned int irq, unsigned long set)
@@ -312,16 +449,24 @@ static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr)
312 irq_modify_status(irq, clr, 0); 449 irq_modify_status(irq, clr, 0);
313} 450}
314 451
315static inline void set_irq_noprobe(unsigned int irq) 452static inline void irq_set_noprobe(unsigned int irq)
316{ 453{
317 irq_modify_status(irq, 0, IRQ_NOPROBE); 454 irq_modify_status(irq, 0, IRQ_NOPROBE);
318} 455}
319 456
320static inline void set_irq_probe(unsigned int irq) 457static inline void irq_set_probe(unsigned int irq)
321{ 458{
322 irq_modify_status(irq, IRQ_NOPROBE, 0); 459 irq_modify_status(irq, IRQ_NOPROBE, 0);
323} 460}
324 461
462static inline void irq_set_nested_thread(unsigned int irq, bool nest)
463{
464 if (nest)
465 irq_set_status_flags(irq, IRQ_NESTED_THREAD);
466 else
467 irq_clear_status_flags(irq, IRQ_NESTED_THREAD);
468}
469
325/* Handle dynamic irq creation and destruction */ 470/* Handle dynamic irq creation and destruction */
326extern unsigned int create_irq_nr(unsigned int irq_want, int node); 471extern unsigned int create_irq_nr(unsigned int irq_want, int node);
327extern int create_irq(void); 472extern int create_irq(void);
@@ -338,14 +483,14 @@ static inline void dynamic_irq_init(unsigned int irq)
338} 483}
339 484
340/* Set/get chip/data for an IRQ: */ 485/* Set/get chip/data for an IRQ: */
341extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); 486extern int irq_set_chip(unsigned int irq, struct irq_chip *chip);
342extern int set_irq_data(unsigned int irq, void *data); 487extern int irq_set_handler_data(unsigned int irq, void *data);
343extern int set_irq_chip_data(unsigned int irq, void *data); 488extern int irq_set_chip_data(unsigned int irq, void *data);
344extern int set_irq_type(unsigned int irq, unsigned int type); 489extern int irq_set_irq_type(unsigned int irq, unsigned int type);
345extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); 490extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
346extern struct irq_data *irq_get_irq_data(unsigned int irq); 491extern struct irq_data *irq_get_irq_data(unsigned int irq);
347 492
348static inline struct irq_chip *get_irq_chip(unsigned int irq) 493static inline struct irq_chip *irq_get_chip(unsigned int irq)
349{ 494{
350 struct irq_data *d = irq_get_irq_data(irq); 495 struct irq_data *d = irq_get_irq_data(irq);
351 return d ? d->chip : NULL; 496 return d ? d->chip : NULL;
@@ -356,7 +501,7 @@ static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d)
356 return d->chip; 501 return d->chip;
357} 502}
358 503
359static inline void *get_irq_chip_data(unsigned int irq) 504static inline void *irq_get_chip_data(unsigned int irq)
360{ 505{
361 struct irq_data *d = irq_get_irq_data(irq); 506 struct irq_data *d = irq_get_irq_data(irq);
362 return d ? d->chip_data : NULL; 507 return d ? d->chip_data : NULL;
@@ -367,18 +512,18 @@ static inline void *irq_data_get_irq_chip_data(struct irq_data *d)
367 return d->chip_data; 512 return d->chip_data;
368} 513}
369 514
370static inline void *get_irq_data(unsigned int irq) 515static inline void *irq_get_handler_data(unsigned int irq)
371{ 516{
372 struct irq_data *d = irq_get_irq_data(irq); 517 struct irq_data *d = irq_get_irq_data(irq);
373 return d ? d->handler_data : NULL; 518 return d ? d->handler_data : NULL;
374} 519}
375 520
376static inline void *irq_data_get_irq_data(struct irq_data *d) 521static inline void *irq_data_get_irq_handler_data(struct irq_data *d)
377{ 522{
378 return d->handler_data; 523 return d->handler_data;
379} 524}
380 525
381static inline struct msi_desc *get_irq_msi(unsigned int irq) 526static inline struct msi_desc *irq_get_msi_desc(unsigned int irq)
382{ 527{
383 struct irq_data *d = irq_get_irq_data(irq); 528 struct irq_data *d = irq_get_irq_data(irq);
384 return d ? d->msi_desc : NULL; 529 return d ? d->msi_desc : NULL;
@@ -389,6 +534,89 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
389 return d->msi_desc; 534 return d->msi_desc;
390} 535}
391 536
537#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
538/* Please do not use: Use the replacement functions instead */
539static inline int set_irq_chip(unsigned int irq, struct irq_chip *chip)
540{
541 return irq_set_chip(irq, chip);
542}
543static inline int set_irq_data(unsigned int irq, void *data)
544{
545 return irq_set_handler_data(irq, data);
546}
547static inline int set_irq_chip_data(unsigned int irq, void *data)
548{
549 return irq_set_chip_data(irq, data);
550}
551static inline int set_irq_type(unsigned int irq, unsigned int type)
552{
553 return irq_set_irq_type(irq, type);
554}
555static inline int set_irq_msi(unsigned int irq, struct msi_desc *entry)
556{
557 return irq_set_msi_desc(irq, entry);
558}
559static inline struct irq_chip *get_irq_chip(unsigned int irq)
560{
561 return irq_get_chip(irq);
562}
563static inline void *get_irq_chip_data(unsigned int irq)
564{
565 return irq_get_chip_data(irq);
566}
567static inline void *get_irq_data(unsigned int irq)
568{
569 return irq_get_handler_data(irq);
570}
571static inline void *irq_data_get_irq_data(struct irq_data *d)
572{
573 return irq_data_get_irq_handler_data(d);
574}
575static inline struct msi_desc *get_irq_msi(unsigned int irq)
576{
577 return irq_get_msi_desc(irq);
578}
579static inline void set_irq_noprobe(unsigned int irq)
580{
581 irq_set_noprobe(irq);
582}
583static inline void set_irq_probe(unsigned int irq)
584{
585 irq_set_probe(irq);
586}
587static inline void set_irq_nested_thread(unsigned int irq, int nest)
588{
589 irq_set_nested_thread(irq, nest);
590}
591static inline void
592set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
593 irq_flow_handler_t handle, const char *name)
594{
595 irq_set_chip_and_handler_name(irq, chip, handle, name);
596}
597static inline void
598set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
599 irq_flow_handler_t handle)
600{
601 irq_set_chip_and_handler(irq, chip, handle);
602}
603static inline void
604__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
605 const char *name)
606{
607 __irq_set_handler(irq, handle, is_chained, name);
608}
609static inline void set_irq_handler(unsigned int irq, irq_flow_handler_t handle)
610{
611 irq_set_handler(irq, handle);
612}
613static inline void
614set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle)
615{
616 irq_set_chained_handler(irq, handle);
617}
618#endif
619
392int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); 620int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node);
393void irq_free_descs(unsigned int irq, unsigned int cnt); 621void irq_free_descs(unsigned int irq, unsigned int cnt);
394int irq_reserve_irqs(unsigned int from, unsigned int cnt); 622int irq_reserve_irqs(unsigned int from, unsigned int cnt);
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index c1a95b7b58d..00218371518 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -8,6 +8,7 @@
8 * For now it's included from <linux/irq.h> 8 * For now it's included from <linux/irq.h>
9 */ 9 */
10 10
11struct irq_affinity_notify;
11struct proc_dir_entry; 12struct proc_dir_entry;
12struct timer_rand_state; 13struct timer_rand_state;
13/** 14/**
@@ -18,13 +19,16 @@ struct timer_rand_state;
18 * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] 19 * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()]
19 * @action: the irq action chain 20 * @action: the irq action chain
20 * @status: status information 21 * @status: status information
22 * @core_internal_state__do_not_mess_with_it: core internal status information
21 * @depth: disable-depth, for nested irq_disable() calls 23 * @depth: disable-depth, for nested irq_disable() calls
22 * @wake_depth: enable depth, for multiple set_irq_wake() callers 24 * @wake_depth: enable depth, for multiple set_irq_wake() callers
23 * @irq_count: stats field to detect stalled irqs 25 * @irq_count: stats field to detect stalled irqs
24 * @last_unhandled: aging timer for unhandled count 26 * @last_unhandled: aging timer for unhandled count
25 * @irqs_unhandled: stats field for spurious unhandled interrupts 27 * @irqs_unhandled: stats field for spurious unhandled interrupts
26 * @lock: locking for SMP 28 * @lock: locking for SMP
29 * @affinity_notify: context for notification of affinity changes
27 * @pending_mask: pending rebalanced interrupts 30 * @pending_mask: pending rebalanced interrupts
31 * @threads_oneshot: bitfield to handle shared oneshot threads
28 * @threads_active: number of irqaction threads currently running 32 * @threads_active: number of irqaction threads currently running
29 * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers 33 * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
30 * @dir: /proc/irq/ procfs entry 34 * @dir: /proc/irq/ procfs entry
@@ -45,6 +49,7 @@ struct irq_desc {
45 struct { 49 struct {
46 unsigned int irq; 50 unsigned int irq;
47 unsigned int node; 51 unsigned int node;
52 unsigned int pad_do_not_even_think_about_it;
48 struct irq_chip *chip; 53 struct irq_chip *chip;
49 void *handler_data; 54 void *handler_data;
50 void *chip_data; 55 void *chip_data;
@@ -59,9 +64,16 @@ struct irq_desc {
59 struct timer_rand_state *timer_rand_state; 64 struct timer_rand_state *timer_rand_state;
60 unsigned int __percpu *kstat_irqs; 65 unsigned int __percpu *kstat_irqs;
61 irq_flow_handler_t handle_irq; 66 irq_flow_handler_t handle_irq;
67#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
68 irq_preflow_handler_t preflow_handler;
69#endif
62 struct irqaction *action; /* IRQ action list */ 70 struct irqaction *action; /* IRQ action list */
71#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
72 unsigned int status_use_accessors;
73#else
63 unsigned int status; /* IRQ status */ 74 unsigned int status; /* IRQ status */
64 75#endif
76 unsigned int core_internal_state__do_not_mess_with_it;
65 unsigned int depth; /* nested irq disables */ 77 unsigned int depth; /* nested irq disables */
66 unsigned int wake_depth; /* nested wake enables */ 78 unsigned int wake_depth; /* nested wake enables */
67 unsigned int irq_count; /* For detecting broken IRQs */ 79 unsigned int irq_count; /* For detecting broken IRQs */
@@ -70,10 +82,12 @@ struct irq_desc {
70 raw_spinlock_t lock; 82 raw_spinlock_t lock;
71#ifdef CONFIG_SMP 83#ifdef CONFIG_SMP
72 const struct cpumask *affinity_hint; 84 const struct cpumask *affinity_hint;
85 struct irq_affinity_notify *affinity_notify;
73#ifdef CONFIG_GENERIC_PENDING_IRQ 86#ifdef CONFIG_GENERIC_PENDING_IRQ
74 cpumask_var_t pending_mask; 87 cpumask_var_t pending_mask;
75#endif 88#endif
76#endif 89#endif
90 unsigned long threads_oneshot;
77 atomic_t threads_active; 91 atomic_t threads_active;
78 wait_queue_head_t wait_for_threads; 92 wait_queue_head_t wait_for_threads;
79#ifdef CONFIG_PROC_FS 93#ifdef CONFIG_PROC_FS
@@ -95,10 +109,51 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
95 109
96#ifdef CONFIG_GENERIC_HARDIRQS 110#ifdef CONFIG_GENERIC_HARDIRQS
97 111
98#define get_irq_desc_chip(desc) ((desc)->irq_data.chip) 112static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc)
99#define get_irq_desc_chip_data(desc) ((desc)->irq_data.chip_data) 113{
100#define get_irq_desc_data(desc) ((desc)->irq_data.handler_data) 114 return &desc->irq_data;
101#define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) 115}
116
117static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc)
118{
119 return desc->irq_data.chip;
120}
121
122static inline void *irq_desc_get_chip_data(struct irq_desc *desc)
123{
124 return desc->irq_data.chip_data;
125}
126
127static inline void *irq_desc_get_handler_data(struct irq_desc *desc)
128{
129 return desc->irq_data.handler_data;
130}
131
132static inline struct msi_desc *irq_desc_get_msi_desc(struct irq_desc *desc)
133{
134 return desc->irq_data.msi_desc;
135}
136
137#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
138static inline struct irq_chip *get_irq_desc_chip(struct irq_desc *desc)
139{
140 return irq_desc_get_chip(desc);
141}
142static inline void *get_irq_desc_data(struct irq_desc *desc)
143{
144 return irq_desc_get_handler_data(desc);
145}
146
147static inline void *get_irq_desc_chip_data(struct irq_desc *desc)
148{
149 return irq_desc_get_chip_data(desc);
150}
151
152static inline struct msi_desc *get_irq_desc_msi(struct irq_desc *desc)
153{
154 return irq_desc_get_msi_desc(desc);
155}
156#endif
102 157
103/* 158/*
104 * Architectures call this to let the generic IRQ layer 159 * Architectures call this to let the generic IRQ layer
@@ -123,6 +178,7 @@ static inline int irq_has_action(unsigned int irq)
123 return desc->action != NULL; 178 return desc->action != NULL;
124} 179}
125 180
181#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
126static inline int irq_balancing_disabled(unsigned int irq) 182static inline int irq_balancing_disabled(unsigned int irq)
127{ 183{
128 struct irq_desc *desc; 184 struct irq_desc *desc;
@@ -130,6 +186,7 @@ static inline int irq_balancing_disabled(unsigned int irq)
130 desc = irq_to_desc(irq); 186 desc = irq_to_desc(irq);
131 return desc->status & IRQ_NO_BALANCING_MASK; 187 return desc->status & IRQ_NO_BALANCING_MASK;
132} 188}
189#endif
133 190
134/* caller has locked the irq_desc and both params are valid */ 191/* caller has locked the irq_desc and both params are valid */
135static inline void __set_irq_handler_unlocked(int irq, 192static inline void __set_irq_handler_unlocked(int irq,
@@ -140,6 +197,17 @@ static inline void __set_irq_handler_unlocked(int irq,
140 desc = irq_to_desc(irq); 197 desc = irq_to_desc(irq);
141 desc->handle_irq = handler; 198 desc->handle_irq = handler;
142} 199}
200
201#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
202static inline void
203__irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler)
204{
205 struct irq_desc *desc;
206
207 desc = irq_to_desc(irq);
208 desc->preflow_handler = handler;
209}
210#endif
143#endif 211#endif
144 212
145#endif 213#endif
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 6811f4bfc6e..922aa313c9f 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
307extern unsigned long clock_t_to_jiffies(unsigned long x); 307extern unsigned long clock_t_to_jiffies(unsigned long x);
308extern u64 jiffies_64_to_clock_t(u64 x); 308extern u64 jiffies_64_to_clock_t(u64 x);
309extern u64 nsec_to_clock_t(u64 x); 309extern u64 nsec_to_clock_t(u64 x);
310extern u64 nsecs_to_jiffies64(u64 n);
310extern unsigned long nsecs_to_jiffies(u64 n); 311extern unsigned long nsecs_to_jiffies(u64 n);
311 312
312#define TIMESTAMP_SIZE 30 313#define TIMESTAMP_SIZE 30
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index ce0775aa64c..7ff16f7d3ed 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -64,7 +64,7 @@ struct kthread_work {
64}; 64};
65 65
66#define KTHREAD_WORKER_INIT(worker) { \ 66#define KTHREAD_WORKER_INIT(worker) { \
67 .lock = SPIN_LOCK_UNLOCKED, \ 67 .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \
68 .work_list = LIST_HEAD_INIT((worker).work_list), \ 68 .work_list = LIST_HEAD_INIT((worker).work_list), \
69 } 69 }
70 70
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6385fc17ad..679300c050f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1309,8 +1309,6 @@ int add_from_early_node_map(struct range *range, int az,
1309 int nr_range, int nid); 1309 int nr_range, int nid);
1310u64 __init find_memory_core_early(int nid, u64 size, u64 align, 1310u64 __init find_memory_core_early(int nid, u64 size, u64 align,
1311 u64 goal, u64 limit); 1311 u64 goal, u64 limit);
1312void *__alloc_memory_core_early(int nodeid, u64 size, u64 align,
1313 u64 goal, u64 limit);
1314typedef int (*work_fn_t)(unsigned long, unsigned long, void *); 1312typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
1315extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); 1313extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
1316extern void sparse_memory_present_with_active_regions(int nid); 1314extern void sparse_memory_present_with_active_regions(int nid);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f276d4fa01f..9c8603872c3 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -19,7 +19,6 @@ struct nameidata {
19 struct path path; 19 struct path path;
20 struct qstr last; 20 struct qstr last;
21 struct path root; 21 struct path root;
22 struct file *file;
23 struct inode *inode; /* path.dentry.d_inode */ 22 struct inode *inode; /* path.dentry.d_inode */
24 unsigned int flags; 23 unsigned int flags;
25 unsigned seq; 24 unsigned seq;
@@ -63,6 +62,10 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
63#define LOOKUP_EXCL 0x0400 62#define LOOKUP_EXCL 0x0400
64#define LOOKUP_RENAME_TARGET 0x0800 63#define LOOKUP_RENAME_TARGET 0x0800
65 64
65#define LOOKUP_JUMPED 0x1000
66#define LOOKUP_ROOT 0x2000
67#define LOOKUP_EMPTY 0x4000
68
66extern int user_path_at(int, const char __user *, unsigned, struct path *); 69extern int user_path_at(int, const char __user *, unsigned, struct path *);
67 70
68#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path) 71#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
@@ -72,7 +75,7 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *);
72 75
73extern int kern_path(const char *, unsigned, struct path *); 76extern int kern_path(const char *, unsigned, struct path *);
74 77
75extern int path_lookup(const char *, unsigned, struct nameidata *); 78extern int kern_path_parent(const char *, struct nameidata *);
76extern int vfs_path_lookup(struct dentry *, struct vfsmount *, 79extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
77 const char *, unsigned int, struct nameidata *); 80 const char *, unsigned int, struct nameidata *);
78 81
diff --git a/include/linux/of.h b/include/linux/of.h
index cad7cf0ab27..266db1d0baa 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -23,8 +23,6 @@
23 23
24#include <asm/byteorder.h> 24#include <asm/byteorder.h>
25 25
26#ifdef CONFIG_OF
27
28typedef u32 phandle; 26typedef u32 phandle;
29typedef u32 ihandle; 27typedef u32 ihandle;
30 28
@@ -65,11 +63,18 @@ struct device_node {
65#endif 63#endif
66}; 64};
67 65
66#ifdef CONFIG_OF
67
68/* Pointer for first entry in chain of all nodes. */ 68/* Pointer for first entry in chain of all nodes. */
69extern struct device_node *allnodes; 69extern struct device_node *allnodes;
70extern struct device_node *of_chosen; 70extern struct device_node *of_chosen;
71extern rwlock_t devtree_lock; 71extern rwlock_t devtree_lock;
72 72
73static inline bool of_have_populated_dt(void)
74{
75 return allnodes != NULL;
76}
77
73static inline bool of_node_is_root(const struct device_node *node) 78static inline bool of_node_is_root(const struct device_node *node)
74{ 79{
75 return node && (node->parent == NULL); 80 return node && (node->parent == NULL);
@@ -222,5 +227,12 @@ extern void of_attach_node(struct device_node *);
222extern void of_detach_node(struct device_node *); 227extern void of_detach_node(struct device_node *);
223#endif 228#endif
224 229
230#else
231
232static inline bool of_have_populated_dt(void)
233{
234 return false;
235}
236
225#endif /* CONFIG_OF */ 237#endif /* CONFIG_OF */
226#endif /* _LINUX_OF_H */ 238#endif /* _LINUX_OF_H */
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
new file mode 100644
index 00000000000..85a27b650d7
--- /dev/null
+++ b/include/linux/of_pci.h
@@ -0,0 +1,9 @@
1#ifndef __OF_PCI_H
2#define __OF_PCI_H
3
4#include <linux/pci.h>
5
6struct pci_dev;
7struct of_irq;
8int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
9#endif
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dda5b0a3ff6..614615b8d42 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -225,8 +225,14 @@ struct perf_event_attr {
225 }; 225 };
226 226
227 __u32 bp_type; 227 __u32 bp_type;
228 __u64 bp_addr; 228 union {
229 __u64 bp_len; 229 __u64 bp_addr;
230 __u64 config1; /* extension of config */
231 };
232 union {
233 __u64 bp_len;
234 __u64 config2; /* extension of config1 */
235 };
230}; 236};
231 237
232/* 238/*
@@ -464,6 +470,7 @@ enum perf_callchain_context {
464 470
465#define PERF_FLAG_FD_NO_GROUP (1U << 0) 471#define PERF_FLAG_FD_NO_GROUP (1U << 0)
466#define PERF_FLAG_FD_OUTPUT (1U << 1) 472#define PERF_FLAG_FD_OUTPUT (1U << 1)
473#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */
467 474
468#ifdef __KERNEL__ 475#ifdef __KERNEL__
469/* 476/*
@@ -471,6 +478,7 @@ enum perf_callchain_context {
471 */ 478 */
472 479
473#ifdef CONFIG_PERF_EVENTS 480#ifdef CONFIG_PERF_EVENTS
481# include <linux/cgroup.h>
474# include <asm/perf_event.h> 482# include <asm/perf_event.h>
475# include <asm/local64.h> 483# include <asm/local64.h>
476#endif 484#endif
@@ -539,6 +547,9 @@ struct hw_perf_event {
539 unsigned long event_base; 547 unsigned long event_base;
540 int idx; 548 int idx;
541 int last_cpu; 549 int last_cpu;
550 unsigned int extra_reg;
551 u64 extra_config;
552 int extra_alloc;
542 }; 553 };
543 struct { /* software */ 554 struct { /* software */
544 struct hrtimer hrtimer; 555 struct hrtimer hrtimer;
@@ -716,6 +727,22 @@ struct swevent_hlist {
716#define PERF_ATTACH_GROUP 0x02 727#define PERF_ATTACH_GROUP 0x02
717#define PERF_ATTACH_TASK 0x04 728#define PERF_ATTACH_TASK 0x04
718 729
730#ifdef CONFIG_CGROUP_PERF
731/*
732 * perf_cgroup_info keeps track of time_enabled for a cgroup.
733 * This is a per-cpu dynamically allocated data structure.
734 */
735struct perf_cgroup_info {
736 u64 time;
737 u64 timestamp;
738};
739
740struct perf_cgroup {
741 struct cgroup_subsys_state css;
742 struct perf_cgroup_info *info; /* timing info, one per cpu */
743};
744#endif
745
719/** 746/**
720 * struct perf_event - performance event kernel representation: 747 * struct perf_event - performance event kernel representation:
721 */ 748 */
@@ -832,6 +859,11 @@ struct perf_event {
832 struct event_filter *filter; 859 struct event_filter *filter;
833#endif 860#endif
834 861
862#ifdef CONFIG_CGROUP_PERF
863 struct perf_cgroup *cgrp; /* cgroup event is attach to */
864 int cgrp_defer_enabled;
865#endif
866
835#endif /* CONFIG_PERF_EVENTS */ 867#endif /* CONFIG_PERF_EVENTS */
836}; 868};
837 869
@@ -886,6 +918,7 @@ struct perf_event_context {
886 u64 generation; 918 u64 generation;
887 int pin_count; 919 int pin_count;
888 struct rcu_head rcu_head; 920 struct rcu_head rcu_head;
921 int nr_cgroups; /* cgroup events present */
889}; 922};
890 923
891/* 924/*
@@ -905,6 +938,9 @@ struct perf_cpu_context {
905 struct list_head rotation_list; 938 struct list_head rotation_list;
906 int jiffies_interval; 939 int jiffies_interval;
907 struct pmu *active_pmu; 940 struct pmu *active_pmu;
941#ifdef CONFIG_CGROUP_PERF
942 struct perf_cgroup *cgrp;
943#endif
908}; 944};
909 945
910struct perf_output_handle { 946struct perf_output_handle {
@@ -1040,11 +1076,11 @@ have_event:
1040 __perf_sw_event(event_id, nr, nmi, regs, addr); 1076 __perf_sw_event(event_id, nr, nmi, regs, addr);
1041} 1077}
1042 1078
1043extern atomic_t perf_task_events; 1079extern atomic_t perf_sched_events;
1044 1080
1045static inline void perf_event_task_sched_in(struct task_struct *task) 1081static inline void perf_event_task_sched_in(struct task_struct *task)
1046{ 1082{
1047 COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); 1083 COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
1048} 1084}
1049 1085
1050static inline 1086static inline
@@ -1052,7 +1088,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex
1052{ 1088{
1053 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1089 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1054 1090
1055 COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); 1091 COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
1056} 1092}
1057 1093
1058extern void perf_event_mmap(struct vm_area_struct *vma); 1094extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -1083,6 +1119,10 @@ extern int sysctl_perf_event_paranoid;
1083extern int sysctl_perf_event_mlock; 1119extern int sysctl_perf_event_mlock;
1084extern int sysctl_perf_event_sample_rate; 1120extern int sysctl_perf_event_sample_rate;
1085 1121
1122extern int perf_proc_update_handler(struct ctl_table *table, int write,
1123 void __user *buffer, size_t *lenp,
1124 loff_t *ppos);
1125
1086static inline bool perf_paranoid_tracepoint_raw(void) 1126static inline bool perf_paranoid_tracepoint_raw(void)
1087{ 1127{
1088 return sysctl_perf_event_paranoid > -1; 1128 return sysctl_perf_event_paranoid > -1;
diff --git a/include/linux/plist.h b/include/linux/plist.h
index 7254eda078e..c9b9f322c8d 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -31,15 +31,17 @@
31 * 31 *
32 * Simple ASCII art explanation: 32 * Simple ASCII art explanation:
33 * 33 *
34 * |HEAD | 34 * pl:prio_list (only for plist_node)
35 * | | 35 * nl:node_list
36 * |prio_list.prev|<------------------------------------| 36 * HEAD| NODE(S)
37 * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-| 37 * |
38 * |10 | |10| |21| |21| |21| |40| (prio) 38 * ||------------------------------------|
39 * | | | | | | | | | | | | 39 * ||->|pl|<->|pl|<--------------->|pl|<-|
40 * | | | | | | | | | | | | 40 * | |10| |21| |21| |21| |40| (prio)
41 * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| 41 * | | | | | | | | | | |
42 * |node_list.prev|<------------------------------------| 42 * | | | | | | | | | | |
43 * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
44 * |-------------------------------------------|
43 * 45 *
44 * The nodes on the prio_list list are sorted by priority to simplify 46 * The nodes on the prio_list list are sorted by priority to simplify
45 * the insertion of new nodes. There are no nodes with duplicate 47 * the insertion of new nodes. There are no nodes with duplicate
@@ -78,7 +80,6 @@
78#include <linux/spinlock_types.h> 80#include <linux/spinlock_types.h>
79 81
80struct plist_head { 82struct plist_head {
81 struct list_head prio_list;
82 struct list_head node_list; 83 struct list_head node_list;
83#ifdef CONFIG_DEBUG_PI_LIST 84#ifdef CONFIG_DEBUG_PI_LIST
84 raw_spinlock_t *rawlock; 85 raw_spinlock_t *rawlock;
@@ -88,7 +89,8 @@ struct plist_head {
88 89
89struct plist_node { 90struct plist_node {
90 int prio; 91 int prio;
91 struct plist_head plist; 92 struct list_head prio_list;
93 struct list_head node_list;
92}; 94};
93 95
94#ifdef CONFIG_DEBUG_PI_LIST 96#ifdef CONFIG_DEBUG_PI_LIST
@@ -100,7 +102,6 @@ struct plist_node {
100#endif 102#endif
101 103
102#define _PLIST_HEAD_INIT(head) \ 104#define _PLIST_HEAD_INIT(head) \
103 .prio_list = LIST_HEAD_INIT((head).prio_list), \
104 .node_list = LIST_HEAD_INIT((head).node_list) 105 .node_list = LIST_HEAD_INIT((head).node_list)
105 106
106/** 107/**
@@ -133,7 +134,8 @@ struct plist_node {
133#define PLIST_NODE_INIT(node, __prio) \ 134#define PLIST_NODE_INIT(node, __prio) \
134{ \ 135{ \
135 .prio = (__prio), \ 136 .prio = (__prio), \
136 .plist = { _PLIST_HEAD_INIT((node).plist) }, \ 137 .prio_list = LIST_HEAD_INIT((node).prio_list), \
138 .node_list = LIST_HEAD_INIT((node).node_list), \
137} 139}
138 140
139/** 141/**
@@ -144,7 +146,6 @@ struct plist_node {
144static inline void 146static inline void
145plist_head_init(struct plist_head *head, spinlock_t *lock) 147plist_head_init(struct plist_head *head, spinlock_t *lock)
146{ 148{
147 INIT_LIST_HEAD(&head->prio_list);
148 INIT_LIST_HEAD(&head->node_list); 149 INIT_LIST_HEAD(&head->node_list);
149#ifdef CONFIG_DEBUG_PI_LIST 150#ifdef CONFIG_DEBUG_PI_LIST
150 head->spinlock = lock; 151 head->spinlock = lock;
@@ -160,7 +161,6 @@ plist_head_init(struct plist_head *head, spinlock_t *lock)
160static inline void 161static inline void
161plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock) 162plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock)
162{ 163{
163 INIT_LIST_HEAD(&head->prio_list);
164 INIT_LIST_HEAD(&head->node_list); 164 INIT_LIST_HEAD(&head->node_list);
165#ifdef CONFIG_DEBUG_PI_LIST 165#ifdef CONFIG_DEBUG_PI_LIST
166 head->rawlock = lock; 166 head->rawlock = lock;
@@ -176,7 +176,8 @@ plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock)
176static inline void plist_node_init(struct plist_node *node, int prio) 176static inline void plist_node_init(struct plist_node *node, int prio)
177{ 177{
178 node->prio = prio; 178 node->prio = prio;
179 plist_head_init(&node->plist, NULL); 179 INIT_LIST_HEAD(&node->prio_list);
180 INIT_LIST_HEAD(&node->node_list);
180} 181}
181 182
182extern void plist_add(struct plist_node *node, struct plist_head *head); 183extern void plist_add(struct plist_node *node, struct plist_head *head);
@@ -188,7 +189,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
188 * @head: the head for your list 189 * @head: the head for your list
189 */ 190 */
190#define plist_for_each(pos, head) \ 191#define plist_for_each(pos, head) \
191 list_for_each_entry(pos, &(head)->node_list, plist.node_list) 192 list_for_each_entry(pos, &(head)->node_list, node_list)
192 193
193/** 194/**
194 * plist_for_each_safe - iterate safely over a plist of given type 195 * plist_for_each_safe - iterate safely over a plist of given type
@@ -199,7 +200,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
199 * Iterate over a plist of given type, safe against removal of list entry. 200 * Iterate over a plist of given type, safe against removal of list entry.
200 */ 201 */
201#define plist_for_each_safe(pos, n, head) \ 202#define plist_for_each_safe(pos, n, head) \
202 list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list) 203 list_for_each_entry_safe(pos, n, &(head)->node_list, node_list)
203 204
204/** 205/**
205 * plist_for_each_entry - iterate over list of given type 206 * plist_for_each_entry - iterate over list of given type
@@ -208,7 +209,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
208 * @mem: the name of the list_struct within the struct 209 * @mem: the name of the list_struct within the struct
209 */ 210 */
210#define plist_for_each_entry(pos, head, mem) \ 211#define plist_for_each_entry(pos, head, mem) \
211 list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list) 212 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
212 213
213/** 214/**
214 * plist_for_each_entry_safe - iterate safely over list of given type 215 * plist_for_each_entry_safe - iterate safely over list of given type
@@ -220,7 +221,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
220 * Iterate over list of given type, safe against removal of list entry. 221 * Iterate over list of given type, safe against removal of list entry.
221 */ 222 */
222#define plist_for_each_entry_safe(pos, n, head, m) \ 223#define plist_for_each_entry_safe(pos, n, head, m) \
223 list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list) 224 list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list)
224 225
225/** 226/**
226 * plist_head_empty - return !0 if a plist_head is empty 227 * plist_head_empty - return !0 if a plist_head is empty
@@ -237,7 +238,7 @@ static inline int plist_head_empty(const struct plist_head *head)
237 */ 238 */
238static inline int plist_node_empty(const struct plist_node *node) 239static inline int plist_node_empty(const struct plist_node *node)
239{ 240{
240 return plist_head_empty(&node->plist); 241 return list_empty(&node->node_list);
241} 242}
242 243
243/* All functions below assume the plist_head is not empty. */ 244/* All functions below assume the plist_head is not empty. */
@@ -285,7 +286,7 @@ static inline int plist_node_empty(const struct plist_node *node)
285static inline struct plist_node *plist_first(const struct plist_head *head) 286static inline struct plist_node *plist_first(const struct plist_head *head)
286{ 287{
287 return list_entry(head->node_list.next, 288 return list_entry(head->node_list.next,
288 struct plist_node, plist.node_list); 289 struct plist_node, node_list);
289} 290}
290 291
291/** 292/**
@@ -297,7 +298,7 @@ static inline struct plist_node *plist_first(const struct plist_head *head)
297static inline struct plist_node *plist_last(const struct plist_head *head) 298static inline struct plist_node *plist_last(const struct plist_head *head)
298{ 299{
299 return list_entry(head->node_list.prev, 300 return list_entry(head->node_list.prev,
300 struct plist_node, plist.node_list); 301 struct plist_node, node_list);
301} 302}
302 303
303#endif 304#endif
diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
new file mode 100644
index 00000000000..369e19d3750
--- /dev/null
+++ b/include/linux/posix-clock.h
@@ -0,0 +1,150 @@
1/*
2 * posix-clock.h - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#ifndef _LINUX_POSIX_CLOCK_H_
21#define _LINUX_POSIX_CLOCK_H_
22
23#include <linux/cdev.h>
24#include <linux/fs.h>
25#include <linux/poll.h>
26#include <linux/posix-timers.h>
27
28struct posix_clock;
29
30/**
31 * struct posix_clock_operations - functional interface to the clock
32 *
33 * Every posix clock is represented by a character device. Drivers may
34 * optionally offer extended capabilities by implementing the
35 * character device methods. The character device file operations are
36 * first handled by the clock device layer, then passed on to the
37 * driver by calling these functions.
38 *
39 * @owner: The clock driver should set to THIS_MODULE
40 * @clock_adjtime: Adjust the clock
41 * @clock_gettime: Read the current time
42 * @clock_getres: Get the clock resolution
43 * @clock_settime: Set the current time value
44 * @timer_create: Create a new timer
45 * @timer_delete: Remove a previously created timer
46 * @timer_gettime: Get remaining time and interval of a timer
47 * @timer_setttime: Set a timer's initial expiration and interval
48 * @fasync: Optional character device fasync method
49 * @mmap: Optional character device mmap method
50 * @open: Optional character device open method
51 * @release: Optional character device release method
52 * @ioctl: Optional character device ioctl method
53 * @read: Optional character device read method
54 * @poll: Optional character device poll method
55 */
56struct posix_clock_operations {
57 struct module *owner;
58
59 int (*clock_adjtime)(struct posix_clock *pc, struct timex *tx);
60
61 int (*clock_gettime)(struct posix_clock *pc, struct timespec *ts);
62
63 int (*clock_getres) (struct posix_clock *pc, struct timespec *ts);
64
65 int (*clock_settime)(struct posix_clock *pc,
66 const struct timespec *ts);
67
68 int (*timer_create) (struct posix_clock *pc, struct k_itimer *kit);
69
70 int (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit);
71
72 void (*timer_gettime)(struct posix_clock *pc,
73 struct k_itimer *kit, struct itimerspec *tsp);
74
75 int (*timer_settime)(struct posix_clock *pc,
76 struct k_itimer *kit, int flags,
77 struct itimerspec *tsp, struct itimerspec *old);
78 /*
79 * Optional character device methods:
80 */
81 int (*fasync) (struct posix_clock *pc,
82 int fd, struct file *file, int on);
83
84 long (*ioctl) (struct posix_clock *pc,
85 unsigned int cmd, unsigned long arg);
86
87 int (*mmap) (struct posix_clock *pc,
88 struct vm_area_struct *vma);
89
90 int (*open) (struct posix_clock *pc, fmode_t f_mode);
91
92 uint (*poll) (struct posix_clock *pc,
93 struct file *file, poll_table *wait);
94
95 int (*release) (struct posix_clock *pc);
96
97 ssize_t (*read) (struct posix_clock *pc,
98 uint flags, char __user *buf, size_t cnt);
99};
100
101/**
102 * struct posix_clock - represents a dynamic posix clock
103 *
104 * @ops: Functional interface to the clock
105 * @cdev: Character device instance for this clock
106 * @kref: Reference count.
107 * @mutex: Protects the 'zombie' field from concurrent access.
108 * @zombie: If 'zombie' is true, then the hardware has disappeared.
109 * @release: A function to free the structure when the reference count reaches
110 * zero. May be NULL if structure is statically allocated.
111 *
112 * Drivers should embed their struct posix_clock within a private
113 * structure, obtaining a reference to it during callbacks using
114 * container_of().
115 */
116struct posix_clock {
117 struct posix_clock_operations ops;
118 struct cdev cdev;
119 struct kref kref;
120 struct mutex mutex;
121 bool zombie;
122 void (*release)(struct posix_clock *clk);
123};
124
125/**
126 * posix_clock_register() - register a new clock
127 * @clk: Pointer to the clock. Caller must provide 'ops' and 'release'
128 * @devid: Allocated device id
129 *
130 * A clock driver calls this function to register itself with the
131 * clock device subsystem. If 'clk' points to dynamically allocated
132 * memory, then the caller must provide a 'release' function to free
133 * that memory.
134 *
135 * Returns zero on success, non-zero otherwise.
136 */
137int posix_clock_register(struct posix_clock *clk, dev_t devid);
138
139/**
140 * posix_clock_unregister() - unregister a clock
141 * @clk: Clock instance previously registered via posix_clock_register()
142 *
143 * A clock driver calls this function to remove itself from the clock
144 * device subsystem. The posix_clock itself will remain (in an
145 * inactive state) until its reference count drops to zero, at which
146 * point it will be deallocated with its 'release' method.
147 */
148void posix_clock_unregister(struct posix_clock *clk);
149
150#endif
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3e23844a699..d51243ae072 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/timex.h>
7 8
8union cpu_time_count { 9union cpu_time_count {
9 cputime_t cpu; 10 cputime_t cpu;
@@ -17,10 +18,21 @@ struct cpu_timer_list {
17 int firing; 18 int firing;
18}; 19};
19 20
21/*
22 * Bit fields within a clockid:
23 *
24 * The most significant 29 bits hold either a pid or a file descriptor.
25 *
26 * Bit 2 indicates whether a cpu clock refers to a thread or a process.
27 *
28 * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
29 *
30 * A clockid is invalid if bits 2, 1, and 0 are all set.
31 */
20#define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) 32#define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3))
21#define CPUCLOCK_PERTHREAD(clock) \ 33#define CPUCLOCK_PERTHREAD(clock) \
22 (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) 34 (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
23#define CPUCLOCK_PID_MASK 7 35
24#define CPUCLOCK_PERTHREAD_MASK 4 36#define CPUCLOCK_PERTHREAD_MASK 4
25#define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) 37#define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
26#define CPUCLOCK_CLOCK_MASK 3 38#define CPUCLOCK_CLOCK_MASK 3
@@ -28,12 +40,17 @@ struct cpu_timer_list {
28#define CPUCLOCK_VIRT 1 40#define CPUCLOCK_VIRT 1
29#define CPUCLOCK_SCHED 2 41#define CPUCLOCK_SCHED 2
30#define CPUCLOCK_MAX 3 42#define CPUCLOCK_MAX 3
43#define CLOCKFD CPUCLOCK_MAX
44#define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
31 45
32#define MAKE_PROCESS_CPUCLOCK(pid, clock) \ 46#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
33 ((~(clockid_t) (pid) << 3) | (clockid_t) (clock)) 47 ((~(clockid_t) (pid) << 3) | (clockid_t) (clock))
34#define MAKE_THREAD_CPUCLOCK(tid, clock) \ 48#define MAKE_THREAD_CPUCLOCK(tid, clock) \
35 MAKE_PROCESS_CPUCLOCK((tid), (clock) | CPUCLOCK_PERTHREAD_MASK) 49 MAKE_PROCESS_CPUCLOCK((tid), (clock) | CPUCLOCK_PERTHREAD_MASK)
36 50
51#define FD_TO_CLOCKID(fd) ((~(clockid_t) (fd) << 3) | CLOCKFD)
52#define CLOCKID_TO_FD(clk) ((unsigned int) ~((clk) >> 3))
53
37/* POSIX.1b interval timer structure. */ 54/* POSIX.1b interval timer structure. */
38struct k_itimer { 55struct k_itimer {
39 struct list_head list; /* free/ allocate list */ 56 struct list_head list; /* free/ allocate list */
@@ -67,10 +84,11 @@ struct k_itimer {
67}; 84};
68 85
69struct k_clock { 86struct k_clock {
70 int res; /* in nanoseconds */
71 int (*clock_getres) (const clockid_t which_clock, struct timespec *tp); 87 int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
72 int (*clock_set) (const clockid_t which_clock, struct timespec * tp); 88 int (*clock_set) (const clockid_t which_clock,
89 const struct timespec *tp);
73 int (*clock_get) (const clockid_t which_clock, struct timespec * tp); 90 int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
91 int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
74 int (*timer_create) (struct k_itimer *timer); 92 int (*timer_create) (struct k_itimer *timer);
75 int (*nsleep) (const clockid_t which_clock, int flags, 93 int (*nsleep) (const clockid_t which_clock, int flags,
76 struct timespec *, struct timespec __user *); 94 struct timespec *, struct timespec __user *);
@@ -84,28 +102,14 @@ struct k_clock {
84 struct itimerspec * cur_setting); 102 struct itimerspec * cur_setting);
85}; 103};
86 104
87void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); 105extern struct k_clock clock_posix_cpu;
106extern struct k_clock clock_posix_dynamic;
88 107
89/* error handlers for timer_create, nanosleep and settime */ 108void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock);
90int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
91 struct timespec __user *);
92int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
93 109
94/* function to call to trigger timer event */ 110/* function to call to trigger timer event */
95int posix_timer_event(struct k_itimer *timr, int si_private); 111int posix_timer_event(struct k_itimer *timr, int si_private);
96 112
97int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts);
98int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts);
99int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
100int posix_cpu_timer_create(struct k_itimer *timer);
101int posix_cpu_nsleep(const clockid_t which_clock, int flags,
102 struct timespec *rqtp, struct timespec __user *rmtp);
103long posix_cpu_nsleep_restart(struct restart_block *restart_block);
104int posix_cpu_timer_set(struct k_itimer *timer, int flags,
105 struct itimerspec *new, struct itimerspec *old);
106int posix_cpu_timer_del(struct k_itimer *timer);
107void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp);
108
109void posix_cpu_timer_schedule(struct k_itimer *timer); 113void posix_cpu_timer_schedule(struct k_itimer *timer);
110 114
111void run_posix_cpu_timers(struct task_struct *task); 115void run_posix_cpu_timers(struct task_struct *task);
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8d3a2486544..ab38ac80b0f 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -100,6 +100,8 @@ void ring_buffer_free(struct ring_buffer *buffer);
100 100
101int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); 101int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
102 102
103void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val);
104
103struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer, 105struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer,
104 unsigned long length); 106 unsigned long length);
105int ring_buffer_unlock_commit(struct ring_buffer *buffer, 107int ring_buffer_unlock_commit(struct ring_buffer *buffer,
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 89c3e518299..2ca7e8a7806 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -133,7 +133,6 @@ extern struct class *rtc_class;
133 * The (current) exceptions are mostly filesystem hooks: 133 * The (current) exceptions are mostly filesystem hooks:
134 * - the proc() hook for procfs 134 * - the proc() hook for procfs
135 * - non-ioctl() chardev hooks: open(), release(), read_callback() 135 * - non-ioctl() chardev hooks: open(), release(), read_callback()
136 * - periodic irq calls: irq_set_state(), irq_set_freq()
137 * 136 *
138 * REVISIT those periodic irq calls *do* have ops_lock when they're 137 * REVISIT those periodic irq calls *do* have ops_lock when they're
139 * issued through ioctl() ... 138 * issued through ioctl() ...
@@ -148,11 +147,8 @@ struct rtc_class_ops {
148 int (*set_alarm)(struct device *, struct rtc_wkalrm *); 147 int (*set_alarm)(struct device *, struct rtc_wkalrm *);
149 int (*proc)(struct device *, struct seq_file *); 148 int (*proc)(struct device *, struct seq_file *);
150 int (*set_mmss)(struct device *, unsigned long secs); 149 int (*set_mmss)(struct device *, unsigned long secs);
151 int (*irq_set_state)(struct device *, int enabled);
152 int (*irq_set_freq)(struct device *, int freq);
153 int (*read_callback)(struct device *, int data); 150 int (*read_callback)(struct device *, int data);
154 int (*alarm_irq_enable)(struct device *, unsigned int enabled); 151 int (*alarm_irq_enable)(struct device *, unsigned int enabled);
155 int (*update_irq_enable)(struct device *, unsigned int enabled);
156}; 152};
157 153
158#define RTC_DEVICE_NAME_SIZE 20 154#define RTC_DEVICE_NAME_SIZE 20
@@ -227,6 +223,7 @@ extern void rtc_device_unregister(struct rtc_device *rtc);
227extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); 223extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
228extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); 224extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
229extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs); 225extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs);
226int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
230extern int rtc_read_alarm(struct rtc_device *rtc, 227extern int rtc_read_alarm(struct rtc_device *rtc,
231 struct rtc_wkalrm *alrm); 228 struct rtc_wkalrm *alrm);
232extern int rtc_set_alarm(struct rtc_device *rtc, 229extern int rtc_set_alarm(struct rtc_device *rtc,
diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index bd31808c7d8..cc0072e93e3 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -43,14 +43,6 @@ typedef struct {
43 RW_DEP_MAP_INIT(lockname) } 43 RW_DEP_MAP_INIT(lockname) }
44#endif 44#endif
45 45
46/*
47 * RW_LOCK_UNLOCKED defeat lockdep state tracking and is hence
48 * deprecated.
49 *
50 * Please use DEFINE_RWLOCK() or __RW_LOCK_UNLOCKED() as appropriate.
51 */
52#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init)
53
54#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) 46#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x)
55 47
56#endif /* __LINUX_RWLOCK_TYPES_H */ 48#endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index bdfcc252797..34701241b67 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -12,15 +12,7 @@
12#error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead" 12#error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead"
13#endif 13#endif
14 14
15#include <linux/spinlock.h>
16#include <linux/list.h>
17
18#ifdef __KERNEL__ 15#ifdef __KERNEL__
19
20#include <linux/types.h>
21
22struct rwsem_waiter;
23
24/* 16/*
25 * the rw-semaphore definition 17 * the rw-semaphore definition
26 * - if activity is 0 then there are no active readers or writers 18 * - if activity is 0 then there are no active readers or writers
@@ -37,28 +29,7 @@ struct rw_semaphore {
37#endif 29#endif
38}; 30};
39 31
40#ifdef CONFIG_DEBUG_LOCK_ALLOC 32#define RWSEM_UNLOCKED_VALUE 0x00000000
41# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
42#else
43# define __RWSEM_DEP_MAP_INIT(lockname)
44#endif
45
46#define __RWSEM_INITIALIZER(name) \
47{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \
48 __RWSEM_DEP_MAP_INIT(name) }
49
50#define DECLARE_RWSEM(name) \
51 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
52
53extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
54 struct lock_class_key *key);
55
56#define init_rwsem(sem) \
57do { \
58 static struct lock_class_key __key; \
59 \
60 __init_rwsem((sem), #sem, &__key); \
61} while (0)
62 33
63extern void __down_read(struct rw_semaphore *sem); 34extern void __down_read(struct rw_semaphore *sem);
64extern int __down_read_trylock(struct rw_semaphore *sem); 35extern int __down_read_trylock(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index efd348fe8ca..a8afe9cd000 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -11,6 +11,9 @@
11 11
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/list.h>
15#include <linux/spinlock.h>
16
14#include <asm/system.h> 17#include <asm/system.h>
15#include <asm/atomic.h> 18#include <asm/atomic.h>
16 19
@@ -19,9 +22,57 @@ struct rw_semaphore;
19#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK 22#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
20#include <linux/rwsem-spinlock.h> /* use a generic implementation */ 23#include <linux/rwsem-spinlock.h> /* use a generic implementation */
21#else 24#else
22#include <asm/rwsem.h> /* use an arch-specific implementation */ 25/* All arch specific implementations share the same struct */
26struct rw_semaphore {
27 long count;
28 spinlock_t wait_lock;
29 struct list_head wait_list;
30#ifdef CONFIG_DEBUG_LOCK_ALLOC
31 struct lockdep_map dep_map;
32#endif
33};
34
35extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
36extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
37extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
38extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
39
40/* Include the arch specific part */
41#include <asm/rwsem.h>
42
43/* In all implementations count != 0 means locked */
44static inline int rwsem_is_locked(struct rw_semaphore *sem)
45{
46 return sem->count != 0;
47}
48
49#endif
50
51/* Common initializer macros and functions */
52
53#ifdef CONFIG_DEBUG_LOCK_ALLOC
54# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
55#else
56# define __RWSEM_DEP_MAP_INIT(lockname)
23#endif 57#endif
24 58
59#define __RWSEM_INITIALIZER(name) \
60 { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED(name.wait_lock), \
61 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
62
63#define DECLARE_RWSEM(name) \
64 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
65
66extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
67 struct lock_class_key *key);
68
69#define init_rwsem(sem) \
70do { \
71 static struct lock_class_key __key; \
72 \
73 __init_rwsem((sem), #sem, &__key); \
74} while (0)
75
25/* 76/*
26 * lock for reading 77 * lock for reading
27 */ 78 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5ed06..c15936fe998 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1058,6 +1058,7 @@ struct sched_class {
1058 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1058 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1059 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1059 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1060 void (*yield_task) (struct rq *rq); 1060 void (*yield_task) (struct rq *rq);
1061 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
1061 1062
1062 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1063 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1063 1064
@@ -1084,12 +1085,10 @@ struct sched_class {
1084 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1085 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1085 void (*task_fork) (struct task_struct *p); 1086 void (*task_fork) (struct task_struct *p);
1086 1087
1087 void (*switched_from) (struct rq *this_rq, struct task_struct *task, 1088 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1088 int running); 1089 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1089 void (*switched_to) (struct rq *this_rq, struct task_struct *task,
1090 int running);
1091 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1090 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1092 int oldprio, int running); 1091 int oldprio);
1093 1092
1094 unsigned int (*get_rr_interval) (struct rq *rq, 1093 unsigned int (*get_rr_interval) (struct rq *rq,
1095 struct task_struct *task); 1094 struct task_struct *task);
@@ -1715,7 +1714,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1715/* 1714/*
1716 * Per process flags 1715 * Per process flags
1717 */ 1716 */
1718#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
1719#define PF_STARTING 0x00000002 /* being created */ 1717#define PF_STARTING 0x00000002 /* being created */
1720#define PF_EXITING 0x00000004 /* getting shut down */ 1718#define PF_EXITING 0x00000004 /* getting shut down */
1721#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1719#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
@@ -1945,8 +1943,6 @@ int sched_rt_handler(struct ctl_table *table, int write,
1945 void __user *buffer, size_t *lenp, 1943 void __user *buffer, size_t *lenp,
1946 loff_t *ppos); 1944 loff_t *ppos);
1947 1945
1948extern unsigned int sysctl_sched_compat_yield;
1949
1950#ifdef CONFIG_SCHED_AUTOGROUP 1946#ifdef CONFIG_SCHED_AUTOGROUP
1951extern unsigned int sysctl_sched_autogroup_enabled; 1947extern unsigned int sysctl_sched_autogroup_enabled;
1952 1948
@@ -1977,6 +1973,7 @@ static inline int rt_mutex_getprio(struct task_struct *p)
1977# define rt_mutex_adjust_pi(p) do { } while (0) 1973# define rt_mutex_adjust_pi(p) do { } while (0)
1978#endif 1974#endif
1979 1975
1976extern bool yield_to(struct task_struct *p, bool preempt);
1980extern void set_user_nice(struct task_struct *p, long nice); 1977extern void set_user_nice(struct task_struct *p, long nice);
1981extern int task_prio(const struct task_struct *p); 1978extern int task_prio(const struct task_struct *p);
1982extern int task_nice(const struct task_struct *p); 1979extern int task_nice(const struct task_struct *p);
@@ -2049,7 +2046,7 @@ extern void release_uids(struct user_namespace *ns);
2049 2046
2050#include <asm/current.h> 2047#include <asm/current.h>
2051 2048
2052extern void do_timer(unsigned long ticks); 2049extern void xtime_update(unsigned long ticks);
2053 2050
2054extern int wake_up_state(struct task_struct *tsk, unsigned int state); 2051extern int wake_up_state(struct task_struct *tsk, unsigned int state);
2055extern int wake_up_process(struct task_struct *tsk); 2052extern int wake_up_process(struct task_struct *tsk);
@@ -2578,13 +2575,6 @@ static inline void inc_syscw(struct task_struct *tsk)
2578#define TASK_SIZE_OF(tsk) TASK_SIZE 2575#define TASK_SIZE_OF(tsk) TASK_SIZE
2579#endif 2576#endif
2580 2577
2581/*
2582 * Call the function if the target task is executing on a CPU right now:
2583 */
2584extern void task_oncpu_function_call(struct task_struct *p,
2585 void (*func) (void *info), void *info);
2586
2587
2588#ifdef CONFIG_MM_OWNER 2578#ifdef CONFIG_MM_OWNER
2589extern void mm_update_next_owner(struct mm_struct *mm); 2579extern void mm_update_next_owner(struct mm_struct *mm);
2590extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2580extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/security.h b/include/linux/security.h
index b2b7f9749f5..debbd97db7a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,7 +53,7 @@ struct audit_krule;
53 */ 53 */
54extern int cap_capable(struct task_struct *tsk, const struct cred *cred, 54extern int cap_capable(struct task_struct *tsk, const struct cred *cred,
55 int cap, int audit); 55 int cap, int audit);
56extern int cap_settime(struct timespec *ts, struct timezone *tz); 56extern int cap_settime(const struct timespec *ts, const struct timezone *tz);
57extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); 57extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
58extern int cap_ptrace_traceme(struct task_struct *parent); 58extern int cap_ptrace_traceme(struct task_struct *parent);
59extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); 59extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
@@ -1387,7 +1387,7 @@ struct security_operations {
1387 int (*quotactl) (int cmds, int type, int id, struct super_block *sb); 1387 int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
1388 int (*quota_on) (struct dentry *dentry); 1388 int (*quota_on) (struct dentry *dentry);
1389 int (*syslog) (int type); 1389 int (*syslog) (int type);
1390 int (*settime) (struct timespec *ts, struct timezone *tz); 1390 int (*settime) (const struct timespec *ts, const struct timezone *tz);
1391 int (*vm_enough_memory) (struct mm_struct *mm, long pages); 1391 int (*vm_enough_memory) (struct mm_struct *mm, long pages);
1392 1392
1393 int (*bprm_set_creds) (struct linux_binprm *bprm); 1393 int (*bprm_set_creds) (struct linux_binprm *bprm);
@@ -1669,7 +1669,7 @@ int security_sysctl(struct ctl_table *table, int op);
1669int security_quotactl(int cmds, int type, int id, struct super_block *sb); 1669int security_quotactl(int cmds, int type, int id, struct super_block *sb);
1670int security_quota_on(struct dentry *dentry); 1670int security_quota_on(struct dentry *dentry);
1671int security_syslog(int type); 1671int security_syslog(int type);
1672int security_settime(struct timespec *ts, struct timezone *tz); 1672int security_settime(const struct timespec *ts, const struct timezone *tz);
1673int security_vm_enough_memory(long pages); 1673int security_vm_enough_memory(long pages);
1674int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); 1674int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
1675int security_vm_enough_memory_kern(long pages); 1675int security_vm_enough_memory_kern(long pages);
@@ -1904,7 +1904,8 @@ static inline int security_syslog(int type)
1904 return 0; 1904 return 0;
1905} 1905}
1906 1906
1907static inline int security_settime(struct timespec *ts, struct timezone *tz) 1907static inline int security_settime(const struct timespec *ts,
1908 const struct timezone *tz)
1908{ 1909{
1909 return cap_settime(ts, tz); 1910 return cap_settime(ts, tz);
1910} 1911}
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 851b7783720..73548eb13a5 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -81,14 +81,6 @@ typedef struct spinlock {
81#define __SPIN_LOCK_UNLOCKED(lockname) \ 81#define __SPIN_LOCK_UNLOCKED(lockname) \
82 (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname) 82 (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
83 83
84/*
85 * SPIN_LOCK_UNLOCKED defeats lockdep state tracking and is hence
86 * deprecated.
87 * Please use DEFINE_SPINLOCK() or __SPIN_LOCK_UNLOCKED() as
88 * appropriate.
89 */
90#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init)
91
92#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) 84#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
93 85
94#include <linux/rwlock_types.h> 86#include <linux/rwlock_types.h>
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 98664db1be4..1f5c18e6f4f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -62,6 +62,7 @@ struct robust_list_head;
62struct getcpu_cache; 62struct getcpu_cache;
63struct old_linux_dirent; 63struct old_linux_dirent;
64struct perf_event_attr; 64struct perf_event_attr;
65struct file_handle;
65 66
66#include <linux/types.h> 67#include <linux/types.h>
67#include <linux/aio_abi.h> 68#include <linux/aio_abi.h>
@@ -132,11 +133,11 @@ extern struct trace_event_functions exit_syscall_print_funcs;
132 .class = &event_class_syscall_enter, \ 133 .class = &event_class_syscall_enter, \
133 .event.funcs = &enter_syscall_print_funcs, \ 134 .event.funcs = &enter_syscall_print_funcs, \
134 .data = (void *)&__syscall_meta_##sname,\ 135 .data = (void *)&__syscall_meta_##sname,\
136 .flags = TRACE_EVENT_FL_CAP_ANY, \
135 }; \ 137 }; \
136 static struct ftrace_event_call __used \ 138 static struct ftrace_event_call __used \
137 __attribute__((section("_ftrace_events"))) \ 139 __attribute__((section("_ftrace_events"))) \
138 *__event_enter_##sname = &event_enter_##sname; \ 140 *__event_enter_##sname = &event_enter_##sname;
139 __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
140 141
141#define SYSCALL_TRACE_EXIT_EVENT(sname) \ 142#define SYSCALL_TRACE_EXIT_EVENT(sname) \
142 static struct syscall_metadata __syscall_meta_##sname; \ 143 static struct syscall_metadata __syscall_meta_##sname; \
@@ -146,11 +147,11 @@ extern struct trace_event_functions exit_syscall_print_funcs;
146 .class = &event_class_syscall_exit, \ 147 .class = &event_class_syscall_exit, \
147 .event.funcs = &exit_syscall_print_funcs, \ 148 .event.funcs = &exit_syscall_print_funcs, \
148 .data = (void *)&__syscall_meta_##sname,\ 149 .data = (void *)&__syscall_meta_##sname,\
150 .flags = TRACE_EVENT_FL_CAP_ANY, \
149 }; \ 151 }; \
150 static struct ftrace_event_call __used \ 152 static struct ftrace_event_call __used \
151 __attribute__((section("_ftrace_events"))) \ 153 __attribute__((section("_ftrace_events"))) \
152 *__event_exit_##sname = &event_exit_##sname; \ 154 *__event_exit_##sname = &event_exit_##sname;
153 __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
154 155
155#define SYSCALL_METADATA(sname, nb) \ 156#define SYSCALL_METADATA(sname, nb) \
156 SYSCALL_TRACE_ENTER_EVENT(sname); \ 157 SYSCALL_TRACE_ENTER_EVENT(sname); \
@@ -158,6 +159,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
158 static struct syscall_metadata __used \ 159 static struct syscall_metadata __used \
159 __syscall_meta_##sname = { \ 160 __syscall_meta_##sname = { \
160 .name = "sys"#sname, \ 161 .name = "sys"#sname, \
162 .syscall_nr = -1, /* Filled in at boot */ \
161 .nb_args = nb, \ 163 .nb_args = nb, \
162 .types = types_##sname, \ 164 .types = types_##sname, \
163 .args = args_##sname, \ 165 .args = args_##sname, \
@@ -175,6 +177,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
175 static struct syscall_metadata __used \ 177 static struct syscall_metadata __used \
176 __syscall_meta__##sname = { \ 178 __syscall_meta__##sname = { \
177 .name = "sys_"#sname, \ 179 .name = "sys_"#sname, \
180 .syscall_nr = -1, /* Filled in at boot */ \
178 .nb_args = 0, \ 181 .nb_args = 0, \
179 .enter_event = &event_enter__##sname, \ 182 .enter_event = &event_enter__##sname, \
180 .exit_event = &event_exit__##sname, \ 183 .exit_event = &event_exit__##sname, \
@@ -313,6 +316,8 @@ asmlinkage long sys_clock_settime(clockid_t which_clock,
313 const struct timespec __user *tp); 316 const struct timespec __user *tp);
314asmlinkage long sys_clock_gettime(clockid_t which_clock, 317asmlinkage long sys_clock_gettime(clockid_t which_clock,
315 struct timespec __user *tp); 318 struct timespec __user *tp);
319asmlinkage long sys_clock_adjtime(clockid_t which_clock,
320 struct timex __user *tx);
316asmlinkage long sys_clock_getres(clockid_t which_clock, 321asmlinkage long sys_clock_getres(clockid_t which_clock,
317 struct timespec __user *tp); 322 struct timespec __user *tp);
318asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, 323asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
@@ -832,5 +837,10 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
832 unsigned long prot, unsigned long flags, 837 unsigned long prot, unsigned long flags,
833 unsigned long fd, unsigned long pgoff); 838 unsigned long fd, unsigned long pgoff);
834asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); 839asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
835 840asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
841 struct file_handle __user *handle,
842 int __user *mnt_id, int flag);
843asmlinkage long sys_open_by_handle_at(int mountdirfd,
844 struct file_handle __user *handle,
845 int flags);
836#endif 846#endif
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index c9069654417..20fc303947d 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -18,9 +18,6 @@ struct compat_timespec;
18struct restart_block { 18struct restart_block {
19 long (*fn)(struct restart_block *); 19 long (*fn)(struct restart_block *);
20 union { 20 union {
21 struct {
22 unsigned long arg0, arg1, arg2, arg3;
23 };
24 /* For futex_wait and futex_wait_requeue_pi */ 21 /* For futex_wait and futex_wait_requeue_pi */
25 struct { 22 struct {
26 u32 __user *uaddr; 23 u32 __user *uaddr;
diff --git a/include/linux/time.h b/include/linux/time.h
index 1e6d3b59238..454a2620578 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -113,8 +113,6 @@ static inline struct timespec timespec_sub(struct timespec lhs,
113#define timespec_valid(ts) \ 113#define timespec_valid(ts) \
114 (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) 114 (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC))
115 115
116extern seqlock_t xtime_lock;
117
118extern void read_persistent_clock(struct timespec *ts); 116extern void read_persistent_clock(struct timespec *ts);
119extern void read_boot_clock(struct timespec *ts); 117extern void read_boot_clock(struct timespec *ts);
120extern int update_persistent_clock(struct timespec now); 118extern int update_persistent_clock(struct timespec now);
@@ -125,8 +123,9 @@ extern int timekeeping_suspended;
125unsigned long get_seconds(void); 123unsigned long get_seconds(void);
126struct timespec current_kernel_time(void); 124struct timespec current_kernel_time(void);
127struct timespec __current_kernel_time(void); /* does not take xtime_lock */ 125struct timespec __current_kernel_time(void); /* does not take xtime_lock */
128struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */
129struct timespec get_monotonic_coarse(void); 126struct timespec get_monotonic_coarse(void);
127void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
128 struct timespec *wtom, struct timespec *sleep);
130 129
131#define CURRENT_TIME (current_kernel_time()) 130#define CURRENT_TIME (current_kernel_time())
132#define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) 131#define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 })
@@ -147,8 +146,9 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
147#endif 146#endif
148 147
149extern void do_gettimeofday(struct timeval *tv); 148extern void do_gettimeofday(struct timeval *tv);
150extern int do_settimeofday(struct timespec *tv); 149extern int do_settimeofday(const struct timespec *tv);
151extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz); 150extern int do_sys_settimeofday(const struct timespec *tv,
151 const struct timezone *tz);
152#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts) 152#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
153extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags); 153extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
154struct itimerval; 154struct itimerval;
@@ -162,12 +162,13 @@ extern void getnstime_raw_and_real(struct timespec *ts_raw,
162 struct timespec *ts_real); 162 struct timespec *ts_real);
163extern void getboottime(struct timespec *ts); 163extern void getboottime(struct timespec *ts);
164extern void monotonic_to_bootbased(struct timespec *ts); 164extern void monotonic_to_bootbased(struct timespec *ts);
165extern void get_monotonic_boottime(struct timespec *ts);
165 166
166extern struct timespec timespec_trunc(struct timespec t, unsigned gran); 167extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
167extern int timekeeping_valid_for_hres(void); 168extern int timekeeping_valid_for_hres(void);
168extern u64 timekeeping_max_deferment(void); 169extern u64 timekeeping_max_deferment(void);
169extern void update_wall_time(void);
170extern void timekeeping_leap_insert(int leapsecond); 170extern void timekeeping_leap_insert(int leapsecond);
171extern int timekeeping_inject_offset(struct timespec *ts);
171 172
172struct tms; 173struct tms;
173extern void do_sys_times(struct tms *); 174extern void do_sys_times(struct tms *);
@@ -292,6 +293,7 @@ struct itimerval {
292#define CLOCK_MONOTONIC_RAW 4 293#define CLOCK_MONOTONIC_RAW 4
293#define CLOCK_REALTIME_COARSE 5 294#define CLOCK_REALTIME_COARSE 5
294#define CLOCK_MONOTONIC_COARSE 6 295#define CLOCK_MONOTONIC_COARSE 6
296#define CLOCK_BOOTTIME 7
295 297
296/* 298/*
297 * The IDs of various hardware clocks: 299 * The IDs of various hardware clocks:
diff --git a/include/linux/timex.h b/include/linux/timex.h
index d23999f9499..aa60fe7b6ed 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -73,7 +73,7 @@ struct timex {
73 long tolerance; /* clock frequency tolerance (ppm) 73 long tolerance; /* clock frequency tolerance (ppm)
74 * (read only) 74 * (read only)
75 */ 75 */
76 struct timeval time; /* (read only) */ 76 struct timeval time; /* (read only, except for ADJ_SETOFFSET) */
77 long tick; /* (modified) usecs between clock ticks */ 77 long tick; /* (modified) usecs between clock ticks */
78 78
79 long ppsfreq; /* pps frequency (scaled ppm) (ro) */ 79 long ppsfreq; /* pps frequency (scaled ppm) (ro) */
@@ -102,6 +102,7 @@ struct timex {
102#define ADJ_STATUS 0x0010 /* clock status */ 102#define ADJ_STATUS 0x0010 /* clock status */
103#define ADJ_TIMECONST 0x0020 /* pll time constant */ 103#define ADJ_TIMECONST 0x0020 /* pll time constant */
104#define ADJ_TAI 0x0080 /* set TAI offset */ 104#define ADJ_TAI 0x0080 /* set TAI offset */
105#define ADJ_SETOFFSET 0x0100 /* add 'time' to current time */
105#define ADJ_MICRO 0x1000 /* select microsecond resolution */ 106#define ADJ_MICRO 0x1000 /* select microsecond resolution */
106#define ADJ_NANO 0x2000 /* select nanosecond resolution */ 107#define ADJ_NANO 0x2000 /* select nanosecond resolution */
107#define ADJ_TICK 0x4000 /* tick value */ 108#define ADJ_TICK 0x4000 /* tick value */
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 7eee77895cb..4cbbcef6baa 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h
@@ -17,36 +17,36 @@ TRACE_EVENT(mce_record,
17 TP_STRUCT__entry( 17 TP_STRUCT__entry(
18 __field( u64, mcgcap ) 18 __field( u64, mcgcap )
19 __field( u64, mcgstatus ) 19 __field( u64, mcgstatus )
20 __field( u8, bank )
21 __field( u64, status ) 20 __field( u64, status )
22 __field( u64, addr ) 21 __field( u64, addr )
23 __field( u64, misc ) 22 __field( u64, misc )
24 __field( u64, ip ) 23 __field( u64, ip )
25 __field( u8, cs )
26 __field( u64, tsc ) 24 __field( u64, tsc )
27 __field( u64, walltime ) 25 __field( u64, walltime )
28 __field( u32, cpu ) 26 __field( u32, cpu )
29 __field( u32, cpuid ) 27 __field( u32, cpuid )
30 __field( u32, apicid ) 28 __field( u32, apicid )
31 __field( u32, socketid ) 29 __field( u32, socketid )
30 __field( u8, cs )
31 __field( u8, bank )
32 __field( u8, cpuvendor ) 32 __field( u8, cpuvendor )
33 ), 33 ),
34 34
35 TP_fast_assign( 35 TP_fast_assign(
36 __entry->mcgcap = m->mcgcap; 36 __entry->mcgcap = m->mcgcap;
37 __entry->mcgstatus = m->mcgstatus; 37 __entry->mcgstatus = m->mcgstatus;
38 __entry->bank = m->bank;
39 __entry->status = m->status; 38 __entry->status = m->status;
40 __entry->addr = m->addr; 39 __entry->addr = m->addr;
41 __entry->misc = m->misc; 40 __entry->misc = m->misc;
42 __entry->ip = m->ip; 41 __entry->ip = m->ip;
43 __entry->cs = m->cs;
44 __entry->tsc = m->tsc; 42 __entry->tsc = m->tsc;
45 __entry->walltime = m->time; 43 __entry->walltime = m->time;
46 __entry->cpu = m->extcpu; 44 __entry->cpu = m->extcpu;
47 __entry->cpuid = m->cpuid; 45 __entry->cpuid = m->cpuid;
48 __entry->apicid = m->apicid; 46 __entry->apicid = m->apicid;
49 __entry->socketid = m->socketid; 47 __entry->socketid = m->socketid;
48 __entry->cs = m->cs;
49 __entry->bank = m->bank;
50 __entry->cpuvendor = m->cpuvendor; 50 __entry->cpuvendor = m->cpuvendor;
51 ), 51 ),
52 52
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index c6bae36547e..21a546d27c0 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -108,14 +108,14 @@ TRACE_EVENT(module_request,
108 TP_ARGS(name, wait, ip), 108 TP_ARGS(name, wait, ip),
109 109
110 TP_STRUCT__entry( 110 TP_STRUCT__entry(
111 __field( bool, wait )
112 __field( unsigned long, ip ) 111 __field( unsigned long, ip )
112 __field( bool, wait )
113 __string( name, name ) 113 __string( name, name )
114 ), 114 ),
115 115
116 TP_fast_assign( 116 TP_fast_assign(
117 __entry->wait = wait;
118 __entry->ip = ip; 117 __entry->ip = ip;
118 __entry->wait = wait;
119 __assign_str(name, name); 119 __assign_str(name, name);
120 ), 120 ),
121 121
@@ -129,4 +129,3 @@ TRACE_EVENT(module_request,
129 129
130/* This part must be outside protection */ 130/* This part must be outside protection */
131#include <trace/define_trace.h> 131#include <trace/define_trace.h>
132
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index f10293c41b1..0c68ae22da2 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -19,14 +19,14 @@ TRACE_EVENT(kfree_skb,
19 19
20 TP_STRUCT__entry( 20 TP_STRUCT__entry(
21 __field( void *, skbaddr ) 21 __field( void *, skbaddr )
22 __field( unsigned short, protocol )
23 __field( void *, location ) 22 __field( void *, location )
23 __field( unsigned short, protocol )
24 ), 24 ),
25 25
26 TP_fast_assign( 26 TP_fast_assign(
27 __entry->skbaddr = skb; 27 __entry->skbaddr = skb;
28 __entry->protocol = ntohs(skb->protocol);
29 __entry->location = location; 28 __entry->location = location;
29 __entry->protocol = ntohs(skb->protocol);
30 ), 30 ),
31 31
32 TP_printk("skbaddr=%p protocol=%u location=%p", 32 TP_printk("skbaddr=%p protocol=%u location=%p",
diff --git a/include/xen/events.h b/include/xen/events.h
index 00f53ddcc06..962da2ced5b 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -75,11 +75,9 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
75int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name); 75int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);
76 76
77#ifdef CONFIG_PCI_MSI 77#ifdef CONFIG_PCI_MSI
78/* Allocate an irq and a pirq to be used with MSIs. */ 78int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
79#define XEN_ALLOC_PIRQ (1 << 0) 79int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
80#define XEN_ALLOC_IRQ (1 << 1) 80 int pirq, int vector, const char *name);
81void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_mask);
82int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
83#endif 81#endif
84 82
85/* De-allocates the above mentioned physical interrupt. */ 83/* De-allocates the above mentioned physical interrupt. */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c2d1fa4dc1e..61e523af3c4 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -51,11 +51,7 @@ typedef uint64_t blkif_sector_t;
51 */ 51 */
52#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 52#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
53 53
54struct blkif_request { 54struct blkif_request_rw {
55 uint8_t operation; /* BLKIF_OP_??? */
56 uint8_t nr_segments; /* number of segments */
57 blkif_vdev_t handle; /* only for read/write requests */
58 uint64_t id; /* private guest value, echoed in resp */
59 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 55 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
60 struct blkif_request_segment { 56 struct blkif_request_segment {
61 grant_ref_t gref; /* reference to I/O buffer frame */ 57 grant_ref_t gref; /* reference to I/O buffer frame */
@@ -65,6 +61,16 @@ struct blkif_request {
65 } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 61 } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
66}; 62};
67 63
64struct blkif_request {
65 uint8_t operation; /* BLKIF_OP_??? */
66 uint8_t nr_segments; /* number of segments */
67 blkif_vdev_t handle; /* only for read/write requests */
68 uint64_t id; /* private guest value, echoed in resp */
69 union {
70 struct blkif_request_rw rw;
71 } u;
72};
73
68struct blkif_response { 74struct blkif_response {
69 uint64_t id; /* copied from request */ 75 uint64_t id; /* copied from request */
70 uint8_t operation; /* copied from request */ 76 uint8_t operation; /* copied from request */
@@ -91,4 +97,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
91#define VDISK_REMOVABLE 0x2 97#define VDISK_REMOVABLE 0x2
92#define VDISK_READONLY 0x4 98#define VDISK_READONLY 0x4
93 99
100/* Xen-defined major numbers for virtual disks, they look strangely
101 * familiar */
102#define XEN_IDE0_MAJOR 3
103#define XEN_IDE1_MAJOR 22
104#define XEN_SCSI_DISK0_MAJOR 8
105#define XEN_SCSI_DISK1_MAJOR 65
106#define XEN_SCSI_DISK2_MAJOR 66
107#define XEN_SCSI_DISK3_MAJOR 67
108#define XEN_SCSI_DISK4_MAJOR 68
109#define XEN_SCSI_DISK5_MAJOR 69
110#define XEN_SCSI_DISK6_MAJOR 70
111#define XEN_SCSI_DISK7_MAJOR 71
112#define XEN_SCSI_DISK8_MAJOR 128
113#define XEN_SCSI_DISK9_MAJOR 129
114#define XEN_SCSI_DISK10_MAJOR 130
115#define XEN_SCSI_DISK11_MAJOR 131
116#define XEN_SCSI_DISK12_MAJOR 132
117#define XEN_SCSI_DISK13_MAJOR 133
118#define XEN_SCSI_DISK14_MAJOR 134
119#define XEN_SCSI_DISK15_MAJOR 135
120
94#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ 121#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 2befa3e2f1b..b33257bc7e8 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -30,7 +30,7 @@
30#define __HYPERVISOR_stack_switch 3 30#define __HYPERVISOR_stack_switch 3
31#define __HYPERVISOR_set_callbacks 4 31#define __HYPERVISOR_set_callbacks 4
32#define __HYPERVISOR_fpu_taskswitch 5 32#define __HYPERVISOR_fpu_taskswitch 5
33#define __HYPERVISOR_sched_op 6 33#define __HYPERVISOR_sched_op_compat 6
34#define __HYPERVISOR_dom0_op 7 34#define __HYPERVISOR_dom0_op 7
35#define __HYPERVISOR_set_debugreg 8 35#define __HYPERVISOR_set_debugreg 8
36#define __HYPERVISOR_get_debugreg 9 36#define __HYPERVISOR_get_debugreg 9
@@ -52,7 +52,7 @@
52#define __HYPERVISOR_mmuext_op 26 52#define __HYPERVISOR_mmuext_op 26
53#define __HYPERVISOR_acm_op 27 53#define __HYPERVISOR_acm_op 27
54#define __HYPERVISOR_nmi_op 28 54#define __HYPERVISOR_nmi_op 28
55#define __HYPERVISOR_sched_op_new 29 55#define __HYPERVISOR_sched_op 29
56#define __HYPERVISOR_callback_op 30 56#define __HYPERVISOR_callback_op 30
57#define __HYPERVISOR_xenoprof_op 31 57#define __HYPERVISOR_xenoprof_op 31
58#define __HYPERVISOR_event_channel_op 32 58#define __HYPERVISOR_event_channel_op 32
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 98b92154a26..03c85d7387f 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -5,9 +5,9 @@
5 5
6DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); 6DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
7 7
8void xen_pre_suspend(void); 8void xen_arch_pre_suspend(void);
9void xen_post_suspend(int suspend_cancelled); 9void xen_arch_post_suspend(int suspend_cancelled);
10void xen_hvm_post_suspend(int suspend_cancelled); 10void xen_arch_hvm_post_suspend(int suspend_cancelled);
11 11
12void xen_mm_pin_all(void); 12void xen_mm_pin_all(void);
13void xen_mm_unpin_all(void); 13void xen_mm_unpin_all(void);
diff --git a/init/Kconfig b/init/Kconfig
index be788c0957d..5721d27af62 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -287,6 +287,18 @@ config BSD_PROCESS_ACCT_V3
287 for processing it. A preliminary version of these tools is available 287 for processing it. A preliminary version of these tools is available
288 at <http://www.gnu.org/software/acct/>. 288 at <http://www.gnu.org/software/acct/>.
289 289
290config FHANDLE
291 bool "open by fhandle syscalls"
292 select EXPORTFS
293 help
294 If you say Y here, a user level program will be able to map
295 file names to handle and then later use the handle for
296 different file system operations. This is useful in implementing
297 userspace file servers, which now track files using handles instead
298 of names. The handle would remain the same even if file names
299 get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
300 syscalls.
301
290config TASKSTATS 302config TASKSTATS
291 bool "Export task/process statistics through netlink (EXPERIMENTAL)" 303 bool "Export task/process statistics through netlink (EXPERIMENTAL)"
292 depends on NET 304 depends on NET
@@ -683,6 +695,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
683 select this option (if, for some reason, they need to disable it 695 select this option (if, for some reason, they need to disable it
684 then noswapaccount does the trick). 696 then noswapaccount does the trick).
685 697
698config CGROUP_PERF
699 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
700 depends on PERF_EVENTS && CGROUPS
701 help
702 This option extends the per-cpu mode to restrict monitoring to
703 threads which belong to the cgroup specified and run on the
704 designated cpu.
705
706 Say N if unsure.
707
686menuconfig CGROUP_SCHED 708menuconfig CGROUP_SCHED
687 bool "Group CPU scheduler" 709 bool "Group CPU scheduler"
688 depends on EXPERIMENTAL 710 depends on EXPERIMENTAL
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c786646..e683869365d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
144} 144}
145 145
146/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
147static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct path *path)
148{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode; 149 struct inode *inode = path->dentry->d_inode;
150 struct audit_parent *parent; 150 struct audit_parent *parent;
151 int ret; 151 int ret;
152 152
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
353} 353}
354 354
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata *ndparent, *ndwatch; 358 struct nameidata nd;
359 struct dentry *d;
359 int err; 360 int err;
360 361
361 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); 362 err = kern_path_parent(watch->path, &nd);
362 if (unlikely(!ndparent)) 363 if (err)
363 return -ENOMEM; 364 return err;
364 365
365 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); 366 if (nd.last_type != LAST_NORM) {
366 if (unlikely(!ndwatch)) { 367 path_put(&nd.path);
367 kfree(ndparent); 368 return -EINVAL;
368 return -ENOMEM;
369 } 369 }
370 370
371 err = path_lookup(path, LOOKUP_PARENT, ndparent); 371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 if (err) { 372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 kfree(ndparent); 373 if (IS_ERR(d)) {
374 kfree(ndwatch); 374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 return err; 375 path_put(&nd.path);
376 return PTR_ERR(d);
376 } 377 }
377 378 if (d->d_inode) {
378 err = path_lookup(path, 0, ndwatch); 379 /* update watch filter fields */
379 if (err) { 380 watch->dev = d->d_inode->i_sb->s_dev;
380 kfree(ndwatch); 381 watch->ino = d->d_inode->i_ino;
381 ndwatch = NULL;
382 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
383 384
384 *ndp = ndparent; 385 *parent = nd.path;
385 *ndw = ndwatch; 386 dput(d);
386
387 return 0; 387 return 0;
388} 388}
389 389
390/* Release resources used for watch path information. */
391static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
392{
393 if (ndp) {
394 path_put(&ndp->path);
395 kfree(ndp);
396 }
397 if (ndw) {
398 path_put(&ndw->path);
399 kfree(ndw);
400 }
401}
402
403/* Associate the given rule with an existing parent. 390/* Associate the given rule with an existing parent.
404 * Caller must hold audit_filter_mutex. */ 391 * Caller must hold audit_filter_mutex. */
405static void audit_add_to_parent(struct audit_krule *krule, 392static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
440{ 427{
441 struct audit_watch *watch = krule->watch; 428 struct audit_watch *watch = krule->watch;
442 struct audit_parent *parent; 429 struct audit_parent *parent;
443 struct nameidata *ndp = NULL, *ndw = NULL; 430 struct path parent_path;
444 int h, ret = 0; 431 int h, ret = 0;
445 432
446 mutex_unlock(&audit_filter_mutex); 433 mutex_unlock(&audit_filter_mutex);
447 434
448 /* Avoid calling path_lookup under audit_filter_mutex. */ 435 /* Avoid calling path_lookup under audit_filter_mutex. */
449 ret = audit_get_nd(watch->path, &ndp, &ndw); 436 ret = audit_get_nd(watch, &parent_path);
450 if (ret) {
451 /* caller expects mutex locked */
452 mutex_lock(&audit_filter_mutex);
453 goto error;
454 }
455 437
438 /* caller expects mutex locked */
456 mutex_lock(&audit_filter_mutex); 439 mutex_lock(&audit_filter_mutex);
457 440
458 /* update watch filter fields */ 441 if (ret)
459 if (ndw) { 442 return ret;
460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
461 watch->ino = ndw->path.dentry->d_inode->i_ino;
462 }
463 443
464 /* either find an old parent or attach a new one */ 444 /* either find an old parent or attach a new one */
465 parent = audit_find_parent(ndp->path.dentry->d_inode); 445 parent = audit_find_parent(parent_path.dentry->d_inode);
466 if (!parent) { 446 if (!parent) {
467 parent = audit_init_parent(ndp); 447 parent = audit_init_parent(&parent_path);
468 if (IS_ERR(parent)) { 448 if (IS_ERR(parent)) {
469 ret = PTR_ERR(parent); 449 ret = PTR_ERR(parent);
470 goto error; 450 goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
479 h = audit_hash_ino((u32)watch->ino); 459 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h]; 460 *list = &audit_inode_hash[h];
481error: 461error:
482 audit_put_nd(ndp, ndw); /* NULL args OK */ 462 path_put(&parent_path);
483 return ret; 463 return ret;
484
485} 464}
486 465
487void audit_remove_watch_rule(struct audit_krule *krule) 466void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83..95362d15128 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
4230 */ 4230 */
4231void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4231void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4232{ 4232{
4233 int i;
4234 struct css_set *cg; 4233 struct css_set *cg;
4235 4234 int i;
4236 if (run_callbacks && need_forkexit_callback) {
4237 /*
4238 * modular subsystems can't use callbacks, so no need to lock
4239 * the subsys array
4240 */
4241 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4242 struct cgroup_subsys *ss = subsys[i];
4243 if (ss->exit)
4244 ss->exit(ss, tsk);
4245 }
4246 }
4247 4235
4248 /* 4236 /*
4249 * Unlink from the css_set task list if necessary. 4237 * Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4261 task_lock(tsk); 4249 task_lock(tsk);
4262 cg = tsk->cgroups; 4250 cg = tsk->cgroups;
4263 tsk->cgroups = &init_css_set; 4251 tsk->cgroups = &init_css_set;
4252
4253 if (run_callbacks && need_forkexit_callback) {
4254 /*
4255 * modular subsystems can't use callbacks, so no need to lock
4256 * the subsys array
4257 */
4258 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4259 struct cgroup_subsys *ss = subsys[i];
4260 if (ss->exit) {
4261 struct cgroup *old_cgrp =
4262 rcu_dereference_raw(cg->subsys[i])->cgroup;
4263 struct cgroup *cgrp = task_cgroup(tsk, i);
4264 ss->exit(ss, cgrp, old_cgrp, tsk);
4265 }
4266 }
4267 }
4264 task_unlock(tsk); 4268 task_unlock(tsk);
4269
4265 if (cg) 4270 if (cg)
4266 put_css_set_taskexit(cg); 4271 put_css_set_taskexit(cg);
4267} 4272}
@@ -4813,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4813 return ret; 4818 return ret;
4814} 4819}
4815 4820
4821/*
4822 * get corresponding css from file open on cgroupfs directory
4823 */
4824struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
4825{
4826 struct cgroup *cgrp;
4827 struct inode *inode;
4828 struct cgroup_subsys_state *css;
4829
4830 inode = f->f_dentry->d_inode;
4831 /* check in cgroup filesystem dir */
4832 if (inode->i_op != &cgroup_dir_inode_operations)
4833 return ERR_PTR(-EBADF);
4834
4835 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
4836 return ERR_PTR(-EINVAL);
4837
4838 /* get cgroup */
4839 cgrp = __d_cgrp(f->f_dentry);
4840 css = cgrp->subsys[id];
4841 return css ? css : ERR_PTR(-ENOENT);
4842}
4843
4816#ifdef CONFIG_CGROUP_DEBUG 4844#ifdef CONFIG_CGROUP_DEBUG
4817static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 4845static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4818 struct cgroup *cont) 4846 struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a..38b1d2c1cbe 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53} 53}
54 54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{
57 memset(txc, 0, sizeof(struct timex));
58
59 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
60 __get_user(txc->modes, &utp->modes) ||
61 __get_user(txc->offset, &utp->offset) ||
62 __get_user(txc->freq, &utp->freq) ||
63 __get_user(txc->maxerror, &utp->maxerror) ||
64 __get_user(txc->esterror, &utp->esterror) ||
65 __get_user(txc->status, &utp->status) ||
66 __get_user(txc->constant, &utp->constant) ||
67 __get_user(txc->precision, &utp->precision) ||
68 __get_user(txc->tolerance, &utp->tolerance) ||
69 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
70 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
71 __get_user(txc->tick, &utp->tick) ||
72 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
73 __get_user(txc->jitter, &utp->jitter) ||
74 __get_user(txc->shift, &utp->shift) ||
75 __get_user(txc->stabil, &utp->stabil) ||
76 __get_user(txc->jitcnt, &utp->jitcnt) ||
77 __get_user(txc->calcnt, &utp->calcnt) ||
78 __get_user(txc->errcnt, &utp->errcnt) ||
79 __get_user(txc->stbcnt, &utp->stbcnt))
80 return -EFAULT;
81
82 return 0;
83}
84
85static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
86{
87 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
88 __put_user(txc->modes, &utp->modes) ||
89 __put_user(txc->offset, &utp->offset) ||
90 __put_user(txc->freq, &utp->freq) ||
91 __put_user(txc->maxerror, &utp->maxerror) ||
92 __put_user(txc->esterror, &utp->esterror) ||
93 __put_user(txc->status, &utp->status) ||
94 __put_user(txc->constant, &utp->constant) ||
95 __put_user(txc->precision, &utp->precision) ||
96 __put_user(txc->tolerance, &utp->tolerance) ||
97 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
98 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
99 __put_user(txc->tick, &utp->tick) ||
100 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
101 __put_user(txc->jitter, &utp->jitter) ||
102 __put_user(txc->shift, &utp->shift) ||
103 __put_user(txc->stabil, &utp->stabil) ||
104 __put_user(txc->jitcnt, &utp->jitcnt) ||
105 __put_user(txc->calcnt, &utp->calcnt) ||
106 __put_user(txc->errcnt, &utp->errcnt) ||
107 __put_user(txc->stbcnt, &utp->stbcnt) ||
108 __put_user(txc->tai, &utp->tai))
109 return -EFAULT;
110 return 0;
111}
112
55asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
56 struct timezone __user *tz) 114 struct timezone __user *tz)
57{ 115{
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
617 return err; 675 return err;
618} 676}
619 677
678long compat_sys_clock_adjtime(clockid_t which_clock,
679 struct compat_timex __user *utp)
680{
681 struct timex txc;
682 mm_segment_t oldfs;
683 int err, ret;
684
685 err = compat_get_timex(&txc, utp);
686 if (err)
687 return err;
688
689 oldfs = get_fs();
690 set_fs(KERNEL_DS);
691 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
692 set_fs(oldfs);
693
694 err = compat_put_timex(utp, &txc);
695 if (err)
696 return err;
697
698 return ret;
699}
700
620long compat_sys_clock_getres(clockid_t which_clock, 701long compat_sys_clock_getres(clockid_t which_clock,
621 struct compat_timespec __user *tp) 702 struct compat_timespec __user *tp)
622{ 703{
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
951asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1032asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
952{ 1033{
953 struct timex txc; 1034 struct timex txc;
954 int ret; 1035 int err, ret;
955
956 memset(&txc, 0, sizeof(struct timex));
957 1036
958 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || 1037 err = compat_get_timex(&txc, utp);
959 __get_user(txc.modes, &utp->modes) || 1038 if (err)
960 __get_user(txc.offset, &utp->offset) || 1039 return err;
961 __get_user(txc.freq, &utp->freq) ||
962 __get_user(txc.maxerror, &utp->maxerror) ||
963 __get_user(txc.esterror, &utp->esterror) ||
964 __get_user(txc.status, &utp->status) ||
965 __get_user(txc.constant, &utp->constant) ||
966 __get_user(txc.precision, &utp->precision) ||
967 __get_user(txc.tolerance, &utp->tolerance) ||
968 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
969 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
970 __get_user(txc.tick, &utp->tick) ||
971 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
972 __get_user(txc.jitter, &utp->jitter) ||
973 __get_user(txc.shift, &utp->shift) ||
974 __get_user(txc.stabil, &utp->stabil) ||
975 __get_user(txc.jitcnt, &utp->jitcnt) ||
976 __get_user(txc.calcnt, &utp->calcnt) ||
977 __get_user(txc.errcnt, &utp->errcnt) ||
978 __get_user(txc.stbcnt, &utp->stbcnt))
979 return -EFAULT;
980 1040
981 ret = do_adjtimex(&txc); 1041 ret = do_adjtimex(&txc);
982 1042
983 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || 1043 err = compat_put_timex(utp, &txc);
984 __put_user(txc.modes, &utp->modes) || 1044 if (err)
985 __put_user(txc.offset, &utp->offset) || 1045 return err;
986 __put_user(txc.freq, &utp->freq) ||
987 __put_user(txc.maxerror, &utp->maxerror) ||
988 __put_user(txc.esterror, &utp->esterror) ||
989 __put_user(txc.status, &utp->status) ||
990 __put_user(txc.constant, &utp->constant) ||
991 __put_user(txc.precision, &utp->precision) ||
992 __put_user(txc.tolerance, &utp->tolerance) ||
993 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
994 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
995 __put_user(txc.tick, &utp->tick) ||
996 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
997 __put_user(txc.jitter, &utp->jitter) ||
998 __put_user(txc.shift, &utp->shift) ||
999 __put_user(txc.stabil, &utp->stabil) ||
1000 __put_user(txc.jitcnt, &utp->jitcnt) ||
1001 __put_user(txc.calcnt, &utp->calcnt) ||
1002 __put_user(txc.errcnt, &utp->errcnt) ||
1003 __put_user(txc.stbcnt, &utp->stbcnt) ||
1004 __put_user(txc.tai, &utp->tai))
1005 ret = -EFAULT;
1006 1046
1007 return ret; 1047 return ret;
1008} 1048}
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a9d6dd53a6..2343c132c5a 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
35static struct thread_group_cred init_tgcred = { 35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2), 36 .usage = ATOMIC_INIT(2),
37 .tgid = 0, 37 .tgid = 0,
38 .lock = SPIN_LOCK_UNLOCKED, 38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39}; 39};
40#endif 40#endif
41 41
diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd..bda41571538 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
381 return NULL; 381 return NULL;
382} 382}
383 383
384static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 384static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
385 u32 uval, u32 newval)
385{ 386{
386 u32 curval; 387 int ret;
387 388
388 pagefault_disable(); 389 pagefault_disable();
389 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 390 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
390 pagefault_enable(); 391 pagefault_enable();
391 392
392 return curval; 393 return ret;
393} 394}
394 395
395static int get_futex_value_locked(u32 *dest, u32 __user *from) 396static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
674 struct task_struct *task, int set_waiters) 675 struct task_struct *task, int set_waiters)
675{ 676{
676 int lock_taken, ret, ownerdied = 0; 677 int lock_taken, ret, ownerdied = 0;
677 u32 uval, newval, curval; 678 u32 uval, newval, curval, vpid = task_pid_vnr(task);
678 679
679retry: 680retry:
680 ret = lock_taken = 0; 681 ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
684 * (by doing a 0 -> TID atomic cmpxchg), while holding all 685 * (by doing a 0 -> TID atomic cmpxchg), while holding all
685 * the locks. It will most likely not succeed. 686 * the locks. It will most likely not succeed.
686 */ 687 */
687 newval = task_pid_vnr(task); 688 newval = vpid;
688 if (set_waiters) 689 if (set_waiters)
689 newval |= FUTEX_WAITERS; 690 newval |= FUTEX_WAITERS;
690 691
691 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 692 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
692
693 if (unlikely(curval == -EFAULT))
694 return -EFAULT; 693 return -EFAULT;
695 694
696 /* 695 /*
697 * Detect deadlocks. 696 * Detect deadlocks.
698 */ 697 */
699 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) 698 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
700 return -EDEADLK; 699 return -EDEADLK;
701 700
702 /* 701 /*
@@ -723,14 +722,12 @@ retry:
723 */ 722 */
724 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 723 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
725 /* Keep the OWNER_DIED bit */ 724 /* Keep the OWNER_DIED bit */
726 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); 725 newval = (curval & ~FUTEX_TID_MASK) | vpid;
727 ownerdied = 0; 726 ownerdied = 0;
728 lock_taken = 1; 727 lock_taken = 1;
729 } 728 }
730 729
731 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 730 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
732
733 if (unlikely(curval == -EFAULT))
734 return -EFAULT; 731 return -EFAULT;
735 if (unlikely(curval != uval)) 732 if (unlikely(curval != uval))
736 goto retry; 733 goto retry;
@@ -775,6 +772,24 @@ retry:
775 return ret; 772 return ret;
776} 773}
777 774
775/**
776 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
777 * @q: The futex_q to unqueue
778 *
779 * The q->lock_ptr must not be NULL and must be held by the caller.
780 */
781static void __unqueue_futex(struct futex_q *q)
782{
783 struct futex_hash_bucket *hb;
784
785 if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
786 || plist_node_empty(&q->list)))
787 return;
788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
790 plist_del(&q->list, &hb->chain);
791}
792
778/* 793/*
779 * The hash bucket lock must be held when this is called. 794 * The hash bucket lock must be held when this is called.
780 * Afterwards, the futex_q must not be accessed. 795 * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
792 */ 807 */
793 get_task_struct(p); 808 get_task_struct(p);
794 809
795 plist_del(&q->list, &q->list.plist); 810 __unqueue_futex(q);
796 /* 811 /*
797 * The waiting task can free the futex_q as soon as 812 * The waiting task can free the futex_q as soon as
798 * q->lock_ptr = NULL is written, without taking any locks. A 813 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
843 858
844 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 859 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
845 860
846 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 861 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
847
848 if (curval == -EFAULT)
849 ret = -EFAULT; 862 ret = -EFAULT;
850 else if (curval != uval) 863 else if (curval != uval)
851 ret = -EINVAL; 864 ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
880 * There is no waiter, so we unlock the futex. The owner died 893 * There is no waiter, so we unlock the futex. The owner died
881 * bit has not to be preserved here. We are the owner: 894 * bit has not to be preserved here. We are the owner:
882 */ 895 */
883 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 896 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
884 897 return -EFAULT;
885 if (oldval == -EFAULT)
886 return oldval;
887 if (oldval != uval) 898 if (oldval != uval)
888 return -EAGAIN; 899 return -EAGAIN;
889 900
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1071 plist_del(&q->list, &hb1->chain); 1082 plist_del(&q->list, &hb1->chain);
1072 plist_add(&q->list, &hb2->chain); 1083 plist_add(&q->list, &hb2->chain);
1073 q->lock_ptr = &hb2->lock; 1084 q->lock_ptr = &hb2->lock;
1074#ifdef CONFIG_DEBUG_PI_LIST
1075 q->list.plist.spinlock = &hb2->lock;
1076#endif
1077 } 1085 }
1078 get_futex_key_refs(key2); 1086 get_futex_key_refs(key2);
1079 q->key = *key2; 1087 q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1100 get_futex_key_refs(key); 1108 get_futex_key_refs(key);
1101 q->key = *key; 1109 q->key = *key;
1102 1110
1103 WARN_ON(plist_node_empty(&q->list)); 1111 __unqueue_futex(q);
1104 plist_del(&q->list, &q->list.plist);
1105 1112
1106 WARN_ON(!q->rt_waiter); 1113 WARN_ON(!q->rt_waiter);
1107 q->rt_waiter = NULL; 1114 q->rt_waiter = NULL;
1108 1115
1109 q->lock_ptr = &hb->lock; 1116 q->lock_ptr = &hb->lock;
1110#ifdef CONFIG_DEBUG_PI_LIST
1111 q->list.plist.spinlock = &hb->lock;
1112#endif
1113 1117
1114 wake_up_state(q->task, TASK_NORMAL); 1118 wake_up_state(q->task, TASK_NORMAL);
1115} 1119}
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1457 prio = min(current->normal_prio, MAX_RT_PRIO); 1461 prio = min(current->normal_prio, MAX_RT_PRIO);
1458 1462
1459 plist_node_init(&q->list, prio); 1463 plist_node_init(&q->list, prio);
1460#ifdef CONFIG_DEBUG_PI_LIST
1461 q->list.plist.spinlock = &hb->lock;
1462#endif
1463 plist_add(&q->list, &hb->chain); 1464 plist_add(&q->list, &hb->chain);
1464 q->task = current; 1465 q->task = current;
1465 spin_unlock(&hb->lock); 1466 spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
1504 spin_unlock(lock_ptr); 1505 spin_unlock(lock_ptr);
1505 goto retry; 1506 goto retry;
1506 } 1507 }
1507 WARN_ON(plist_node_empty(&q->list)); 1508 __unqueue_futex(q);
1508 plist_del(&q->list, &q->list.plist);
1509 1509
1510 BUG_ON(q->pi_state); 1510 BUG_ON(q->pi_state);
1511 1511
@@ -1525,8 +1525,7 @@ retry:
1525static void unqueue_me_pi(struct futex_q *q) 1525static void unqueue_me_pi(struct futex_q *q)
1526 __releases(q->lock_ptr) 1526 __releases(q->lock_ptr)
1527{ 1527{
1528 WARN_ON(plist_node_empty(&q->list)); 1528 __unqueue_futex(q);
1529 plist_del(&q->list, &q->list.plist);
1530 1529
1531 BUG_ON(!q->pi_state); 1530 BUG_ON(!q->pi_state);
1532 free_pi_state(q->pi_state); 1531 free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1556 1555
1557 /* 1556 /*
1558 * We are here either because we stole the rtmutex from the 1557 * We are here either because we stole the rtmutex from the
1559 * pending owner or we are the pending owner which failed to 1558 * previous highest priority waiter or we are the highest priority
1560 * get the rtmutex. We have to replace the pending owner TID 1559 * waiter but failed to get the rtmutex the first time.
1561 * in the user space variable. This must be atomic as we have 1560 * We have to replace the newowner TID in the user space variable.
1562 * to preserve the owner died bit here. 1561 * This must be atomic as we have to preserve the owner died bit here.
1563 * 1562 *
1564 * Note: We write the user space value _before_ changing the pi_state 1563 * Note: We write the user space value _before_ changing the pi_state
1565 * because we can fault here. Imagine swapped out pages or a fork 1564 * because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
1578 while (1) { 1577 while (1) {
1579 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1578 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1580 1579
1581 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1580 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1582
1583 if (curval == -EFAULT)
1584 goto handle_fault; 1581 goto handle_fault;
1585 if (curval == uval) 1582 if (curval == uval)
1586 break; 1583 break;
@@ -1608,8 +1605,8 @@ retry:
1608 1605
1609 /* 1606 /*
1610 * To handle the page fault we need to drop the hash bucket 1607 * To handle the page fault we need to drop the hash bucket
1611 * lock here. That gives the other task (either the pending 1608 * lock here. That gives the other task (either the highest priority
1612 * owner itself or the task which stole the rtmutex) the 1609 * waiter itself or the task which stole the rtmutex) the
1613 * chance to try the fixup of the pi_state. So once we are 1610 * chance to try the fixup of the pi_state. So once we are
1614 * back from handling the fault we need to check the pi_state 1611 * back from handling the fault we need to check the pi_state
1615 * after reacquiring the hash bucket lock and before trying to 1612 * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1685 /* 1682 /*
1686 * pi_state is incorrect, some other task did a lock steal and 1683 * pi_state is incorrect, some other task did a lock steal and
1687 * we returned due to timeout or signal without taking the 1684 * we returned due to timeout or signal without taking the
1688 * rt_mutex. Too late. We can access the rt_mutex_owner without 1685 * rt_mutex. Too late.
1689 * locking, as the other task is now blocked on the hash bucket
1690 * lock. Fix the state up.
1691 */ 1686 */
1687 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1692 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1688 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1689 if (!owner)
1690 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1691 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1693 ret = fixup_pi_state_owner(uaddr, q, owner); 1692 ret = fixup_pi_state_owner(uaddr, q, owner);
1694 goto out; 1693 goto out;
1695 } 1694 }
1696 1695
1697 /* 1696 /*
1698 * Paranoia check. If we did not take the lock, then we should not be 1697 * Paranoia check. If we did not take the lock, then we should not be
1699 * the owner, nor the pending owner, of the rt_mutex. 1698 * the owner of the rt_mutex.
1700 */ 1699 */
1701 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1700 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1702 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1701 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1781 * 1780 *
1782 * The basic logical guarantee of a futex is that it blocks ONLY 1781 * The basic logical guarantee of a futex is that it blocks ONLY
1783 * if cond(var) is known to be true at the time of blocking, for 1782 * if cond(var) is known to be true at the time of blocking, for
1784 * any cond. If we queued after testing *uaddr, that would open 1783 * any cond. If we locked the hash-bucket after testing *uaddr, that
1785 * a race condition where we could block indefinitely with 1784 * would open a race condition where we could block indefinitely with
1786 * cond(var) false, which would violate the guarantee. 1785 * cond(var) false, which would violate the guarantee.
1787 * 1786 *
1788 * A consequence is that futex_wait() can return zero and absorb 1787 * On the other hand, we insert q and release the hash-bucket only
1789 * a wakeup when *uaddr != val on entry to the syscall. This is 1788 * after testing *uaddr. This guarantees that futex_wait() will NOT
1790 * rare, but normal. 1789 * absorb a wakeup if *uaddr does not match the desired values
1790 * while the syscall executes.
1791 */ 1791 */
1792retry: 1792retry:
1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2046{ 2046{
2047 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2048 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
2049 u32 uval;
2050 struct plist_head *head; 2049 struct plist_head *head;
2051 union futex_key key = FUTEX_KEY_INIT; 2050 union futex_key key = FUTEX_KEY_INIT;
2051 u32 uval, vpid = task_pid_vnr(current);
2052 int ret; 2052 int ret;
2053 2053
2054retry: 2054retry:
@@ -2057,7 +2057,7 @@ retry:
2057 /* 2057 /*
2058 * We release only a lock we actually own: 2058 * We release only a lock we actually own:
2059 */ 2059 */
2060 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != vpid)
2061 return -EPERM; 2061 return -EPERM;
2062 2062
2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
2072 * again. If it succeeds then we can return without waking 2072 * again. If it succeeds then we can return without waking
2073 * anyone else up: 2073 * anyone else up:
2074 */ 2074 */
2075 if (!(uval & FUTEX_OWNER_DIED)) 2075 if (!(uval & FUTEX_OWNER_DIED) &&
2076 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 2076 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2077
2078
2079 if (unlikely(uval == -EFAULT))
2080 goto pi_faulted; 2077 goto pi_faulted;
2081 /* 2078 /*
2082 * Rare case: we managed to release the lock atomically, 2079 * Rare case: we managed to release the lock atomically,
2083 * no need to wake anyone else up: 2080 * no need to wake anyone else up:
2084 */ 2081 */
2085 if (unlikely(uval == task_pid_vnr(current))) 2082 if (unlikely(uval == vpid))
2086 goto out_unlock; 2083 goto out_unlock;
2087 2084
2088 /* 2085 /*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2167 * We were woken prior to requeue by a timeout or a signal. 2164 * We were woken prior to requeue by a timeout or a signal.
2168 * Unqueue the futex_q and determine which it was. 2165 * Unqueue the futex_q and determine which it was.
2169 */ 2166 */
2170 plist_del(&q->list, &q->list.plist); 2167 plist_del(&q->list, &hb->chain);
2171 2168
2172 /* Handle spurious wakeups gracefully */ 2169 /* Handle spurious wakeups gracefully */
2173 ret = -EWOULDBLOCK; 2170 ret = -EWOULDBLOCK;
@@ -2463,11 +2460,20 @@ retry:
2463 * userspace. 2460 * userspace.
2464 */ 2461 */
2465 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2462 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2466 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2463 /*
2467 2464 * We are not holding a lock here, but we want to have
2468 if (nval == -EFAULT) 2465 * the pagefault_disable/enable() protection because
2469 return -1; 2466 * we want to handle the fault gracefully. If the
2470 2467 * access fails we try to fault in the futex with R/W
2468 * verification via get_user_pages. get_user() above
2469 * does not guarantee R/W access. If that fails we
2470 * give up and leave the futex locked.
2471 */
2472 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2473 if (fault_in_user_writeable(uaddr))
2474 return -1;
2475 goto retry;
2476 }
2471 if (nval != uval) 2477 if (nval != uval)
2472 goto retry; 2478 goto retry;
2473 2479
@@ -2678,8 +2684,7 @@ static int __init futex_init(void)
2678 * implementation, the non-functional ones will return 2684 * implementation, the non-functional ones will return
2679 * -ENOSYS. 2685 * -ENOSYS.
2680 */ 2686 */
2681 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2687 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2682 if (curval == -EFAULT)
2683 futex_cmpxchg_enabled = 1; 2688 futex_cmpxchg_enabled = 1;
2684 2689
2685 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2690 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c04861..9017478c5d4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
53/* 53/*
54 * The timer bases: 54 * The timer bases:
55 * 55 *
56 * Note: If we want to add new timer bases, we have to skip the two 56 * There are more clockids then hrtimer bases. Thus, we index
57 * clock ids captured by the cpu-timers. We do this by holding empty 57 * into the timer bases by the hrtimer_base_type enum. When trying
58 * entries rather than doing math adjustment of the clock ids. 58 * to reach a base using a clockid, hrtimer_clockid_to_base()
59 * This ensures that we capture erroneous accesses to these clock ids 59 * is used to convert from clockid to the proper hrtimer_base_type.
60 * rather than moving them into the range of valid clock id's.
61 */ 60 */
62DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 61DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
63{ 62{
@@ -74,30 +73,39 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
74 .get_time = &ktime_get, 73 .get_time = &ktime_get,
75 .resolution = KTIME_LOW_RES, 74 .resolution = KTIME_LOW_RES,
76 }, 75 },
76 {
77 .index = CLOCK_BOOTTIME,
78 .get_time = &ktime_get_boottime,
79 .resolution = KTIME_LOW_RES,
80 },
77 } 81 }
78}; 82};
79 83
84static int hrtimer_clock_to_base_table[MAX_CLOCKS];
85
86static inline int hrtimer_clockid_to_base(clockid_t clock_id)
87{
88 return hrtimer_clock_to_base_table[clock_id];
89}
90
91
80/* 92/*
81 * Get the coarse grained time at the softirq based on xtime and 93 * Get the coarse grained time at the softirq based on xtime and
82 * wall_to_monotonic. 94 * wall_to_monotonic.
83 */ 95 */
84static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 96static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
85{ 97{
86 ktime_t xtim, tomono; 98 ktime_t xtim, mono, boot;
87 struct timespec xts, tom; 99 struct timespec xts, tom, slp;
88 unsigned long seq;
89 100
90 do { 101 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time();
93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq));
95 102
96 xtim = timespec_to_ktime(xts); 103 xtim = timespec_to_ktime(xts);
97 tomono = timespec_to_ktime(tom); 104 mono = ktime_add(xtim, timespec_to_ktime(tom));
98 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 105 boot = ktime_add(mono, timespec_to_ktime(slp));
99 base->clock_base[CLOCK_MONOTONIC].softirq_time = 106 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
100 ktime_add(xtim, tomono); 107 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
108 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
101} 109}
102 110
103/* 111/*
@@ -184,10 +192,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
184 struct hrtimer_cpu_base *new_cpu_base; 192 struct hrtimer_cpu_base *new_cpu_base;
185 int this_cpu = smp_processor_id(); 193 int this_cpu = smp_processor_id();
186 int cpu = hrtimer_get_target(this_cpu, pinned); 194 int cpu = hrtimer_get_target(this_cpu, pinned);
195 int basenum = hrtimer_clockid_to_base(base->index);
187 196
188again: 197again:
189 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 198 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
190 new_base = &new_cpu_base->clock_base[base->index]; 199 new_base = &new_cpu_base->clock_base[basenum];
191 200
192 if (base != new_base) { 201 if (base != new_base) {
193 /* 202 /*
@@ -334,6 +343,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
334 343
335static struct debug_obj_descr hrtimer_debug_descr; 344static struct debug_obj_descr hrtimer_debug_descr;
336 345
346static void *hrtimer_debug_hint(void *addr)
347{
348 return ((struct hrtimer *) addr)->function;
349}
350
337/* 351/*
338 * fixup_init is called when: 352 * fixup_init is called when:
339 * - an active object is initialized 353 * - an active object is initialized
@@ -393,6 +407,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
393 407
394static struct debug_obj_descr hrtimer_debug_descr = { 408static struct debug_obj_descr hrtimer_debug_descr = {
395 .name = "hrtimer", 409 .name = "hrtimer",
410 .debug_hint = hrtimer_debug_hint,
396 .fixup_init = hrtimer_fixup_init, 411 .fixup_init = hrtimer_fixup_init,
397 .fixup_activate = hrtimer_fixup_activate, 412 .fixup_activate = hrtimer_fixup_activate,
398 .fixup_free = hrtimer_fixup_free, 413 .fixup_free = hrtimer_fixup_free,
@@ -611,24 +626,23 @@ static int hrtimer_reprogram(struct hrtimer *timer,
611static void retrigger_next_event(void *arg) 626static void retrigger_next_event(void *arg)
612{ 627{
613 struct hrtimer_cpu_base *base; 628 struct hrtimer_cpu_base *base;
614 struct timespec realtime_offset, wtm; 629 struct timespec realtime_offset, wtm, sleep;
615 unsigned long seq;
616 630
617 if (!hrtimer_hres_active()) 631 if (!hrtimer_hres_active())
618 return; 632 return;
619 633
620 do { 634 get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
621 seq = read_seqbegin(&xtime_lock); 635 &sleep);
622 wtm = __get_wall_to_monotonic();
623 } while (read_seqretry(&xtime_lock, seq));
624 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); 636 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
625 637
626 base = &__get_cpu_var(hrtimer_bases); 638 base = &__get_cpu_var(hrtimer_bases);
627 639
628 /* Adjust CLOCK_REALTIME offset */ 640 /* Adjust CLOCK_REALTIME offset */
629 raw_spin_lock(&base->lock); 641 raw_spin_lock(&base->lock);
630 base->clock_base[CLOCK_REALTIME].offset = 642 base->clock_base[HRTIMER_BASE_REALTIME].offset =
631 timespec_to_ktime(realtime_offset); 643 timespec_to_ktime(realtime_offset);
644 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
645 timespec_to_ktime(sleep);
632 646
633 hrtimer_force_reprogram(base, 0); 647 hrtimer_force_reprogram(base, 0);
634 raw_spin_unlock(&base->lock); 648 raw_spin_unlock(&base->lock);
@@ -673,14 +687,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
673} 687}
674 688
675/* 689/*
676 * Initialize the high resolution related parts of a hrtimer
677 */
678static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
679{
680}
681
682
683/*
684 * When High resolution timers are active, try to reprogram. Note, that in case 690 * When High resolution timers are active, try to reprogram. Note, that in case
685 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 691 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
686 * check happens. The timer gets enqueued into the rbtree. The reprogramming 692 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -725,8 +731,9 @@ static int hrtimer_switch_to_hres(void)
725 return 0; 731 return 0;
726 } 732 }
727 base->hres_active = 1; 733 base->hres_active = 1;
728 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; 734 base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
729 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; 735 base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
736 base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
730 737
731 tick_setup_sched_timer(); 738 tick_setup_sched_timer();
732 739
@@ -750,7 +757,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
750 return 0; 757 return 0;
751} 758}
752static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 759static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
753static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
754 760
755#endif /* CONFIG_HIGH_RES_TIMERS */ 761#endif /* CONFIG_HIGH_RES_TIMERS */
756 762
@@ -1121,6 +1127,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1121 enum hrtimer_mode mode) 1127 enum hrtimer_mode mode)
1122{ 1128{
1123 struct hrtimer_cpu_base *cpu_base; 1129 struct hrtimer_cpu_base *cpu_base;
1130 int base;
1124 1131
1125 memset(timer, 0, sizeof(struct hrtimer)); 1132 memset(timer, 0, sizeof(struct hrtimer));
1126 1133
@@ -1129,8 +1136,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1129 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1136 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1130 clock_id = CLOCK_MONOTONIC; 1137 clock_id = CLOCK_MONOTONIC;
1131 1138
1132 timer->base = &cpu_base->clock_base[clock_id]; 1139 base = hrtimer_clockid_to_base(clock_id);
1133 hrtimer_init_timer_hres(timer); 1140 timer->base = &cpu_base->clock_base[base];
1134 timerqueue_init(&timer->node); 1141 timerqueue_init(&timer->node);
1135 1142
1136#ifdef CONFIG_TIMER_STATS 1143#ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1172,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
1165int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 1172int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1166{ 1173{
1167 struct hrtimer_cpu_base *cpu_base; 1174 struct hrtimer_cpu_base *cpu_base;
1175 int base = hrtimer_clockid_to_base(which_clock);
1168 1176
1169 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1177 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1170 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); 1178 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1171 1179
1172 return 0; 1180 return 0;
1173} 1181}
@@ -1714,6 +1722,10 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
1714 1722
1715void __init hrtimers_init(void) 1723void __init hrtimers_init(void)
1716{ 1724{
1725 hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
1726 hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
1727 hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
1728
1717 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1729 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1718 (void *)(long)smp_processor_id()); 1730 (void *)(long)smp_processor_id());
1719 register_cpu_notifier(&hrtimers_nb); 1731 register_cpu_notifier(&hrtimers_nb);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686..09bef82d74c 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
1# Select this to activate the generic irq options below
1config HAVE_GENERIC_HARDIRQS 2config HAVE_GENERIC_HARDIRQS
2 def_bool n 3 bool
3 4
4if HAVE_GENERIC_HARDIRQS 5if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem" 6menu "IRQ subsystem"
@@ -11,26 +12,44 @@ config GENERIC_HARDIRQS
11 12
12# Select this to disable the deprecated stuff 13# Select this to disable the deprecated stuff
13config GENERIC_HARDIRQS_NO_DEPRECATED 14config GENERIC_HARDIRQS_NO_DEPRECATED
14 def_bool n 15 bool
16
17config GENERIC_HARDIRQS_NO_COMPAT
18 bool
15 19
16# Options selectable by the architecture code 20# Options selectable by the architecture code
21
22# Make sparse irq Kconfig switch below available
17config HAVE_SPARSE_IRQ 23config HAVE_SPARSE_IRQ
18 def_bool n 24 bool
19 25
26# Enable the generic irq autoprobe mechanism
20config GENERIC_IRQ_PROBE 27config GENERIC_IRQ_PROBE
21 def_bool n 28 bool
29
30# Use the generic /proc/interrupts implementation
31config GENERIC_IRQ_SHOW
32 bool
22 33
34# Support for delayed migration from interrupt context
23config GENERIC_PENDING_IRQ 35config GENERIC_PENDING_IRQ
24 def_bool n 36 bool
25 37
38# Alpha specific irq affinity mechanism
26config AUTO_IRQ_AFFINITY 39config AUTO_IRQ_AFFINITY
27 def_bool n 40 bool
28
29config IRQ_PER_CPU
30 def_bool n
31 41
42# Tasklet based software resend for pending interrupts on enable_irq()
32config HARDIRQS_SW_RESEND 43config HARDIRQS_SW_RESEND
33 def_bool n 44 bool
45
46# Preflow handler support for fasteoi (sparc64)
47config IRQ_PREFLOW_FASTEOI
48 bool
49
50# Support forced irq threading
51config IRQ_FORCED_THREADING
52 bool
34 53
35config SPARSE_IRQ 54config SPARSE_IRQ
36 bool "Support sparse irq numbering" 55 bool "Support sparse irq numbering"
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c3..394784c5706 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
17/* 17/*
18 * Autodetection depends on the fact that any interrupt that 18 * Autodetection depends on the fact that any interrupt that
19 * comes in on to an unassigned handler will get stuck with 19 * comes in on to an unassigned handler will get stuck with
20 * "IRQ_WAITING" cleared and the interrupt disabled. 20 * "IRQS_WAITING" cleared and the interrupt disabled.
21 */ 21 */
22static DEFINE_MUTEX(probing_active); 22static DEFINE_MUTEX(probing_active);
23 23
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
32{ 32{
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 unsigned long mask = 0; 34 unsigned long mask = 0;
35 unsigned int status;
36 int i; 35 int i;
37 36
38 /* 37 /*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
46 */ 45 */
47 for_each_irq_desc_reverse(i, desc) { 46 for_each_irq_desc_reverse(i, desc) {
48 raw_spin_lock_irq(&desc->lock); 47 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 48 if (!desc->action && irq_settings_can_probe(desc)) {
50 /*
51 * An old-style architecture might still have
52 * the handle_bad_irq handler there:
53 */
54 compat_irq_chip_set_default_handler(desc);
55
56 /* 49 /*
57 * Some chips need to know about probing in 50 * Some chips need to know about probing in
58 * progress: 51 * progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
60 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
61 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data); 56 irq_startup(desc);
64 } 57 }
65 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
66 } 59 }
@@ -75,10 +68,12 @@ unsigned long probe_irq_on(void)
75 */ 68 */
76 for_each_irq_desc_reverse(i, desc) { 69 for_each_irq_desc_reverse(i, desc) {
77 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
80 if (desc->irq_data.chip->irq_startup(&desc->irq_data)) 73 if (irq_startup(desc)) {
81 desc->status |= IRQ_PENDING; 74 irq_compat_set_pending(desc);
75 desc->istate |= IRQS_PENDING;
76 }
82 } 77 }
83 raw_spin_unlock_irq(&desc->lock); 78 raw_spin_unlock_irq(&desc->lock);
84 } 79 }
@@ -93,13 +88,12 @@ unsigned long probe_irq_on(void)
93 */ 88 */
94 for_each_irq_desc(i, desc) { 89 for_each_irq_desc(i, desc) {
95 raw_spin_lock_irq(&desc->lock); 90 raw_spin_lock_irq(&desc->lock);
96 status = desc->status;
97 91
98 if (status & IRQ_AUTODETECT) { 92 if (desc->istate & IRQS_AUTODETECT) {
99 /* It triggered already - consider it spurious. */ 93 /* It triggered already - consider it spurious. */
100 if (!(status & IRQ_WAITING)) { 94 if (!(desc->istate & IRQS_WAITING)) {
101 desc->status = status & ~IRQ_AUTODETECT; 95 desc->istate &= ~IRQS_AUTODETECT;
102 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 96 irq_shutdown(desc);
103 } else 97 } else
104 if (i < 32) 98 if (i < 32)
105 mask |= 1 << i; 99 mask |= 1 << i;
@@ -125,20 +119,18 @@ EXPORT_SYMBOL(probe_irq_on);
125 */ 119 */
126unsigned int probe_irq_mask(unsigned long val) 120unsigned int probe_irq_mask(unsigned long val)
127{ 121{
128 unsigned int status, mask = 0; 122 unsigned int mask = 0;
129 struct irq_desc *desc; 123 struct irq_desc *desc;
130 int i; 124 int i;
131 125
132 for_each_irq_desc(i, desc) { 126 for_each_irq_desc(i, desc) {
133 raw_spin_lock_irq(&desc->lock); 127 raw_spin_lock_irq(&desc->lock);
134 status = desc->status; 128 if (desc->istate & IRQS_AUTODETECT) {
135 129 if (i < 16 && !(desc->istate & IRQS_WAITING))
136 if (status & IRQ_AUTODETECT) {
137 if (i < 16 && !(status & IRQ_WAITING))
138 mask |= 1 << i; 130 mask |= 1 << i;
139 131
140 desc->status = status & ~IRQ_AUTODETECT; 132 desc->istate &= ~IRQS_AUTODETECT;
141 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 133 irq_shutdown(desc);
142 } 134 }
143 raw_spin_unlock_irq(&desc->lock); 135 raw_spin_unlock_irq(&desc->lock);
144 } 136 }
@@ -169,20 +161,18 @@ int probe_irq_off(unsigned long val)
169{ 161{
170 int i, irq_found = 0, nr_of_irqs = 0; 162 int i, irq_found = 0, nr_of_irqs = 0;
171 struct irq_desc *desc; 163 struct irq_desc *desc;
172 unsigned int status;
173 164
174 for_each_irq_desc(i, desc) { 165 for_each_irq_desc(i, desc) {
175 raw_spin_lock_irq(&desc->lock); 166 raw_spin_lock_irq(&desc->lock);
176 status = desc->status;
177 167
178 if (status & IRQ_AUTODETECT) { 168 if (desc->istate & IRQS_AUTODETECT) {
179 if (!(status & IRQ_WAITING)) { 169 if (!(desc->istate & IRQS_WAITING)) {
180 if (!nr_of_irqs) 170 if (!nr_of_irqs)
181 irq_found = i; 171 irq_found = i;
182 nr_of_irqs++; 172 nr_of_irqs++;
183 } 173 }
184 desc->status = status & ~IRQ_AUTODETECT; 174 desc->istate &= ~IRQS_AUTODETECT;
185 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 175 irq_shutdown(desc);
186 } 176 }
187 raw_spin_unlock_irq(&desc->lock); 177 raw_spin_unlock_irq(&desc->lock);
188 } 178 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad8..c9c0601f061 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,110 @@
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21/**
22 * set_irq_chip - set the irq chip for an irq 22 * irq_set_chip - set the irq chip for an irq
23 * @irq: irq number 23 * @irq: irq number
24 * @chip: pointer to irq chip description structure 24 * @chip: pointer to irq chip description structure
25 */ 25 */
26int set_irq_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 struct irq_desc *desc = irq_to_desc(irq);
29 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
30 30
31 if (!desc) { 31 if (!desc)
32 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
33 return -EINVAL; 32 return -EINVAL;
34 }
35 33
36 if (!chip) 34 if (!chip)
37 chip = &no_irq_chip; 35 chip = &no_irq_chip;
38 36
39 raw_spin_lock_irqsave(&desc->lock, flags);
40 irq_chip_set_defaults(chip); 37 irq_chip_set_defaults(chip);
41 desc->irq_data.chip = chip; 38 desc->irq_data.chip = chip;
42 raw_spin_unlock_irqrestore(&desc->lock, flags); 39 irq_put_desc_unlock(desc, flags);
43
44 return 0; 40 return 0;
45} 41}
46EXPORT_SYMBOL(set_irq_chip); 42EXPORT_SYMBOL(irq_set_chip);
47 43
48/** 44/**
49 * set_irq_type - set the irq trigger type for an irq 45 * irq_set_type - set the irq trigger type for an irq
50 * @irq: irq number 46 * @irq: irq number
51 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h 47 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
52 */ 48 */
53int set_irq_type(unsigned int irq, unsigned int type) 49int irq_set_irq_type(unsigned int irq, unsigned int type)
54{ 50{
55 struct irq_desc *desc = irq_to_desc(irq);
56 unsigned long flags; 51 unsigned long flags;
57 int ret = -ENXIO; 52 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
53 int ret = 0;
58 54
59 if (!desc) { 55 if (!desc)
60 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 56 return -EINVAL;
61 return -ENODEV;
62 }
63 57
64 type &= IRQ_TYPE_SENSE_MASK; 58 type &= IRQ_TYPE_SENSE_MASK;
65 if (type == IRQ_TYPE_NONE) 59 if (type != IRQ_TYPE_NONE)
66 return 0; 60 ret = __irq_set_trigger(desc, irq, type);
67 61 irq_put_desc_busunlock(desc, flags);
68 raw_spin_lock_irqsave(&desc->lock, flags);
69 ret = __irq_set_trigger(desc, irq, type);
70 raw_spin_unlock_irqrestore(&desc->lock, flags);
71 return ret; 62 return ret;
72} 63}
73EXPORT_SYMBOL(set_irq_type); 64EXPORT_SYMBOL(irq_set_irq_type);
74 65
75/** 66/**
76 * set_irq_data - set irq type data for an irq 67 * irq_set_handler_data - set irq handler data for an irq
77 * @irq: Interrupt number 68 * @irq: Interrupt number
78 * @data: Pointer to interrupt specific data 69 * @data: Pointer to interrupt specific data
79 * 70 *
80 * Set the hardware irq controller data for an irq 71 * Set the hardware irq controller data for an irq
81 */ 72 */
82int set_irq_data(unsigned int irq, void *data) 73int irq_set_handler_data(unsigned int irq, void *data)
83{ 74{
84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 75 unsigned long flags;
76 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
86 77
87 if (!desc) { 78 if (!desc)
88 printk(KERN_ERR
89 "Trying to install controller data for IRQ%d\n", irq);
90 return -EINVAL; 79 return -EINVAL;
91 }
92
93 raw_spin_lock_irqsave(&desc->lock, flags);
94 desc->irq_data.handler_data = data; 80 desc->irq_data.handler_data = data;
95 raw_spin_unlock_irqrestore(&desc->lock, flags); 81 irq_put_desc_unlock(desc, flags);
96 return 0; 82 return 0;
97} 83}
98EXPORT_SYMBOL(set_irq_data); 84EXPORT_SYMBOL(irq_set_handler_data);
99 85
100/** 86/**
101 * set_irq_msi - set MSI descriptor data for an irq 87 * irq_set_msi_desc - set MSI descriptor data for an irq
102 * @irq: Interrupt number 88 * @irq: Interrupt number
103 * @entry: Pointer to MSI descriptor data 89 * @entry: Pointer to MSI descriptor data
104 * 90 *
105 * Set the MSI descriptor entry for an irq 91 * Set the MSI descriptor entry for an irq
106 */ 92 */
107int set_irq_msi(unsigned int irq, struct msi_desc *entry) 93int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
108{ 94{
109 struct irq_desc *desc = irq_to_desc(irq);
110 unsigned long flags; 95 unsigned long flags;
96 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
111 97
112 if (!desc) { 98 if (!desc)
113 printk(KERN_ERR
114 "Trying to install msi data for IRQ%d\n", irq);
115 return -EINVAL; 99 return -EINVAL;
116 }
117
118 raw_spin_lock_irqsave(&desc->lock, flags);
119 desc->irq_data.msi_desc = entry; 100 desc->irq_data.msi_desc = entry;
120 if (entry) 101 if (entry)
121 entry->irq = irq; 102 entry->irq = irq;
122 raw_spin_unlock_irqrestore(&desc->lock, flags); 103 irq_put_desc_unlock(desc, flags);
123 return 0; 104 return 0;
124} 105}
125 106
126/** 107/**
127 * set_irq_chip_data - set irq chip data for an irq 108 * irq_set_chip_data - set irq chip data for an irq
128 * @irq: Interrupt number 109 * @irq: Interrupt number
129 * @data: Pointer to chip specific data 110 * @data: Pointer to chip specific data
130 * 111 *
131 * Set the hardware irq chip data for an irq 112 * Set the hardware irq chip data for an irq
132 */ 113 */
133int set_irq_chip_data(unsigned int irq, void *data) 114int irq_set_chip_data(unsigned int irq, void *data)
134{ 115{
135 struct irq_desc *desc = irq_to_desc(irq);
136 unsigned long flags; 116 unsigned long flags;
117 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
137 118
138 if (!desc) { 119 if (!desc)
139 printk(KERN_ERR
140 "Trying to install chip data for IRQ%d\n", irq);
141 return -EINVAL;
142 }
143
144 if (!desc->irq_data.chip) {
145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
146 return -EINVAL; 120 return -EINVAL;
147 }
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->irq_data.chip_data = data; 121 desc->irq_data.chip_data = data;
151 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 irq_put_desc_unlock(desc, flags);
152
153 return 0; 123 return 0;
154} 124}
155EXPORT_SYMBOL(set_irq_chip_data); 125EXPORT_SYMBOL(irq_set_chip_data);
156 126
157struct irq_data *irq_get_irq_data(unsigned int irq) 127struct irq_data *irq_get_irq_data(unsigned int irq)
158{ 128{
@@ -162,72 +132,75 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
162} 132}
163EXPORT_SYMBOL_GPL(irq_get_irq_data); 133EXPORT_SYMBOL_GPL(irq_get_irq_data);
164 134
165/** 135static void irq_state_clr_disabled(struct irq_desc *desc)
166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
167 *
168 * @irq: Interrupt number
169 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
170 *
171 * The IRQ_NESTED_THREAD flag indicates that on
172 * request_threaded_irq() no separate interrupt thread should be
173 * created for the irq as the handler are called nested in the
174 * context of a demultiplexing interrupt handler thread.
175 */
176void set_irq_nested_thread(unsigned int irq, int nest)
177{ 136{
178 struct irq_desc *desc = irq_to_desc(irq); 137 desc->istate &= ~IRQS_DISABLED;
179 unsigned long flags; 138 irq_compat_clr_disabled(desc);
180
181 if (!desc)
182 return;
183
184 raw_spin_lock_irqsave(&desc->lock, flags);
185 if (nest)
186 desc->status |= IRQ_NESTED_THREAD;
187 else
188 desc->status &= ~IRQ_NESTED_THREAD;
189 raw_spin_unlock_irqrestore(&desc->lock, flags);
190} 139}
191EXPORT_SYMBOL_GPL(set_irq_nested_thread);
192 140
193/* 141static void irq_state_set_disabled(struct irq_desc *desc)
194 * default enable function
195 */
196static void default_enable(struct irq_data *data)
197{ 142{
198 struct irq_desc *desc = irq_data_to_desc(data); 143 desc->istate |= IRQS_DISABLED;
144 irq_compat_set_disabled(desc);
145}
199 146
200 desc->irq_data.chip->irq_unmask(&desc->irq_data); 147static void irq_state_clr_masked(struct irq_desc *desc)
201 desc->status &= ~IRQ_MASKED; 148{
149 desc->istate &= ~IRQS_MASKED;
150 irq_compat_clr_masked(desc);
202} 151}
203 152
204/* 153static void irq_state_set_masked(struct irq_desc *desc)
205 * default disable function
206 */
207static void default_disable(struct irq_data *data)
208{ 154{
155 desc->istate |= IRQS_MASKED;
156 irq_compat_set_masked(desc);
209} 157}
210 158
211/* 159int irq_startup(struct irq_desc *desc)
212 * default startup function
213 */
214static unsigned int default_startup(struct irq_data *data)
215{ 160{
216 struct irq_desc *desc = irq_data_to_desc(data); 161 irq_state_clr_disabled(desc);
162 desc->depth = 0;
163
164 if (desc->irq_data.chip->irq_startup) {
165 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
166 irq_state_clr_masked(desc);
167 return ret;
168 }
217 169
218 desc->irq_data.chip->irq_enable(data); 170 irq_enable(desc);
219 return 0; 171 return 0;
220} 172}
221 173
222/* 174void irq_shutdown(struct irq_desc *desc)
223 * default shutdown function
224 */
225static void default_shutdown(struct irq_data *data)
226{ 175{
227 struct irq_desc *desc = irq_data_to_desc(data); 176 irq_state_set_disabled(desc);
177 desc->depth = 1;
178 if (desc->irq_data.chip->irq_shutdown)
179 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
180 if (desc->irq_data.chip->irq_disable)
181 desc->irq_data.chip->irq_disable(&desc->irq_data);
182 else
183 desc->irq_data.chip->irq_mask(&desc->irq_data);
184 irq_state_set_masked(desc);
185}
228 186
229 desc->irq_data.chip->irq_mask(&desc->irq_data); 187void irq_enable(struct irq_desc *desc)
230 desc->status |= IRQ_MASKED; 188{
189 irq_state_clr_disabled(desc);
190 if (desc->irq_data.chip->irq_enable)
191 desc->irq_data.chip->irq_enable(&desc->irq_data);
192 else
193 desc->irq_data.chip->irq_unmask(&desc->irq_data);
194 irq_state_clr_masked(desc);
195}
196
197void irq_disable(struct irq_desc *desc)
198{
199 irq_state_set_disabled(desc);
200 if (desc->irq_data.chip->irq_disable) {
201 desc->irq_data.chip->irq_disable(&desc->irq_data);
202 irq_state_set_masked(desc);
203 }
231} 204}
232 205
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 206#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -315,10 +288,6 @@ static void compat_bus_sync_unlock(struct irq_data *data)
315void irq_chip_set_defaults(struct irq_chip *chip) 288void irq_chip_set_defaults(struct irq_chip *chip)
316{ 289{
317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 290#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
318 /*
319 * Compat fixup functions need to be before we set the
320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable) 291 if (chip->enable)
323 chip->irq_enable = compat_irq_enable; 292 chip->irq_enable = compat_irq_enable;
324 if (chip->disable) 293 if (chip->disable)
@@ -327,33 +296,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
327 chip->irq_shutdown = compat_irq_shutdown; 296 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup) 297 if (chip->startup)
329 chip->irq_startup = compat_irq_startup; 298 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
343 * to use default_shutdown, otherwise the irq line is not
344 * disabled on free_irq():
345 */
346 if (!chip->irq_shutdown)
347 chip->irq_shutdown = chip->irq_disable != default_disable ?
348 chip->irq_disable : default_shutdown;
349
350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
351 if (!chip->end) 299 if (!chip->end)
352 chip->end = dummy_irq_chip.end; 300 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock) 301 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock; 302 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock) 303 if (chip->bus_sync_unlock)
@@ -388,22 +332,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
388 if (desc->irq_data.chip->irq_ack) 332 if (desc->irq_data.chip->irq_ack)
389 desc->irq_data.chip->irq_ack(&desc->irq_data); 333 desc->irq_data.chip->irq_ack(&desc->irq_data);
390 } 334 }
391 desc->status |= IRQ_MASKED; 335 irq_state_set_masked(desc);
392} 336}
393 337
394static inline void mask_irq(struct irq_desc *desc) 338void mask_irq(struct irq_desc *desc)
395{ 339{
396 if (desc->irq_data.chip->irq_mask) { 340 if (desc->irq_data.chip->irq_mask) {
397 desc->irq_data.chip->irq_mask(&desc->irq_data); 341 desc->irq_data.chip->irq_mask(&desc->irq_data);
398 desc->status |= IRQ_MASKED; 342 irq_state_set_masked(desc);
399 } 343 }
400} 344}
401 345
402static inline void unmask_irq(struct irq_desc *desc) 346void unmask_irq(struct irq_desc *desc)
403{ 347{
404 if (desc->irq_data.chip->irq_unmask) { 348 if (desc->irq_data.chip->irq_unmask) {
405 desc->irq_data.chip->irq_unmask(&desc->irq_data); 349 desc->irq_data.chip->irq_unmask(&desc->irq_data);
406 desc->status &= ~IRQ_MASKED; 350 irq_state_clr_masked(desc);
407 } 351 }
408} 352}
409 353
@@ -428,10 +372,11 @@ void handle_nested_irq(unsigned int irq)
428 kstat_incr_irqs_this_cpu(irq, desc); 372 kstat_incr_irqs_this_cpu(irq, desc);
429 373
430 action = desc->action; 374 action = desc->action;
431 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 375 if (unlikely(!action || (desc->istate & IRQS_DISABLED)))
432 goto out_unlock; 376 goto out_unlock;
433 377
434 desc->status |= IRQ_INPROGRESS; 378 irq_compat_set_progress(desc);
379 desc->istate |= IRQS_INPROGRESS;
435 raw_spin_unlock_irq(&desc->lock); 380 raw_spin_unlock_irq(&desc->lock);
436 381
437 action_ret = action->thread_fn(action->irq, action->dev_id); 382 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +384,21 @@ void handle_nested_irq(unsigned int irq)
439 note_interrupt(irq, desc, action_ret); 384 note_interrupt(irq, desc, action_ret);
440 385
441 raw_spin_lock_irq(&desc->lock); 386 raw_spin_lock_irq(&desc->lock);
442 desc->status &= ~IRQ_INPROGRESS; 387 desc->istate &= ~IRQS_INPROGRESS;
388 irq_compat_clr_progress(desc);
443 389
444out_unlock: 390out_unlock:
445 raw_spin_unlock_irq(&desc->lock); 391 raw_spin_unlock_irq(&desc->lock);
446} 392}
447EXPORT_SYMBOL_GPL(handle_nested_irq); 393EXPORT_SYMBOL_GPL(handle_nested_irq);
448 394
395static bool irq_check_poll(struct irq_desc *desc)
396{
397 if (!(desc->istate & IRQS_POLL_INPROGRESS))
398 return false;
399 return irq_wait_for_poll(desc);
400}
401
449/** 402/**
450 * handle_simple_irq - Simple and software-decoded IRQs. 403 * handle_simple_irq - Simple and software-decoded IRQs.
451 * @irq: the interrupt number 404 * @irq: the interrupt number
@@ -461,29 +414,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
461void 414void
462handle_simple_irq(unsigned int irq, struct irq_desc *desc) 415handle_simple_irq(unsigned int irq, struct irq_desc *desc)
463{ 416{
464 struct irqaction *action;
465 irqreturn_t action_ret;
466
467 raw_spin_lock(&desc->lock); 417 raw_spin_lock(&desc->lock);
468 418
469 if (unlikely(desc->status & IRQ_INPROGRESS)) 419 if (unlikely(desc->istate & IRQS_INPROGRESS))
470 goto out_unlock; 420 if (!irq_check_poll(desc))
471 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 421 goto out_unlock;
422
423 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
472 kstat_incr_irqs_this_cpu(irq, desc); 424 kstat_incr_irqs_this_cpu(irq, desc);
473 425
474 action = desc->action; 426 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
475 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
476 goto out_unlock; 427 goto out_unlock;
477 428
478 desc->status |= IRQ_INPROGRESS; 429 handle_irq_event(desc);
479 raw_spin_unlock(&desc->lock);
480 430
481 action_ret = handle_IRQ_event(irq, action);
482 if (!noirqdebug)
483 note_interrupt(irq, desc, action_ret);
484
485 raw_spin_lock(&desc->lock);
486 desc->status &= ~IRQ_INPROGRESS;
487out_unlock: 431out_unlock:
488 raw_spin_unlock(&desc->lock); 432 raw_spin_unlock(&desc->lock);
489} 433}
@@ -501,42 +445,42 @@ out_unlock:
501void 445void
502handle_level_irq(unsigned int irq, struct irq_desc *desc) 446handle_level_irq(unsigned int irq, struct irq_desc *desc)
503{ 447{
504 struct irqaction *action;
505 irqreturn_t action_ret;
506
507 raw_spin_lock(&desc->lock); 448 raw_spin_lock(&desc->lock);
508 mask_ack_irq(desc); 449 mask_ack_irq(desc);
509 450
510 if (unlikely(desc->status & IRQ_INPROGRESS)) 451 if (unlikely(desc->istate & IRQS_INPROGRESS))
511 goto out_unlock; 452 if (!irq_check_poll(desc))
512 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 453 goto out_unlock;
454
455 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
513 kstat_incr_irqs_this_cpu(irq, desc); 456 kstat_incr_irqs_this_cpu(irq, desc);
514 457
515 /* 458 /*
516 * If its disabled or no action available 459 * If its disabled or no action available
517 * keep it masked and get out of here 460 * keep it masked and get out of here
518 */ 461 */
519 action = desc->action; 462 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
520 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
521 goto out_unlock; 463 goto out_unlock;
522 464
523 desc->status |= IRQ_INPROGRESS; 465 handle_irq_event(desc);
524 raw_spin_unlock(&desc->lock);
525
526 action_ret = handle_IRQ_event(irq, action);
527 if (!noirqdebug)
528 note_interrupt(irq, desc, action_ret);
529 466
530 raw_spin_lock(&desc->lock); 467 if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT)))
531 desc->status &= ~IRQ_INPROGRESS;
532
533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
534 unmask_irq(desc); 468 unmask_irq(desc);
535out_unlock: 469out_unlock:
536 raw_spin_unlock(&desc->lock); 470 raw_spin_unlock(&desc->lock);
537} 471}
538EXPORT_SYMBOL_GPL(handle_level_irq); 472EXPORT_SYMBOL_GPL(handle_level_irq);
539 473
474#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
475static inline void preflow_handler(struct irq_desc *desc)
476{
477 if (desc->preflow_handler)
478 desc->preflow_handler(&desc->irq_data);
479}
480#else
481static inline void preflow_handler(struct irq_desc *desc) { }
482#endif
483
540/** 484/**
541 * handle_fasteoi_irq - irq handler for transparent controllers 485 * handle_fasteoi_irq - irq handler for transparent controllers
542 * @irq: the interrupt number 486 * @irq: the interrupt number
@@ -550,42 +494,41 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
550void 494void
551handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 495handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
552{ 496{
553 struct irqaction *action;
554 irqreturn_t action_ret;
555
556 raw_spin_lock(&desc->lock); 497 raw_spin_lock(&desc->lock);
557 498
558 if (unlikely(desc->status & IRQ_INPROGRESS)) 499 if (unlikely(desc->istate & IRQS_INPROGRESS))
559 goto out; 500 if (!irq_check_poll(desc))
501 goto out;
560 502
561 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 503 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
562 kstat_incr_irqs_this_cpu(irq, desc); 504 kstat_incr_irqs_this_cpu(irq, desc);
563 505
564 /* 506 /*
565 * If its disabled or no action available 507 * If its disabled or no action available
566 * then mask it and get out of here: 508 * then mask it and get out of here:
567 */ 509 */
568 action = desc->action; 510 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 511 irq_compat_set_pending(desc);
570 desc->status |= IRQ_PENDING; 512 desc->istate |= IRQS_PENDING;
571 mask_irq(desc); 513 mask_irq(desc);
572 goto out; 514 goto out;
573 } 515 }
574 516
575 desc->status |= IRQ_INPROGRESS; 517 if (desc->istate & IRQS_ONESHOT)
576 desc->status &= ~IRQ_PENDING; 518 mask_irq(desc);
577 raw_spin_unlock(&desc->lock);
578 519
579 action_ret = handle_IRQ_event(irq, action); 520 preflow_handler(desc);
580 if (!noirqdebug) 521 handle_irq_event(desc);
581 note_interrupt(irq, desc, action_ret);
582 522
583 raw_spin_lock(&desc->lock); 523out_eoi:
584 desc->status &= ~IRQ_INPROGRESS;
585out:
586 desc->irq_data.chip->irq_eoi(&desc->irq_data); 524 desc->irq_data.chip->irq_eoi(&desc->irq_data);
587 525out_unlock:
588 raw_spin_unlock(&desc->lock); 526 raw_spin_unlock(&desc->lock);
527 return;
528out:
529 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
530 goto out_eoi;
531 goto out_unlock;
589} 532}
590 533
591/** 534/**
@@ -609,32 +552,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
609{ 552{
610 raw_spin_lock(&desc->lock); 553 raw_spin_lock(&desc->lock);
611 554
612 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 555 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
613
614 /* 556 /*
615 * If we're currently running this IRQ, or its disabled, 557 * If we're currently running this IRQ, or its disabled,
616 * we shouldn't process the IRQ. Mark it pending, handle 558 * we shouldn't process the IRQ. Mark it pending, handle
617 * the necessary masking and go out 559 * the necessary masking and go out
618 */ 560 */
619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 561 if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
620 !desc->action)) { 562 !desc->action))) {
621 desc->status |= (IRQ_PENDING | IRQ_MASKED); 563 if (!irq_check_poll(desc)) {
622 mask_ack_irq(desc); 564 irq_compat_set_pending(desc);
623 goto out_unlock; 565 desc->istate |= IRQS_PENDING;
566 mask_ack_irq(desc);
567 goto out_unlock;
568 }
624 } 569 }
625 kstat_incr_irqs_this_cpu(irq, desc); 570 kstat_incr_irqs_this_cpu(irq, desc);
626 571
627 /* Start handling the irq */ 572 /* Start handling the irq */
628 desc->irq_data.chip->irq_ack(&desc->irq_data); 573 desc->irq_data.chip->irq_ack(&desc->irq_data);
629 574
630 /* Mark the IRQ currently in progress.*/
631 desc->status |= IRQ_INPROGRESS;
632
633 do { 575 do {
634 struct irqaction *action = desc->action; 576 if (unlikely(!desc->action)) {
635 irqreturn_t action_ret;
636
637 if (unlikely(!action)) {
638 mask_irq(desc); 577 mask_irq(desc);
639 goto out_unlock; 578 goto out_unlock;
640 } 579 }
@@ -644,22 +583,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
644 * one, we could have masked the irq. 583 * one, we could have masked the irq.
645 * Renable it, if it was not disabled in meantime. 584 * Renable it, if it was not disabled in meantime.
646 */ 585 */
647 if (unlikely((desc->status & 586 if (unlikely(desc->istate & IRQS_PENDING)) {
648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 587 if (!(desc->istate & IRQS_DISABLED) &&
649 (IRQ_PENDING | IRQ_MASKED))) { 588 (desc->istate & IRQS_MASKED))
650 unmask_irq(desc); 589 unmask_irq(desc);
651 } 590 }
652 591
653 desc->status &= ~IRQ_PENDING; 592 handle_irq_event(desc);
654 raw_spin_unlock(&desc->lock);
655 action_ret = handle_IRQ_event(irq, action);
656 if (!noirqdebug)
657 note_interrupt(irq, desc, action_ret);
658 raw_spin_lock(&desc->lock);
659 593
660 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 594 } while ((desc->istate & IRQS_PENDING) &&
595 !(desc->istate & IRQS_DISABLED));
661 596
662 desc->status &= ~IRQ_INPROGRESS;
663out_unlock: 597out_unlock:
664 raw_spin_unlock(&desc->lock); 598 raw_spin_unlock(&desc->lock);
665} 599}
@@ -674,103 +608,84 @@ out_unlock:
674void 608void
675handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 609handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
676{ 610{
677 irqreturn_t action_ret; 611 struct irq_chip *chip = irq_desc_get_chip(desc);
678 612
679 kstat_incr_irqs_this_cpu(irq, desc); 613 kstat_incr_irqs_this_cpu(irq, desc);
680 614
681 if (desc->irq_data.chip->irq_ack) 615 if (chip->irq_ack)
682 desc->irq_data.chip->irq_ack(&desc->irq_data); 616 chip->irq_ack(&desc->irq_data);
683 617
684 action_ret = handle_IRQ_event(irq, desc->action); 618 handle_irq_event_percpu(desc, desc->action);
685 if (!noirqdebug)
686 note_interrupt(irq, desc, action_ret);
687 619
688 if (desc->irq_data.chip->irq_eoi) 620 if (chip->irq_eoi)
689 desc->irq_data.chip->irq_eoi(&desc->irq_data); 621 chip->irq_eoi(&desc->irq_data);
690} 622}
691 623
692void 624void
693__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 625__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
694 const char *name) 626 const char *name)
695{ 627{
696 struct irq_desc *desc = irq_to_desc(irq);
697 unsigned long flags; 628 unsigned long flags;
629 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
698 630
699 if (!desc) { 631 if (!desc)
700 printk(KERN_ERR
701 "Trying to install type control for IRQ%d\n", irq);
702 return; 632 return;
703 }
704 633
705 if (!handle) 634 if (!handle) {
706 handle = handle_bad_irq; 635 handle = handle_bad_irq;
707 else if (desc->irq_data.chip == &no_irq_chip) { 636 } else {
708 printk(KERN_WARNING "Trying to install %sinterrupt handler " 637 if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
709 "for IRQ%d\n", is_chained ? "chained " : "", irq); 638 goto out;
710 /*
711 * Some ARM implementations install a handler for really dumb
712 * interrupt hardware without setting an irq_chip. This worked
713 * with the ARM no_irq_chip but the check in setup_irq would
714 * prevent us to setup the interrupt at all. Switch it to
715 * dummy_irq_chip for easy transition.
716 */
717 desc->irq_data.chip = &dummy_irq_chip;
718 } 639 }
719 640
720 chip_bus_lock(desc);
721 raw_spin_lock_irqsave(&desc->lock, flags);
722
723 /* Uninstall? */ 641 /* Uninstall? */
724 if (handle == handle_bad_irq) { 642 if (handle == handle_bad_irq) {
725 if (desc->irq_data.chip != &no_irq_chip) 643 if (desc->irq_data.chip != &no_irq_chip)
726 mask_ack_irq(desc); 644 mask_ack_irq(desc);
727 desc->status |= IRQ_DISABLED; 645 irq_compat_set_disabled(desc);
646 desc->istate |= IRQS_DISABLED;
728 desc->depth = 1; 647 desc->depth = 1;
729 } 648 }
730 desc->handle_irq = handle; 649 desc->handle_irq = handle;
731 desc->name = name; 650 desc->name = name;
732 651
733 if (handle != handle_bad_irq && is_chained) { 652 if (handle != handle_bad_irq && is_chained) {
734 desc->status &= ~IRQ_DISABLED; 653 irq_settings_set_noprobe(desc);
735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 654 irq_settings_set_norequest(desc);
736 desc->depth = 0; 655 irq_startup(desc);
737 desc->irq_data.chip->irq_startup(&desc->irq_data);
738 } 656 }
739 raw_spin_unlock_irqrestore(&desc->lock, flags); 657out:
740 chip_bus_sync_unlock(desc); 658 irq_put_desc_busunlock(desc, flags);
741}
742EXPORT_SYMBOL_GPL(__set_irq_handler);
743
744void
745set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
746 irq_flow_handler_t handle)
747{
748 set_irq_chip(irq, chip);
749 __set_irq_handler(irq, handle, 0, NULL);
750} 659}
660EXPORT_SYMBOL_GPL(__irq_set_handler);
751 661
752void 662void
753set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, 663irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
754 irq_flow_handler_t handle, const char *name) 664 irq_flow_handler_t handle, const char *name)
755{ 665{
756 set_irq_chip(irq, chip); 666 irq_set_chip(irq, chip);
757 __set_irq_handler(irq, handle, 0, name); 667 __irq_set_handler(irq, handle, 0, name);
758} 668}
759 669
760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 670void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
761{ 671{
762 struct irq_desc *desc = irq_to_desc(irq);
763 unsigned long flags; 672 unsigned long flags;
673 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
764 674
765 if (!desc) 675 if (!desc)
766 return; 676 return;
677 irq_settings_clr_and_set(desc, clr, set);
678
679 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
680 IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
681 if (irq_settings_has_no_balance_set(desc))
682 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
683 if (irq_settings_is_per_cpu(desc))
684 irqd_set(&desc->irq_data, IRQD_PER_CPU);
685 if (irq_settings_can_move_pcntxt(desc))
686 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
767 687
768 /* Sanitize flags */ 688 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
769 set &= IRQF_MODIFY_MASK;
770 clr &= IRQF_MODIFY_MASK;
771 689
772 raw_spin_lock_irqsave(&desc->lock, flags); 690 irq_put_desc_unlock(desc, flags);
773 desc->status &= ~clr;
774 desc->status |= set;
775 raw_spin_unlock_irqrestore(&desc->lock, flags);
776} 691}
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
new file mode 100644
index 00000000000..6bbaf66aca8
--- /dev/null
+++ b/kernel/irq/compat.h
@@ -0,0 +1,72 @@
1/*
2 * Compat layer for transition period
3 */
4#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
5static inline void irq_compat_set_progress(struct irq_desc *desc)
6{
7 desc->status |= IRQ_INPROGRESS;
8}
9
10static inline void irq_compat_clr_progress(struct irq_desc *desc)
11{
12 desc->status &= ~IRQ_INPROGRESS;
13}
14static inline void irq_compat_set_disabled(struct irq_desc *desc)
15{
16 desc->status |= IRQ_DISABLED;
17}
18static inline void irq_compat_clr_disabled(struct irq_desc *desc)
19{
20 desc->status &= ~IRQ_DISABLED;
21}
22static inline void irq_compat_set_pending(struct irq_desc *desc)
23{
24 desc->status |= IRQ_PENDING;
25}
26
27static inline void irq_compat_clr_pending(struct irq_desc *desc)
28{
29 desc->status &= ~IRQ_PENDING;
30}
31static inline void irq_compat_set_masked(struct irq_desc *desc)
32{
33 desc->status |= IRQ_MASKED;
34}
35
36static inline void irq_compat_clr_masked(struct irq_desc *desc)
37{
38 desc->status &= ~IRQ_MASKED;
39}
40static inline void irq_compat_set_move_pending(struct irq_desc *desc)
41{
42 desc->status |= IRQ_MOVE_PENDING;
43}
44
45static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
46{
47 desc->status &= ~IRQ_MOVE_PENDING;
48}
49static inline void irq_compat_set_affinity(struct irq_desc *desc)
50{
51 desc->status |= IRQ_AFFINITY_SET;
52}
53
54static inline void irq_compat_clr_affinity(struct irq_desc *desc)
55{
56 desc->status &= ~IRQ_AFFINITY_SET;
57}
58#else
59static inline void irq_compat_set_progress(struct irq_desc *desc) { }
60static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
61static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
62static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
63static inline void irq_compat_set_pending(struct irq_desc *desc) { }
64static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
65static inline void irq_compat_set_masked(struct irq_desc *desc) { }
66static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
67static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
68static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
69static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
70static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
71#endif
72
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 00000000000..d1a33b7fa61
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,40 @@
1/*
2 * Debugging printout:
3 */
4
5#include <linux/kallsyms.h>
6
7#define P(f) if (desc->status & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9
10static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
11{
12 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
13 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
14 printk("->handle_irq(): %p, ", desc->handle_irq);
15 print_symbol("%s\n", (unsigned long)desc->handle_irq);
16 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
17 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
18 printk("->action(): %p\n", desc->action);
19 if (desc->action) {
20 printk("->action->handler(): %p, ", desc->action->handler);
21 print_symbol("%s\n", (unsigned long)desc->action->handler);
22 }
23
24 P(IRQ_LEVEL);
25 P(IRQ_PER_CPU);
26 P(IRQ_NOPROBE);
27 P(IRQ_NOREQUEST);
28 P(IRQ_NOAUTOEN);
29
30 PS(IRQS_AUTODETECT);
31 PS(IRQS_INPROGRESS);
32 PS(IRQS_REPLAY);
33 PS(IRQS_WAITING);
34 PS(IRQS_DISABLED);
35 PS(IRQS_PENDING);
36 PS(IRQS_MASKED);
37}
38
39#undef P
40#undef PS
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a719012..517561fc731 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
51 "but no thread function available.", irq, action->name); 51 "but no thread function available.", irq, action->name);
52} 52}
53 53
54/** 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55 * handle_IRQ_event - irq action chain handler 55{
56 * @irq: the interrupt number 56 /*
57 * @action: the interrupt action chain for this irq 57 * Wake up the handler thread for this action. In case the
58 * 58 * thread crashed and was killed we just pretend that we
59 * Handles the action chain of an irq event 59 * handled the interrupt. The hardirq handler has disabled the
60 */ 60 * device interrupt, so no irq storm is lurking. If the
61irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) 61 * RUNTHREAD bit is already set, nothing to do.
62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return;
66
67 /*
68 * It's safe to OR the mask lockless here. We have only two
69 * places which write to threads_oneshot: This code and the
70 * irq thread.
71 *
72 * This code is the hard irq context and can never run on two
73 * cpus in parallel. If it ever does we have more serious
74 * problems than this bitmask.
75 *
76 * The irq threads of this irq which clear their "running" bit
77 * in threads_oneshot are serialized via desc->lock against
78 * each other and they are serialized against this code by
79 * IRQS_INPROGRESS.
80 *
81 * Hard irq handler:
82 *
83 * spin_lock(desc->lock);
84 * desc->state |= IRQS_INPROGRESS;
85 * spin_unlock(desc->lock);
86 * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
87 * desc->threads_oneshot |= mask;
88 * spin_lock(desc->lock);
89 * desc->state &= ~IRQS_INPROGRESS;
90 * spin_unlock(desc->lock);
91 *
92 * irq thread:
93 *
94 * again:
95 * spin_lock(desc->lock);
96 * if (desc->state & IRQS_INPROGRESS) {
97 * spin_unlock(desc->lock);
98 * while(desc->state & IRQS_INPROGRESS)
99 * cpu_relax();
100 * goto again;
101 * }
102 * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
103 * desc->threads_oneshot &= ~mask;
104 * spin_unlock(desc->lock);
105 *
106 * So either the thread waits for us to clear IRQS_INPROGRESS
107 * or we are waiting in the flow handler for desc->lock to be
108 * released before we reach this point. The thread also checks
109 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
110 * threads_oneshot untouched and runs the thread another time.
111 */
112 desc->threads_oneshot |= action->thread_mask;
113 wake_up_process(action->thread);
114}
115
116irqreturn_t
117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
62{ 118{
63 irqreturn_t ret, retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
64 unsigned int status = 0; 120 unsigned int random = 0, irq = desc->irq_data.irq;
65 121
66 do { 122 do {
123 irqreturn_t res;
124
67 trace_irq_handler_entry(irq, action); 125 trace_irq_handler_entry(irq, action);
68 ret = action->handler(irq, action->dev_id); 126 res = action->handler(irq, action->dev_id);
69 trace_irq_handler_exit(irq, action, ret); 127 trace_irq_handler_exit(irq, action, res);
70 128
71 switch (ret) { 129 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
130 irq, action->handler))
131 local_irq_disable();
132
133 switch (res) {
72 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
73 /* 135 /*
74 * Set result to handled so the spurious check 136 * Set result to handled so the spurious check
75 * does not trigger. 137 * does not trigger.
76 */ 138 */
77 ret = IRQ_HANDLED; 139 res = IRQ_HANDLED;
78 140
79 /* 141 /*
80 * Catch drivers which return WAKE_THREAD but 142 * Catch drivers which return WAKE_THREAD but
@@ -85,36 +147,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
85 break; 147 break;
86 } 148 }
87 149
88 /* 150 irq_wake_thread(desc, action);
89 * Wake up the handler thread for this
90 * action. In case the thread crashed and was
91 * killed we just pretend that we handled the
92 * interrupt. The hardirq handler above has
93 * disabled the device interrupt, so no irq
94 * storm is lurking.
95 */
96 if (likely(!test_bit(IRQTF_DIED,
97 &action->thread_flags))) {
98 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
99 wake_up_process(action->thread);
100 }
101 151
102 /* Fall through to add to randomness */ 152 /* Fall through to add to randomness */
103 case IRQ_HANDLED: 153 case IRQ_HANDLED:
104 status |= action->flags; 154 random |= action->flags;
105 break; 155 break;
106 156
107 default: 157 default:
108 break; 158 break;
109 } 159 }
110 160
111 retval |= ret; 161 retval |= res;
112 action = action->next; 162 action = action->next;
113 } while (action); 163 } while (action);
114 164
115 if (status & IRQF_SAMPLE_RANDOM) 165 if (random & IRQF_SAMPLE_RANDOM)
116 add_interrupt_randomness(irq); 166 add_interrupt_randomness(irq);
117 local_irq_disable();
118 167
168 if (!noirqdebug)
169 note_interrupt(irq, desc, retval);
119 return retval; 170 return retval;
120} 171}
172
173irqreturn_t handle_irq_event(struct irq_desc *desc)
174{
175 struct irqaction *action = desc->action;
176 irqreturn_t ret;
177
178 irq_compat_clr_pending(desc);
179 desc->istate &= ~IRQS_PENDING;
180 irq_compat_set_progress(desc);
181 desc->istate |= IRQS_INPROGRESS;
182 raw_spin_unlock(&desc->lock);
183
184 ret = handle_irq_event_percpu(desc, action);
185
186 raw_spin_lock(&desc->lock);
187 desc->istate &= ~IRQS_INPROGRESS;
188 irq_compat_clr_progress(desc);
189 return ret;
190}
191
192/**
193 * handle_IRQ_event - irq action chain handler
194 * @irq: the interrupt number
195 * @action: the interrupt action chain for this irq
196 *
197 * Handles the action chain of an irq event
198 */
199irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
200{
201 return handle_irq_event_percpu(irq_to_desc(irq), action);
202}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99c3bc8a6fb..6c6ec9a4902 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,5 +1,9 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 *
4 * Do not ever include this file from anything else than
5 * kernel/irq/. Do not even think about using any information outside
6 * of this file for your non core code.
3 */ 7 */
4#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
5 9
@@ -9,25 +13,89 @@
9# define IRQ_BITMAP_BITS NR_IRQS 13# define IRQ_BITMAP_BITS NR_IRQS
10#endif 14#endif
11 15
16#define istate core_internal_state__do_not_mess_with_it
17
18#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
19# define status status_use_accessors
20#endif
21
12extern int noirqdebug; 22extern int noirqdebug;
13 23
24/*
25 * Bits used by threaded handlers:
26 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
27 * IRQTF_DIED - handler thread died
28 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
29 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
30 * IRQTF_FORCED_THREAD - irq action is force threaded
31 */
32enum {
33 IRQTF_RUNTHREAD,
34 IRQTF_DIED,
35 IRQTF_WARNED,
36 IRQTF_AFFINITY,
37 IRQTF_FORCED_THREAD,
38};
39
40/*
41 * Bit masks for desc->state
42 *
43 * IRQS_AUTODETECT - autodetection in progress
44 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
45 * detection
46 * IRQS_POLL_INPROGRESS - polling in progress
47 * IRQS_INPROGRESS - Interrupt in progress
48 * IRQS_ONESHOT - irq is not unmasked in primary handler
49 * IRQS_REPLAY - irq is replayed
50 * IRQS_WAITING - irq is waiting
51 * IRQS_DISABLED - irq is disabled
52 * IRQS_PENDING - irq is pending and replayed later
53 * IRQS_MASKED - irq is masked
54 * IRQS_SUSPENDED - irq is suspended
55 */
56enum {
57 IRQS_AUTODETECT = 0x00000001,
58 IRQS_SPURIOUS_DISABLED = 0x00000002,
59 IRQS_POLL_INPROGRESS = 0x00000008,
60 IRQS_INPROGRESS = 0x00000010,
61 IRQS_ONESHOT = 0x00000020,
62 IRQS_REPLAY = 0x00000040,
63 IRQS_WAITING = 0x00000080,
64 IRQS_DISABLED = 0x00000100,
65 IRQS_PENDING = 0x00000200,
66 IRQS_MASKED = 0x00000400,
67 IRQS_SUSPENDED = 0x00000800,
68};
69
70#include "compat.h"
71#include "debug.h"
72#include "settings.h"
73
14#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 74#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
15 75
16/* Set default functions for irq_chip structures: */ 76/* Set default functions for irq_chip structures: */
17extern void irq_chip_set_defaults(struct irq_chip *chip); 77extern void irq_chip_set_defaults(struct irq_chip *chip);
18 78
19/* Set default handler: */
20extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
21
22extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 79extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
23 unsigned long flags); 80 unsigned long flags);
24extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 81extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
25extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 82extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
26 83
84extern int irq_startup(struct irq_desc *desc);
85extern void irq_shutdown(struct irq_desc *desc);
86extern void irq_enable(struct irq_desc *desc);
87extern void irq_disable(struct irq_desc *desc);
88extern void mask_irq(struct irq_desc *desc);
89extern void unmask_irq(struct irq_desc *desc);
90
27extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 91extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
28 92
93irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
94irqreturn_t handle_irq_event(struct irq_desc *desc);
95
29/* Resending of interrupts :*/ 96/* Resending of interrupts :*/
30void check_irq_resend(struct irq_desc *desc, unsigned int irq); 97void check_irq_resend(struct irq_desc *desc, unsigned int irq);
98bool irq_wait_for_poll(struct irq_desc *desc);
31 99
32#ifdef CONFIG_PROC_FS 100#ifdef CONFIG_PROC_FS
33extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 101extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -43,20 +111,10 @@ static inline void unregister_handler_proc(unsigned int irq,
43 struct irqaction *action) { } 111 struct irqaction *action) { }
44#endif 112#endif
45 113
46extern int irq_select_affinity_usr(unsigned int irq); 114extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
47 115
48extern void irq_set_thread_affinity(struct irq_desc *desc); 116extern void irq_set_thread_affinity(struct irq_desc *desc);
49 117
50#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
51static inline void irq_end(unsigned int irq, struct irq_desc *desc)
52{
53 if (desc->irq_data.chip && desc->irq_data.chip->end)
54 desc->irq_data.chip->end(irq);
55}
56#else
57static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
58#endif
59
60/* Inline functions for support of irq chips on slow busses */ 118/* Inline functions for support of irq chips on slow busses */
61static inline void chip_bus_lock(struct irq_desc *desc) 119static inline void chip_bus_lock(struct irq_desc *desc)
62{ 120{
@@ -70,43 +128,60 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
70 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 128 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
71} 129}
72 130
131struct irq_desc *
132__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
133void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
134
135static inline struct irq_desc *
136irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
137{
138 return __irq_get_desc_lock(irq, flags, true);
139}
140
141static inline void
142irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
143{
144 __irq_put_desc_unlock(desc, flags, true);
145}
146
147static inline struct irq_desc *
148irq_get_desc_lock(unsigned int irq, unsigned long *flags)
149{
150 return __irq_get_desc_lock(irq, flags, false);
151}
152
153static inline void
154irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
155{
156 __irq_put_desc_unlock(desc, flags, false);
157}
158
73/* 159/*
74 * Debugging printout: 160 * Manipulation functions for irq_data.state
75 */ 161 */
162static inline void irqd_set_move_pending(struct irq_data *d)
163{
164 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
165 irq_compat_set_move_pending(irq_data_to_desc(d));
166}
76 167
77#include <linux/kallsyms.h> 168static inline void irqd_clr_move_pending(struct irq_data *d)
78 169{
79#define P(f) if (desc->status & f) printk("%14s set\n", #f) 170 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
171 irq_compat_clr_move_pending(irq_data_to_desc(d));
172}
80 173
81static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 174static inline void irqd_clear(struct irq_data *d, unsigned int mask)
82{ 175{
83 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 176 d->state_use_accessors &= ~mask;
84 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
85 printk("->handle_irq(): %p, ", desc->handle_irq);
86 print_symbol("%s\n", (unsigned long)desc->handle_irq);
87 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
88 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
89 printk("->action(): %p\n", desc->action);
90 if (desc->action) {
91 printk("->action->handler(): %p, ", desc->action->handler);
92 print_symbol("%s\n", (unsigned long)desc->action->handler);
93 }
94
95 P(IRQ_INPROGRESS);
96 P(IRQ_DISABLED);
97 P(IRQ_PENDING);
98 P(IRQ_REPLAY);
99 P(IRQ_AUTODETECT);
100 P(IRQ_WAITING);
101 P(IRQ_LEVEL);
102 P(IRQ_MASKED);
103#ifdef CONFIG_IRQ_PER_CPU
104 P(IRQ_PER_CPU);
105#endif
106 P(IRQ_NOPROBE);
107 P(IRQ_NOREQUEST);
108 P(IRQ_NOAUTOEN);
109} 177}
110 178
111#undef P 179static inline void irqd_set(struct irq_data *d, unsigned int mask)
180{
181 d->state_use_accessors |= mask;
182}
112 183
184static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
185{
186 return d->state_use_accessors & mask;
187}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2039bea31bd..dbccc799407 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
79 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
80 desc->irq_data.handler_data = NULL; 80 desc->irq_data.handler_data = NULL;
81 desc->irq_data.msi_desc = NULL; 81 desc->irq_data.msi_desc = NULL;
82 desc->status = IRQ_DEFAULT_INIT_FLAGS; 82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 desc->istate = IRQS_DISABLED;
83 desc->handle_irq = handle_bad_irq; 84 desc->handle_irq = handle_bad_irq;
84 desc->depth = 1; 85 desc->depth = 1;
85 desc->irq_count = 0; 86 desc->irq_count = 0;
@@ -206,6 +207,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
206 return NULL; 207 return NULL;
207} 208}
208 209
210static int irq_expand_nr_irqs(unsigned int nr)
211{
212 if (nr > IRQ_BITMAP_BITS)
213 return -ENOMEM;
214 nr_irqs = nr;
215 return 0;
216}
217
209int __init early_irq_init(void) 218int __init early_irq_init(void)
210{ 219{
211 int i, initcnt, node = first_online_node; 220 int i, initcnt, node = first_online_node;
@@ -238,7 +247,7 @@ int __init early_irq_init(void)
238 247
239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 248struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
240 [0 ... NR_IRQS-1] = { 249 [0 ... NR_IRQS-1] = {
241 .status = IRQ_DEFAULT_INIT_FLAGS, 250 .istate = IRQS_DISABLED,
242 .handle_irq = handle_bad_irq, 251 .handle_irq = handle_bad_irq,
243 .depth = 1, 252 .depth = 1,
244 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), 253 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -260,8 +269,8 @@ int __init early_irq_init(void)
260 for (i = 0; i < count; i++) { 269 for (i = 0; i < count; i++) {
261 desc[i].irq_data.irq = i; 270 desc[i].irq_data.irq = i;
262 desc[i].irq_data.chip = &no_irq_chip; 271 desc[i].irq_data.chip = &no_irq_chip;
263 /* TODO : do this allocation on-demand ... */
264 desc[i].kstat_irqs = alloc_percpu(unsigned int); 272 desc[i].kstat_irqs = alloc_percpu(unsigned int);
273 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
265 alloc_masks(desc + i, GFP_KERNEL, node); 274 alloc_masks(desc + i, GFP_KERNEL, node);
266 desc_smp_init(desc + i, node); 275 desc_smp_init(desc + i, node);
267 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 276 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -286,24 +295,14 @@ static void free_desc(unsigned int irq)
286 295
287static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 296static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
288{ 297{
289#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
290 struct irq_desc *desc;
291 unsigned int i;
292
293 for (i = 0; i < cnt; i++) {
294 desc = irq_to_desc(start + i);
295 if (desc && !desc->kstat_irqs) {
296 unsigned int __percpu *stats = alloc_percpu(unsigned int);
297
298 if (!stats)
299 return -1;
300 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
301 free_percpu(stats);
302 }
303 }
304#endif
305 return start; 298 return start;
306} 299}
300
301static int irq_expand_nr_irqs(unsigned int nr)
302{
303 return -ENOMEM;
304}
305
307#endif /* !CONFIG_SPARSE_IRQ */ 306#endif /* !CONFIG_SPARSE_IRQ */
308 307
309/* Dynamic interrupt handling */ 308/* Dynamic interrupt handling */
@@ -347,14 +346,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
347 346
348 mutex_lock(&sparse_irq_lock); 347 mutex_lock(&sparse_irq_lock);
349 348
350 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 349 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
350 from, cnt, 0);
351 ret = -EEXIST; 351 ret = -EEXIST;
352 if (irq >=0 && start != irq) 352 if (irq >=0 && start != irq)
353 goto err; 353 goto err;
354 354
355 ret = -ENOMEM; 355 if (start + cnt > nr_irqs) {
356 if (start >= nr_irqs) 356 ret = irq_expand_nr_irqs(start + cnt);
357 goto err; 357 if (ret)
358 goto err;
359 }
358 360
359 bitmap_set(allocated_irqs, start, cnt); 361 bitmap_set(allocated_irqs, start, cnt);
360 mutex_unlock(&sparse_irq_lock); 362 mutex_unlock(&sparse_irq_lock);
@@ -401,6 +403,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
401 return find_next_bit(allocated_irqs, nr_irqs, offset); 403 return find_next_bit(allocated_irqs, nr_irqs, offset);
402} 404}
403 405
406struct irq_desc *
407__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
408{
409 struct irq_desc *desc = irq_to_desc(irq);
410
411 if (desc) {
412 if (bus)
413 chip_bus_lock(desc);
414 raw_spin_lock_irqsave(&desc->lock, *flags);
415 }
416 return desc;
417}
418
419void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
420{
421 raw_spin_unlock_irqrestore(&desc->lock, flags);
422 if (bus)
423 chip_bus_sync_unlock(desc);
424}
425
404/** 426/**
405 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 427 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
406 * @irq: irq number to initialize 428 * @irq: irq number to initialize
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9033c1c7082..acd599a43bf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
17 17
18#include "internals.h" 18#include "internals.h"
19 19
20#ifdef CONFIG_IRQ_FORCED_THREADING
21__read_mostly bool force_irqthreads;
22
23static int __init setup_forced_irqthreads(char *arg)
24{
25 force_irqthreads = true;
26 return 0;
27}
28early_param("threadirqs", setup_forced_irqthreads);
29#endif
30
20/** 31/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 32 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 33 * @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
30void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
31{ 42{
32 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
33 unsigned int status; 44 unsigned int state;
34 45
35 if (!desc) 46 if (!desc)
36 return; 47 return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
42 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
43 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
44 */ 55 */
45 while (desc->status & IRQ_INPROGRESS) 56 while (desc->istate & IRQS_INPROGRESS)
46 cpu_relax(); 57 cpu_relax();
47 58
48 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
49 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 61 state = desc->istate;
51 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 63
53 /* Oops, that failed? */ 64 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 65 } while (state & IRQS_INPROGRESS);
55 66
56 /* 67 /*
57 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 84{
74 struct irq_desc *desc = irq_to_desc(irq); 85 struct irq_desc *desc = irq_to_desc(irq);
75 86
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || 87 if (!desc || !irqd_can_balance(&desc->irq_data) ||
77 !desc->irq_data.chip->irq_set_affinity) 88 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
78 return 0; 89 return 0;
79 90
80 return 1; 91 return 1;
@@ -100,67 +111,169 @@ void irq_set_thread_affinity(struct irq_desc *desc)
100 } 111 }
101} 112}
102 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
116{
117 return irq_settings_can_move_pcntxt(desc);
118}
119static inline bool irq_move_pending(struct irq_desc *desc)
120{
121 return irqd_is_setaffinity_pending(&desc->irq_data);
122}
123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
125{
126 cpumask_copy(desc->pending_mask, mask);
127}
128static inline void
129irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
130{
131 cpumask_copy(mask, desc->pending_mask);
132}
133#else
134static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; }
135static inline bool irq_move_pending(struct irq_desc *desc) { return false; }
136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif
141
103/** 142/**
104 * irq_set_affinity - Set the irq affinity of a given irq 143 * irq_set_affinity - Set the irq affinity of a given irq
105 * @irq: Interrupt to set affinity 144 * @irq: Interrupt to set affinity
106 * @cpumask: cpumask 145 * @cpumask: cpumask
107 * 146 *
108 */ 147 */
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 148int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
110{ 149{
111 struct irq_desc *desc = irq_to_desc(irq); 150 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip; 151 struct irq_chip *chip = desc->irq_data.chip;
113 unsigned long flags; 152 unsigned long flags;
153 int ret = 0;
114 154
115 if (!chip->irq_set_affinity) 155 if (!chip->irq_set_affinity)
116 return -EINVAL; 156 return -EINVAL;
117 157
118 raw_spin_lock_irqsave(&desc->lock, flags); 158 raw_spin_lock_irqsave(&desc->lock, flags);
119 159
120#ifdef CONFIG_GENERIC_PENDING_IRQ 160 if (irq_can_move_pcntxt(desc)) {
121 if (desc->status & IRQ_MOVE_PCNTXT) { 161 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { 162 switch (ret) {
123 cpumask_copy(desc->irq_data.affinity, cpumask); 163 case IRQ_SET_MASK_OK:
164 cpumask_copy(desc->irq_data.affinity, mask);
165 case IRQ_SET_MASK_OK_NOCOPY:
124 irq_set_thread_affinity(desc); 166 irq_set_thread_affinity(desc);
167 ret = 0;
125 } 168 }
169 } else {
170 irqd_set_move_pending(&desc->irq_data);
171 irq_copy_pending(desc, mask);
126 } 172 }
127 else { 173
128 desc->status |= IRQ_MOVE_PENDING; 174 if (desc->affinity_notify) {
129 cpumask_copy(desc->pending_mask, cpumask); 175 kref_get(&desc->affinity_notify->kref);
130 } 176 schedule_work(&desc->affinity_notify->work);
131#else
132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
133 cpumask_copy(desc->irq_data.affinity, cpumask);
134 irq_set_thread_affinity(desc);
135 } 177 }
136#endif 178 irq_compat_set_affinity(desc);
137 desc->status |= IRQ_AFFINITY_SET; 179 irqd_set(&desc->irq_data, IRQD_AFFINITY_SET);
138 raw_spin_unlock_irqrestore(&desc->lock, flags); 180 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return 0; 181 return ret;
140} 182}
141 183
142int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 184int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
143{ 185{
186 unsigned long flags;
187 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
188
189 if (!desc)
190 return -EINVAL;
191 desc->affinity_hint = m;
192 irq_put_desc_unlock(desc, flags);
193 return 0;
194}
195EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
196
197static void irq_affinity_notify(struct work_struct *work)
198{
199 struct irq_affinity_notify *notify =
200 container_of(work, struct irq_affinity_notify, work);
201 struct irq_desc *desc = irq_to_desc(notify->irq);
202 cpumask_var_t cpumask;
203 unsigned long flags;
204
205 if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
206 goto out;
207
208 raw_spin_lock_irqsave(&desc->lock, flags);
209 if (irq_move_pending(desc))
210 irq_get_pending(cpumask, desc);
211 else
212 cpumask_copy(cpumask, desc->irq_data.affinity);
213 raw_spin_unlock_irqrestore(&desc->lock, flags);
214
215 notify->notify(notify, cpumask);
216
217 free_cpumask_var(cpumask);
218out:
219 kref_put(&notify->kref, notify->release);
220}
221
222/**
223 * irq_set_affinity_notifier - control notification of IRQ affinity changes
224 * @irq: Interrupt for which to enable/disable notification
225 * @notify: Context for notification, or %NULL to disable
226 * notification. Function pointers must be initialised;
227 * the other fields will be initialised by this function.
228 *
229 * Must be called in process context. Notification may only be enabled
230 * after the IRQ is allocated and must be disabled before the IRQ is
231 * freed using free_irq().
232 */
233int
234irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
235{
144 struct irq_desc *desc = irq_to_desc(irq); 236 struct irq_desc *desc = irq_to_desc(irq);
237 struct irq_affinity_notify *old_notify;
145 unsigned long flags; 238 unsigned long flags;
146 239
240 /* The release function is promised process context */
241 might_sleep();
242
147 if (!desc) 243 if (!desc)
148 return -EINVAL; 244 return -EINVAL;
149 245
246 /* Complete initialisation of *notify */
247 if (notify) {
248 notify->irq = irq;
249 kref_init(&notify->kref);
250 INIT_WORK(&notify->work, irq_affinity_notify);
251 }
252
150 raw_spin_lock_irqsave(&desc->lock, flags); 253 raw_spin_lock_irqsave(&desc->lock, flags);
151 desc->affinity_hint = m; 254 old_notify = desc->affinity_notify;
255 desc->affinity_notify = notify;
152 raw_spin_unlock_irqrestore(&desc->lock, flags); 256 raw_spin_unlock_irqrestore(&desc->lock, flags);
153 257
258 if (old_notify)
259 kref_put(&old_notify->kref, old_notify->release);
260
154 return 0; 261 return 0;
155} 262}
156EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 263EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
157 264
158#ifndef CONFIG_AUTO_IRQ_AFFINITY 265#ifndef CONFIG_AUTO_IRQ_AFFINITY
159/* 266/*
160 * Generic version of the affinity autoselector. 267 * Generic version of the affinity autoselector.
161 */ 268 */
162static int setup_affinity(unsigned int irq, struct irq_desc *desc) 269static int
270setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
163{ 271{
272 struct irq_chip *chip = irq_desc_get_chip(desc);
273 struct cpumask *set = irq_default_affinity;
274 int ret;
275
276 /* Excludes PER_CPU and NO_BALANCE interrupts */
164 if (!irq_can_set_affinity(irq)) 277 if (!irq_can_set_affinity(irq))
165 return 0; 278 return 0;
166 279
@@ -168,22 +281,29 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * Preserve an userspace affinity setup, but make sure that 281 * Preserve an userspace affinity setup, but make sure that
169 * one of the targets is online. 282 * one of the targets is online.
170 */ 283 */
171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 284 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) 285 if (cpumask_intersects(desc->irq_data.affinity,
173 < nr_cpu_ids) 286 cpu_online_mask))
174 goto set_affinity; 287 set = desc->irq_data.affinity;
175 else 288 else {
176 desc->status &= ~IRQ_AFFINITY_SET; 289 irq_compat_clr_affinity(desc);
290 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
291 }
177 } 292 }
178 293
179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); 294 cpumask_and(mask, cpu_online_mask, set);
180set_affinity: 295 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); 296 switch (ret) {
182 297 case IRQ_SET_MASK_OK:
298 cpumask_copy(desc->irq_data.affinity, mask);
299 case IRQ_SET_MASK_OK_NOCOPY:
300 irq_set_thread_affinity(desc);
301 }
183 return 0; 302 return 0;
184} 303}
185#else 304#else
186static inline int setup_affinity(unsigned int irq, struct irq_desc *d) 305static inline int
306setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
187{ 307{
188 return irq_select_affinity(irq); 308 return irq_select_affinity(irq);
189} 309}
@@ -192,23 +312,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
192/* 312/*
193 * Called when affinity is set via /proc/irq 313 * Called when affinity is set via /proc/irq
194 */ 314 */
195int irq_select_affinity_usr(unsigned int irq) 315int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
196{ 316{
197 struct irq_desc *desc = irq_to_desc(irq); 317 struct irq_desc *desc = irq_to_desc(irq);
198 unsigned long flags; 318 unsigned long flags;
199 int ret; 319 int ret;
200 320
201 raw_spin_lock_irqsave(&desc->lock, flags); 321 raw_spin_lock_irqsave(&desc->lock, flags);
202 ret = setup_affinity(irq, desc); 322 ret = setup_affinity(irq, desc, mask);
203 if (!ret)
204 irq_set_thread_affinity(desc);
205 raw_spin_unlock_irqrestore(&desc->lock, flags); 323 raw_spin_unlock_irqrestore(&desc->lock, flags);
206
207 return ret; 324 return ret;
208} 325}
209 326
210#else 327#else
211static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) 328static inline int
329setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
212{ 330{
213 return 0; 331 return 0;
214} 332}
@@ -219,13 +337,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
219 if (suspend) { 337 if (suspend) {
220 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) 338 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
221 return; 339 return;
222 desc->status |= IRQ_SUSPENDED; 340 desc->istate |= IRQS_SUSPENDED;
223 } 341 }
224 342
225 if (!desc->depth++) { 343 if (!desc->depth++)
226 desc->status |= IRQ_DISABLED; 344 irq_disable(desc);
227 desc->irq_data.chip->irq_disable(&desc->irq_data); 345}
228 } 346
347static int __disable_irq_nosync(unsigned int irq)
348{
349 unsigned long flags;
350 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
351
352 if (!desc)
353 return -EINVAL;
354 __disable_irq(desc, irq, false);
355 irq_put_desc_busunlock(desc, flags);
356 return 0;
229} 357}
230 358
231/** 359/**
@@ -241,17 +369,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
241 */ 369 */
242void disable_irq_nosync(unsigned int irq) 370void disable_irq_nosync(unsigned int irq)
243{ 371{
244 struct irq_desc *desc = irq_to_desc(irq); 372 __disable_irq_nosync(irq);
245 unsigned long flags;
246
247 if (!desc)
248 return;
249
250 chip_bus_lock(desc);
251 raw_spin_lock_irqsave(&desc->lock, flags);
252 __disable_irq(desc, irq, false);
253 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 chip_bus_sync_unlock(desc);
255} 373}
256EXPORT_SYMBOL(disable_irq_nosync); 374EXPORT_SYMBOL(disable_irq_nosync);
257 375
@@ -269,21 +387,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
269 */ 387 */
270void disable_irq(unsigned int irq) 388void disable_irq(unsigned int irq)
271{ 389{
272 struct irq_desc *desc = irq_to_desc(irq); 390 if (!__disable_irq_nosync(irq))
273
274 if (!desc)
275 return;
276
277 disable_irq_nosync(irq);
278 if (desc->action)
279 synchronize_irq(irq); 391 synchronize_irq(irq);
280} 392}
281EXPORT_SYMBOL(disable_irq); 393EXPORT_SYMBOL(disable_irq);
282 394
283void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 395void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
284{ 396{
285 if (resume) 397 if (resume) {
286 desc->status &= ~IRQ_SUSPENDED; 398 if (!(desc->istate & IRQS_SUSPENDED)) {
399 if (!desc->action)
400 return;
401 if (!(desc->action->flags & IRQF_FORCE_RESUME))
402 return;
403 /* Pretend that it got disabled ! */
404 desc->depth++;
405 }
406 desc->istate &= ~IRQS_SUSPENDED;
407 }
287 408
288 switch (desc->depth) { 409 switch (desc->depth) {
289 case 0: 410 case 0:
@@ -291,12 +412,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
291 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 412 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
292 break; 413 break;
293 case 1: { 414 case 1: {
294 unsigned int status = desc->status & ~IRQ_DISABLED; 415 if (desc->istate & IRQS_SUSPENDED)
295
296 if (desc->status & IRQ_SUSPENDED)
297 goto err_out; 416 goto err_out;
298 /* Prevent probing on this irq: */ 417 /* Prevent probing on this irq: */
299 desc->status = status | IRQ_NOPROBE; 418 irq_settings_set_noprobe(desc);
419 irq_enable(desc);
300 check_irq_resend(desc, irq); 420 check_irq_resend(desc, irq);
301 /* fall-through */ 421 /* fall-through */
302 } 422 }
@@ -318,21 +438,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
318 */ 438 */
319void enable_irq(unsigned int irq) 439void enable_irq(unsigned int irq)
320{ 440{
321 struct irq_desc *desc = irq_to_desc(irq);
322 unsigned long flags; 441 unsigned long flags;
442 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
323 443
324 if (!desc) 444 if (!desc)
325 return; 445 return;
446 if (WARN(!desc->irq_data.chip,
447 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
448 goto out;
326 449
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
331 chip_bus_lock(desc);
332 raw_spin_lock_irqsave(&desc->lock, flags);
333 __enable_irq(desc, irq, false); 450 __enable_irq(desc, irq, false);
334 raw_spin_unlock_irqrestore(&desc->lock, flags); 451out:
335 chip_bus_sync_unlock(desc); 452 irq_put_desc_busunlock(desc, flags);
336} 453}
337EXPORT_SYMBOL(enable_irq); 454EXPORT_SYMBOL(enable_irq);
338 455
@@ -348,7 +465,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
348} 465}
349 466
350/** 467/**
351 * set_irq_wake - control irq power management wakeup 468 * irq_set_irq_wake - control irq power management wakeup
352 * @irq: interrupt to control 469 * @irq: interrupt to control
353 * @on: enable/disable power management wakeup 470 * @on: enable/disable power management wakeup
354 * 471 *
@@ -359,23 +476,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
359 * Wakeup mode lets this IRQ wake the system from sleep 476 * Wakeup mode lets this IRQ wake the system from sleep
360 * states like "suspend to RAM". 477 * states like "suspend to RAM".
361 */ 478 */
362int set_irq_wake(unsigned int irq, unsigned int on) 479int irq_set_irq_wake(unsigned int irq, unsigned int on)
363{ 480{
364 struct irq_desc *desc = irq_to_desc(irq);
365 unsigned long flags; 481 unsigned long flags;
482 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
366 int ret = 0; 483 int ret = 0;
367 484
368 /* wakeup-capable irqs can be shared between drivers that 485 /* wakeup-capable irqs can be shared between drivers that
369 * don't need to have the same sleep mode behaviors. 486 * don't need to have the same sleep mode behaviors.
370 */ 487 */
371 raw_spin_lock_irqsave(&desc->lock, flags);
372 if (on) { 488 if (on) {
373 if (desc->wake_depth++ == 0) { 489 if (desc->wake_depth++ == 0) {
374 ret = set_irq_wake_real(irq, on); 490 ret = set_irq_wake_real(irq, on);
375 if (ret) 491 if (ret)
376 desc->wake_depth = 0; 492 desc->wake_depth = 0;
377 else 493 else
378 desc->status |= IRQ_WAKEUP; 494 irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
379 } 495 }
380 } else { 496 } else {
381 if (desc->wake_depth == 0) { 497 if (desc->wake_depth == 0) {
@@ -385,14 +501,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
385 if (ret) 501 if (ret)
386 desc->wake_depth = 1; 502 desc->wake_depth = 1;
387 else 503 else
388 desc->status &= ~IRQ_WAKEUP; 504 irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
389 } 505 }
390 } 506 }
391 507 irq_put_desc_busunlock(desc, flags);
392 raw_spin_unlock_irqrestore(&desc->lock, flags);
393 return ret; 508 return ret;
394} 509}
395EXPORT_SYMBOL(set_irq_wake); 510EXPORT_SYMBOL(irq_set_irq_wake);
396 511
397/* 512/*
398 * Internal function that tells the architecture code whether a 513 * Internal function that tells the architecture code whether a
@@ -401,43 +516,27 @@ EXPORT_SYMBOL(set_irq_wake);
401 */ 516 */
402int can_request_irq(unsigned int irq, unsigned long irqflags) 517int can_request_irq(unsigned int irq, unsigned long irqflags)
403{ 518{
404 struct irq_desc *desc = irq_to_desc(irq);
405 struct irqaction *action;
406 unsigned long flags; 519 unsigned long flags;
520 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
521 int canrequest = 0;
407 522
408 if (!desc) 523 if (!desc)
409 return 0; 524 return 0;
410 525
411 if (desc->status & IRQ_NOREQUEST) 526 if (irq_settings_can_request(desc)) {
412 return 0; 527 if (desc->action)
413 528 if (irqflags & desc->action->flags & IRQF_SHARED)
414 raw_spin_lock_irqsave(&desc->lock, flags); 529 canrequest =1;
415 action = desc->action; 530 }
416 if (action) 531 irq_put_desc_unlock(desc, flags);
417 if (irqflags & action->flags & IRQF_SHARED) 532 return canrequest;
418 action = NULL;
419
420 raw_spin_unlock_irqrestore(&desc->lock, flags);
421
422 return !action;
423}
424
425void compat_irq_chip_set_default_handler(struct irq_desc *desc)
426{
427 /*
428 * If the architecture still has not overriden
429 * the flow handler then zap the default. This
430 * should catch incorrect flow-type setting.
431 */
432 if (desc->handle_irq == &handle_bad_irq)
433 desc->handle_irq = NULL;
434} 533}
435 534
436int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 535int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
437 unsigned long flags) 536 unsigned long flags)
438{ 537{
439 int ret;
440 struct irq_chip *chip = desc->irq_data.chip; 538 struct irq_chip *chip = desc->irq_data.chip;
539 int ret, unmask = 0;
441 540
442 if (!chip || !chip->irq_set_type) { 541 if (!chip || !chip->irq_set_type) {
443 /* 542 /*
@@ -449,23 +548,43 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
449 return 0; 548 return 0;
450 } 549 }
451 550
551 flags &= IRQ_TYPE_SENSE_MASK;
552
553 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
554 if (!(desc->istate & IRQS_MASKED))
555 mask_irq(desc);
556 if (!(desc->istate & IRQS_DISABLED))
557 unmask = 1;
558 }
559
452 /* caller masked out all except trigger mode flags */ 560 /* caller masked out all except trigger mode flags */
453 ret = chip->irq_set_type(&desc->irq_data, flags); 561 ret = chip->irq_set_type(&desc->irq_data, flags);
454 562
455 if (ret) 563 switch (ret) {
456 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 564 case IRQ_SET_MASK_OK:
457 flags, irq, chip->irq_set_type); 565 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
458 else { 566 irqd_set(&desc->irq_data, flags);
459 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 567
460 flags |= IRQ_LEVEL; 568 case IRQ_SET_MASK_OK_NOCOPY:
461 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 569 flags = irqd_get_trigger_type(&desc->irq_data);
462 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 570 irq_settings_set_trigger_mask(desc, flags);
463 desc->status |= flags; 571 irqd_clear(&desc->irq_data, IRQD_LEVEL);
572 irq_settings_clr_level(desc);
573 if (flags & IRQ_TYPE_LEVEL_MASK) {
574 irq_settings_set_level(desc);
575 irqd_set(&desc->irq_data, IRQD_LEVEL);
576 }
464 577
465 if (chip != desc->irq_data.chip) 578 if (chip != desc->irq_data.chip)
466 irq_chip_set_defaults(desc->irq_data.chip); 579 irq_chip_set_defaults(desc->irq_data.chip);
580 ret = 0;
581 break;
582 default:
583 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
584 flags, irq, chip->irq_set_type);
467 } 585 }
468 586 if (unmask)
587 unmask_irq(desc);
469 return ret; 588 return ret;
470} 589}
471 590
@@ -509,8 +628,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
509 * handler finished. unmask if the interrupt has not been disabled and 628 * handler finished. unmask if the interrupt has not been disabled and
510 * is marked MASKED. 629 * is marked MASKED.
511 */ 630 */
512static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 631static void irq_finalize_oneshot(struct irq_desc *desc,
632 struct irqaction *action, bool force)
513{ 633{
634 if (!(desc->istate & IRQS_ONESHOT))
635 return;
514again: 636again:
515 chip_bus_lock(desc); 637 chip_bus_lock(desc);
516 raw_spin_lock_irq(&desc->lock); 638 raw_spin_lock_irq(&desc->lock);
@@ -522,26 +644,44 @@ again:
522 * The thread is faster done than the hard interrupt handler 644 * The thread is faster done than the hard interrupt handler
523 * on the other CPU. If we unmask the irq line then the 645 * on the other CPU. If we unmask the irq line then the
524 * interrupt can come in again and masks the line, leaves due 646 * interrupt can come in again and masks the line, leaves due
525 * to IRQ_INPROGRESS and the irq line is masked forever. 647 * to IRQS_INPROGRESS and the irq line is masked forever.
648 *
649 * This also serializes the state of shared oneshot handlers
650 * versus "desc->threads_onehsot |= action->thread_mask;" in
651 * irq_wake_thread(). See the comment there which explains the
652 * serialization.
526 */ 653 */
527 if (unlikely(desc->status & IRQ_INPROGRESS)) { 654 if (unlikely(desc->istate & IRQS_INPROGRESS)) {
528 raw_spin_unlock_irq(&desc->lock); 655 raw_spin_unlock_irq(&desc->lock);
529 chip_bus_sync_unlock(desc); 656 chip_bus_sync_unlock(desc);
530 cpu_relax(); 657 cpu_relax();
531 goto again; 658 goto again;
532 } 659 }
533 660
534 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 661 /*
535 desc->status &= ~IRQ_MASKED; 662 * Now check again, whether the thread should run. Otherwise
663 * we would clear the threads_oneshot bit of this thread which
664 * was just set.
665 */
666 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
667 goto out_unlock;
668
669 desc->threads_oneshot &= ~action->thread_mask;
670
671 if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) &&
672 (desc->istate & IRQS_MASKED)) {
673 irq_compat_clr_masked(desc);
674 desc->istate &= ~IRQS_MASKED;
536 desc->irq_data.chip->irq_unmask(&desc->irq_data); 675 desc->irq_data.chip->irq_unmask(&desc->irq_data);
537 } 676 }
677out_unlock:
538 raw_spin_unlock_irq(&desc->lock); 678 raw_spin_unlock_irq(&desc->lock);
539 chip_bus_sync_unlock(desc); 679 chip_bus_sync_unlock(desc);
540} 680}
541 681
542#ifdef CONFIG_SMP 682#ifdef CONFIG_SMP
543/* 683/*
544 * Check whether we need to change the affinity of the interrupt thread. 684 * Check whether we need to chasnge the affinity of the interrupt thread.
545 */ 685 */
546static void 686static void
547irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 687irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,6 +713,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
573#endif 713#endif
574 714
575/* 715/*
716 * Interrupts which are not explicitely requested as threaded
717 * interrupts rely on the implicit bh/preempt disable of the hard irq
718 * context. So we need to disable bh here to avoid deadlocks and other
719 * side effects.
720 */
721static void
722irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
723{
724 local_bh_disable();
725 action->thread_fn(action->irq, action->dev_id);
726 irq_finalize_oneshot(desc, action, false);
727 local_bh_enable();
728}
729
730/*
731 * Interrupts explicitely requested as threaded interupts want to be
732 * preemtible - many of them need to sleep and wait for slow busses to
733 * complete.
734 */
735static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
736{
737 action->thread_fn(action->irq, action->dev_id);
738 irq_finalize_oneshot(desc, action, false);
739}
740
741/*
576 * Interrupt handler thread 742 * Interrupt handler thread
577 */ 743 */
578static int irq_thread(void *data) 744static int irq_thread(void *data)
@@ -582,7 +748,14 @@ static int irq_thread(void *data)
582 }; 748 };
583 struct irqaction *action = data; 749 struct irqaction *action = data;
584 struct irq_desc *desc = irq_to_desc(action->irq); 750 struct irq_desc *desc = irq_to_desc(action->irq);
585 int wake, oneshot = desc->status & IRQ_ONESHOT; 751 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
752 int wake;
753
754 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
755 &action->thread_flags))
756 handler_fn = irq_forced_thread_fn;
757 else
758 handler_fn = irq_thread_fn;
586 759
587 sched_setscheduler(current, SCHED_FIFO, &param); 760 sched_setscheduler(current, SCHED_FIFO, &param);
588 current->irqaction = action; 761 current->irqaction = action;
@@ -594,23 +767,20 @@ static int irq_thread(void *data)
594 atomic_inc(&desc->threads_active); 767 atomic_inc(&desc->threads_active);
595 768
596 raw_spin_lock_irq(&desc->lock); 769 raw_spin_lock_irq(&desc->lock);
597 if (unlikely(desc->status & IRQ_DISABLED)) { 770 if (unlikely(desc->istate & IRQS_DISABLED)) {
598 /* 771 /*
599 * CHECKME: We might need a dedicated 772 * CHECKME: We might need a dedicated
600 * IRQ_THREAD_PENDING flag here, which 773 * IRQ_THREAD_PENDING flag here, which
601 * retriggers the thread in check_irq_resend() 774 * retriggers the thread in check_irq_resend()
602 * but AFAICT IRQ_PENDING should be fine as it 775 * but AFAICT IRQS_PENDING should be fine as it
603 * retriggers the interrupt itself --- tglx 776 * retriggers the interrupt itself --- tglx
604 */ 777 */
605 desc->status |= IRQ_PENDING; 778 irq_compat_set_pending(desc);
779 desc->istate |= IRQS_PENDING;
606 raw_spin_unlock_irq(&desc->lock); 780 raw_spin_unlock_irq(&desc->lock);
607 } else { 781 } else {
608 raw_spin_unlock_irq(&desc->lock); 782 raw_spin_unlock_irq(&desc->lock);
609 783 handler_fn(desc, action);
610 action->thread_fn(action->irq, action->dev_id);
611
612 if (oneshot)
613 irq_finalize_oneshot(action->irq, desc);
614 } 784 }
615 785
616 wake = atomic_dec_and_test(&desc->threads_active); 786 wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +789,9 @@ static int irq_thread(void *data)
619 wake_up(&desc->wait_for_threads); 789 wake_up(&desc->wait_for_threads);
620 } 790 }
621 791
792 /* Prevent a stale desc->threads_oneshot */
793 irq_finalize_oneshot(desc, action, true);
794
622 /* 795 /*
623 * Clear irqaction. Otherwise exit_irq_thread() would make 796 * Clear irqaction. Otherwise exit_irq_thread() would make
624 * fuzz about an active irq thread going into nirvana. 797 * fuzz about an active irq thread going into nirvana.
@@ -633,6 +806,7 @@ static int irq_thread(void *data)
633void exit_irq_thread(void) 806void exit_irq_thread(void)
634{ 807{
635 struct task_struct *tsk = current; 808 struct task_struct *tsk = current;
809 struct irq_desc *desc;
636 810
637 if (!tsk->irqaction) 811 if (!tsk->irqaction)
638 return; 812 return;
@@ -641,6 +815,14 @@ void exit_irq_thread(void)
641 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 815 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
642 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 816 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
643 817
818 desc = irq_to_desc(tsk->irqaction->irq);
819
820 /*
821 * Prevent a stale desc->threads_oneshot. Must be called
822 * before setting the IRQTF_DIED flag.
823 */
824 irq_finalize_oneshot(desc, tsk->irqaction, true);
825
644 /* 826 /*
645 * Set the THREAD DIED flag to prevent further wakeups of the 827 * Set the THREAD DIED flag to prevent further wakeups of the
646 * soon to be gone threaded handler. 828 * soon to be gone threaded handler.
@@ -648,6 +830,22 @@ void exit_irq_thread(void)
648 set_bit(IRQTF_DIED, &tsk->irqaction->flags); 830 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
649} 831}
650 832
833static void irq_setup_forced_threading(struct irqaction *new)
834{
835 if (!force_irqthreads)
836 return;
837 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
838 return;
839
840 new->flags |= IRQF_ONESHOT;
841
842 if (!new->thread_fn) {
843 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
844 new->thread_fn = new->handler;
845 new->handler = irq_default_primary_handler;
846 }
847}
848
651/* 849/*
652 * Internal function to register an irqaction - typically used to 850 * Internal function to register an irqaction - typically used to
653 * allocate special interrupts that are part of the architecture. 851 * allocate special interrupts that are part of the architecture.
@@ -657,9 +855,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657{ 855{
658 struct irqaction *old, **old_ptr; 856 struct irqaction *old, **old_ptr;
659 const char *old_name = NULL; 857 const char *old_name = NULL;
660 unsigned long flags; 858 unsigned long flags, thread_mask = 0;
661 int nested, shared = 0; 859 int ret, nested, shared = 0;
662 int ret; 860 cpumask_var_t mask;
663 861
664 if (!desc) 862 if (!desc)
665 return -EINVAL; 863 return -EINVAL;
@@ -683,15 +881,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
683 rand_initialize_irq(irq); 881 rand_initialize_irq(irq);
684 } 882 }
685 883
686 /* Oneshot interrupts are not allowed with shared */
687 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
688 return -EINVAL;
689
690 /* 884 /*
691 * Check whether the interrupt nests into another interrupt 885 * Check whether the interrupt nests into another interrupt
692 * thread. 886 * thread.
693 */ 887 */
694 nested = desc->status & IRQ_NESTED_THREAD; 888 nested = irq_settings_is_nested_thread(desc);
695 if (nested) { 889 if (nested) {
696 if (!new->thread_fn) 890 if (!new->thread_fn)
697 return -EINVAL; 891 return -EINVAL;
@@ -701,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
701 * dummy function which warns when called. 895 * dummy function which warns when called.
702 */ 896 */
703 new->handler = irq_nested_primary_handler; 897 new->handler = irq_nested_primary_handler;
898 } else {
899 irq_setup_forced_threading(new);
704 } 900 }
705 901
706 /* 902 /*
@@ -724,6 +920,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
724 new->thread = t; 920 new->thread = t;
725 } 921 }
726 922
923 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
924 ret = -ENOMEM;
925 goto out_thread;
926 }
927
727 /* 928 /*
728 * The following block of code has to be executed atomically 929 * The following block of code has to be executed atomically
729 */ 930 */
@@ -735,29 +936,40 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 * Can't share interrupts unless both agree to and are 936 * Can't share interrupts unless both agree to and are
736 * the same type (level, edge, polarity). So both flag 937 * the same type (level, edge, polarity). So both flag
737 * fields must have IRQF_SHARED set and the bits which 938 * fields must have IRQF_SHARED set and the bits which
738 * set the trigger type must match. 939 * set the trigger type must match. Also all must
940 * agree on ONESHOT.
739 */ 941 */
740 if (!((old->flags & new->flags) & IRQF_SHARED) || 942 if (!((old->flags & new->flags) & IRQF_SHARED) ||
741 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { 943 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
944 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
742 old_name = old->name; 945 old_name = old->name;
743 goto mismatch; 946 goto mismatch;
744 } 947 }
745 948
746#if defined(CONFIG_IRQ_PER_CPU)
747 /* All handlers must agree on per-cpuness */ 949 /* All handlers must agree on per-cpuness */
748 if ((old->flags & IRQF_PERCPU) != 950 if ((old->flags & IRQF_PERCPU) !=
749 (new->flags & IRQF_PERCPU)) 951 (new->flags & IRQF_PERCPU))
750 goto mismatch; 952 goto mismatch;
751#endif
752 953
753 /* add new interrupt at end of irq queue */ 954 /* add new interrupt at end of irq queue */
754 do { 955 do {
956 thread_mask |= old->thread_mask;
755 old_ptr = &old->next; 957 old_ptr = &old->next;
756 old = *old_ptr; 958 old = *old_ptr;
757 } while (old); 959 } while (old);
758 shared = 1; 960 shared = 1;
759 } 961 }
760 962
963 /*
964 * Setup the thread mask for this irqaction. Unlikely to have
965 * 32 resp 64 irqs sharing one line, but who knows.
966 */
967 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
968 ret = -EBUSY;
969 goto out_mask;
970 }
971 new->thread_mask = 1 << ffz(thread_mask);
972
761 if (!shared) { 973 if (!shared) {
762 irq_chip_set_defaults(desc->irq_data.chip); 974 irq_chip_set_defaults(desc->irq_data.chip);
763 975
@@ -769,42 +981,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
769 new->flags & IRQF_TRIGGER_MASK); 981 new->flags & IRQF_TRIGGER_MASK);
770 982
771 if (ret) 983 if (ret)
772 goto out_thread; 984 goto out_mask;
773 } else 985 }
774 compat_irq_chip_set_default_handler(desc);
775#if defined(CONFIG_IRQ_PER_CPU)
776 if (new->flags & IRQF_PERCPU)
777 desc->status |= IRQ_PER_CPU;
778#endif
779 986
780 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | 987 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
781 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 988 IRQS_INPROGRESS | IRQS_ONESHOT | \
989 IRQS_WAITING);
990
991 if (new->flags & IRQF_PERCPU) {
992 irqd_set(&desc->irq_data, IRQD_PER_CPU);
993 irq_settings_set_per_cpu(desc);
994 }
782 995
783 if (new->flags & IRQF_ONESHOT) 996 if (new->flags & IRQF_ONESHOT)
784 desc->status |= IRQ_ONESHOT; 997 desc->istate |= IRQS_ONESHOT;
785 998
786 if (!(desc->status & IRQ_NOAUTOEN)) { 999 if (irq_settings_can_autoenable(desc))
787 desc->depth = 0; 1000 irq_startup(desc);
788 desc->status &= ~IRQ_DISABLED; 1001 else
789 desc->irq_data.chip->irq_startup(&desc->irq_data);
790 } else
791 /* Undo nested disables: */ 1002 /* Undo nested disables: */
792 desc->depth = 1; 1003 desc->depth = 1;
793 1004
794 /* Exclude IRQ from balancing if requested */ 1005 /* Exclude IRQ from balancing if requested */
795 if (new->flags & IRQF_NOBALANCING) 1006 if (new->flags & IRQF_NOBALANCING) {
796 desc->status |= IRQ_NO_BALANCING; 1007 irq_settings_set_no_balancing(desc);
1008 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1009 }
797 1010
798 /* Set default affinity mask once everything is setup */ 1011 /* Set default affinity mask once everything is setup */
799 setup_affinity(irq, desc); 1012 setup_affinity(irq, desc, mask);
800 1013
801 } else if ((new->flags & IRQF_TRIGGER_MASK) 1014 } else if (new->flags & IRQF_TRIGGER_MASK) {
802 && (new->flags & IRQF_TRIGGER_MASK) 1015 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
803 != (desc->status & IRQ_TYPE_SENSE_MASK)) { 1016 unsigned int omsk = irq_settings_get_trigger_mask(desc);
804 /* hope the handler works with the actual trigger mode... */ 1017
805 pr_warning("IRQ %d uses trigger mode %d; requested %d\n", 1018 if (nmsk != omsk)
806 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), 1019 /* hope the handler works with current trigger mode */
807 (int)(new->flags & IRQF_TRIGGER_MASK)); 1020 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1021 irq, nmsk, omsk);
808 } 1022 }
809 1023
810 new->irq = irq; 1024 new->irq = irq;
@@ -818,8 +1032,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
818 * Check whether we disabled the irq via the spurious handler 1032 * Check whether we disabled the irq via the spurious handler
819 * before. Reenable it and give it another chance. 1033 * before. Reenable it and give it another chance.
820 */ 1034 */
821 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 1035 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
822 desc->status &= ~IRQ_SPURIOUS_DISABLED; 1036 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
823 __enable_irq(desc, irq, false); 1037 __enable_irq(desc, irq, false);
824 } 1038 }
825 1039
@@ -849,6 +1063,9 @@ mismatch:
849#endif 1063#endif
850 ret = -EBUSY; 1064 ret = -EBUSY;
851 1065
1066out_mask:
1067 free_cpumask_var(mask);
1068
852out_thread: 1069out_thread:
853 raw_spin_unlock_irqrestore(&desc->lock, flags); 1070 raw_spin_unlock_irqrestore(&desc->lock, flags);
854 if (new->thread) { 1071 if (new->thread) {
@@ -871,9 +1088,14 @@ out_thread:
871 */ 1088 */
872int setup_irq(unsigned int irq, struct irqaction *act) 1089int setup_irq(unsigned int irq, struct irqaction *act)
873{ 1090{
1091 int retval;
874 struct irq_desc *desc = irq_to_desc(irq); 1092 struct irq_desc *desc = irq_to_desc(irq);
875 1093
876 return __setup_irq(irq, desc, act); 1094 chip_bus_lock(desc);
1095 retval = __setup_irq(irq, desc, act);
1096 chip_bus_sync_unlock(desc);
1097
1098 return retval;
877} 1099}
878EXPORT_SYMBOL_GPL(setup_irq); 1100EXPORT_SYMBOL_GPL(setup_irq);
879 1101
@@ -924,13 +1146,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
924#endif 1146#endif
925 1147
926 /* If this was the last handler, shut down the IRQ line: */ 1148 /* If this was the last handler, shut down the IRQ line: */
927 if (!desc->action) { 1149 if (!desc->action)
928 desc->status |= IRQ_DISABLED; 1150 irq_shutdown(desc);
929 if (desc->irq_data.chip->irq_shutdown)
930 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
931 else
932 desc->irq_data.chip->irq_disable(&desc->irq_data);
933 }
934 1151
935#ifdef CONFIG_SMP 1152#ifdef CONFIG_SMP
936 /* make sure affinity_hint is cleaned up */ 1153 /* make sure affinity_hint is cleaned up */
@@ -1004,6 +1221,11 @@ void free_irq(unsigned int irq, void *dev_id)
1004 if (!desc) 1221 if (!desc)
1005 return; 1222 return;
1006 1223
1224#ifdef CONFIG_SMP
1225 if (WARN_ON(desc->affinity_notify))
1226 desc->affinity_notify = NULL;
1227#endif
1228
1007 chip_bus_lock(desc); 1229 chip_bus_lock(desc);
1008 kfree(__free_irq(irq, dev_id)); 1230 kfree(__free_irq(irq, dev_id));
1009 chip_bus_sync_unlock(desc); 1231 chip_bus_sync_unlock(desc);
@@ -1074,7 +1296,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1074 if (!desc) 1296 if (!desc)
1075 return -EINVAL; 1297 return -EINVAL;
1076 1298
1077 if (desc->status & IRQ_NOREQUEST) 1299 if (!irq_settings_can_request(desc))
1078 return -EINVAL; 1300 return -EINVAL;
1079 1301
1080 if (!handler) { 1302 if (!handler) {
@@ -1149,7 +1371,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1149 if (!desc) 1371 if (!desc)
1150 return -EINVAL; 1372 return -EINVAL;
1151 1373
1152 if (desc->status & IRQ_NESTED_THREAD) { 1374 if (irq_settings_is_nested_thread(desc)) {
1153 ret = request_threaded_irq(irq, NULL, handler, 1375 ret = request_threaded_irq(irq, NULL, handler,
1154 flags, name, dev_id); 1376 flags, name, dev_id);
1155 return !ret ? IRQC_IS_NESTED : ret; 1377 return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff0..ec4806d4778 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7void move_masked_irq(int irq) 7void irq_move_masked_irq(struct irq_data *idata)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_data_to_desc(idata);
10 struct irq_chip *chip = desc->irq_data.chip; 10 struct irq_chip *chip = idata->chip;
11 11
12 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
13 return; 13 return;
14 14
15 /* 15 /*
16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. 16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
17 */ 17 */
18 if (CHECK_IRQ_PER_CPU(desc->status)) { 18 if (!irqd_can_balance(&desc->irq_data)) {
19 WARN_ON(1); 19 WARN_ON(1);
20 return; 20 return;
21 } 21 }
22 22
23 desc->status &= ~IRQ_MOVE_PENDING; 23 irqd_clr_move_pending(&desc->irq_data);
24 24
25 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
26 return; 26 return;
@@ -53,15 +53,20 @@ void move_masked_irq(int irq)
53 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
54} 54}
55 55
56void move_native_irq(int irq) 56void move_masked_irq(int irq)
57{
58 irq_move_masked_irq(irq_get_irq_data(irq));
59}
60
61void irq_move_irq(struct irq_data *idata)
57{ 62{
58 struct irq_desc *desc = irq_to_desc(irq); 63 struct irq_desc *desc = irq_data_to_desc(idata);
59 bool masked; 64 bool masked;
60 65
61 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 66 if (likely(!irqd_is_setaffinity_pending(idata)))
62 return; 67 return;
63 68
64 if (unlikely(desc->status & IRQ_DISABLED)) 69 if (unlikely(desc->istate & IRQS_DISABLED))
65 return; 70 return;
66 71
67 /* 72 /*
@@ -69,10 +74,15 @@ void move_native_irq(int irq)
69 * threaded interrupt with ONESHOT set, we can end up with an 74 * threaded interrupt with ONESHOT set, we can end up with an
70 * interrupt storm. 75 * interrupt storm.
71 */ 76 */
72 masked = desc->status & IRQ_MASKED; 77 masked = desc->istate & IRQS_MASKED;
73 if (!masked) 78 if (!masked)
74 desc->irq_data.chip->irq_mask(&desc->irq_data); 79 idata->chip->irq_mask(idata);
75 move_masked_irq(irq); 80 irq_move_masked_irq(idata);
76 if (!masked) 81 if (!masked)
77 desc->irq_data.chip->irq_unmask(&desc->irq_data); 82 idata->chip->irq_unmask(idata);
83}
84
85void move_native_irq(int irq)
86{
87 irq_move_irq(irq_get_irq_data(irq));
78} 88}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b0..f76fc00c987 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
18 * During system-wide suspend or hibernation device drivers need to be prevented 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * from receiving interrupts and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It marks all interrupt lines in use, except for the timer ones, as disabled 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * and sets the IRQ_SUSPENDED flag for each of them. 21 * and sets the IRQS_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED) 37 if (desc->istate & IRQS_SUSPENDED)
38 synchronize_irq(irq); 38 synchronize_irq(irq);
39} 39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 40EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() 43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 * 44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that 45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set. 46 * have the IRQS_SUSPENDED flag set.
47 */ 47 */
48void resume_device_irqs(void) 48void resume_device_irqs(void)
49{ 49{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
53 for_each_irq_desc(irq, desc) { 53 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
61 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
71 struct irq_desc *desc; 68 struct irq_desc *desc;
72 int irq; 69 int irq;
73 70
74 for_each_irq_desc(irq, desc) 71 for_each_irq_desc(irq, desc) {
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) 72 if (irqd_is_wakeup_set(&desc->irq_data)) {
76 return -EBUSY; 73 if (desc->istate & IRQS_PENDING)
74 return -EBUSY;
75 continue;
76 }
77 /*
78 * Check the non wakeup interrupts whether they need
79 * to be masked before finally going into suspend
80 * state. That's for hardware which has no wakeup
81 * source configuration facility. The chip
82 * implementation indicates that with
83 * IRQCHIP_MASK_ON_SUSPEND.
84 */
85 if (desc->istate & IRQS_SUSPENDED &&
86 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
87 mask_irq(desc);
88 }
77 89
78 return 0; 90 return 0;
79} 91}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7..4cc2e5ed0be 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
24 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
25 26
26#ifdef CONFIG_GENERIC_PENDING_IRQ 27#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
28 mask = desc->pending_mask; 29 mask = desc->pending_mask;
29#endif 30#endif
30 seq_cpumask(m, mask); 31 seq_cpumask(m, mask);
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 66 cpumask_var_t new_value;
66 int err; 67 int err;
67 68
68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || 69 if (!irq_can_set_affinity(irq) || no_irq_affinity)
69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
89 if (!cpumask_intersects(new_value, cpu_online_mask)) { 89 if (!cpumask_intersects(new_value, cpu_online_mask)) {
90 /* Special case for empty set - allow the architecture 90 /* Special case for empty set - allow the architecture
91 code to set default SMP affinity. */ 91 code to set default SMP affinity. */
92 err = irq_select_affinity_usr(irq) ? -EINVAL : count; 92 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
93 } else { 93 } else {
94 irq_set_affinity(irq, new_value); 94 irq_set_affinity(irq, new_value);
95 err = count; 95 err = count;
@@ -357,3 +357,65 @@ void init_irq_proc(void)
357 } 357 }
358} 358}
359 359
360#ifdef CONFIG_GENERIC_IRQ_SHOW
361
362int __weak arch_show_interrupts(struct seq_file *p, int prec)
363{
364 return 0;
365}
366
367int show_interrupts(struct seq_file *p, void *v)
368{
369 static int prec;
370
371 unsigned long flags, any_count = 0;
372 int i = *(loff_t *) v, j;
373 struct irqaction *action;
374 struct irq_desc *desc;
375
376 if (i > nr_irqs)
377 return 0;
378
379 if (i == nr_irqs)
380 return arch_show_interrupts(p, prec);
381
382 /* print header and calculate the width of the first column */
383 if (i == 0) {
384 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
385 j *= 10;
386
387 seq_printf(p, "%*s", prec + 8, "");
388 for_each_online_cpu(j)
389 seq_printf(p, "CPU%-8d", j);
390 seq_putc(p, '\n');
391 }
392
393 desc = irq_to_desc(i);
394 if (!desc)
395 return 0;
396
397 raw_spin_lock_irqsave(&desc->lock, flags);
398 for_each_online_cpu(j)
399 any_count |= kstat_irqs_cpu(i, j);
400 action = desc->action;
401 if (!action && !any_count)
402 goto out;
403
404 seq_printf(p, "%*d: ", prec, i);
405 for_each_online_cpu(j)
406 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
407 seq_printf(p, " %8s", desc->irq_data.chip->name);
408 seq_printf(p, "-%-8s", desc->name);
409
410 if (action) {
411 seq_printf(p, " %s", action->name);
412 while ((action = action->next) != NULL)
413 seq_printf(p, ", %s", action->name);
414 }
415
416 seq_putc(p, '\n');
417out:
418 raw_spin_unlock_irqrestore(&desc->lock, flags);
419 return 0;
420}
421#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dc49358b73f..ad683a99b1e 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,20 +55,19 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64
65 /* 58 /*
66 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
68 * active. 61 * active.
69 */ 62 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 63 if (irq_settings_is_level(desc))
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) {
68 irq_compat_clr_pending(desc);
69 desc->istate &= ~IRQS_PENDING;
70 desc->istate |= IRQS_REPLAY;
72 71
73 if (!desc->irq_data.chip->irq_retrigger || 72 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 73 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 00000000000..0227ad35827
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,138 @@
1/*
2 * Internal header to deal with irq_desc->status which will be renamed
3 * to irq_desc->settings.
4 */
5enum {
6 _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
7 _IRQ_PER_CPU = IRQ_PER_CPU,
8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
14 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
15 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
16};
17
18#define IRQ_INPROGRESS GOT_YOU_MORON
19#define IRQ_REPLAY GOT_YOU_MORON
20#define IRQ_WAITING GOT_YOU_MORON
21#define IRQ_DISABLED GOT_YOU_MORON
22#define IRQ_PENDING GOT_YOU_MORON
23#define IRQ_MASKED GOT_YOU_MORON
24#define IRQ_WAKEUP GOT_YOU_MORON
25#define IRQ_MOVE_PENDING GOT_YOU_MORON
26#define IRQ_PER_CPU GOT_YOU_MORON
27#define IRQ_NO_BALANCING GOT_YOU_MORON
28#define IRQ_AFFINITY_SET GOT_YOU_MORON
29#define IRQ_LEVEL GOT_YOU_MORON
30#define IRQ_NOPROBE GOT_YOU_MORON
31#define IRQ_NOREQUEST GOT_YOU_MORON
32#define IRQ_NOAUTOEN GOT_YOU_MORON
33#define IRQ_NESTED_THREAD GOT_YOU_MORON
34#undef IRQF_MODIFY_MASK
35#define IRQF_MODIFY_MASK GOT_YOU_MORON
36
37static inline void
38irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
39{
40 desc->status &= ~(clr & _IRQF_MODIFY_MASK);
41 desc->status |= (set & _IRQF_MODIFY_MASK);
42}
43
44static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
45{
46 return desc->status & _IRQ_PER_CPU;
47}
48
49static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
50{
51 desc->status |= _IRQ_PER_CPU;
52}
53
54static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
55{
56 desc->status |= _IRQ_NO_BALANCING;
57}
58
59static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
60{
61 return desc->status & _IRQ_NO_BALANCING;
62}
63
64static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
65{
66 return desc->status & IRQ_TYPE_SENSE_MASK;
67}
68
69static inline void
70irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
71{
72 desc->status &= ~IRQ_TYPE_SENSE_MASK;
73 desc->status |= mask & IRQ_TYPE_SENSE_MASK;
74}
75
76static inline bool irq_settings_is_level(struct irq_desc *desc)
77{
78 return desc->status & _IRQ_LEVEL;
79}
80
81static inline void irq_settings_clr_level(struct irq_desc *desc)
82{
83 desc->status &= ~_IRQ_LEVEL;
84}
85
86static inline void irq_settings_set_level(struct irq_desc *desc)
87{
88 desc->status |= _IRQ_LEVEL;
89}
90
91static inline bool irq_settings_can_request(struct irq_desc *desc)
92{
93 return !(desc->status & _IRQ_NOREQUEST);
94}
95
96static inline void irq_settings_clr_norequest(struct irq_desc *desc)
97{
98 desc->status &= ~_IRQ_NOREQUEST;
99}
100
101static inline void irq_settings_set_norequest(struct irq_desc *desc)
102{
103 desc->status |= _IRQ_NOREQUEST;
104}
105
106static inline bool irq_settings_can_probe(struct irq_desc *desc)
107{
108 return !(desc->status & _IRQ_NOPROBE);
109}
110
111static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
112{
113 desc->status &= ~_IRQ_NOPROBE;
114}
115
116static inline void irq_settings_set_noprobe(struct irq_desc *desc)
117{
118 desc->status |= _IRQ_NOPROBE;
119}
120
121static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
122{
123 return desc->status & _IRQ_MOVE_PCNTXT;
124}
125
126static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
127{
128 return !(desc->status & _IRQ_NOAUTOEN);
129}
130
131static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
132{
133 return desc->status & _IRQ_NESTED_THREAD;
134}
135
136/* Nothing should touch desc->status from now on */
137#undef status
138#define status USE_THE_PROPER_WRAPPERS_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f..dd586ebf9c8 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,94 @@ static int irqfixup __read_mostly;
21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
22static void poll_spurious_irqs(unsigned long dummy); 22static void poll_spurious_irqs(unsigned long dummy);
23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); 23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
24static int irq_poll_cpu;
25static atomic_t irq_poll_active;
26
27/*
28 * We wait here for a poller to finish.
29 *
30 * If the poll runs on this CPU, then we yell loudly and return
31 * false. That will leave the interrupt line disabled in the worst
32 * case, but it should never happen.
33 *
34 * We wait until the poller is done and then recheck disabled and
35 * action (about to be disabled). Only if it's still active, we return
36 * true and let the handler run.
37 */
38bool irq_wait_for_poll(struct irq_desc *desc)
39{
40 if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
41 "irq poll in progress on cpu %d for irq %d\n",
42 smp_processor_id(), desc->irq_data.irq))
43 return false;
44
45#ifdef CONFIG_SMP
46 do {
47 raw_spin_unlock(&desc->lock);
48 while (desc->istate & IRQS_INPROGRESS)
49 cpu_relax();
50 raw_spin_lock(&desc->lock);
51 } while (desc->istate & IRQS_INPROGRESS);
52 /* Might have been disabled in meantime */
53 return !(desc->istate & IRQS_DISABLED) && desc->action;
54#else
55 return false;
56#endif
57}
58
24 59
25/* 60/*
26 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
27 */ 62 */
28static int try_one_irq(int irq, struct irq_desc *desc) 63static int try_one_irq(int irq, struct irq_desc *desc, bool force)
29{ 64{
65 irqreturn_t ret = IRQ_NONE;
30 struct irqaction *action; 66 struct irqaction *action;
31 int ok = 0, work = 0;
32 67
33 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
34 /* Already running on another processor */
35 if (desc->status & IRQ_INPROGRESS) {
36 /*
37 * Already running: If it is shared get the other
38 * CPU to go looking for our mystery interrupt too
39 */
40 if (desc->action && (desc->action->flags & IRQF_SHARED))
41 desc->status |= IRQ_PENDING;
42 raw_spin_unlock(&desc->lock);
43 return ok;
44 }
45 /* Honour the normal IRQ locking */
46 desc->status |= IRQ_INPROGRESS;
47 action = desc->action;
48 raw_spin_unlock(&desc->lock);
49 69
50 while (action) { 70 /* PER_CPU and nested thread interrupts are never polled */
51 /* Only shared IRQ handlers are safe to call */ 71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
52 if (action->flags & IRQF_SHARED) { 72 goto out;
53 if (action->handler(irq, action->dev_id) ==
54 IRQ_HANDLED)
55 ok = 1;
56 }
57 action = action->next;
58 }
59 local_irq_disable();
60 /* Now clean up the flags */
61 raw_spin_lock(&desc->lock);
62 action = desc->action;
63 73
64 /* 74 /*
65 * While we were looking for a fixup someone queued a real 75 * Do not poll disabled interrupts unless the spurious
66 * IRQ clashing with our walk: 76 * disabled poller asks explicitely.
67 */ 77 */
68 while ((desc->status & IRQ_PENDING) && action) { 78 if ((desc->istate & IRQS_DISABLED) && !force)
79 goto out;
80
81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well.
84 */
85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next)
88 goto out;
89
90 /* Already running on another processor */
91 if (desc->istate & IRQS_INPROGRESS) {
69 /* 92 /*
70 * Perform real IRQ processing for the IRQ we deferred 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too
71 */ 95 */
72 work = 1; 96 irq_compat_set_pending(desc);
73 raw_spin_unlock(&desc->lock); 97 desc->istate |= IRQS_PENDING;
74 handle_IRQ_event(irq, action); 98 goto out;
75 raw_spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 } 99 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if (work)
84 irq_end(irq, desc);
85 raw_spin_unlock(&desc->lock);
86 100
87 return ok; 101 /* Mark it poll in progress */
102 desc->istate |= IRQS_POLL_INPROGRESS;
103 do {
104 if (handle_irq_event(desc) == IRQ_HANDLED)
105 ret = IRQ_HANDLED;
106 action = desc->action;
107 } while ((desc->istate & IRQS_PENDING) && action);
108 desc->istate &= ~IRQS_POLL_INPROGRESS;
109out:
110 raw_spin_unlock(&desc->lock);
111 return ret == IRQ_HANDLED;
88} 112}
89 113
90static int misrouted_irq(int irq) 114static int misrouted_irq(int irq)
@@ -92,6 +116,11 @@ static int misrouted_irq(int irq)
92 struct irq_desc *desc; 116 struct irq_desc *desc;
93 int i, ok = 0; 117 int i, ok = 0;
94 118
119 if (atomic_inc_return(&irq_poll_active) == 1)
120 goto out;
121
122 irq_poll_cpu = smp_processor_id();
123
95 for_each_irq_desc(i, desc) { 124 for_each_irq_desc(i, desc) {
96 if (!i) 125 if (!i)
97 continue; 126 continue;
@@ -99,9 +128,11 @@ static int misrouted_irq(int irq)
99 if (i == irq) /* Already tried */ 128 if (i == irq) /* Already tried */
100 continue; 129 continue;
101 130
102 if (try_one_irq(i, desc)) 131 if (try_one_irq(i, desc, false))
103 ok = 1; 132 ok = 1;
104 } 133 }
134out:
135 atomic_dec(&irq_poll_active);
105 /* So the caller can adjust the irq error counts */ 136 /* So the caller can adjust the irq error counts */
106 return ok; 137 return ok;
107} 138}
@@ -111,23 +142,28 @@ static void poll_spurious_irqs(unsigned long dummy)
111 struct irq_desc *desc; 142 struct irq_desc *desc;
112 int i; 143 int i;
113 144
145 if (atomic_inc_return(&irq_poll_active) != 1)
146 goto out;
147 irq_poll_cpu = smp_processor_id();
148
114 for_each_irq_desc(i, desc) { 149 for_each_irq_desc(i, desc) {
115 unsigned int status; 150 unsigned int state;
116 151
117 if (!i) 152 if (!i)
118 continue; 153 continue;
119 154
120 /* Racy but it doesn't matter */ 155 /* Racy but it doesn't matter */
121 status = desc->status; 156 state = desc->istate;
122 barrier(); 157 barrier();
123 if (!(status & IRQ_SPURIOUS_DISABLED)) 158 if (!(state & IRQS_SPURIOUS_DISABLED))
124 continue; 159 continue;
125 160
126 local_irq_disable(); 161 local_irq_disable();
127 try_one_irq(i, desc); 162 try_one_irq(i, desc, true);
128 local_irq_enable(); 163 local_irq_enable();
129 } 164 }
130 165out:
166 atomic_dec(&irq_poll_active);
131 mod_timer(&poll_spurious_irq_timer, 167 mod_timer(&poll_spurious_irq_timer,
132 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 168 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
133} 169}
@@ -139,15 +175,13 @@ static void poll_spurious_irqs(unsigned long dummy)
139 * 175 *
140 * (The other 100-of-100,000 interrupts may have been a correctly 176 * (The other 100-of-100,000 interrupts may have been a correctly
141 * functioning device sharing an IRQ with the failing one) 177 * functioning device sharing an IRQ with the failing one)
142 *
143 * Called under desc->lock
144 */ 178 */
145
146static void 179static void
147__report_bad_irq(unsigned int irq, struct irq_desc *desc, 180__report_bad_irq(unsigned int irq, struct irq_desc *desc,
148 irqreturn_t action_ret) 181 irqreturn_t action_ret)
149{ 182{
150 struct irqaction *action; 183 struct irqaction *action;
184 unsigned long flags;
151 185
152 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 186 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
153 printk(KERN_ERR "irq event %d: bogus return value %x\n", 187 printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +193,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
159 dump_stack(); 193 dump_stack();
160 printk(KERN_ERR "handlers:\n"); 194 printk(KERN_ERR "handlers:\n");
161 195
196 /*
197 * We need to take desc->lock here. note_interrupt() is called
198 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
199 * with something else removing an action. It's ok to take
200 * desc->lock here. See synchronize_irq().
201 */
202 raw_spin_lock_irqsave(&desc->lock, flags);
162 action = desc->action; 203 action = desc->action;
163 while (action) { 204 while (action) {
164 printk(KERN_ERR "[<%p>]", action->handler); 205 printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +208,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
167 printk("\n"); 208 printk("\n");
168 action = action->next; 209 action = action->next;
169 } 210 }
211 raw_spin_unlock_irqrestore(&desc->lock, flags);
170} 212}
171 213
172static void 214static void
@@ -218,6 +260,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
218void note_interrupt(unsigned int irq, struct irq_desc *desc, 260void note_interrupt(unsigned int irq, struct irq_desc *desc,
219 irqreturn_t action_ret) 261 irqreturn_t action_ret)
220{ 262{
263 if (desc->istate & IRQS_POLL_INPROGRESS)
264 return;
265
221 if (unlikely(action_ret != IRQ_HANDLED)) { 266 if (unlikely(action_ret != IRQ_HANDLED)) {
222 /* 267 /*
223 * If we are seeing only the odd spurious IRQ caused by 268 * If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +299,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 * Now kill the IRQ 299 * Now kill the IRQ
255 */ 300 */
256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 301 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 302 desc->istate |= IRQS_SPURIOUS_DISABLED;
258 desc->depth++; 303 desc->depth++;
259 desc->irq_data.chip->irq_disable(&desc->irq_data); 304 irq_disable(desc);
260 305
261 mod_timer(&poll_spurious_irq_timer, 306 mod_timer(&poll_spurious_irq_timer,
262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 307 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 656222fcf76..ed253aa24ba 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
67/* 150/*
68 * max perf event sample rate 151 * max perf event sample rate
69 */ 152 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
71 171
72static atomic64_t perf_event_id; 172static atomic64_t perf_event_id;
73 173
@@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 175 enum event_type_t event_type);
76 176
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
79 183
80void __weak perf_event_print_debug(void) { } 184void __weak perf_event_print_debug(void) { }
81 185
@@ -89,6 +193,360 @@ static inline u64 perf_clock(void)
89 return local_clock(); 193 return local_clock();
90} 194}
91 195
196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
276 struct perf_cgroup *cgrp;
277
278 /*
279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
281 */
282 if (!is_cgroup_event(event))
283 return;
284
285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
291}
292
293static inline void
294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
310 info->timestamp = ctx->timestamp;
311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
426 }
427out:
428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
92void perf_pmu_disable(struct pmu *pmu) 550void perf_pmu_disable(struct pmu *pmu)
93{ 551{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 712 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 713 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 715}
259 716
260/* 717/*
@@ -271,6 +728,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 728static u64 perf_event_time(struct perf_event *event)
272{ 729{
273 struct perf_event_context *ctx = event->ctx; 730 struct perf_event_context *ctx = event->ctx;
731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
274 return ctx ? ctx->time : 0; 735 return ctx ? ctx->time : 0;
275} 736}
276 737
@@ -285,9 +746,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 748 return;
288 749 /*
289 if (ctx->is_active) 750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 760 run_end = perf_event_time(event);
761 else if (ctx->is_active)
762 run_end = ctx->time;
291 else 763 else
292 run_end = event->tstamp_stopped; 764 run_end = event->tstamp_stopped;
293 765
@@ -299,6 +771,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 771 run_end = perf_event_time(event);
300 772
301 event->total_time_running = run_end - event->tstamp_running; 773 event->total_time_running = run_end - event->tstamp_running;
774
302} 775}
303 776
304/* 777/*
@@ -347,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 820 list_add_tail(&event->group_entry, list);
348 } 821 }
349 822
823 if (is_cgroup_event(event))
824 ctx->nr_cgroups++;
825
350 list_add_rcu(&event->event_entry, &ctx->event_list); 826 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 827 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 828 perf_pmu_rotate_start(ctx->pmu);
@@ -473,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 949
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 950 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 951
952 if (is_cgroup_event(event))
953 ctx->nr_cgroups--;
954
476 ctx->nr_events--; 955 ctx->nr_events--;
477 if (event->attr.inherit_stat) 956 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 957 ctx->nr_stat--;
@@ -544,7 +1023,8 @@ out:
544static inline int 1023static inline int
545event_filter_match(struct perf_event *event) 1024event_filter_match(struct perf_event *event)
546{ 1025{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1026 return (event->cpu == -1 || event->cpu == smp_processor_id())
1027 && perf_cgroup_match(event);
548} 1028}
549 1029
550static void 1030static void
@@ -562,7 +1042,7 @@ event_sched_out(struct perf_event *event,
562 */ 1042 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1043 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1044 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1045 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1046 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1047 event->tstamp_stopped = tstamp;
568 } 1048 }
@@ -606,47 +1086,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1086 cpuctx->exclusive = 0;
607} 1087}
608 1088
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1089/*
616 * Cross CPU call to remove a performance event 1090 * Cross CPU call to remove a performance event
617 * 1091 *
618 * We disable the event on the hardware level first. After that we 1092 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1093 * remove it from the context list.
620 */ 1094 */
621static void __perf_event_remove_from_context(void *info) 1095static int __perf_remove_from_context(void *info)
622{ 1096{
623 struct perf_event *event = info; 1097 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1098 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1099 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1100
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1101 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1102 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1103 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1104 raw_spin_unlock(&ctx->lock);
1105
1106 return 0;
642} 1107}
643 1108
644 1109
645/* 1110/*
646 * Remove the event from a task's (or a CPU's) list of events. 1111 * Remove the event from a task's (or a CPU's) list of events.
647 * 1112 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1113 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1114 * call when the task is on a CPU.
652 * 1115 *
@@ -657,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1120 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1121 * context has been detached from its task.
659 */ 1122 */
660static void perf_event_remove_from_context(struct perf_event *event) 1123static void perf_remove_from_context(struct perf_event *event)
661{ 1124{
662 struct perf_event_context *ctx = event->ctx; 1125 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1126 struct task_struct *task = ctx->task;
664 1127
1128 lockdep_assert_held(&ctx->mutex);
1129
665 if (!task) { 1130 if (!task) {
666 /* 1131 /*
667 * Per cpu events are removed via an smp call and 1132 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1133 * the removal is always successful.
669 */ 1134 */
670 smp_call_function_single(event->cpu, 1135 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1136 return;
674 } 1137 }
675 1138
676retry: 1139retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1140 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1141 return;
679 1142
680 raw_spin_lock_irq(&ctx->lock); 1143 raw_spin_lock_irq(&ctx->lock);
681 /* 1144 /*
682 * If the context is active we need to retry the smp call. 1145 * If we failed to find a running task, but find the context active now
1146 * that we've acquired the ctx->lock, retry.
683 */ 1147 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1148 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1149 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1150 goto retry;
687 } 1151 }
688 1152
689 /* 1153 /*
690 * The lock prevents that this context is scheduled in so we 1154 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1155 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1156 */
694 if (!list_empty(&event->group_entry)) 1157 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1158 raw_spin_unlock_irq(&ctx->lock);
697} 1159}
698 1160
699/* 1161/*
700 * Cross CPU call to disable a performance event 1162 * Cross CPU call to disable a performance event
701 */ 1163 */
702static void __perf_event_disable(void *info) 1164static int __perf_event_disable(void *info)
703{ 1165{
704 struct perf_event *event = info; 1166 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1167 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1170,12 @@ static void __perf_event_disable(void *info)
708 /* 1170 /*
709 * If this is a per-task event, need to check whether this 1171 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1172 * event's task is the current task on this cpu.
1173 *
1174 * Can trigger due to concurrent perf_event_context_sched_out()
1175 * flipping contexts around.
711 */ 1176 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1177 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1178 return -EINVAL;
714 1179
715 raw_spin_lock(&ctx->lock); 1180 raw_spin_lock(&ctx->lock);
716 1181
@@ -720,6 +1185,7 @@ static void __perf_event_disable(void *info)
720 */ 1185 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1186 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1187 update_context_time(ctx);
1188 update_cgrp_time_from_event(event);
723 update_group_times(event); 1189 update_group_times(event);
724 if (event == event->group_leader) 1190 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1191 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1195,8 @@ static void __perf_event_disable(void *info)
729 } 1195 }
730 1196
731 raw_spin_unlock(&ctx->lock); 1197 raw_spin_unlock(&ctx->lock);
1198
1199 return 0;
732} 1200}
733 1201
734/* 1202/*
@@ -753,13 +1221,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1221 /*
754 * Disable the event on the cpu that it's on 1222 * Disable the event on the cpu that it's on
755 */ 1223 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1224 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1225 return;
759 } 1226 }
760 1227
761retry: 1228retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1229 if (!task_function_call(task, __perf_event_disable, event))
1230 return;
763 1231
764 raw_spin_lock_irq(&ctx->lock); 1232 raw_spin_lock_irq(&ctx->lock);
765 /* 1233 /*
@@ -767,6 +1235,11 @@ retry:
767 */ 1235 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1236 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1237 raw_spin_unlock_irq(&ctx->lock);
1238 /*
1239 * Reload the task pointer, it might have been changed by
1240 * a concurrent perf_event_context_sched_out().
1241 */
1242 task = ctx->task;
770 goto retry; 1243 goto retry;
771 } 1244 }
772 1245
@@ -778,10 +1251,44 @@ retry:
778 update_group_times(event); 1251 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1252 event->state = PERF_EVENT_STATE_OFF;
780 } 1253 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1254 raw_spin_unlock_irq(&ctx->lock);
783} 1255}
784 1256
1257static void perf_set_shadow_time(struct perf_event *event,
1258 struct perf_event_context *ctx,
1259 u64 tstamp)
1260{
1261 /*
1262 * use the correct time source for the time snapshot
1263 *
1264 * We could get by without this by leveraging the
1265 * fact that to get to this function, the caller
1266 * has most likely already called update_context_time()
1267 * and update_cgrp_time_xx() and thus both timestamp
1268 * are identical (or very close). Given that tstamp is,
1269 * already adjusted for cgroup, we could say that:
1270 * tstamp - ctx->timestamp
1271 * is equivalent to
1272 * tstamp - cgrp->timestamp.
1273 *
1274 * Then, in perf_output_read(), the calculation would
1275 * work with no changes because:
1276 * - event is guaranteed scheduled in
1277 * - no scheduled out in between
1278 * - thus the timestamp would be the same
1279 *
1280 * But this is a bit hairy.
1281 *
1282 * So instead, we have an explicit cgroup call to remain
1283 * within the time time source all along. We believe it
1284 * is cleaner and simpler to understand.
1285 */
1286 if (is_cgroup_event(event))
1287 perf_cgroup_set_shadow_time(event, tstamp);
1288 else
1289 event->shadow_ctx_time = tstamp - ctx->timestamp;
1290}
1291
785#define MAX_INTERRUPTS (~0ULL) 1292#define MAX_INTERRUPTS (~0ULL)
786 1293
787static void perf_log_throttle(struct perf_event *event, int enable); 1294static void perf_log_throttle(struct perf_event *event, int enable);
@@ -822,7 +1329,7 @@ event_sched_in(struct perf_event *event,
822 1329
823 event->tstamp_running += tstamp - event->tstamp_stopped; 1330 event->tstamp_running += tstamp - event->tstamp_stopped;
824 1331
825 event->shadow_ctx_time = tstamp - ctx->timestamp; 1332 perf_set_shadow_time(event, ctx, tstamp);
826 1333
827 if (!is_software_event(event)) 1334 if (!is_software_event(event))
828 cpuctx->active_oncpu++; 1335 cpuctx->active_oncpu++;
@@ -943,12 +1450,15 @@ static void add_event_to_ctx(struct perf_event *event,
943 event->tstamp_stopped = tstamp; 1450 event->tstamp_stopped = tstamp;
944} 1451}
945 1452
1453static void perf_event_context_sched_in(struct perf_event_context *ctx,
1454 struct task_struct *tsk);
1455
946/* 1456/*
947 * Cross CPU call to install and enable a performance event 1457 * Cross CPU call to install and enable a performance event
948 * 1458 *
949 * Must be called with ctx->mutex held 1459 * Must be called with ctx->mutex held
950 */ 1460 */
951static void __perf_install_in_context(void *info) 1461static int __perf_install_in_context(void *info)
952{ 1462{
953 struct perf_event *event = info; 1463 struct perf_event *event = info;
954 struct perf_event_context *ctx = event->ctx; 1464 struct perf_event_context *ctx = event->ctx;
@@ -957,21 +1467,22 @@ static void __perf_install_in_context(void *info)
957 int err; 1467 int err;
958 1468
959 /* 1469 /*
960 * If this is a task context, we need to check whether it is 1470 * In case we're installing a new context to an already running task,
961 * the current task context of this cpu. If not it has been 1471 * could also happen before perf_event_task_sched_in() on architectures
962 * scheduled out before the smp call arrived. 1472 * which do context switches with IRQs enabled.
963 * Or possibly this is the right context but it isn't
964 * on this cpu because it had no events.
965 */ 1473 */
966 if (ctx->task && cpuctx->task_ctx != ctx) { 1474 if (ctx->task && !cpuctx->task_ctx)
967 if (cpuctx->task_ctx || ctx->task != current) 1475 perf_event_context_sched_in(ctx, ctx->task);
968 return;
969 cpuctx->task_ctx = ctx;
970 }
971 1476
972 raw_spin_lock(&ctx->lock); 1477 raw_spin_lock(&ctx->lock);
973 ctx->is_active = 1; 1478 ctx->is_active = 1;
974 update_context_time(ctx); 1479 update_context_time(ctx);
1480 /*
1481 * update cgrp time only if current cgrp
1482 * matches event->cgrp. Must be done before
1483 * calling add_event_to_ctx()
1484 */
1485 update_cgrp_time_from_event(event);
975 1486
976 add_event_to_ctx(event, ctx); 1487 add_event_to_ctx(event, ctx);
977 1488
@@ -1012,6 +1523,8 @@ static void __perf_install_in_context(void *info)
1012 1523
1013unlock: 1524unlock:
1014 raw_spin_unlock(&ctx->lock); 1525 raw_spin_unlock(&ctx->lock);
1526
1527 return 0;
1015} 1528}
1016 1529
1017/* 1530/*
@@ -1023,8 +1536,6 @@ unlock:
1023 * If the event is attached to a task which is on a CPU we use a smp 1536 * If the event is attached to a task which is on a CPU we use a smp
1024 * call to enable it in the task context. The task might have been 1537 * call to enable it in the task context. The task might have been
1025 * scheduled away, but we check this in the smp call again. 1538 * scheduled away, but we check this in the smp call again.
1026 *
1027 * Must be called with ctx->mutex held.
1028 */ 1539 */
1029static void 1540static void
1030perf_install_in_context(struct perf_event_context *ctx, 1541perf_install_in_context(struct perf_event_context *ctx,
@@ -1033,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1033{ 1544{
1034 struct task_struct *task = ctx->task; 1545 struct task_struct *task = ctx->task;
1035 1546
1547 lockdep_assert_held(&ctx->mutex);
1548
1036 event->ctx = ctx; 1549 event->ctx = ctx;
1037 1550
1038 if (!task) { 1551 if (!task) {
@@ -1040,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1040 * Per cpu events are installed via an smp call and 1553 * Per cpu events are installed via an smp call and
1041 * the install is always successful. 1554 * the install is always successful.
1042 */ 1555 */
1043 smp_call_function_single(cpu, __perf_install_in_context, 1556 cpu_function_call(cpu, __perf_install_in_context, event);
1044 event, 1);
1045 return; 1557 return;
1046 } 1558 }
1047 1559
1048retry: 1560retry:
1049 task_oncpu_function_call(task, __perf_install_in_context, 1561 if (!task_function_call(task, __perf_install_in_context, event))
1050 event); 1562 return;
1051 1563
1052 raw_spin_lock_irq(&ctx->lock); 1564 raw_spin_lock_irq(&ctx->lock);
1053 /* 1565 /*
1054 * we need to retry the smp call. 1566 * If we failed to find a running task, but find the context active now
1567 * that we've acquired the ctx->lock, retry.
1055 */ 1568 */
1056 if (ctx->is_active && list_empty(&event->group_entry)) { 1569 if (ctx->is_active) {
1057 raw_spin_unlock_irq(&ctx->lock); 1570 raw_spin_unlock_irq(&ctx->lock);
1058 goto retry; 1571 goto retry;
1059 } 1572 }
1060 1573
1061 /* 1574 /*
1062 * The lock prevents that this context is scheduled in so we 1575 * Since the task isn't running, its safe to add the event, us holding
1063 * can add the event safely, if it the call above did not 1576 * the ctx->lock ensures the task won't get scheduled in.
1064 * succeed.
1065 */ 1577 */
1066 if (list_empty(&event->group_entry)) 1578 add_event_to_ctx(event, ctx);
1067 add_event_to_ctx(event, ctx);
1068 raw_spin_unlock_irq(&ctx->lock); 1579 raw_spin_unlock_irq(&ctx->lock);
1069} 1580}
1070 1581
@@ -1093,7 +1604,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1093/* 1604/*
1094 * Cross CPU call to enable a performance event 1605 * Cross CPU call to enable a performance event
1095 */ 1606 */
1096static void __perf_event_enable(void *info) 1607static int __perf_event_enable(void *info)
1097{ 1608{
1098 struct perf_event *event = info; 1609 struct perf_event *event = info;
1099 struct perf_event_context *ctx = event->ctx; 1610 struct perf_event_context *ctx = event->ctx;
@@ -1101,26 +1612,27 @@ static void __perf_event_enable(void *info)
1101 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1612 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1102 int err; 1613 int err;
1103 1614
1104 /* 1615 if (WARN_ON_ONCE(!ctx->is_active))
1105 * If this is a per-task event, need to check whether this 1616 return -EINVAL;
1106 * event's task is the current task on this cpu.
1107 */
1108 if (ctx->task && cpuctx->task_ctx != ctx) {
1109 if (cpuctx->task_ctx || ctx->task != current)
1110 return;
1111 cpuctx->task_ctx = ctx;
1112 }
1113 1617
1114 raw_spin_lock(&ctx->lock); 1618 raw_spin_lock(&ctx->lock);
1115 ctx->is_active = 1;
1116 update_context_time(ctx); 1619 update_context_time(ctx);
1117 1620
1118 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1621 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1119 goto unlock; 1622 goto unlock;
1623
1624 /*
1625 * set current task's cgroup time reference point
1626 */
1627 perf_cgroup_set_timestamp(current, ctx);
1628
1120 __perf_event_mark_enabled(event, ctx); 1629 __perf_event_mark_enabled(event, ctx);
1121 1630
1122 if (!event_filter_match(event)) 1631 if (!event_filter_match(event)) {
1632 if (is_cgroup_event(event))
1633 perf_cgroup_defer_enabled(event);
1123 goto unlock; 1634 goto unlock;
1635 }
1124 1636
1125 /* 1637 /*
1126 * If the event is in a group and isn't the group leader, 1638 * If the event is in a group and isn't the group leader,
@@ -1153,6 +1665,8 @@ static void __perf_event_enable(void *info)
1153 1665
1154unlock: 1666unlock:
1155 raw_spin_unlock(&ctx->lock); 1667 raw_spin_unlock(&ctx->lock);
1668
1669 return 0;
1156} 1670}
1157 1671
1158/* 1672/*
@@ -1173,8 +1687,7 @@ void perf_event_enable(struct perf_event *event)
1173 /* 1687 /*
1174 * Enable the event on the cpu that it's on 1688 * Enable the event on the cpu that it's on
1175 */ 1689 */
1176 smp_call_function_single(event->cpu, __perf_event_enable, 1690 cpu_function_call(event->cpu, __perf_event_enable, event);
1177 event, 1);
1178 return; 1691 return;
1179 } 1692 }
1180 1693
@@ -1193,8 +1706,15 @@ void perf_event_enable(struct perf_event *event)
1193 event->state = PERF_EVENT_STATE_OFF; 1706 event->state = PERF_EVENT_STATE_OFF;
1194 1707
1195retry: 1708retry:
1709 if (!ctx->is_active) {
1710 __perf_event_mark_enabled(event, ctx);
1711 goto out;
1712 }
1713
1196 raw_spin_unlock_irq(&ctx->lock); 1714 raw_spin_unlock_irq(&ctx->lock);
1197 task_oncpu_function_call(task, __perf_event_enable, event); 1715
1716 if (!task_function_call(task, __perf_event_enable, event))
1717 return;
1198 1718
1199 raw_spin_lock_irq(&ctx->lock); 1719 raw_spin_lock_irq(&ctx->lock);
1200 1720
@@ -1202,15 +1722,14 @@ retry:
1202 * If the context is active and the event is still off, 1722 * If the context is active and the event is still off,
1203 * we need to retry the cross-call. 1723 * we need to retry the cross-call.
1204 */ 1724 */
1205 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1725 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1726 /*
1727 * task could have been flipped by a concurrent
1728 * perf_event_context_sched_out()
1729 */
1730 task = ctx->task;
1206 goto retry; 1731 goto retry;
1207 1732 }
1208 /*
1209 * Since we have the lock this context can't be scheduled
1210 * in, so we can change the state safely.
1211 */
1212 if (event->state == PERF_EVENT_STATE_OFF)
1213 __perf_event_mark_enabled(event, ctx);
1214 1733
1215out: 1734out:
1216 raw_spin_unlock_irq(&ctx->lock); 1735 raw_spin_unlock_irq(&ctx->lock);
@@ -1242,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1242 if (likely(!ctx->nr_events)) 1761 if (likely(!ctx->nr_events))
1243 goto out; 1762 goto out;
1244 update_context_time(ctx); 1763 update_context_time(ctx);
1764 update_cgrp_time_from_cpuctx(cpuctx);
1245 1765
1246 if (!ctx->nr_active) 1766 if (!ctx->nr_active)
1247 goto out; 1767 goto out;
@@ -1354,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1354 } 1874 }
1355} 1875}
1356 1876
1357void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1877static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1358 struct task_struct *next) 1878 struct task_struct *next)
1359{ 1879{
1360 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1880 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1361 struct perf_event_context *next_ctx; 1881 struct perf_event_context *next_ctx;
@@ -1431,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1431 1951
1432 for_each_task_context_nr(ctxn) 1952 for_each_task_context_nr(ctxn)
1433 perf_event_context_sched_out(task, ctxn, next); 1953 perf_event_context_sched_out(task, ctxn, next);
1954
1955 /*
1956 * if cgroup events exist on this CPU, then we need
1957 * to check if we have to switch out PMU state.
1958 * cgroup event are system-wide mode only
1959 */
1960 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1961 perf_cgroup_sched_out(task);
1434} 1962}
1435 1963
1436static void task_ctx_sched_out(struct perf_event_context *ctx, 1964static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1469,6 +1997,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1469 if (!event_filter_match(event)) 1997 if (!event_filter_match(event))
1470 continue; 1998 continue;
1471 1999
2000 /* may need to reset tstamp_enabled */
2001 if (is_cgroup_event(event))
2002 perf_cgroup_mark_enabled(event, ctx);
2003
1472 if (group_can_go_on(event, cpuctx, 1)) 2004 if (group_can_go_on(event, cpuctx, 1))
1473 group_sched_in(event, cpuctx, ctx); 2005 group_sched_in(event, cpuctx, ctx);
1474 2006
@@ -1501,6 +2033,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1501 if (!event_filter_match(event)) 2033 if (!event_filter_match(event))
1502 continue; 2034 continue;
1503 2035
2036 /* may need to reset tstamp_enabled */
2037 if (is_cgroup_event(event))
2038 perf_cgroup_mark_enabled(event, ctx);
2039
1504 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2040 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1505 if (group_sched_in(event, cpuctx, ctx)) 2041 if (group_sched_in(event, cpuctx, ctx))
1506 can_add_hw = 0; 2042 can_add_hw = 0;
@@ -1511,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1511static void 2047static void
1512ctx_sched_in(struct perf_event_context *ctx, 2048ctx_sched_in(struct perf_event_context *ctx,
1513 struct perf_cpu_context *cpuctx, 2049 struct perf_cpu_context *cpuctx,
1514 enum event_type_t event_type) 2050 enum event_type_t event_type,
2051 struct task_struct *task)
1515{ 2052{
2053 u64 now;
2054
1516 raw_spin_lock(&ctx->lock); 2055 raw_spin_lock(&ctx->lock);
1517 ctx->is_active = 1; 2056 ctx->is_active = 1;
1518 if (likely(!ctx->nr_events)) 2057 if (likely(!ctx->nr_events))
1519 goto out; 2058 goto out;
1520 2059
1521 ctx->timestamp = perf_clock(); 2060 now = perf_clock();
1522 2061 ctx->timestamp = now;
2062 perf_cgroup_set_timestamp(task, ctx);
1523 /* 2063 /*
1524 * First go through the list and put on any pinned groups 2064 * First go through the list and put on any pinned groups
1525 * in order to give them the best chance of going on. 2065 * in order to give them the best chance of going on.
@@ -1536,11 +2076,12 @@ out:
1536} 2076}
1537 2077
1538static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2078static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1539 enum event_type_t event_type) 2079 enum event_type_t event_type,
2080 struct task_struct *task)
1540{ 2081{
1541 struct perf_event_context *ctx = &cpuctx->ctx; 2082 struct perf_event_context *ctx = &cpuctx->ctx;
1542 2083
1543 ctx_sched_in(ctx, cpuctx, event_type); 2084 ctx_sched_in(ctx, cpuctx, event_type, task);
1544} 2085}
1545 2086
1546static void task_ctx_sched_in(struct perf_event_context *ctx, 2087static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1548,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1548{ 2089{
1549 struct perf_cpu_context *cpuctx; 2090 struct perf_cpu_context *cpuctx;
1550 2091
1551 cpuctx = __get_cpu_context(ctx); 2092 cpuctx = __get_cpu_context(ctx);
1552 if (cpuctx->task_ctx == ctx) 2093 if (cpuctx->task_ctx == ctx)
1553 return; 2094 return;
1554 2095
1555 ctx_sched_in(ctx, cpuctx, event_type); 2096 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1556 cpuctx->task_ctx = ctx; 2097 cpuctx->task_ctx = ctx;
1557} 2098}
1558 2099
1559void perf_event_context_sched_in(struct perf_event_context *ctx) 2100static void perf_event_context_sched_in(struct perf_event_context *ctx,
2101 struct task_struct *task)
1560{ 2102{
1561 struct perf_cpu_context *cpuctx; 2103 struct perf_cpu_context *cpuctx;
1562 2104
@@ -1572,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1572 */ 2114 */
1573 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2115 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1574 2116
1575 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2117 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1576 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2118 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1577 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2119 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1578 2120
1579 cpuctx->task_ctx = ctx; 2121 cpuctx->task_ctx = ctx;
1580 2122
@@ -1607,8 +2149,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
1607 if (likely(!ctx)) 2149 if (likely(!ctx))
1608 continue; 2150 continue;
1609 2151
1610 perf_event_context_sched_in(ctx); 2152 perf_event_context_sched_in(ctx, task);
1611 } 2153 }
2154 /*
2155 * if cgroup events exist on this CPU, then we need
2156 * to check if we have to switch in PMU state.
2157 * cgroup event are system-wide mode only
2158 */
2159 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2160 perf_cgroup_sched_in(task);
1612} 2161}
1613 2162
1614static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2163static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1638,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1638 * Reduce accuracy by one bit such that @a and @b converge 2187 * Reduce accuracy by one bit such that @a and @b converge
1639 * to a similar magnitude. 2188 * to a similar magnitude.
1640 */ 2189 */
1641#define REDUCE_FLS(a, b) \ 2190#define REDUCE_FLS(a, b) \
1642do { \ 2191do { \
1643 if (a##_fls > b##_fls) { \ 2192 if (a##_fls > b##_fls) { \
1644 a >>= 1; \ 2193 a >>= 1; \
@@ -1808,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1808 if (ctx) 2357 if (ctx)
1809 rotate_ctx(ctx); 2358 rotate_ctx(ctx);
1810 2359
1811 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2360 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1812 if (ctx) 2361 if (ctx)
1813 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2362 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1814 2363
@@ -1887,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1887 2436
1888 raw_spin_unlock(&ctx->lock); 2437 raw_spin_unlock(&ctx->lock);
1889 2438
1890 perf_event_context_sched_in(ctx); 2439 perf_event_context_sched_in(ctx, ctx->task);
1891out: 2440out:
1892 local_irq_restore(flags); 2441 local_irq_restore(flags);
1893} 2442}
@@ -1912,8 +2461,10 @@ static void __perf_event_read(void *info)
1912 return; 2461 return;
1913 2462
1914 raw_spin_lock(&ctx->lock); 2463 raw_spin_lock(&ctx->lock);
1915 if (ctx->is_active) 2464 if (ctx->is_active) {
1916 update_context_time(ctx); 2465 update_context_time(ctx);
2466 update_cgrp_time_from_event(event);
2467 }
1917 update_event_times(event); 2468 update_event_times(event);
1918 if (event->state == PERF_EVENT_STATE_ACTIVE) 2469 if (event->state == PERF_EVENT_STATE_ACTIVE)
1919 event->pmu->read(event); 2470 event->pmu->read(event);
@@ -1944,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event)
1944 * (e.g., thread is blocked), in that case 2495 * (e.g., thread is blocked), in that case
1945 * we cannot update context time 2496 * we cannot update context time
1946 */ 2497 */
1947 if (ctx->is_active) 2498 if (ctx->is_active) {
1948 update_context_time(ctx); 2499 update_context_time(ctx);
2500 update_cgrp_time_from_event(event);
2501 }
1949 update_event_times(event); 2502 update_event_times(event);
1950 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2503 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1951 } 2504 }
@@ -2224,6 +2777,9 @@ errout:
2224 2777
2225} 2778}
2226 2779
2780/*
2781 * Returns a matching context with refcount and pincount.
2782 */
2227static struct perf_event_context * 2783static struct perf_event_context *
2228find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2784find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2229{ 2785{
@@ -2248,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2248 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2804 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2249 ctx = &cpuctx->ctx; 2805 ctx = &cpuctx->ctx;
2250 get_ctx(ctx); 2806 get_ctx(ctx);
2807 ++ctx->pin_count;
2251 2808
2252 return ctx; 2809 return ctx;
2253 } 2810 }
@@ -2261,6 +2818,7 @@ retry:
2261 ctx = perf_lock_task_context(task, ctxn, &flags); 2818 ctx = perf_lock_task_context(task, ctxn, &flags);
2262 if (ctx) { 2819 if (ctx) {
2263 unclone_ctx(ctx); 2820 unclone_ctx(ctx);
2821 ++ctx->pin_count;
2264 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2822 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2265 } 2823 }
2266 2824
@@ -2282,8 +2840,10 @@ retry:
2282 err = -ESRCH; 2840 err = -ESRCH;
2283 else if (task->perf_event_ctxp[ctxn]) 2841 else if (task->perf_event_ctxp[ctxn])
2284 err = -EAGAIN; 2842 err = -EAGAIN;
2285 else 2843 else {
2844 ++ctx->pin_count;
2286 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2845 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2846 }
2287 mutex_unlock(&task->perf_event_mutex); 2847 mutex_unlock(&task->perf_event_mutex);
2288 2848
2289 if (unlikely(err)) { 2849 if (unlikely(err)) {
@@ -2323,7 +2883,7 @@ static void free_event(struct perf_event *event)
2323 2883
2324 if (!event->parent) { 2884 if (!event->parent) {
2325 if (event->attach_state & PERF_ATTACH_TASK) 2885 if (event->attach_state & PERF_ATTACH_TASK)
2326 jump_label_dec(&perf_task_events); 2886 jump_label_dec(&perf_sched_events);
2327 if (event->attr.mmap || event->attr.mmap_data) 2887 if (event->attr.mmap || event->attr.mmap_data)
2328 atomic_dec(&nr_mmap_events); 2888 atomic_dec(&nr_mmap_events);
2329 if (event->attr.comm) 2889 if (event->attr.comm)
@@ -2332,6 +2892,10 @@ static void free_event(struct perf_event *event)
2332 atomic_dec(&nr_task_events); 2892 atomic_dec(&nr_task_events);
2333 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2893 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2334 put_callchain_buffers(); 2894 put_callchain_buffers();
2895 if (is_cgroup_event(event)) {
2896 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2897 jump_label_dec(&perf_sched_events);
2898 }
2335 } 2899 }
2336 2900
2337 if (event->buffer) { 2901 if (event->buffer) {
@@ -2339,6 +2903,9 @@ static void free_event(struct perf_event *event)
2339 event->buffer = NULL; 2903 event->buffer = NULL;
2340 } 2904 }
2341 2905
2906 if (is_cgroup_event(event))
2907 perf_detach_cgroup(event);
2908
2342 if (event->destroy) 2909 if (event->destroy)
2343 event->destroy(event); 2910 event->destroy(event);
2344 2911
@@ -4406,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4406 if (unlikely(!is_sampling_event(event))) 4973 if (unlikely(!is_sampling_event(event)))
4407 return 0; 4974 return 0;
4408 4975
4409 if (!throttle) { 4976 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4410 hwc->interrupts++; 4977 if (throttle) {
4411 } else { 4978 hwc->interrupts = MAX_INTERRUPTS;
4412 if (hwc->interrupts != MAX_INTERRUPTS) { 4979 perf_log_throttle(event, 0);
4413 hwc->interrupts++;
4414 if (HZ * hwc->interrupts >
4415 (u64)sysctl_perf_event_sample_rate) {
4416 hwc->interrupts = MAX_INTERRUPTS;
4417 perf_log_throttle(event, 0);
4418 ret = 1;
4419 }
4420 } else {
4421 /*
4422 * Keep re-disabling events even though on the previous
4423 * pass we disabled it - just in case we raced with a
4424 * sched-in and the event got enabled again:
4425 */
4426 ret = 1; 4980 ret = 1;
4427 } 4981 }
4428 } 4982 } else
4983 hwc->interrupts++;
4429 4984
4430 if (event->attr.freq) { 4985 if (event->attr.freq) {
4431 u64 now = perf_clock(); 4986 u64 now = perf_clock();
@@ -5062,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5062 u64 period; 5617 u64 period;
5063 5618
5064 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5619 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5620
5621 if (event->state != PERF_EVENT_STATE_ACTIVE)
5622 return HRTIMER_NORESTART;
5623
5065 event->pmu->read(event); 5624 event->pmu->read(event);
5066 5625
5067 perf_sample_data_init(&data, 0); 5626 perf_sample_data_init(&data, 0);
@@ -5088,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5088 if (!is_sampling_event(event)) 5647 if (!is_sampling_event(event))
5089 return; 5648 return;
5090 5649
5091 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5092 hwc->hrtimer.function = perf_swevent_hrtimer;
5093
5094 period = local64_read(&hwc->period_left); 5650 period = local64_read(&hwc->period_left);
5095 if (period) { 5651 if (period) {
5096 if (period < 0) 5652 if (period < 0)
@@ -5117,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5117 } 5673 }
5118} 5674}
5119 5675
5676static void perf_swevent_init_hrtimer(struct perf_event *event)
5677{
5678 struct hw_perf_event *hwc = &event->hw;
5679
5680 if (!is_sampling_event(event))
5681 return;
5682
5683 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5684 hwc->hrtimer.function = perf_swevent_hrtimer;
5685
5686 /*
5687 * Since hrtimers have a fixed rate, we can do a static freq->period
5688 * mapping and avoid the whole period adjust feedback stuff.
5689 */
5690 if (event->attr.freq) {
5691 long freq = event->attr.sample_freq;
5692
5693 event->attr.sample_period = NSEC_PER_SEC / freq;
5694 hwc->sample_period = event->attr.sample_period;
5695 local64_set(&hwc->period_left, hwc->sample_period);
5696 event->attr.freq = 0;
5697 }
5698}
5699
5120/* 5700/*
5121 * Software event: cpu wall time clock 5701 * Software event: cpu wall time clock
5122 */ 5702 */
@@ -5169,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5169 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5749 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5170 return -ENOENT; 5750 return -ENOENT;
5171 5751
5752 perf_swevent_init_hrtimer(event);
5753
5172 return 0; 5754 return 0;
5173} 5755}
5174 5756
@@ -5224,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5224 5806
5225static void task_clock_event_read(struct perf_event *event) 5807static void task_clock_event_read(struct perf_event *event)
5226{ 5808{
5227 u64 time; 5809 u64 now = perf_clock();
5228 5810 u64 delta = now - event->ctx->timestamp;
5229 if (!in_nmi()) { 5811 u64 time = event->ctx->time + delta;
5230 update_context_time(event->ctx);
5231 time = event->ctx->time;
5232 } else {
5233 u64 now = perf_clock();
5234 u64 delta = now - event->ctx->timestamp;
5235 time = event->ctx->time + delta;
5236 }
5237 5812
5238 task_clock_event_update(event, time); 5813 task_clock_event_update(event, time);
5239} 5814}
@@ -5246,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event)
5246 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5821 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5247 return -ENOENT; 5822 return -ENOENT;
5248 5823
5824 perf_swevent_init_hrtimer(event);
5825
5249 return 0; 5826 return 0;
5250} 5827}
5251 5828
@@ -5517,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5517{ 6094{
5518 struct pmu *pmu = NULL; 6095 struct pmu *pmu = NULL;
5519 int idx; 6096 int idx;
6097 int ret;
5520 6098
5521 idx = srcu_read_lock(&pmus_srcu); 6099 idx = srcu_read_lock(&pmus_srcu);
5522 6100
5523 rcu_read_lock(); 6101 rcu_read_lock();
5524 pmu = idr_find(&pmu_idr, event->attr.type); 6102 pmu = idr_find(&pmu_idr, event->attr.type);
5525 rcu_read_unlock(); 6103 rcu_read_unlock();
5526 if (pmu) 6104 if (pmu) {
6105 ret = pmu->event_init(event);
6106 if (ret)
6107 pmu = ERR_PTR(ret);
5527 goto unlock; 6108 goto unlock;
6109 }
5528 6110
5529 list_for_each_entry_rcu(pmu, &pmus, entry) { 6111 list_for_each_entry_rcu(pmu, &pmus, entry) {
5530 int ret = pmu->event_init(event); 6112 ret = pmu->event_init(event);
5531 if (!ret) 6113 if (!ret)
5532 goto unlock; 6114 goto unlock;
5533 6115
@@ -5653,7 +6235,7 @@ done:
5653 6235
5654 if (!event->parent) { 6236 if (!event->parent) {
5655 if (event->attach_state & PERF_ATTACH_TASK) 6237 if (event->attach_state & PERF_ATTACH_TASK)
5656 jump_label_inc(&perf_task_events); 6238 jump_label_inc(&perf_sched_events);
5657 if (event->attr.mmap || event->attr.mmap_data) 6239 if (event->attr.mmap || event->attr.mmap_data)
5658 atomic_inc(&nr_mmap_events); 6240 atomic_inc(&nr_mmap_events);
5659 if (event->attr.comm) 6241 if (event->attr.comm)
@@ -5828,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open,
5828 int err; 6410 int err;
5829 6411
5830 /* for future expandability... */ 6412 /* for future expandability... */
5831 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6413 if (flags & ~PERF_FLAG_ALL)
5832 return -EINVAL; 6414 return -EINVAL;
5833 6415
5834 err = perf_copy_attr(attr_uptr, &attr); 6416 err = perf_copy_attr(attr_uptr, &attr);
@@ -5845,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open,
5845 return -EINVAL; 6427 return -EINVAL;
5846 } 6428 }
5847 6429
6430 /*
6431 * In cgroup mode, the pid argument is used to pass the fd
6432 * opened to the cgroup directory in cgroupfs. The cpu argument
6433 * designates the cpu on which to monitor threads from that
6434 * cgroup.
6435 */
6436 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6437 return -EINVAL;
6438
5848 event_fd = get_unused_fd_flags(O_RDWR); 6439 event_fd = get_unused_fd_flags(O_RDWR);
5849 if (event_fd < 0) 6440 if (event_fd < 0)
5850 return event_fd; 6441 return event_fd;
@@ -5862,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open,
5862 group_leader = NULL; 6453 group_leader = NULL;
5863 } 6454 }
5864 6455
5865 if (pid != -1) { 6456 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5866 task = find_lively_task_by_vpid(pid); 6457 task = find_lively_task_by_vpid(pid);
5867 if (IS_ERR(task)) { 6458 if (IS_ERR(task)) {
5868 err = PTR_ERR(task); 6459 err = PTR_ERR(task);
@@ -5876,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open,
5876 goto err_task; 6467 goto err_task;
5877 } 6468 }
5878 6469
6470 if (flags & PERF_FLAG_PID_CGROUP) {
6471 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6472 if (err)
6473 goto err_alloc;
6474 /*
6475 * one more event:
6476 * - that has cgroup constraint on event->cpu
6477 * - that may need work on context switch
6478 */
6479 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6480 jump_label_inc(&perf_sched_events);
6481 }
6482
5879 /* 6483 /*
5880 * Special case software events and allow them to be part of 6484 * Special case software events and allow them to be part of
5881 * any hardware group. 6485 * any hardware group.
@@ -5961,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open,
5961 struct perf_event_context *gctx = group_leader->ctx; 6565 struct perf_event_context *gctx = group_leader->ctx;
5962 6566
5963 mutex_lock(&gctx->mutex); 6567 mutex_lock(&gctx->mutex);
5964 perf_event_remove_from_context(group_leader); 6568 perf_remove_from_context(group_leader);
5965 list_for_each_entry(sibling, &group_leader->sibling_list, 6569 list_for_each_entry(sibling, &group_leader->sibling_list,
5966 group_entry) { 6570 group_entry) {
5967 perf_event_remove_from_context(sibling); 6571 perf_remove_from_context(sibling);
5968 put_ctx(gctx); 6572 put_ctx(gctx);
5969 } 6573 }
5970 mutex_unlock(&gctx->mutex); 6574 mutex_unlock(&gctx->mutex);
@@ -5987,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open,
5987 6591
5988 perf_install_in_context(ctx, event, cpu); 6592 perf_install_in_context(ctx, event, cpu);
5989 ++ctx->generation; 6593 ++ctx->generation;
6594 perf_unpin_context(ctx);
5990 mutex_unlock(&ctx->mutex); 6595 mutex_unlock(&ctx->mutex);
5991 6596
5992 event->owner = current; 6597 event->owner = current;
@@ -6012,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open,
6012 return event_fd; 6617 return event_fd;
6013 6618
6014err_context: 6619err_context:
6620 perf_unpin_context(ctx);
6015 put_ctx(ctx); 6621 put_ctx(ctx);
6016err_alloc: 6622err_alloc:
6017 free_event(event); 6623 free_event(event);
@@ -6062,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6062 mutex_lock(&ctx->mutex); 6668 mutex_lock(&ctx->mutex);
6063 perf_install_in_context(ctx, event, cpu); 6669 perf_install_in_context(ctx, event, cpu);
6064 ++ctx->generation; 6670 ++ctx->generation;
6671 perf_unpin_context(ctx);
6065 mutex_unlock(&ctx->mutex); 6672 mutex_unlock(&ctx->mutex);
6066 6673
6067 return event; 6674 return event;
@@ -6115,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event,
6115{ 6722{
6116 struct perf_event *parent_event; 6723 struct perf_event *parent_event;
6117 6724
6118 perf_event_remove_from_context(child_event); 6725 perf_remove_from_context(child_event);
6119 6726
6120 parent_event = child_event->parent; 6727 parent_event = child_event->parent;
6121 /* 6728 /*
@@ -6422,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6422 return 0; 7029 return 0;
6423 } 7030 }
6424 7031
6425 child_ctx = child->perf_event_ctxp[ctxn]; 7032 child_ctx = child->perf_event_ctxp[ctxn];
6426 if (!child_ctx) { 7033 if (!child_ctx) {
6427 /* 7034 /*
6428 * This is executed from the parent task context, so 7035 * This is executed from the parent task context, so
@@ -6537,6 +7144,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6537 mutex_unlock(&parent_ctx->mutex); 7144 mutex_unlock(&parent_ctx->mutex);
6538 7145
6539 perf_unpin_context(parent_ctx); 7146 perf_unpin_context(parent_ctx);
7147 put_ctx(parent_ctx);
6540 7148
6541 return ret; 7149 return ret;
6542} 7150}
@@ -6606,9 +7214,9 @@ static void __perf_event_exit_context(void *__info)
6606 perf_pmu_rotate_stop(ctx->pmu); 7214 perf_pmu_rotate_stop(ctx->pmu);
6607 7215
6608 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7216 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6609 __perf_event_remove_from_context(event); 7217 __perf_remove_from_context(event);
6610 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7218 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6611 __perf_event_remove_from_context(event); 7219 __perf_remove_from_context(event);
6612} 7220}
6613 7221
6614static void perf_event_exit_cpu_context(int cpu) 7222static void perf_event_exit_cpu_context(int cpu)
@@ -6732,3 +7340,83 @@ unlock:
6732 return ret; 7340 return ret;
6733} 7341}
6734device_initcall(perf_event_sysfs_init); 7342device_initcall(perf_event_sysfs_init);
7343
7344#ifdef CONFIG_CGROUP_PERF
7345static struct cgroup_subsys_state *perf_cgroup_create(
7346 struct cgroup_subsys *ss, struct cgroup *cont)
7347{
7348 struct perf_cgroup *jc;
7349
7350 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7351 if (!jc)
7352 return ERR_PTR(-ENOMEM);
7353
7354 jc->info = alloc_percpu(struct perf_cgroup_info);
7355 if (!jc->info) {
7356 kfree(jc);
7357 return ERR_PTR(-ENOMEM);
7358 }
7359
7360 return &jc->css;
7361}
7362
7363static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7364 struct cgroup *cont)
7365{
7366 struct perf_cgroup *jc;
7367 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7368 struct perf_cgroup, css);
7369 free_percpu(jc->info);
7370 kfree(jc);
7371}
7372
7373static int __perf_cgroup_move(void *info)
7374{
7375 struct task_struct *task = info;
7376 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7377 return 0;
7378}
7379
7380static void perf_cgroup_move(struct task_struct *task)
7381{
7382 task_function_call(task, __perf_cgroup_move, task);
7383}
7384
7385static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7386 struct cgroup *old_cgrp, struct task_struct *task,
7387 bool threadgroup)
7388{
7389 perf_cgroup_move(task);
7390 if (threadgroup) {
7391 struct task_struct *c;
7392 rcu_read_lock();
7393 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7394 perf_cgroup_move(c);
7395 }
7396 rcu_read_unlock();
7397 }
7398}
7399
7400static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7401 struct cgroup *old_cgrp, struct task_struct *task)
7402{
7403 /*
7404 * cgroup_exit() is called in the copy_process() failure path.
7405 * Ignore this case since the task hasn't ran yet, this avoids
7406 * trying to poke a half freed task state from generic code.
7407 */
7408 if (!(task->flags & PF_EXITING))
7409 return;
7410
7411 perf_cgroup_move(task);
7412}
7413
7414struct cgroup_subsys perf_subsys = {
7415 .name = "perf_event",
7416 .subsys_id = perf_subsys_id,
7417 .create = perf_cgroup_create,
7418 .destroy = perf_cgroup_destroy,
7419 .exit = perf_cgroup_exit,
7420 .attach = perf_cgroup_attach,
7421};
7422#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850..67fea9d25d5 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
176 return p->utime; 176 return p->utime;
177} 177}
178 178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 179static int
180posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{ 181{
181 int error = check_clock(which_clock); 182 int error = check_clock(which_clock);
182 if (!error) { 183 if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
194 return error; 195 return error;
195} 196}
196 197
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 198static int
199posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{ 200{
199 /* 201 /*
200 * You can never reset a CPU clock, but we check for other errors 202 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
317} 319}
318 320
319 321
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 322static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{ 323{
322 const pid_t pid = CPUCLOCK_PID(which_clock); 324 const pid_t pid = CPUCLOCK_PID(which_clock);
323 int error = -EINVAL; 325 int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the 381 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized. 382 * new timer already all-zeros initialized.
381 */ 383 */
382int posix_cpu_timer_create(struct k_itimer *new_timer) 384static int posix_cpu_timer_create(struct k_itimer *new_timer)
383{ 385{
384 int ret = 0; 386 int ret = 0;
385 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); 387 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock 427 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again. (This happens when the timer is in the middle of firing.) 428 * and try again. (This happens when the timer is in the middle of firing.)
427 */ 429 */
428int posix_cpu_timer_del(struct k_itimer *timer) 430static int posix_cpu_timer_del(struct k_itimer *timer)
429{ 431{
430 struct task_struct *p = timer->it.cpu.task; 432 struct task_struct *p = timer->it.cpu.task;
431 int ret = 0; 433 int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock 667 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again. (This happens when the timer is in the middle of firing.) 668 * and try again. (This happens when the timer is in the middle of firing.)
667 */ 669 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags, 670static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669 struct itimerspec *new, struct itimerspec *old) 671 struct itimerspec *new, struct itimerspec *old)
670{ 672{
671 struct task_struct *p = timer->it.cpu.task; 673 struct task_struct *p = timer->it.cpu.task;
672 union cpu_time_count old_expires, new_expires, old_incr, val; 674 union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
820 return ret; 822 return ret;
821} 823}
822 824
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 825static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{ 826{
825 union cpu_time_count now; 827 union cpu_time_count now;
826 struct task_struct *p = timer->it.cpu.task; 828 struct task_struct *p = timer->it.cpu.task;
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1481 return error; 1483 return error;
1482} 1484}
1483 1485
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1486static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1485 struct timespec *rqtp, struct timespec __user *rmtp) 1487
1488static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1489 struct timespec *rqtp, struct timespec __user *rmtp)
1486{ 1490{
1487 struct restart_block *restart_block = 1491 struct restart_block *restart_block =
1488 &current_thread_info()->restart_block; 1492 &current_thread_info()->restart_block;
1489 struct itimerspec it; 1493 struct itimerspec it;
1490 int error; 1494 int error;
1491 1495
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1501 1505
1502 if (error == -ERESTART_RESTARTBLOCK) { 1506 if (error == -ERESTART_RESTARTBLOCK) {
1503 1507
1504 if (flags & TIMER_ABSTIME) 1508 if (flags & TIMER_ABSTIME)
1505 return -ERESTARTNOHAND; 1509 return -ERESTARTNOHAND;
1506 /* 1510 /*
1507 * Report back to the user the time still remaining. 1511 * Report back to the user the time still remaining.
1508 */ 1512 */
1509 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1513 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510 return -EFAULT; 1514 return -EFAULT;
1511 1515
1512 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1513 restart_block->arg0 = which_clock; 1517 restart_block->nanosleep.index = which_clock;
1514 restart_block->arg1 = (unsigned long) rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1515 restart_block->arg2 = rqtp->tv_sec; 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1516 restart_block->arg3 = rqtp->tv_nsec;
1517 } 1520 }
1518 return error; 1521 return error;
1519} 1522}
1520 1523
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{ 1525{
1523 clockid_t which_clock = restart_block->arg0; 1526 clockid_t which_clock = restart_block->nanosleep.index;
1524 struct timespec __user *rmtp;
1525 struct timespec t; 1527 struct timespec t;
1526 struct itimerspec it; 1528 struct itimerspec it;
1527 int error; 1529 int error;
1528 1530
1529 rmtp = (struct timespec __user *) restart_block->arg1; 1531 t = ns_to_timespec(restart_block->nanosleep.expires);
1530 t.tv_sec = restart_block->arg2;
1531 t.tv_nsec = restart_block->arg3;
1532 1532
1533 restart_block->fn = do_no_restart_syscall;
1534 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1533 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535 1534
1536 if (error == -ERESTART_RESTARTBLOCK) { 1535 if (error == -ERESTART_RESTARTBLOCK) {
1536 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1537 /* 1537 /*
1538 * Report back to the user the time still remaining. 1538 * Report back to the user the time still remaining.
1539 */ 1539 */
1540 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1540 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541 return -EFAULT; 1541 return -EFAULT;
1542 1542
1543 restart_block->fn = posix_cpu_nsleep_restart; 1543 restart_block->nanosleep.expires = timespec_to_ns(&t);
1544 restart_block->arg0 = which_clock;
1545 restart_block->arg1 = (unsigned long) rmtp;
1546 restart_block->arg2 = t.tv_sec;
1547 restart_block->arg3 = t.tv_nsec;
1548 } 1544 }
1549 return error; 1545 return error;
1550 1546
1551} 1547}
1552 1548
1553
1554#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1549#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1550#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556 1551
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1594 timer->it_clock = THREAD_CLOCK; 1589 timer->it_clock = THREAD_CLOCK;
1595 return posix_cpu_timer_create(timer); 1590 return posix_cpu_timer_create(timer);
1596} 1591}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags, 1592
1598 struct timespec *rqtp, struct timespec __user *rmtp) 1593struct k_clock clock_posix_cpu = {
1599{ 1594 .clock_getres = posix_cpu_clock_getres,
1600 return -EINVAL; 1595 .clock_set = posix_cpu_clock_set,
1601} 1596 .clock_get = posix_cpu_clock_get,
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block) 1597 .timer_create = posix_cpu_timer_create,
1603{ 1598 .nsleep = posix_cpu_nsleep,
1604 return -EINVAL; 1599 .nsleep_restart = posix_cpu_nsleep_restart,
1605} 1600 .timer_set = posix_cpu_timer_set,
1601 .timer_del = posix_cpu_timer_del,
1602 .timer_get = posix_cpu_timer_get,
1603};
1606 1604
1607static __init int init_posix_cpu_timers(void) 1605static __init int init_posix_cpu_timers(void)
1608{ 1606{
1609 struct k_clock process = { 1607 struct k_clock process = {
1610 .clock_getres = process_cpu_clock_getres, 1608 .clock_getres = process_cpu_clock_getres,
1611 .clock_get = process_cpu_clock_get, 1609 .clock_get = process_cpu_clock_get,
1612 .clock_set = do_posix_clock_nosettime, 1610 .timer_create = process_cpu_timer_create,
1613 .timer_create = process_cpu_timer_create, 1611 .nsleep = process_cpu_nsleep,
1614 .nsleep = process_cpu_nsleep, 1612 .nsleep_restart = process_cpu_nsleep_restart,
1615 .nsleep_restart = process_cpu_nsleep_restart,
1616 }; 1613 };
1617 struct k_clock thread = { 1614 struct k_clock thread = {
1618 .clock_getres = thread_cpu_clock_getres, 1615 .clock_getres = thread_cpu_clock_getres,
1619 .clock_get = thread_cpu_clock_get, 1616 .clock_get = thread_cpu_clock_get,
1620 .clock_set = do_posix_clock_nosettime, 1617 .timer_create = thread_cpu_timer_create,
1621 .timer_create = thread_cpu_timer_create,
1622 .nsleep = thread_cpu_nsleep,
1623 .nsleep_restart = thread_cpu_nsleep_restart,
1624 }; 1618 };
1625 struct timespec ts; 1619 struct timespec ts;
1626 1620
1627 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1621 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1622 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629 1623
1630 cputime_to_timespec(cputime_one_jiffy, &ts); 1624 cputime_to_timespec(cputime_one_jiffy, &ts);
1631 onecputick = ts.tv_nsec; 1625 onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc5..4c0124919f9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/posix-clock.h>
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/wait.h> 47#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
81#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 82#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
82#endif 83#endif
83 84
85/*
86 * parisc wants ENOTSUP instead of EOPNOTSUPP
87 */
88#ifndef ENOTSUP
89# define ENANOSLEEP_NOTSUP EOPNOTSUPP
90#else
91# define ENANOSLEEP_NOTSUP ENOTSUP
92#endif
84 93
85/* 94/*
86 * The timer ID is turned into a timer address by idr_find(). 95 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
94/* 103/*
95 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us 104 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
96 * to implement others. This structure defines the various 105 * to implement others. This structure defines the various
97 * clocks and allows the possibility of adding others. We 106 * clocks.
98 * provide an interface to add clocks to the table and expect
99 * the "arch" code to add at least one clock that is high
100 * resolution. Here we define the standard CLOCK_REALTIME as a
101 * 1/HZ resolution clock.
102 * 107 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval 108 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as 109 * times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
108 * necessary code is written. The standard says we should say 113 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation... 114 * something about this issue in the documentation...
110 * 115 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to handle 116 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * various clock functions. For clocks that use the standard 117 * handle various clock functions.
113 * system timer code these entries should be NULL. This will
114 * allow dispatch without the overhead of indirect function
115 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code.
121 * 118 *
122 * At this time all functions EXCEPT clock_nanosleep can be 119 * The standard POSIX timer management code assumes the
123 * redirected by the CLOCKS structure. Clock_nanosleep is in 120 * following: 1.) The k_itimer struct (sched.h) is used for
124 * there, but the code ignores it. 121 * the timer. 2.) The list, it_lock, it_clock, it_id and
122 * it_pid fields are not modified by timer code.
125 * 123 *
126 * Permissions: It is assumed that the clock_settime() function defined 124 * Permissions: It is assumed that the clock_settime() function defined
127 * for each clock will take care of permission checks. Some 125 * for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
138 */ 136 */
139static int common_nsleep(const clockid_t, int flags, struct timespec *t, 137static int common_nsleep(const clockid_t, int flags, struct timespec *t,
140 struct timespec __user *rmtp); 138 struct timespec __user *rmtp);
139static int common_timer_create(struct k_itimer *new_timer);
141static void common_timer_get(struct k_itimer *, struct itimerspec *); 140static void common_timer_get(struct k_itimer *, struct itimerspec *);
142static int common_timer_set(struct k_itimer *, int, 141static int common_timer_set(struct k_itimer *, int,
143 struct itimerspec *, struct itimerspec *); 142 struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
158 spin_unlock_irqrestore(&timr->it_lock, flags); 157 spin_unlock_irqrestore(&timr->it_lock, flags);
159} 158}
160 159
161/* 160/* Get clock_realtime */
162 * Call the k_clock hook function if non-null, or the default function. 161static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
163 */
164#define CLOCK_DISPATCH(clock, call, arglist) \
165 ((clock) < 0 ? posix_cpu_##call arglist : \
166 (posix_clocks[clock].call != NULL \
167 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
168
169/*
170 * Default clock hook functions when the struct k_clock passed
171 * to register_posix_clock leaves a function pointer null.
172 *
173 * The function common_CALL is the default implementation for
174 * the function pointer CALL in struct k_clock.
175 */
176
177static inline int common_clock_getres(const clockid_t which_clock,
178 struct timespec *tp)
179{
180 tp->tv_sec = 0;
181 tp->tv_nsec = posix_clocks[which_clock].res;
182 return 0;
183}
184
185/*
186 * Get real time for posix timers
187 */
188static int common_clock_get(clockid_t which_clock, struct timespec *tp)
189{ 162{
190 ktime_get_real_ts(tp); 163 ktime_get_real_ts(tp);
191 return 0; 164 return 0;
192} 165}
193 166
194static inline int common_clock_set(const clockid_t which_clock, 167/* Set clock_realtime */
195 struct timespec *tp) 168static int posix_clock_realtime_set(const clockid_t which_clock,
169 const struct timespec *tp)
196{ 170{
197 return do_sys_settimeofday(tp, NULL); 171 return do_sys_settimeofday(tp, NULL);
198} 172}
199 173
200static int common_timer_create(struct k_itimer *new_timer) 174static int posix_clock_realtime_adj(const clockid_t which_clock,
201{ 175 struct timex *t)
202 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
203 return 0;
204}
205
206static int no_timer_create(struct k_itimer *new_timer)
207{
208 return -EOPNOTSUPP;
209}
210
211static int no_nsleep(const clockid_t which_clock, int flags,
212 struct timespec *tsave, struct timespec __user *rmtp)
213{
214 return -EOPNOTSUPP;
215}
216
217/*
218 * Return nonzero if we know a priori this clockid_t value is bogus.
219 */
220static inline int invalid_clockid(const clockid_t which_clock)
221{ 176{
222 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 177 return do_adjtimex(t);
223 return 0;
224 if ((unsigned) which_clock >= MAX_CLOCKS)
225 return 1;
226 if (posix_clocks[which_clock].clock_getres != NULL)
227 return 0;
228 if (posix_clocks[which_clock].res != 0)
229 return 0;
230 return 1;
231} 178}
232 179
233/* 180/*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
240} 187}
241 188
242/* 189/*
243 * Get monotonic time for posix timers 190 * Get monotonic-raw time for posix timers
244 */ 191 */
245static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 192static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
246{ 193{
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
267 *tp = ktime_to_timespec(KTIME_LOW_RES); 214 *tp = ktime_to_timespec(KTIME_LOW_RES);
268 return 0; 215 return 0;
269} 216}
217
218static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
219{
220 get_monotonic_boottime(tp);
221 return 0;
222}
223
224
270/* 225/*
271 * Initialize everything, well, just everything in Posix clocks/timers ;) 226 * Initialize everything, well, just everything in Posix clocks/timers ;)
272 */ 227 */
273static __init int init_posix_timers(void) 228static __init int init_posix_timers(void)
274{ 229{
275 struct k_clock clock_realtime = { 230 struct k_clock clock_realtime = {
276 .clock_getres = hrtimer_get_res, 231 .clock_getres = hrtimer_get_res,
232 .clock_get = posix_clock_realtime_get,
233 .clock_set = posix_clock_realtime_set,
234 .clock_adj = posix_clock_realtime_adj,
235 .nsleep = common_nsleep,
236 .nsleep_restart = hrtimer_nanosleep_restart,
237 .timer_create = common_timer_create,
238 .timer_set = common_timer_set,
239 .timer_get = common_timer_get,
240 .timer_del = common_timer_del,
277 }; 241 };
278 struct k_clock clock_monotonic = { 242 struct k_clock clock_monotonic = {
279 .clock_getres = hrtimer_get_res, 243 .clock_getres = hrtimer_get_res,
280 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
281 .clock_set = do_posix_clock_nosettime, 245 .nsleep = common_nsleep,
246 .nsleep_restart = hrtimer_nanosleep_restart,
247 .timer_create = common_timer_create,
248 .timer_set = common_timer_set,
249 .timer_get = common_timer_get,
250 .timer_del = common_timer_del,
282 }; 251 };
283 struct k_clock clock_monotonic_raw = { 252 struct k_clock clock_monotonic_raw = {
284 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
285 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
286 .clock_set = do_posix_clock_nosettime,
287 .timer_create = no_timer_create,
288 .nsleep = no_nsleep,
289 }; 255 };
290 struct k_clock clock_realtime_coarse = { 256 struct k_clock clock_realtime_coarse = {
291 .clock_getres = posix_get_coarse_res, 257 .clock_getres = posix_get_coarse_res,
292 .clock_get = posix_get_realtime_coarse, 258 .clock_get = posix_get_realtime_coarse,
293 .clock_set = do_posix_clock_nosettime,
294 .timer_create = no_timer_create,
295 .nsleep = no_nsleep,
296 }; 259 };
297 struct k_clock clock_monotonic_coarse = { 260 struct k_clock clock_monotonic_coarse = {
298 .clock_getres = posix_get_coarse_res, 261 .clock_getres = posix_get_coarse_res,
299 .clock_get = posix_get_monotonic_coarse, 262 .clock_get = posix_get_monotonic_coarse,
300 .clock_set = do_posix_clock_nosettime, 263 };
301 .timer_create = no_timer_create, 264 struct k_clock clock_boottime = {
302 .nsleep = no_nsleep, 265 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime,
267 .nsleep = common_nsleep,
268 .nsleep_restart = hrtimer_nanosleep_restart,
269 .timer_create = common_timer_create,
270 .timer_set = common_timer_set,
271 .timer_get = common_timer_get,
272 .timer_del = common_timer_del,
303 }; 273 };
304 274
305 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 275 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
306 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 276 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
307 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 277 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
308 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
309 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
310 281
311 posix_timers_cache = kmem_cache_create("posix_timers_cache", 282 posix_timers_cache = kmem_cache_create("posix_timers_cache",
312 sizeof (struct k_itimer), 0, SLAB_PANIC, 283 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
482 return task_pid(rtn); 453 return task_pid(rtn);
483} 454}
484 455
485void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 456void posix_timers_register_clock(const clockid_t clock_id,
457 struct k_clock *new_clock)
486{ 458{
487 if ((unsigned) clock_id >= MAX_CLOCKS) { 459 if ((unsigned) clock_id >= MAX_CLOCKS) {
488 printk("POSIX clock register failed for clock_id %d\n", 460 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
461 clock_id);
462 return;
463 }
464
465 if (!new_clock->clock_get) {
466 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
467 clock_id);
468 return;
469 }
470 if (!new_clock->clock_getres) {
471 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
489 clock_id); 472 clock_id);
490 return; 473 return;
491 } 474 }
492 475
493 posix_clocks[clock_id] = *new_clock; 476 posix_clocks[clock_id] = *new_clock;
494} 477}
495EXPORT_SYMBOL_GPL(register_posix_clock); 478EXPORT_SYMBOL_GPL(posix_timers_register_clock);
496 479
497static struct k_itimer * alloc_posix_timer(void) 480static struct k_itimer * alloc_posix_timer(void)
498{ 481{
@@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
523 kmem_cache_free(posix_timers_cache, tmr); 506 kmem_cache_free(posix_timers_cache, tmr);
524} 507}
525 508
509static struct k_clock *clockid_to_kclock(const clockid_t id)
510{
511 if (id < 0)
512 return (id & CLOCKFD_MASK) == CLOCKFD ?
513 &clock_posix_dynamic : &clock_posix_cpu;
514
515 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
516 return NULL;
517 return &posix_clocks[id];
518}
519
520static int common_timer_create(struct k_itimer *new_timer)
521{
522 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
523 return 0;
524}
525
526/* Create a POSIX.1b interval timer. */ 526/* Create a POSIX.1b interval timer. */
527 527
528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, 528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
529 struct sigevent __user *, timer_event_spec, 529 struct sigevent __user *, timer_event_spec,
530 timer_t __user *, created_timer_id) 530 timer_t __user *, created_timer_id)
531{ 531{
532 struct k_clock *kc = clockid_to_kclock(which_clock);
532 struct k_itimer *new_timer; 533 struct k_itimer *new_timer;
533 int error, new_timer_id; 534 int error, new_timer_id;
534 sigevent_t event; 535 sigevent_t event;
535 int it_id_set = IT_ID_NOT_SET; 536 int it_id_set = IT_ID_NOT_SET;
536 537
537 if (invalid_clockid(which_clock)) 538 if (!kc)
538 return -EINVAL; 539 return -EINVAL;
540 if (!kc->timer_create)
541 return -EOPNOTSUPP;
539 542
540 new_timer = alloc_posix_timer(); 543 new_timer = alloc_posix_timer();
541 if (unlikely(!new_timer)) 544 if (unlikely(!new_timer))
@@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 goto out; 600 goto out;
598 } 601 }
599 602
600 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 603 error = kc->timer_create(new_timer);
601 if (error) 604 if (error)
602 goto out; 605 goto out;
603 606
@@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
607 spin_unlock_irq(&current->sighand->siglock); 610 spin_unlock_irq(&current->sighand->siglock);
608 611
609 return 0; 612 return 0;
610 /* 613 /*
611 * In the case of the timer belonging to another task, after 614 * In the case of the timer belonging to another task, after
612 * the task is unlocked, the timer is owned by the other task 615 * the task is unlocked, the timer is owned by the other task
613 * and may cease to exist at any time. Don't use or modify 616 * and may cease to exist at any time. Don't use or modify
@@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
709SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 712SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
710 struct itimerspec __user *, setting) 713 struct itimerspec __user *, setting)
711{ 714{
712 struct k_itimer *timr;
713 struct itimerspec cur_setting; 715 struct itimerspec cur_setting;
716 struct k_itimer *timr;
717 struct k_clock *kc;
714 unsigned long flags; 718 unsigned long flags;
719 int ret = 0;
715 720
716 timr = lock_timer(timer_id, &flags); 721 timr = lock_timer(timer_id, &flags);
717 if (!timr) 722 if (!timr)
718 return -EINVAL; 723 return -EINVAL;
719 724
720 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); 725 kc = clockid_to_kclock(timr->it_clock);
726 if (WARN_ON_ONCE(!kc || !kc->timer_get))
727 ret = -EINVAL;
728 else
729 kc->timer_get(timr, &cur_setting);
721 730
722 unlock_timer(timr, flags); 731 unlock_timer(timr, flags);
723 732
724 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 733 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
725 return -EFAULT; 734 return -EFAULT;
726 735
727 return 0; 736 return ret;
728} 737}
729 738
730/* 739/*
@@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
813 int error = 0; 822 int error = 0;
814 unsigned long flag; 823 unsigned long flag;
815 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 824 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
825 struct k_clock *kc;
816 826
817 if (!new_setting) 827 if (!new_setting)
818 return -EINVAL; 828 return -EINVAL;
@@ -828,8 +838,11 @@ retry:
828 if (!timr) 838 if (!timr)
829 return -EINVAL; 839 return -EINVAL;
830 840
831 error = CLOCK_DISPATCH(timr->it_clock, timer_set, 841 kc = clockid_to_kclock(timr->it_clock);
832 (timr, flags, &new_spec, rtn)); 842 if (WARN_ON_ONCE(!kc || !kc->timer_set))
843 error = -EINVAL;
844 else
845 error = kc->timer_set(timr, flags, &new_spec, rtn);
833 846
834 unlock_timer(timr, flag); 847 unlock_timer(timr, flag);
835 if (error == TIMER_RETRY) { 848 if (error == TIMER_RETRY) {
@@ -844,7 +857,7 @@ retry:
844 return error; 857 return error;
845} 858}
846 859
847static inline int common_timer_del(struct k_itimer *timer) 860static int common_timer_del(struct k_itimer *timer)
848{ 861{
849 timer->it.real.interval.tv64 = 0; 862 timer->it.real.interval.tv64 = 0;
850 863
@@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer)
855 868
856static inline int timer_delete_hook(struct k_itimer *timer) 869static inline int timer_delete_hook(struct k_itimer *timer)
857{ 870{
858 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); 871 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
872
873 if (WARN_ON_ONCE(!kc || !kc->timer_del))
874 return -EINVAL;
875 return kc->timer_del(timer);
859} 876}
860 877
861/* Delete a POSIX.1b interval timer. */ 878/* Delete a POSIX.1b interval timer. */
@@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig)
927 } 944 }
928} 945}
929 946
930/* Not available / possible... functions */
931int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
932{
933 return -EINVAL;
934}
935EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
936
937int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
938 struct timespec *t, struct timespec __user *r)
939{
940#ifndef ENOTSUP
941 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
942#else /* parisc does define it separately. */
943 return -ENOTSUP;
944#endif
945}
946EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
947
948SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 947SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
949 const struct timespec __user *, tp) 948 const struct timespec __user *, tp)
950{ 949{
950 struct k_clock *kc = clockid_to_kclock(which_clock);
951 struct timespec new_tp; 951 struct timespec new_tp;
952 952
953 if (invalid_clockid(which_clock)) 953 if (!kc || !kc->clock_set)
954 return -EINVAL; 954 return -EINVAL;
955
955 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 956 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
956 return -EFAULT; 957 return -EFAULT;
957 958
958 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 959 return kc->clock_set(which_clock, &new_tp);
959} 960}
960 961
961SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 962SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
962 struct timespec __user *,tp) 963 struct timespec __user *,tp)
963{ 964{
965 struct k_clock *kc = clockid_to_kclock(which_clock);
964 struct timespec kernel_tp; 966 struct timespec kernel_tp;
965 int error; 967 int error;
966 968
967 if (invalid_clockid(which_clock)) 969 if (!kc)
968 return -EINVAL; 970 return -EINVAL;
969 error = CLOCK_DISPATCH(which_clock, clock_get, 971
970 (which_clock, &kernel_tp)); 972 error = kc->clock_get(which_clock, &kernel_tp);
973
971 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 974 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
972 error = -EFAULT; 975 error = -EFAULT;
973 976
974 return error; 977 return error;
978}
979
980SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
981 struct timex __user *, utx)
982{
983 struct k_clock *kc = clockid_to_kclock(which_clock);
984 struct timex ktx;
985 int err;
986
987 if (!kc)
988 return -EINVAL;
989 if (!kc->clock_adj)
990 return -EOPNOTSUPP;
991
992 if (copy_from_user(&ktx, utx, sizeof(ktx)))
993 return -EFAULT;
994
995 err = kc->clock_adj(which_clock, &ktx);
996
997 if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
998 return -EFAULT;
975 999
1000 return err;
976} 1001}
977 1002
978SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1003SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
979 struct timespec __user *, tp) 1004 struct timespec __user *, tp)
980{ 1005{
1006 struct k_clock *kc = clockid_to_kclock(which_clock);
981 struct timespec rtn_tp; 1007 struct timespec rtn_tp;
982 int error; 1008 int error;
983 1009
984 if (invalid_clockid(which_clock)) 1010 if (!kc)
985 return -EINVAL; 1011 return -EINVAL;
986 1012
987 error = CLOCK_DISPATCH(which_clock, clock_getres, 1013 error = kc->clock_getres(which_clock, &rtn_tp);
988 (which_clock, &rtn_tp));
989 1014
990 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { 1015 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
991 error = -EFAULT; 1016 error = -EFAULT;
992 }
993 1017
994 return error; 1018 return error;
995} 1019}
@@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1009 const struct timespec __user *, rqtp, 1033 const struct timespec __user *, rqtp,
1010 struct timespec __user *, rmtp) 1034 struct timespec __user *, rmtp)
1011{ 1035{
1036 struct k_clock *kc = clockid_to_kclock(which_clock);
1012 struct timespec t; 1037 struct timespec t;
1013 1038
1014 if (invalid_clockid(which_clock)) 1039 if (!kc)
1015 return -EINVAL; 1040 return -EINVAL;
1041 if (!kc->nsleep)
1042 return -ENANOSLEEP_NOTSUP;
1016 1043
1017 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1044 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1018 return -EFAULT; 1045 return -EFAULT;
@@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1020 if (!timespec_valid(&t)) 1047 if (!timespec_valid(&t))
1021 return -EINVAL; 1048 return -EINVAL;
1022 1049
1023 return CLOCK_DISPATCH(which_clock, nsleep, 1050 return kc->nsleep(which_clock, flags, &t, rmtp);
1024 (which_clock, flags, &t, rmtp));
1025}
1026
1027/*
1028 * nanosleep_restart for monotonic and realtime clocks
1029 */
1030static int common_nsleep_restart(struct restart_block *restart_block)
1031{
1032 return hrtimer_nanosleep_restart(restart_block);
1033} 1051}
1034 1052
1035/* 1053/*
1036 * This will restart clock_nanosleep. This is required only by 1054 * This will restart clock_nanosleep. This is required only by
1037 * compat_clock_nanosleep_restart for now. 1055 * compat_clock_nanosleep_restart for now.
1038 */ 1056 */
1039long 1057long clock_nanosleep_restart(struct restart_block *restart_block)
1040clock_nanosleep_restart(struct restart_block *restart_block)
1041{ 1058{
1042 clockid_t which_clock = restart_block->arg0; 1059 clockid_t which_clock = restart_block->nanosleep.index;
1060 struct k_clock *kc = clockid_to_kclock(which_clock);
1061
1062 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1063 return -EINVAL;
1043 1064
1044 return CLOCK_DISPATCH(which_clock, nsleep_restart, 1065 return kc->nsleep_restart(restart_block);
1045 (restart_block));
1046} 1066}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d..f3240e98792 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 214 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 215 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 216 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether
218 * or not we are in an RCU read-side critical section
219 * exists only in the preemptible RCU implementations
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
217 */ 222 */
218#ifndef CONFIG_PREEMPT
219 WARN_ON(1);
220 return 0;
221#else
222 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
223 irqs_disabled()) { 224 irqs_disabled()) {
224 WARN_ON(1); 225 WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
229 rcu_barrier_bh(); 230 rcu_barrier_bh();
230 debug_object_free(head, &rcuhead_debug_descr); 231 debug_object_free(head, &rcuhead_debug_descr);
231 return 1; 232 return 1;
232#endif
233 default: 233 default:
234 return 0; 234 return 0;
235 } 235 }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962..3cb8e362e88 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
852 if (t->rcu_read_lock_nesting == 0) 852 if (t->rcu_read_lock_nesting == 0)
853 return; 853 return;
854 t->rcu_read_lock_nesting = 1; 854 t->rcu_read_lock_nesting = 1;
855 rcu_read_unlock(); 855 __rcu_read_unlock();
856} 856}
857 857
858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff2..c224da41890 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
51 50
52MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c..3c7cbc2c33b 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
215 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 TRACE_WARN_ON(waiter->task);
219 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
220} 219}
221 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef..5c9ccd38096 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
14#include <linux/sysdev.h> 13#include <linux/sysdev.h>
15#include <linux/timer.h> 14#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
27 int opcode; 26 int opcode;
28 int opdata; 27 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event; 29 int event;
32 struct sys_device sysdev; 30 struct sys_device sysdev;
33}; 31};
@@ -46,9 +44,8 @@ enum test_opcodes {
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ 44 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ 45 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ 46 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */ 47 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ 48 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ 49 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */ 50 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54}; 51};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
74 td->mutexes[i] = 0; 71 td->mutexes[i] = 0;
75 } 72 }
76 } 73 }
77
78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
80 unlock_kernel();
81#endif
82 td->bkl = 0;
83 }
84 return 0; 74 return 0;
85 75
86 case RTTEST_RESETEVENT: 76 case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
131 td->mutexes[id] = 0; 121 td->mutexes[id] = 0;
132 return 0; 122 return 0;
133 123
134 case RTTEST_LOCKBKL:
135 if (td->bkl)
136 return 0;
137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
139 lock_kernel();
140#endif
141 td->bkl = 4;
142 return 0;
143
144 case RTTEST_UNLOCKBKL:
145 if (td->bkl != 4)
146 break;
147#ifdef CONFIG_LOCK_KERNEL
148 unlock_kernel();
149#endif
150 td->bkl = 0;
151 return 0;
152
153 default: 124 default:
154 break; 125 break;
155 } 126 }
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
196 td->event = atomic_add_return(1, &rttest_event); 167 td->event = atomic_add_return(1, &rttest_event);
197 break; 168 break;
198 169
199 case RTTEST_LOCKBKL:
200 default: 170 default:
201 break; 171 break;
202 } 172 }
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
229 td->event = atomic_add_return(1, &rttest_event); 199 td->event = atomic_add_return(1, &rttest_event);
230 return; 200 return;
231 201
232 case RTTEST_LOCKBKL:
233 return;
234 default: 202 default:
235 return; 203 return;
236 } 204 }
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
380 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
381 349
382 curr += sprintf(curr, 350 curr += sprintf(curr,
383 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", 351 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
384 td->opcode, td->event, tsk->state, 352 td->opcode, td->event, tsk->state,
385 (MAX_RT_PRIO - 1) - tsk->prio, 353 (MAX_RT_PRIO - 1) - tsk->prio,
386 (MAX_RT_PRIO - 1) - tsk->normal_prio, 354 (MAX_RT_PRIO - 1) - tsk->normal_prio,
387 tsk->pi_blocked_on, td->bkl); 355 tsk->pi_blocked_on);
388 356
389 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) 357 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
390 curr += sprintf(curr, "%d", td->mutexes[i]); 358 curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786..ab449117aaf 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
20/* 20/*
21 * lock->owner state tracking: 21 * lock->owner state tracking:
22 * 22 *
23 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 23 * lock->owner holds the task_struct pointer of the owner. Bit 0
24 * are used to keep track of the "owner is pending" and "lock has 24 * is used to keep track of the "lock has waiters" state.
25 * waiters" state.
26 * 25 *
27 * owner bit1 bit0 26 * owner bit0
28 * NULL 0 0 lock is free (fast acquire possible) 27 * NULL 0 lock is free (fast acquire possible)
29 * NULL 0 1 invalid state 28 * NULL 1 lock is free and has waiters and the top waiter
30 * NULL 1 0 Transitional State* 29 * is going to take the lock*
31 * NULL 1 1 invalid state 30 * taskpointer 0 lock is held (fast release possible)
32 * taskpointer 0 0 lock is held (fast release possible) 31 * taskpointer 1 lock is held and has waiters**
33 * taskpointer 0 1 task is pending owner
34 * taskpointer 1 0 lock is held and has waiters
35 * taskpointer 1 1 task is pending owner and lock has more waiters
36 *
37 * Pending ownership is assigned to the top (highest priority)
38 * waiter of the lock, when the lock is released. The thread is woken
39 * up and can now take the lock. Until the lock is taken (bit 0
40 * cleared) a competing higher priority thread can steal the lock
41 * which puts the woken up thread back on the waiters list.
42 * 32 *
43 * The fast atomic compare exchange based acquire and release is only 33 * The fast atomic compare exchange based acquire and release is only
44 * possible when bit 0 and 1 of lock->owner are 0. 34 * possible when bit 0 of lock->owner is 0.
35 *
36 * (*) It also can be a transitional state when grabbing the lock
37 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
38 * we need to set the bit0 before looking at the lock, and the owner may be
39 * NULL in this small time, hence this can be a transitional state.
45 * 40 *
46 * (*) There's a small time where the owner can be NULL and the 41 * (**) There is a small time when bit 0 is set but there are no
47 * "lock has waiters" bit is set. This can happen when grabbing the lock. 42 * waiters. This can happen when grabbing the lock in the slow path.
48 * To prevent a cmpxchg of the owner releasing the lock, we need to set this 43 * To prevent a cmpxchg of the owner releasing the lock, we need to
49 * bit before looking at the lock, hence the reason this is a transitional 44 * set this bit before looking at the lock.
50 * state.
51 */ 45 */
52 46
53static void 47static void
54rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 48rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
55 unsigned long mask)
56{ 49{
57 unsigned long val = (unsigned long)owner | mask; 50 unsigned long val = (unsigned long)owner;
58 51
59 if (rt_mutex_has_waiters(lock)) 52 if (rt_mutex_has_waiters(lock))
60 val |= RT_MUTEX_HAS_WAITERS; 53 val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
203 * reached or the state of the chain has changed while we 196 * reached or the state of the chain has changed while we
204 * dropped the locks. 197 * dropped the locks.
205 */ 198 */
206 if (!waiter || !waiter->task) 199 if (!waiter)
207 goto out_unlock_pi; 200 goto out_unlock_pi;
208 201
209 /* 202 /*
210 * Check the orig_waiter state. After we dropped the locks, 203 * Check the orig_waiter state. After we dropped the locks,
211 * the previous owner of the lock might have released the lock 204 * the previous owner of the lock might have released the lock.
212 * and made us the pending owner:
213 */ 205 */
214 if (orig_waiter && !orig_waiter->task) 206 if (orig_waiter && !rt_mutex_owner(orig_lock))
215 goto out_unlock_pi; 207 goto out_unlock_pi;
216 208
217 /* 209 /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 246
255 /* Release the task */ 247 /* Release the task */
256 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 248 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
249 if (!rt_mutex_owner(lock)) {
250 /*
251 * If the requeue above changed the top waiter, then we need
252 * to wake the new top waiter up to try to get the lock.
253 */
254
255 if (top_waiter != rt_mutex_top_waiter(lock))
256 wake_up_process(rt_mutex_top_waiter(lock)->task);
257 raw_spin_unlock(&lock->wait_lock);
258 goto out_put_task;
259 }
257 put_task_struct(task); 260 put_task_struct(task);
258 261
259 /* Grab the next task */ 262 /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
296} 299}
297 300
298/* 301/*
299 * Optimization: check if we can steal the lock from the
300 * assigned pending owner [which might not have taken the
301 * lock yet]:
302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
305{
306 struct task_struct *pendowner = rt_mutex_owner(lock);
307 struct rt_mutex_waiter *next;
308 unsigned long flags;
309
310 if (!rt_mutex_owner_pending(lock))
311 return 0;
312
313 if (pendowner == task)
314 return 1;
315
316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) {
318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0;
320 }
321
322 /*
323 * Check if a waiter is enqueued on the pending owners
324 * pi_waiters list. Remove it and readjust pending owners
325 * priority.
326 */
327 if (likely(!rt_mutex_has_waiters(lock))) {
328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1;
330 }
331
332 /* No chain handling, pending owner is not blocked on anything: */
333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner);
336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337
338 /*
339 * We are going to steal the lock and a waiter was
340 * enqueued on the pending owners pi_waiters queue. So
341 * we have to enqueue this waiter into
342 * task->pi_waiters list. This covers the case,
343 * where task is boosted because it holds another
344 * lock and gets unboosted because the booster is
345 * interrupted, so we would delay a waiter with higher
346 * priority as task->normal_prio.
347 *
348 * Note: in the rare case of a SCHED_OTHER task changing
349 * its priority and thus stealing the lock, next->task
350 * might be task:
351 */
352 if (likely(next->task != task)) {
353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task);
356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 }
358 return 1;
359}
360
361/*
362 * Try to take an rt-mutex 302 * Try to take an rt-mutex
363 * 303 *
364 * This fails
365 * - when the lock has a real owner
366 * - when a different pending owner exists and has higher priority than current
367 *
368 * Must be called with lock->wait_lock held. 304 * Must be called with lock->wait_lock held.
305 *
306 * @lock: the lock to be acquired.
307 * @task: the task which wants to acquire the lock
308 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
369 */ 309 */
370static int try_to_take_rt_mutex(struct rt_mutex *lock) 310static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
311 struct rt_mutex_waiter *waiter)
371{ 312{
372 /* 313 /*
373 * We have to be careful here if the atomic speedups are 314 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
390 */ 331 */
391 mark_rt_mutex_waiters(lock); 332 mark_rt_mutex_waiters(lock);
392 333
393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) 334 if (rt_mutex_owner(lock))
394 return 0; 335 return 0;
395 336
337 /*
338 * It will get the lock because of one of these conditions:
339 * 1) there is no waiter
340 * 2) higher priority than waiters
341 * 3) it is top waiter
342 */
343 if (rt_mutex_has_waiters(lock)) {
344 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
345 if (!waiter || waiter != rt_mutex_top_waiter(lock))
346 return 0;
347 }
348 }
349
350 if (waiter || rt_mutex_has_waiters(lock)) {
351 unsigned long flags;
352 struct rt_mutex_waiter *top;
353
354 raw_spin_lock_irqsave(&task->pi_lock, flags);
355
356 /* remove the queued waiter. */
357 if (waiter) {
358 plist_del(&waiter->list_entry, &lock->wait_list);
359 task->pi_blocked_on = NULL;
360 }
361
362 /*
363 * We have to enqueue the top waiter(if it exists) into
364 * task->pi_waiters list.
365 */
366 if (rt_mutex_has_waiters(lock)) {
367 top = rt_mutex_top_waiter(lock);
368 top->pi_list_entry.prio = top->list_entry.prio;
369 plist_add(&top->pi_list_entry, &task->pi_waiters);
370 }
371 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
372 }
373
396 /* We got the lock. */ 374 /* We got the lock. */
397 debug_rt_mutex_lock(lock); 375 debug_rt_mutex_lock(lock);
398 376
399 rt_mutex_set_owner(lock, current, 0); 377 rt_mutex_set_owner(lock, task);
400 378
401 rt_mutex_deadlock_account_lock(lock, current); 379 rt_mutex_deadlock_account_lock(lock, task);
402 380
403 return 1; 381 return 1;
404} 382}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
436 414
437 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 415 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 416
417 if (!owner)
418 return 0;
419
439 if (waiter == rt_mutex_top_waiter(lock)) { 420 if (waiter == rt_mutex_top_waiter(lock)) {
440 raw_spin_lock_irqsave(&owner->pi_lock, flags); 421 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 422 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
472/* 453/*
473 * Wake up the next waiter on the lock. 454 * Wake up the next waiter on the lock.
474 * 455 *
475 * Remove the top waiter from the current tasks waiter list and from 456 * Remove the top waiter from the current tasks waiter list and wake it up.
476 * the lock waiter list. Set it as pending owner. Then wake it up.
477 * 457 *
478 * Called with lock->wait_lock held. 458 * Called with lock->wait_lock held.
479 */ 459 */
480static void wakeup_next_waiter(struct rt_mutex *lock) 460static void wakeup_next_waiter(struct rt_mutex *lock)
481{ 461{
482 struct rt_mutex_waiter *waiter; 462 struct rt_mutex_waiter *waiter;
483 struct task_struct *pendowner;
484 unsigned long flags; 463 unsigned long flags;
485 464
486 raw_spin_lock_irqsave(&current->pi_lock, flags); 465 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 466
488 waiter = rt_mutex_top_waiter(lock); 467 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list);
490 468
491 /* 469 /*
492 * Remove it from current->pi_waiters. We do not adjust a 470 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
495 * lock->wait_lock. 473 * lock->wait_lock.
496 */ 474 */
497 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 475 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
498 pendowner = waiter->task;
499 waiter->task = NULL;
500 476
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 477 rt_mutex_set_owner(lock, NULL);
502 478
503 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 479 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 480
505 /* 481 wake_up_process(waiter->task);
506 * Clear the pi_blocked_on variable and enqueue a possible
507 * waiter into the pi_waiters list of the pending owner. This
508 * prevents that in case the pending owner gets unboosted a
509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner.
511 */
512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513
514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter);
516 WARN_ON(pendowner->pi_blocked_on->lock != lock);
517
518 pendowner->pi_blocked_on = NULL;
519
520 if (rt_mutex_has_waiters(lock)) {
521 struct rt_mutex_waiter *next;
522
523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 }
526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527
528 wake_up_process(pendowner);
529} 482}
530 483
531/* 484/*
532 * Remove a waiter from a lock 485 * Remove a waiter from a lock and give up
533 * 486 *
534 * Must be called with lock->wait_lock held 487 * Must be called with lock->wait_lock held and
488 * have just failed to try_to_take_rt_mutex().
535 */ 489 */
536static void remove_waiter(struct rt_mutex *lock, 490static void remove_waiter(struct rt_mutex *lock,
537 struct rt_mutex_waiter *waiter) 491 struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
543 497
544 raw_spin_lock_irqsave(&current->pi_lock, flags); 498 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 499 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 500 current->pi_blocked_on = NULL;
548 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 501 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 502
550 if (first && owner != current) { 503 if (!owner)
504 return;
505
506 if (first) {
551 507
552 raw_spin_lock_irqsave(&owner->pi_lock, flags); 508 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 509
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
614 * or TASK_UNINTERRUPTIBLE) 570 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none 571 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter 572 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 * 573 *
619 * lock->wait_lock must be held by the caller. 574 * lock->wait_lock must be held by the caller.
620 */ 575 */
621static int __sched 576static int __sched
622__rt_mutex_slowlock(struct rt_mutex *lock, int state, 577__rt_mutex_slowlock(struct rt_mutex *lock, int state,
623 struct hrtimer_sleeper *timeout, 578 struct hrtimer_sleeper *timeout,
624 struct rt_mutex_waiter *waiter, 579 struct rt_mutex_waiter *waiter)
625 int detect_deadlock)
626{ 580{
627 int ret = 0; 581 int ret = 0;
628 582
629 for (;;) { 583 for (;;) {
630 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock)) 585 if (try_to_take_rt_mutex(lock, current, waiter))
632 break; 586 break;
633 587
634 /* 588 /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
645 break; 599 break;
646 } 600 }
647 601
648 /*
649 * waiter->task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter->task) {
654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter->task)) {
662 /*
663 * Reset the return value. We might
664 * have returned with -EDEADLK and the
665 * owner released the lock while we
666 * were walking the pi chain.
667 */
668 ret = 0;
669 continue;
670 }
671 if (unlikely(ret))
672 break;
673 }
674
675 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
676 603
677 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
678 605
679 if (waiter->task) 606 schedule_rt_mutex(lock);
680 schedule_rt_mutex(lock);
681 607
682 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 609 set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
698 int ret = 0; 624 int ret = 0;
699 625
700 debug_rt_mutex_init_waiter(&waiter); 626 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702 627
703 raw_spin_lock(&lock->wait_lock); 628 raw_spin_lock(&lock->wait_lock);
704 629
705 /* Try to acquire the lock again: */ 630 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 631 if (try_to_take_rt_mutex(lock, current, NULL)) {
707 raw_spin_unlock(&lock->wait_lock); 632 raw_spin_unlock(&lock->wait_lock);
708 return 0; 633 return 0;
709 } 634 }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
717 timeout->task = NULL; 642 timeout->task = NULL;
718 } 643 }
719 644
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, 645 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
721 detect_deadlock); 646
647 if (likely(!ret))
648 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
722 649
723 set_current_state(TASK_RUNNING); 650 set_current_state(TASK_RUNNING);
724 651
725 if (unlikely(waiter.task)) 652 if (unlikely(ret))
726 remove_waiter(lock, &waiter); 653 remove_waiter(lock, &waiter);
727 654
728 /* 655 /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
737 if (unlikely(timeout)) 664 if (unlikely(timeout))
738 hrtimer_cancel(&timeout->timer); 665 hrtimer_cancel(&timeout->timer);
739 666
740 /*
741 * Readjust priority, when we did not get the lock. We might
742 * have been the pending owner and boosted. Since we did not
743 * take the lock, the PI boost has to go.
744 */
745 if (unlikely(ret))
746 rt_mutex_adjust_prio(current);
747
748 debug_rt_mutex_free_waiter(&waiter); 667 debug_rt_mutex_free_waiter(&waiter);
749 668
750 return ret; 669 return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
762 681
763 if (likely(rt_mutex_owner(lock) != current)) { 682 if (likely(rt_mutex_owner(lock) != current)) {
764 683
765 ret = try_to_take_rt_mutex(lock); 684 ret = try_to_take_rt_mutex(lock, current, NULL);
766 /* 685 /*
767 * try_to_take_rt_mutex() sets the lock waiters 686 * try_to_take_rt_mutex() sets the lock waiters
768 * bit unconditionally. Clean this up. 687 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
992{ 911{
993 __rt_mutex_init(lock, NULL); 912 __rt_mutex_init(lock, NULL);
994 debug_rt_mutex_proxy_lock(lock, proxy_owner); 913 debug_rt_mutex_proxy_lock(lock, proxy_owner);
995 rt_mutex_set_owner(lock, proxy_owner, 0); 914 rt_mutex_set_owner(lock, proxy_owner);
996 rt_mutex_deadlock_account_lock(lock, proxy_owner); 915 rt_mutex_deadlock_account_lock(lock, proxy_owner);
997} 916}
998 917
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1008 struct task_struct *proxy_owner) 927 struct task_struct *proxy_owner)
1009{ 928{
1010 debug_rt_mutex_proxy_unlock(lock); 929 debug_rt_mutex_proxy_unlock(lock);
1011 rt_mutex_set_owner(lock, NULL, 0); 930 rt_mutex_set_owner(lock, NULL);
1012 rt_mutex_deadlock_account_unlock(proxy_owner); 931 rt_mutex_deadlock_account_unlock(proxy_owner);
1013} 932}
1014 933
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1034 953
1035 raw_spin_lock(&lock->wait_lock); 954 raw_spin_lock(&lock->wait_lock);
1036 955
1037 mark_rt_mutex_waiters(lock); 956 if (try_to_take_rt_mutex(lock, task, NULL)) {
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 raw_spin_unlock(&lock->wait_lock); 957 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 958 return 1;
1046 } 959 }
1047 960
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 961 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049 962
1050 if (ret && !waiter->task) { 963 if (ret && !rt_mutex_owner(lock)) {
1051 /* 964 /*
1052 * Reset the return value. We might have 965 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner 966 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 969 */
1057 ret = 0; 970 ret = 0;
1058 } 971 }
972
973 if (unlikely(ret))
974 remove_waiter(lock, waiter);
975
1059 raw_spin_unlock(&lock->wait_lock); 976 raw_spin_unlock(&lock->wait_lock);
1060 977
1061 debug_rt_mutex_print_deadlock(waiter); 978 debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1110 1027
1111 set_current_state(TASK_INTERRUPTIBLE); 1028 set_current_state(TASK_INTERRUPTIBLE);
1112 1029
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, 1030 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1114 detect_deadlock);
1115 1031
1116 set_current_state(TASK_RUNNING); 1032 set_current_state(TASK_RUNNING);
1117 1033
1118 if (unlikely(waiter->task)) 1034 if (unlikely(ret))
1119 remove_waiter(lock, waiter); 1035 remove_waiter(lock, waiter);
1120 1036
1121 /* 1037 /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1126 1042
1127 raw_spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1128 1044
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret; 1045 return ret;
1138} 1046}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866a..53a66c85261 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
91/* 91/*
92 * lock->owner state tracking: 92 * lock->owner state tracking:
93 */ 93 */
94#define RT_MUTEX_OWNER_PENDING 1UL 94#define RT_MUTEX_HAS_WAITERS 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL 95#define RT_MUTEX_OWNER_MASKALL 1UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97 96
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) 97static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{ 98{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); 100 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102} 101}
103 102
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/* 103/*
116 * PI-futex support (proxy locking functions, etc.): 104 * PI-futex support (proxy locking functions, etc.):
117 */ 105 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 42eab5a8437..c8e40b7005c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 324 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 325 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 326 */
327 struct sched_entity *curr, *next, *last; 327 struct sched_entity *curr, *next, *last, *skip;
328 328
329 unsigned int nr_spread_over; 329 unsigned int nr_spread_over;
330 330
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 606 struct task_group *tg;
607 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
608 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 609 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 610 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css); 611 tg = container_of(css, struct task_group, css);
@@ -1686,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1683 __release(rq2->lock);
1687} 1684}
1688 1685
1686#else /* CONFIG_SMP */
1687
1688/*
1689 * double_rq_lock - safely lock two runqueues
1690 *
1691 * Note this does not disable interrupts like task_rq_lock,
1692 * you need to do so manually before calling.
1693 */
1694static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1695 __acquires(rq1->lock)
1696 __acquires(rq2->lock)
1697{
1698 BUG_ON(!irqs_disabled());
1699 BUG_ON(rq1 != rq2);
1700 raw_spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */
1702}
1703
1704/*
1705 * double_rq_unlock - safely unlock two runqueues
1706 *
1707 * Note this does not restore interrupts like task_rq_unlock,
1708 * you need to do so manually after calling.
1709 */
1710static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1711 __releases(rq1->lock)
1712 __releases(rq2->lock)
1713{
1714 BUG_ON(rq1 != rq2);
1715 raw_spin_unlock(&rq1->lock);
1716 __release(rq2->lock);
1717}
1718
1689#endif 1719#endif
1690 1720
1691static void calc_load_account_idle(struct rq *this_rq); 1721static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1910 */
1881 if (hardirq_count()) 1911 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1912 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1913 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1914 __this_cpu_add(cpu_softirq_time, delta);
1885 1915
1886 irq_time_write_end(); 1916 irq_time_write_end();
@@ -1920,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1950 sched_rt_avg_update(rq, irq_delta);
1921} 1951}
1922 1952
1953static int irqtime_account_hi_update(void)
1954{
1955 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1956 unsigned long flags;
1957 u64 latest_ns;
1958 int ret = 0;
1959
1960 local_irq_save(flags);
1961 latest_ns = this_cpu_read(cpu_hardirq_time);
1962 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1963 ret = 1;
1964 local_irq_restore(flags);
1965 return ret;
1966}
1967
1968static int irqtime_account_si_update(void)
1969{
1970 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1971 unsigned long flags;
1972 u64 latest_ns;
1973 int ret = 0;
1974
1975 local_irq_save(flags);
1976 latest_ns = this_cpu_read(cpu_softirq_time);
1977 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1978 ret = 1;
1979 local_irq_restore(flags);
1980 return ret;
1981}
1982
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1983#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 1984
1985#define sched_clock_irqtime (0)
1986
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 1987static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 1988{
1927 rq->clock_task += delta; 1989 rq->clock_task += delta;
@@ -2025,14 +2087,14 @@ inline int task_curr(const struct task_struct *p)
2025 2087
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2088static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2089 const struct sched_class *prev_class,
2028 int oldprio, int running) 2090 int oldprio)
2029{ 2091{
2030 if (prev_class != p->sched_class) { 2092 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2093 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2094 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2095 p->sched_class->switched_to(rq, p);
2034 } else 2096 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2097 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2098}
2037 2099
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2100static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2224,7 +2286,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2224 * yield - it could be a while. 2286 * yield - it could be a while.
2225 */ 2287 */
2226 if (unlikely(on_rq)) { 2288 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1); 2289 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2290
2291 set_current_state(TASK_UNINTERRUPTIBLE);
2292 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2228 continue; 2293 continue;
2229 } 2294 }
2230 2295
@@ -2265,27 +2330,6 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2330EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2331#endif /* CONFIG_SMP */
2267 2332
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2290/* 2334/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2335 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2566,6 +2610,7 @@ static void __sched_fork(struct task_struct *p)
2566 p->se.sum_exec_runtime = 0; 2610 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2611 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2612 p->se.nr_migrations = 0;
2613 p->se.vruntime = 0;
2569 2614
2570#ifdef CONFIG_SCHEDSTATS 2615#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2616 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2776,9 +2821,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2821prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2822 struct task_struct *next)
2778{ 2823{
2824 sched_info_switch(prev, next);
2825 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2826 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2827 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2828 prepare_arch_switch(next);
2829 trace_sched_switch(prev, next);
2782} 2830}
2783 2831
2784/** 2832/**
@@ -2911,7 +2959,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 2959 struct mm_struct *mm, *oldmm;
2912 2960
2913 prepare_task_switch(rq, prev, next); 2961 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 2962
2915 mm = next->mm; 2963 mm = next->mm;
2916 oldmm = prev->active_mm; 2964 oldmm = prev->active_mm;
2917 /* 2965 /*
@@ -3568,6 +3616,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3616}
3569 3617
3570/* 3618/*
3619 * Account system cpu time to a process and desired cpustat field
3620 * @p: the process that the cpu time gets accounted to
3621 * @cputime: the cpu time spent in kernel space since the last update
3622 * @cputime_scaled: cputime scaled by cpu frequency
3623 * @target_cputime64: pointer to cpustat field that has to be updated
3624 */
3625static inline
3626void __account_system_time(struct task_struct *p, cputime_t cputime,
3627 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3628{
3629 cputime64_t tmp = cputime_to_cputime64(cputime);
3630
3631 /* Add system time to process. */
3632 p->stime = cputime_add(p->stime, cputime);
3633 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3634 account_group_system_time(p, cputime);
3635
3636 /* Add system time to cpustat. */
3637 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3638 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3639
3640 /* Account for system time used */
3641 acct_update_integrals(p);
3642}
3643
3644/*
3571 * Account system cpu time to a process. 3645 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3646 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3647 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3652,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3652 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3653{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3654 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3655 cputime64_t *target_cputime64;
3582 3656
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3657 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3658 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3659 return;
3586 } 3660 }
3587 3661
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3662 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3663 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3664 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3665 target_cputime64 = &cpustat->softirq;
3599 else 3666 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3667 target_cputime64 = &cpustat->system;
3601 3668
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3669 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3603
3604 /* Account for system time used */
3605 acct_update_integrals(p);
3606} 3670}
3607 3671
3608/* 3672/*
3609 * Account for involuntary wait time. 3673 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3674 * @cputime: the cpu time spent in involuntary wait
3611 */ 3675 */
3612void account_steal_time(cputime_t cputime) 3676void account_steal_time(cputime_t cputime)
3613{ 3677{
@@ -3635,6 +3699,73 @@ void account_idle_time(cputime_t cputime)
3635 3699
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3700#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637 3701
3702#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3703/*
3704 * Account a tick to a process and cpustat
3705 * @p: the process that the cpu time gets accounted to
3706 * @user_tick: is the tick from userspace
3707 * @rq: the pointer to rq
3708 *
3709 * Tick demultiplexing follows the order
3710 * - pending hardirq update
3711 * - pending softirq update
3712 * - user_time
3713 * - idle_time
3714 * - system time
3715 * - check for guest_time
3716 * - else account as system_time
3717 *
3718 * Check for hardirq is done both for system and user time as there is
3719 * no timer going off while we are on hardirq and hence we may never get an
3720 * opportunity to update it solely in system time.
3721 * p->stime and friends are only updated on system time and not on irq
3722 * softirq as those do not count in task exec_runtime any more.
3723 */
3724static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3725 struct rq *rq)
3726{
3727 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3728 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3729 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3730
3731 if (irqtime_account_hi_update()) {
3732 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3733 } else if (irqtime_account_si_update()) {
3734 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3735 } else if (this_cpu_ksoftirqd() == p) {
3736 /*
3737 * ksoftirqd time do not get accounted in cpu_softirq_time.
3738 * So, we have to handle it separately here.
3739 * Also, p->stime needs to be updated for ksoftirqd.
3740 */
3741 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3742 &cpustat->softirq);
3743 } else if (user_tick) {
3744 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3745 } else if (p == rq->idle) {
3746 account_idle_time(cputime_one_jiffy);
3747 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3748 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3749 } else {
3750 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3751 &cpustat->system);
3752 }
3753}
3754
3755static void irqtime_account_idle_ticks(int ticks)
3756{
3757 int i;
3758 struct rq *rq = this_rq();
3759
3760 for (i = 0; i < ticks; i++)
3761 irqtime_account_process_tick(current, 0, rq);
3762}
3763#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3764static void irqtime_account_idle_ticks(int ticks) {}
3765static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3766 struct rq *rq) {}
3767#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3768
3638/* 3769/*
3639 * Account a single tick of cpu time. 3770 * Account a single tick of cpu time.
3640 * @p: the process that the cpu time gets accounted to 3771 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3776,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3776 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3777 struct rq *rq = this_rq();
3647 3778
3779 if (sched_clock_irqtime) {
3780 irqtime_account_process_tick(p, user_tick, rq);
3781 return;
3782 }
3783
3648 if (user_tick) 3784 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3785 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3786 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3806,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3806 */
3671void account_idle_ticks(unsigned long ticks) 3807void account_idle_ticks(unsigned long ticks)
3672{ 3808{
3809
3810 if (sched_clock_irqtime) {
3811 irqtime_account_idle_ticks(ticks);
3812 return;
3813 }
3814
3673 account_idle_time(jiffies_to_cputime(ticks)); 3815 account_idle_time(jiffies_to_cputime(ticks));
3674} 3816}
3675 3817
@@ -3989,9 +4131,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 4131 rq->skip_clock_update = 0;
3990 4132
3991 if (likely(prev != next)) { 4133 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 4134 rq->nr_switches++;
3996 rq->curr = next; 4135 rq->curr = next;
3997 ++*switch_count; 4136 ++*switch_count;
@@ -4571,11 +4710,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4571 4710
4572 if (running) 4711 if (running)
4573 p->sched_class->set_curr_task(rq); 4712 p->sched_class->set_curr_task(rq);
4574 if (on_rq) { 4713 if (on_rq)
4575 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4714 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4576 4715
4577 check_class_changed(rq, p, prev_class, oldprio, running); 4716 check_class_changed(rq, p, prev_class, oldprio);
4578 }
4579 task_rq_unlock(rq, &flags); 4717 task_rq_unlock(rq, &flags);
4580} 4718}
4581 4719
@@ -4823,12 +4961,15 @@ recheck:
4823 param->sched_priority > rlim_rtprio) 4961 param->sched_priority > rlim_rtprio)
4824 return -EPERM; 4962 return -EPERM;
4825 } 4963 }
4964
4826 /* 4965 /*
4827 * Like positive nice levels, dont allow tasks to 4966 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4828 * move out of SCHED_IDLE either: 4967 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4829 */ 4968 */
4830 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4969 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4831 return -EPERM; 4970 if (!can_nice(p, TASK_NICE(p)))
4971 return -EPERM;
4972 }
4832 4973
4833 /* can't change other user's priorities */ 4974 /* can't change other user's priorities */
4834 if (!check_same_owner(p)) 4975 if (!check_same_owner(p))
@@ -4903,11 +5044,10 @@ recheck:
4903 5044
4904 if (running) 5045 if (running)
4905 p->sched_class->set_curr_task(rq); 5046 p->sched_class->set_curr_task(rq);
4906 if (on_rq) { 5047 if (on_rq)
4907 activate_task(rq, p, 0); 5048 activate_task(rq, p, 0);
4908 5049
4909 check_class_changed(rq, p, prev_class, oldprio, running); 5050 check_class_changed(rq, p, prev_class, oldprio);
4910 }
4911 __task_rq_unlock(rq); 5051 __task_rq_unlock(rq);
4912 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5052 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4913 5053
@@ -5324,6 +5464,65 @@ void __sched yield(void)
5324} 5464}
5325EXPORT_SYMBOL(yield); 5465EXPORT_SYMBOL(yield);
5326 5466
5467/**
5468 * yield_to - yield the current processor to another thread in
5469 * your thread group, or accelerate that thread toward the
5470 * processor it's on.
5471 *
5472 * It's the caller's job to ensure that the target task struct
5473 * can't go away on us before we can do any checks.
5474 *
5475 * Returns true if we indeed boosted the target task.
5476 */
5477bool __sched yield_to(struct task_struct *p, bool preempt)
5478{
5479 struct task_struct *curr = current;
5480 struct rq *rq, *p_rq;
5481 unsigned long flags;
5482 bool yielded = 0;
5483
5484 local_irq_save(flags);
5485 rq = this_rq();
5486
5487again:
5488 p_rq = task_rq(p);
5489 double_rq_lock(rq, p_rq);
5490 while (task_rq(p) != p_rq) {
5491 double_rq_unlock(rq, p_rq);
5492 goto again;
5493 }
5494
5495 if (!curr->sched_class->yield_to_task)
5496 goto out;
5497
5498 if (curr->sched_class != p->sched_class)
5499 goto out;
5500
5501 if (task_running(p_rq, p) || p->state)
5502 goto out;
5503
5504 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5505 if (yielded) {
5506 schedstat_inc(rq, yld_count);
5507 /*
5508 * Make p's CPU reschedule; pick_next_entity takes care of
5509 * fairness.
5510 */
5511 if (preempt && rq != p_rq)
5512 resched_task(p_rq->curr);
5513 }
5514
5515out:
5516 double_rq_unlock(rq, p_rq);
5517 local_irq_restore(flags);
5518
5519 if (yielded)
5520 schedule();
5521
5522 return yielded;
5523}
5524EXPORT_SYMBOL_GPL(yield_to);
5525
5327/* 5526/*
5328 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5527 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5329 * that process accounting knows that this is a task in IO wait state. 5528 * that process accounting knows that this is a task in IO wait state.
@@ -5572,7 +5771,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5572 * The idle tasks have their own, simple scheduling class: 5771 * The idle tasks have their own, simple scheduling class:
5573 */ 5772 */
5574 idle->sched_class = &idle_sched_class; 5773 idle->sched_class = &idle_sched_class;
5575 ftrace_graph_init_task(idle); 5774 ftrace_graph_init_idle_task(idle, cpu);
5576} 5775}
5577 5776
5578/* 5777/*
@@ -7797,6 +7996,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7797 INIT_LIST_HEAD(&cfs_rq->tasks); 7996 INIT_LIST_HEAD(&cfs_rq->tasks);
7798#ifdef CONFIG_FAIR_GROUP_SCHED 7997#ifdef CONFIG_FAIR_GROUP_SCHED
7799 cfs_rq->rq = rq; 7998 cfs_rq->rq = rq;
7999 /* allow initial update_cfs_load() to truncate */
8000#ifdef CONFIG_SMP
8001 cfs_rq->load_stamp = 1;
8002#endif
7800#endif 8003#endif
7801 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8004 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7802} 8005}
@@ -8110,6 +8313,8 @@ EXPORT_SYMBOL(__might_sleep);
8110#ifdef CONFIG_MAGIC_SYSRQ 8313#ifdef CONFIG_MAGIC_SYSRQ
8111static void normalize_task(struct rq *rq, struct task_struct *p) 8314static void normalize_task(struct rq *rq, struct task_struct *p)
8112{ 8315{
8316 const struct sched_class *prev_class = p->sched_class;
8317 int old_prio = p->prio;
8113 int on_rq; 8318 int on_rq;
8114 8319
8115 on_rq = p->se.on_rq; 8320 on_rq = p->se.on_rq;
@@ -8120,6 +8325,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8120 activate_task(rq, p, 0); 8325 activate_task(rq, p, 0);
8121 resched_task(rq->curr); 8326 resched_task(rq->curr);
8122 } 8327 }
8328
8329 check_class_changed(rq, p, prev_class, old_prio);
8123} 8330}
8124 8331
8125void normalize_rt_tasks(void) 8332void normalize_rt_tasks(void)
@@ -8511,7 +8718,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8511 /* Propagate contribution to hierarchy */ 8718 /* Propagate contribution to hierarchy */
8512 raw_spin_lock_irqsave(&rq->lock, flags); 8719 raw_spin_lock_irqsave(&rq->lock, flags);
8513 for_each_sched_entity(se) 8720 for_each_sched_entity(se)
8514 update_cfs_shares(group_cfs_rq(se), 0); 8721 update_cfs_shares(group_cfs_rq(se));
8515 raw_spin_unlock_irqrestore(&rq->lock, flags); 8722 raw_spin_unlock_irqrestore(&rq->lock, flags);
8516 } 8723 }
8517 8724
@@ -8885,7 +9092,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8885} 9092}
8886 9093
8887static void 9094static void
8888cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 9095cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9096 struct cgroup *old_cgrp, struct task_struct *task)
8889{ 9097{
8890 /* 9098 /*
8891 * cgroup_exit() is called in the copy_process() failure path. 9099 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb65628315..5946ac51560 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
12static void __init autogroup_init(struct task_struct *init_task) 12static void __init autogroup_init(struct task_struct *init_task)
13{ 13{
14 autogroup_default.tg = &root_task_group; 14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref); 15 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock); 16 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default; 17 init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
130 129
131static inline bool task_group_is_autogroup(struct task_group *tg) 130static inline bool task_group_is_autogroup(struct task_group *tg)
132{ 131{
133 return tg != &root_task_group && tg->autogroup; 132 return !!tg->autogroup;
134} 133}
135 134
136static inline struct task_group * 135static inline struct task_group *
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
161 160
162 p->signal->autogroup = autogroup_kref_get(ag); 161 p->signal->autogroup = autogroup_kref_get(ag);
163 162
163 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
164 goto out;
165
164 t = p; 166 t = p;
165 do { 167 do {
166 sched_move_task(t); 168 sched_move_task(t);
167 } while_each_thread(p, t); 169 } while_each_thread(p, t);
168 170
171out:
169 unlock_task_sighand(p, &flags); 172 unlock_task_sighand(p, &flags);
170 autogroup_kref_put(prev); 173 autogroup_kref_put(prev);
171} 174}
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
247{ 250{
248 struct autogroup *ag = autogroup_task_get(p); 251 struct autogroup *ag = autogroup_task_get(p);
249 252
253 if (!task_group_is_autogroup(ag->tg))
254 goto out;
255
250 down_read(&ag->lock); 256 down_read(&ag->lock);
251 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); 257 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
252 up_read(&ag->lock); 258 up_read(&ag->lock);
253 259
260out:
254 autogroup_kref_put(ag); 261 autogroup_kref_put(ag);
255} 262}
256#endif /* CONFIG_PROC_FS */ 263#endif /* CONFIG_PROC_FS */
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
258#ifdef CONFIG_SCHED_DEBUG 265#ifdef CONFIG_SCHED_DEBUG
259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
260{ 267{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 268 if (!task_group_is_autogroup(tg))
262
263 if (!enabled || !tg->autogroup)
264 return 0; 269 return 0;
265 270
266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 271 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5da..05577055cfc 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3struct autogroup { 3struct autogroup {
4 /*
5 * reference doesn't mean how many thread attach to this
6 * autogroup now. It just stands for the number of task
7 * could use this autogroup.
8 */
4 struct kref kref; 9 struct kref kref;
5 struct task_group *tg; 10 struct task_group *tg;
6 struct rw_semaphore lock; 11 struct rw_semaphore lock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd07..7bacd83a415 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179 179
180 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
182 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
184 if (last) 184 if (last)
185 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450..3f7ec9e27ee 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 69unsigned int sysctl_sched_child_runs_first __read_mostly;
70 70
71/* 71/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 72 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 73 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 74 *
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 411 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 412}
421 413
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 414static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 415{
424 struct rb_node *left = cfs_rq->rb_leftmost; 416 struct rb_node *left = cfs_rq->rb_leftmost;
425 417
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 421 return rb_entry(left, struct sched_entity, run_node);
430} 422}
431 423
424static struct sched_entity *__pick_next_entity(struct sched_entity *se)
425{
426 struct rb_node *next = rb_next(&se->run_node);
427
428 if (!next)
429 return NULL;
430
431 return rb_entry(next, struct sched_entity, run_node);
432}
433
434#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 435static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 436{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 437 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 446 * Scheduling class statistics methods:
444 */ 447 */
445 448
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 449int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 450 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 451 loff_t *ppos)
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
540} 542}
541 543
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); 544static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); 545static void update_cfs_shares(struct cfs_rq *cfs_rq);
544 546
545/* 547/*
546 * Update the current task's runtime statistics. Skip current tasks that 548 * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
733 now - cfs_rq->load_last > 4 * period) { 735 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0; 736 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0; 737 cfs_rq->load_avg = 0;
738 delta = period - 1;
736 } 739 }
737 740
738 cfs_rq->load_stamp = now; 741 cfs_rq->load_stamp = now;
@@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
763 list_del_leaf_cfs_rq(cfs_rq); 766 list_del_leaf_cfs_rq(cfs_rq);
764} 767}
765 768
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 769static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
767 long weight_delta)
768{ 770{
769 long load_weight, load, shares; 771 long load_weight, load, shares;
770 772
771 load = cfs_rq->load.weight + weight_delta; 773 load = cfs_rq->load.weight;
772 774
773 load_weight = atomic_read(&tg->load_weight); 775 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load; 776 load_weight += load;
777 load_weight -= cfs_rq->load_contribution;
776 778
777 shares = (tg->shares * load); 779 shares = (tg->shares * load);
778 if (load_weight) 780 if (load_weight)
@@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{ 792{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { 793 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0); 794 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0); 795 update_cfs_shares(cfs_rq);
794 } 796 }
795} 797}
796# else /* CONFIG_SMP */ 798# else /* CONFIG_SMP */
@@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{ 800{
799} 801}
800 802
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 803static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
802 long weight_delta)
803{ 804{
804 return tg->shares; 805 return tg->shares;
805} 806}
@@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
824 account_entity_enqueue(cfs_rq, se); 825 account_entity_enqueue(cfs_rq, se);
825} 826}
826 827
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 828static void update_cfs_shares(struct cfs_rq *cfs_rq)
828{ 829{
829 struct task_group *tg; 830 struct task_group *tg;
830 struct sched_entity *se; 831 struct sched_entity *se;
@@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
838 if (likely(se->load.weight == tg->shares)) 839 if (likely(se->load.weight == tg->shares))
839 return; 840 return;
840#endif 841#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta); 842 shares = calc_cfs_shares(cfs_rq, tg);
842 843
843 reweight_entity(cfs_rq_of(se), se, shares); 844 reweight_entity(cfs_rq_of(se), se, shares);
844} 845}
@@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{ 848{
848} 849}
849 850
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 851static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
851{ 852{
852} 853}
853 854
@@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
978 */ 979 */
979 update_curr(cfs_rq); 980 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0); 981 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
982 account_entity_enqueue(cfs_rq, se); 982 account_entity_enqueue(cfs_rq, se);
983 update_cfs_shares(cfs_rq);
983 984
984 if (flags & ENQUEUE_WAKEUP) { 985 if (flags & ENQUEUE_WAKEUP) {
985 place_entity(cfs_rq, se, 0); 986 place_entity(cfs_rq, se, 0);
@@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 list_add_leaf_cfs_rq(cfs_rq); 997 list_add_leaf_cfs_rq(cfs_rq);
997} 998}
998 999
999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1000static void __clear_buddies_last(struct sched_entity *se)
1001{
1002 for_each_sched_entity(se) {
1003 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1004 if (cfs_rq->last == se)
1005 cfs_rq->last = NULL;
1006 else
1007 break;
1008 }
1009}
1010
1011static void __clear_buddies_next(struct sched_entity *se)
1000{ 1012{
1001 if (!se || cfs_rq->last == se) 1013 for_each_sched_entity(se) {
1002 cfs_rq->last = NULL; 1014 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1015 if (cfs_rq->next == se)
1016 cfs_rq->next = NULL;
1017 else
1018 break;
1019 }
1020}
1003 1021
1004 if (!se || cfs_rq->next == se) 1022static void __clear_buddies_skip(struct sched_entity *se)
1005 cfs_rq->next = NULL; 1023{
1024 for_each_sched_entity(se) {
1025 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1026 if (cfs_rq->skip == se)
1027 cfs_rq->skip = NULL;
1028 else
1029 break;
1030 }
1006} 1031}
1007 1032
1008static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1033static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1009{ 1034{
1010 for_each_sched_entity(se) 1035 if (cfs_rq->last == se)
1011 __clear_buddies(cfs_rq_of(se), se); 1036 __clear_buddies_last(se);
1037
1038 if (cfs_rq->next == se)
1039 __clear_buddies_next(se);
1040
1041 if (cfs_rq->skip == se)
1042 __clear_buddies_skip(se);
1012} 1043}
1013 1044
1014static void 1045static void
@@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1041 update_cfs_load(cfs_rq, 0); 1072 update_cfs_load(cfs_rq, 0);
1042 account_entity_dequeue(cfs_rq, se); 1073 account_entity_dequeue(cfs_rq, se);
1043 update_min_vruntime(cfs_rq); 1074 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0); 1075 update_cfs_shares(cfs_rq);
1045 1076
1046 /* 1077 /*
1047 * Normalize the entity after updating the min_vruntime because the 1078 * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1084 return; 1115 return;
1085 1116
1086 if (cfs_rq->nr_running > 1) { 1117 if (cfs_rq->nr_running > 1) {
1087 struct sched_entity *se = __pick_next_entity(cfs_rq); 1118 struct sched_entity *se = __pick_first_entity(cfs_rq);
1088 s64 delta = curr->vruntime - se->vruntime; 1119 s64 delta = curr->vruntime - se->vruntime;
1089 1120
1090 if (delta < 0) 1121 if (delta < 0)
@@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1128static int 1159static int
1129wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1160wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1130 1161
1162/*
1163 * Pick the next process, keeping these things in mind, in this order:
1164 * 1) keep things fair between processes/task groups
1165 * 2) pick the "next" process, since someone really wants that to run
1166 * 3) pick the "last" process, for cache locality
1167 * 4) do not run the "skip" process, if something else is available
1168 */
1131static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1169static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1132{ 1170{
1133 struct sched_entity *se = __pick_next_entity(cfs_rq); 1171 struct sched_entity *se = __pick_first_entity(cfs_rq);
1134 struct sched_entity *left = se; 1172 struct sched_entity *left = se;
1135 1173
1136 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1174 /*
1137 se = cfs_rq->next; 1175 * Avoid running the skip buddy, if running something else can
1176 * be done without getting too unfair.
1177 */
1178 if (cfs_rq->skip == se) {
1179 struct sched_entity *second = __pick_next_entity(se);
1180 if (second && wakeup_preempt_entity(second, left) < 1)
1181 se = second;
1182 }
1138 1183
1139 /* 1184 /*
1140 * Prefer last buddy, try to return the CPU to a preempted task. 1185 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1142 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1187 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1143 se = cfs_rq->last; 1188 se = cfs_rq->last;
1144 1189
1190 /*
1191 * Someone really wants this to run. If it's not unfair, run it.
1192 */
1193 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1194 se = cfs_rq->next;
1195
1145 clear_buddies(cfs_rq, se); 1196 clear_buddies(cfs_rq, se);
1146 1197
1147 return se; 1198 return se;
@@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1333 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283 1334
1284 update_cfs_load(cfs_rq, 0); 1335 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0); 1336 update_cfs_shares(cfs_rq);
1286 } 1337 }
1287 1338
1288 hrtick_update(rq); 1339 hrtick_update(rq);
@@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313 1364
1314 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0); 1366 update_cfs_shares(cfs_rq);
1316 } 1367 }
1317 1368
1318 hrtick_update(rq); 1369 hrtick_update(rq);
1319} 1370}
1320 1371
1321/*
1322 * sched_yield() support is very simple - we dequeue and enqueue.
1323 *
1324 * If compat_yield is turned on then we requeue to the end of the tree.
1325 */
1326static void yield_task_fair(struct rq *rq)
1327{
1328 struct task_struct *curr = rq->curr;
1329 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1330 struct sched_entity *rightmost, *se = &curr->se;
1331
1332 /*
1333 * Are we the only task in the tree?
1334 */
1335 if (unlikely(cfs_rq->nr_running == 1))
1336 return;
1337
1338 clear_buddies(cfs_rq, se);
1339
1340 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1341 update_rq_clock(rq);
1342 /*
1343 * Update run-time statistics of the 'current'.
1344 */
1345 update_curr(cfs_rq);
1346
1347 return;
1348 }
1349 /*
1350 * Find the rightmost entry in the rbtree:
1351 */
1352 rightmost = __pick_last_entity(cfs_rq);
1353 /*
1354 * Already in the rightmost position?
1355 */
1356 if (unlikely(!rightmost || entity_before(rightmost, se)))
1357 return;
1358
1359 /*
1360 * Minimally necessary key value to be last in the tree:
1361 * Upon rescheduling, sched_class::put_prev_task() will place
1362 * 'current' within the tree based on its new key value.
1363 */
1364 se->vruntime = rightmost->vruntime + 1;
1365}
1366
1367#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1368 1373
1369static void task_waking_fair(struct rq *rq, struct task_struct *p) 1374static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
1834 } 1839 }
1835} 1840}
1836 1841
1842static void set_skip_buddy(struct sched_entity *se)
1843{
1844 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1845 for_each_sched_entity(se)
1846 cfs_rq_of(se)->skip = se;
1847 }
1848}
1849
1837/* 1850/*
1838 * Preempt the current task with a newly woken task if needed: 1851 * Preempt the current task with a newly woken task if needed:
1839 */ 1852 */
@@ -1857,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 if (test_tsk_need_resched(curr)) 1870 if (test_tsk_need_resched(curr))
1858 return; 1871 return;
1859 1872
1873 /* Idle tasks are by definition preempted by non-idle tasks. */
1874 if (unlikely(curr->policy == SCHED_IDLE) &&
1875 likely(p->policy != SCHED_IDLE))
1876 goto preempt;
1877
1860 /* 1878 /*
1861 * Batch and idle tasks do not preempt (their preemption is driven by 1879 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1862 * the tick): 1880 * is driven by the tick):
1863 */ 1881 */
1864 if (unlikely(p->policy != SCHED_NORMAL)) 1882 if (unlikely(p->policy != SCHED_NORMAL))
1865 return; 1883 return;
1866 1884
1867 /* Idle tasks are by definition preempted by everybody. */
1868 if (unlikely(curr->policy == SCHED_IDLE))
1869 goto preempt;
1870 1885
1871 if (!sched_feat(WAKEUP_PREEMPT)) 1886 if (!sched_feat(WAKEUP_PREEMPT))
1872 return; 1887 return;
@@ -1932,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1932 } 1947 }
1933} 1948}
1934 1949
1950/*
1951 * sched_yield() is very simple
1952 *
1953 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1954 */
1955static void yield_task_fair(struct rq *rq)
1956{
1957 struct task_struct *curr = rq->curr;
1958 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1959 struct sched_entity *se = &curr->se;
1960
1961 /*
1962 * Are we the only task in the tree?
1963 */
1964 if (unlikely(rq->nr_running == 1))
1965 return;
1966
1967 clear_buddies(cfs_rq, se);
1968
1969 if (curr->policy != SCHED_BATCH) {
1970 update_rq_clock(rq);
1971 /*
1972 * Update run-time statistics of the 'current'.
1973 */
1974 update_curr(cfs_rq);
1975 }
1976
1977 set_skip_buddy(se);
1978}
1979
1980static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
1981{
1982 struct sched_entity *se = &p->se;
1983
1984 if (!se->on_rq)
1985 return false;
1986
1987 /* Tell the scheduler that we'd really like pse to run next. */
1988 set_next_buddy(se);
1989
1990 yield_task_fair(rq);
1991
1992 return true;
1993}
1994
1935#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
1936/************************************************** 1996/**************************************************
1937 * Fair scheduling class load-balancing methods: 1997 * Fair scheduling class load-balancing methods:
@@ -2123,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
2123 * We need to update shares after updating tg->load_weight in 2183 * We need to update shares after updating tg->load_weight in
2124 * order to adjust the weight of groups with long running tasks. 2184 * order to adjust the weight of groups with long running tasks.
2125 */ 2185 */
2126 update_cfs_shares(cfs_rq, 0); 2186 update_cfs_shares(cfs_rq);
2127 2187
2128 raw_spin_unlock_irqrestore(&rq->lock, flags); 2188 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129 2189
@@ -2610,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2610 * @this_cpu: Cpu for which load balance is currently performed. 2670 * @this_cpu: Cpu for which load balance is currently performed.
2611 * @idle: Idle status of this_cpu 2671 * @idle: Idle status of this_cpu
2612 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2672 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2613 * @sd_idle: Idle status of the sched_domain containing group.
2614 * @local_group: Does group contain this_cpu. 2673 * @local_group: Does group contain this_cpu.
2615 * @cpus: Set of cpus considered for load balancing. 2674 * @cpus: Set of cpus considered for load balancing.
2616 * @balance: Should we balance. 2675 * @balance: Should we balance.
@@ -2618,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2618 */ 2677 */
2619static inline void update_sg_lb_stats(struct sched_domain *sd, 2678static inline void update_sg_lb_stats(struct sched_domain *sd,
2620 struct sched_group *group, int this_cpu, 2679 struct sched_group *group, int this_cpu,
2621 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2680 enum cpu_idle_type idle, int load_idx,
2622 int local_group, const struct cpumask *cpus, 2681 int local_group, const struct cpumask *cpus,
2623 int *balance, struct sg_lb_stats *sgs) 2682 int *balance, struct sg_lb_stats *sgs)
2624{ 2683{
@@ -2638,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2638 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2697 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2639 struct rq *rq = cpu_rq(i); 2698 struct rq *rq = cpu_rq(i);
2640 2699
2641 if (*sd_idle && rq->nr_running)
2642 *sd_idle = 0;
2643
2644 /* Bias balancing toward cpus of our domain */ 2700 /* Bias balancing toward cpus of our domain */
2645 if (local_group) { 2701 if (local_group) {
2646 if (idle_cpu(i) && !first_idle_cpu) { 2702 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2685,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2685 2741
2686 /* 2742 /*
2687 * Consider the group unbalanced when the imbalance is larger 2743 * Consider the group unbalanced when the imbalance is larger
2688 * than the average weight of two tasks. 2744 * than the average weight of a task.
2689 * 2745 *
2690 * APZ: with cgroup the avg task weight can vary wildly and 2746 * APZ: with cgroup the avg task weight can vary wildly and
2691 * might not be a suitable number - should we keep a 2747 * might not be a suitable number - should we keep a
@@ -2695,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2695 if (sgs->sum_nr_running) 2751 if (sgs->sum_nr_running)
2696 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2752 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2697 2753
2698 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) 2754 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2699 sgs->group_imb = 1; 2755 sgs->group_imb = 1;
2700 2756
2701 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2757 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2755,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2755 * @sd: sched_domain whose statistics are to be updated. 2811 * @sd: sched_domain whose statistics are to be updated.
2756 * @this_cpu: Cpu for which load balance is currently performed. 2812 * @this_cpu: Cpu for which load balance is currently performed.
2757 * @idle: Idle status of this_cpu 2813 * @idle: Idle status of this_cpu
2758 * @sd_idle: Idle status of the sched_domain containing sg.
2759 * @cpus: Set of cpus considered for load balancing. 2814 * @cpus: Set of cpus considered for load balancing.
2760 * @balance: Should we balance. 2815 * @balance: Should we balance.
2761 * @sds: variable to hold the statistics for this sched_domain. 2816 * @sds: variable to hold the statistics for this sched_domain.
2762 */ 2817 */
2763static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2818static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2764 enum cpu_idle_type idle, int *sd_idle, 2819 enum cpu_idle_type idle, const struct cpumask *cpus,
2765 const struct cpumask *cpus, int *balance, 2820 int *balance, struct sd_lb_stats *sds)
2766 struct sd_lb_stats *sds)
2767{ 2821{
2768 struct sched_domain *child = sd->child; 2822 struct sched_domain *child = sd->child;
2769 struct sched_group *sg = sd->groups; 2823 struct sched_group *sg = sd->groups;
@@ -2781,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2781 2835
2782 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2836 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2783 memset(&sgs, 0, sizeof(sgs)); 2837 memset(&sgs, 0, sizeof(sgs));
2784 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2838 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2785 local_group, cpus, balance, &sgs); 2839 local_group, cpus, balance, &sgs);
2786 2840
2787 if (local_group && !(*balance)) 2841 if (local_group && !(*balance))
@@ -3033,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3033 * @imbalance: Variable which stores amount of weighted load which should 3087 * @imbalance: Variable which stores amount of weighted load which should
3034 * be moved to restore balance/put a group to idle. 3088 * be moved to restore balance/put a group to idle.
3035 * @idle: The idle status of this_cpu. 3089 * @idle: The idle status of this_cpu.
3036 * @sd_idle: The idleness of sd
3037 * @cpus: The set of CPUs under consideration for load-balancing. 3090 * @cpus: The set of CPUs under consideration for load-balancing.
3038 * @balance: Pointer to a variable indicating if this_cpu 3091 * @balance: Pointer to a variable indicating if this_cpu
3039 * is the appropriate cpu to perform load balancing at this_level. 3092 * is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3046static struct sched_group * 3099static struct sched_group *
3047find_busiest_group(struct sched_domain *sd, int this_cpu, 3100find_busiest_group(struct sched_domain *sd, int this_cpu,
3048 unsigned long *imbalance, enum cpu_idle_type idle, 3101 unsigned long *imbalance, enum cpu_idle_type idle,
3049 int *sd_idle, const struct cpumask *cpus, int *balance) 3102 const struct cpumask *cpus, int *balance)
3050{ 3103{
3051 struct sd_lb_stats sds; 3104 struct sd_lb_stats sds;
3052 3105
@@ -3056,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3056 * Compute the various statistics relavent for load balancing at 3109 * Compute the various statistics relavent for load balancing at
3057 * this level. 3110 * this level.
3058 */ 3111 */
3059 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3112 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3060 balance, &sds); 3113
3061 3114 /*
3062 /* Cases where imbalance does not exist from POV of this_cpu */ 3115 * this_cpu is not the appropriate cpu to perform load balancing at
3063 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3116 * this level.
3064 * at this level.
3065 * 2) There is no busy sibling group to pull from.
3066 * 3) This group is the busiest group.
3067 * 4) This group is more busy than the avg busieness at this
3068 * sched_domain.
3069 * 5) The imbalance is within the specified limit.
3070 *
3071 * Note: when doing newidle balance, if the local group has excess
3072 * capacity (i.e. nr_running < group_capacity) and the busiest group
3073 * does not have any capacity, we force a load balance to pull tasks
3074 * to the local group. In this case, we skip past checks 3, 4 and 5.
3075 */ 3117 */
3076 if (!(*balance)) 3118 if (!(*balance))
3077 goto ret; 3119 goto ret;
@@ -3080,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3080 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3122 check_asym_packing(sd, &sds, this_cpu, imbalance))
3081 return sds.busiest; 3123 return sds.busiest;
3082 3124
3125 /* There is no busy sibling group to pull tasks from */
3083 if (!sds.busiest || sds.busiest_nr_running == 0) 3126 if (!sds.busiest || sds.busiest_nr_running == 0)
3084 goto out_balanced; 3127 goto out_balanced;
3085 3128
3086 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 3129 /*
3130 * If the busiest group is imbalanced the below checks don't
3131 * work because they assumes all things are equal, which typically
3132 * isn't true due to cpus_allowed constraints and the like.
3133 */
3134 if (sds.group_imb)
3135 goto force_balance;
3136
3137 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3087 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 3138 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3088 !sds.busiest_has_capacity) 3139 !sds.busiest_has_capacity)
3089 goto force_balance; 3140 goto force_balance;
3090 3141
3142 /*
3143 * If the local group is more busy than the selected busiest group
3144 * don't try and pull any tasks.
3145 */
3091 if (sds.this_load >= sds.max_load) 3146 if (sds.this_load >= sds.max_load)
3092 goto out_balanced; 3147 goto out_balanced;
3093 3148
3149 /*
3150 * Don't pull any tasks if this group is already above the domain
3151 * average load.
3152 */
3094 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3153 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3095
3096 if (sds.this_load >= sds.avg_load) 3154 if (sds.this_load >= sds.avg_load)
3097 goto out_balanced; 3155 goto out_balanced;
3098 3156
3099 /* 3157 if (idle == CPU_IDLE) {
3100 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3101 * And to check for busy balance use !idle_cpu instead of
3102 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3103 * even when they are idle.
3104 */
3105 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3106 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3107 goto out_balanced;
3108 } else {
3109 /* 3158 /*
3110 * This cpu is idle. If the busiest group load doesn't 3159 * This cpu is idle. If the busiest group load doesn't
3111 * have more tasks than the number of available cpu's and 3160 * have more tasks than the number of available cpu's and
3112 * there is no imbalance between this and busiest group 3161 * there is no imbalance between this and busiest group
3113 * wrt to idle cpu's, it is balanced. 3162 * wrt to idle cpu's, it is balanced.
3114 */ 3163 */
3115 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 3164 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3116 sds.busiest_nr_running <= sds.busiest_group_weight) 3165 sds.busiest_nr_running <= sds.busiest_group_weight)
3117 goto out_balanced; 3166 goto out_balanced;
3167 } else {
3168 /*
3169 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3170 * imbalance_pct to be conservative.
3171 */
3172 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3173 goto out_balanced;
3118 } 3174 }
3119 3175
3120force_balance: 3176force_balance:
@@ -3193,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3193/* Working cpumask for load_balance and load_balance_newidle. */ 3249/* Working cpumask for load_balance and load_balance_newidle. */
3194static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3250static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3195 3251
3196static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3252static int need_active_balance(struct sched_domain *sd, int idle,
3197 int busiest_cpu, int this_cpu) 3253 int busiest_cpu, int this_cpu)
3198{ 3254{
3199 if (idle == CPU_NEWLY_IDLE) { 3255 if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3225 * move_tasks() will succeed. ld_moved will be true and this 3281 * move_tasks() will succeed. ld_moved will be true and this
3226 * active balance code will not be triggered. 3282 * active balance code will not be triggered.
3227 */ 3283 */
3228 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3229 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3230 return 0;
3231
3232 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3284 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3233 return 0; 3285 return 0;
3234 } 3286 }
@@ -3246,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3246 struct sched_domain *sd, enum cpu_idle_type idle, 3298 struct sched_domain *sd, enum cpu_idle_type idle,
3247 int *balance) 3299 int *balance)
3248{ 3300{
3249 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3301 int ld_moved, all_pinned = 0, active_balance = 0;
3250 struct sched_group *group; 3302 struct sched_group *group;
3251 unsigned long imbalance; 3303 unsigned long imbalance;
3252 struct rq *busiest; 3304 struct rq *busiest;
@@ -3255,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3255 3307
3256 cpumask_copy(cpus, cpu_active_mask); 3308 cpumask_copy(cpus, cpu_active_mask);
3257 3309
3258 /*
3259 * When power savings policy is enabled for the parent domain, idle
3260 * sibling can pick up load irrespective of busy siblings. In this case,
3261 * let the state of idle sibling percolate up as CPU_IDLE, instead of
3262 * portraying it as CPU_NOT_IDLE.
3263 */
3264 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3265 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3266 sd_idle = 1;
3267
3268 schedstat_inc(sd, lb_count[idle]); 3310 schedstat_inc(sd, lb_count[idle]);
3269 3311
3270redo: 3312redo:
3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3313 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3272 cpus, balance); 3314 cpus, balance);
3273 3315
3274 if (*balance == 0) 3316 if (*balance == 0)
@@ -3330,8 +3372,7 @@ redo:
3330 if (idle != CPU_NEWLY_IDLE) 3372 if (idle != CPU_NEWLY_IDLE)
3331 sd->nr_balance_failed++; 3373 sd->nr_balance_failed++;
3332 3374
3333 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3375 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3334 this_cpu)) {
3335 raw_spin_lock_irqsave(&busiest->lock, flags); 3376 raw_spin_lock_irqsave(&busiest->lock, flags);
3336 3377
3337 /* don't kick the active_load_balance_cpu_stop, 3378 /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3427,6 @@ redo:
3386 sd->balance_interval *= 2; 3427 sd->balance_interval *= 2;
3387 } 3428 }
3388 3429
3389 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3390 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3391 ld_moved = -1;
3392
3393 goto out; 3430 goto out;
3394 3431
3395out_balanced: 3432out_balanced:
@@ -3403,11 +3440,7 @@ out_one_pinned:
3403 (sd->balance_interval < sd->max_interval)) 3440 (sd->balance_interval < sd->max_interval))
3404 sd->balance_interval *= 2; 3441 sd->balance_interval *= 2;
3405 3442
3406 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3443 ld_moved = 0;
3407 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3408 ld_moved = -1;
3409 else
3410 ld_moved = 0;
3411out: 3444out:
3412 return ld_moved; 3445 return ld_moved;
3413} 3446}
@@ -3831,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3831 if (load_balance(cpu, rq, sd, idle, &balance)) { 3864 if (load_balance(cpu, rq, sd, idle, &balance)) {
3832 /* 3865 /*
3833 * We've pulled tasks over so either we're no 3866 * We've pulled tasks over so either we're no
3834 * longer idle, or one of our SMT siblings is 3867 * longer idle.
3835 * not idle.
3836 */ 3868 */
3837 idle = CPU_NOT_IDLE; 3869 idle = CPU_NOT_IDLE;
3838 } 3870 }
@@ -4079,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p)
4079 * Priority of the task has changed. Check to see if we preempt 4111 * Priority of the task has changed. Check to see if we preempt
4080 * the current task. 4112 * the current task.
4081 */ 4113 */
4082static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4114static void
4083 int oldprio, int running) 4115prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4084{ 4116{
4117 if (!p->se.on_rq)
4118 return;
4119
4085 /* 4120 /*
4086 * Reschedule if we are currently running on this runqueue and 4121 * Reschedule if we are currently running on this runqueue and
4087 * our priority decreased, or if we are not currently running on 4122 * our priority decreased, or if we are not currently running on
4088 * this runqueue and our priority is higher than the current's 4123 * this runqueue and our priority is higher than the current's
4089 */ 4124 */
4090 if (running) { 4125 if (rq->curr == p) {
4091 if (p->prio > oldprio) 4126 if (p->prio > oldprio)
4092 resched_task(rq->curr); 4127 resched_task(rq->curr);
4093 } else 4128 } else
4094 check_preempt_curr(rq, p, 0); 4129 check_preempt_curr(rq, p, 0);
4095} 4130}
4096 4131
4132static void switched_from_fair(struct rq *rq, struct task_struct *p)
4133{
4134 struct sched_entity *se = &p->se;
4135 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4136
4137 /*
4138 * Ensure the task's vruntime is normalized, so that when its
4139 * switched back to the fair class the enqueue_entity(.flags=0) will
4140 * do the right thing.
4141 *
4142 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4143 * have normalized the vruntime, if it was !on_rq, then only when
4144 * the task is sleeping will it still have non-normalized vruntime.
4145 */
4146 if (!se->on_rq && p->state != TASK_RUNNING) {
4147 /*
4148 * Fix up our vruntime so that the current sleep doesn't
4149 * cause 'unlimited' sleep bonus.
4150 */
4151 place_entity(cfs_rq, se, 0);
4152 se->vruntime -= cfs_rq->min_vruntime;
4153 }
4154}
4155
4097/* 4156/*
4098 * We switched to the sched_fair class. 4157 * We switched to the sched_fair class.
4099 */ 4158 */
4100static void switched_to_fair(struct rq *rq, struct task_struct *p, 4159static void switched_to_fair(struct rq *rq, struct task_struct *p)
4101 int running)
4102{ 4160{
4161 if (!p->se.on_rq)
4162 return;
4163
4103 /* 4164 /*
4104 * We were most likely switched from sched_rt, so 4165 * We were most likely switched from sched_rt, so
4105 * kick off the schedule if running, otherwise just see 4166 * kick off the schedule if running, otherwise just see
4106 * if we can still preempt the current task. 4167 * if we can still preempt the current task.
4107 */ 4168 */
4108 if (running) 4169 if (rq->curr == p)
4109 resched_task(rq->curr); 4170 resched_task(rq->curr);
4110 else 4171 else
4111 check_preempt_curr(rq, p, 0); 4172 check_preempt_curr(rq, p, 0);
@@ -4171,6 +4232,7 @@ static const struct sched_class fair_sched_class = {
4171 .enqueue_task = enqueue_task_fair, 4232 .enqueue_task = enqueue_task_fair,
4172 .dequeue_task = dequeue_task_fair, 4233 .dequeue_task = dequeue_task_fair,
4173 .yield_task = yield_task_fair, 4234 .yield_task = yield_task_fair,
4235 .yield_to_task = yield_to_task_fair,
4174 4236
4175 .check_preempt_curr = check_preempt_wakeup, 4237 .check_preempt_curr = check_preempt_wakeup,
4176 4238
@@ -4191,6 +4253,7 @@ static const struct sched_class fair_sched_class = {
4191 .task_fork = task_fork_fair, 4253 .task_fork = task_fork_fair,
4192 4254
4193 .prio_changed = prio_changed_fair, 4255 .prio_changed = prio_changed_fair,
4256 .switched_from = switched_from_fair,
4194 .switched_to = switched_to_fair, 4257 .switched_to = switched_to_fair,
4195 4258
4196 .get_rr_interval = get_rr_interval_fair, 4259 .get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87..c82f26c1b7c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 01f75a5f17a..db308cb08b7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1599,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
1599 * When switch from the rt queue, we bring ourselves to a position 1599 * When switch from the rt queue, we bring ourselves to a position
1600 * that we might want to pull RT tasks from other runqueues. 1600 * that we might want to pull RT tasks from other runqueues.
1601 */ 1601 */
1602static void switched_from_rt(struct rq *rq, struct task_struct *p, 1602static void switched_from_rt(struct rq *rq, struct task_struct *p)
1603 int running)
1604{ 1603{
1605 /* 1604 /*
1606 * If there are other RT tasks then we will reschedule 1605 * If there are other RT tasks then we will reschedule
@@ -1609,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1609 * we may need to handle the pulling of RT tasks 1608 * we may need to handle the pulling of RT tasks
1610 * now. 1609 * now.
1611 */ 1610 */
1612 if (!rq->rt.rt_nr_running) 1611 if (p->se.on_rq && !rq->rt.rt_nr_running)
1613 pull_rt_task(rq); 1612 pull_rt_task(rq);
1614} 1613}
1615 1614
@@ -1628,8 +1627,7 @@ static inline void init_sched_rt_class(void)
1628 * with RT tasks. In this case we try to push them off to 1627 * with RT tasks. In this case we try to push them off to
1629 * other runqueues. 1628 * other runqueues.
1630 */ 1629 */
1631static void switched_to_rt(struct rq *rq, struct task_struct *p, 1630static void switched_to_rt(struct rq *rq, struct task_struct *p)
1632 int running)
1633{ 1631{
1634 int check_resched = 1; 1632 int check_resched = 1;
1635 1633
@@ -1640,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1640 * If that current running task is also an RT task 1638 * If that current running task is also an RT task
1641 * then see if we can move to another run queue. 1639 * then see if we can move to another run queue.
1642 */ 1640 */
1643 if (!running) { 1641 if (p->se.on_rq && rq->curr != p) {
1644#ifdef CONFIG_SMP 1642#ifdef CONFIG_SMP
1645 if (rq->rt.overloaded && push_rt_task(rq) && 1643 if (rq->rt.overloaded && push_rt_task(rq) &&
1646 /* Don't resched if we changed runqueues */ 1644 /* Don't resched if we changed runqueues */
@@ -1656,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1656 * Priority of the task has changed. This may cause 1654 * Priority of the task has changed. This may cause
1657 * us to initiate a push or pull. 1655 * us to initiate a push or pull.
1658 */ 1656 */
1659static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1657static void
1660 int oldprio, int running) 1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1661{ 1659{
1662 if (running) { 1660 if (!p->se.on_rq)
1661 return;
1662
1663 if (rq->curr == p) {
1663#ifdef CONFIG_SMP 1664#ifdef CONFIG_SMP
1664 /* 1665 /*
1665 * If our priority decreases while running, we 1666 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c..84ec9bcf82d 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
59{ 59{
60} 60}
61 61
62static void switched_to_stop(struct rq *rq, struct task_struct *p, 62static void switched_to_stop(struct rq *rq, struct task_struct *p)
63 int running)
64{ 63{
65 BUG(); /* its impossible to change to this class */ 64 BUG(); /* its impossible to change to this class */
66} 65}
67 66
68static void prio_changed_stop(struct rq *rq, struct task_struct *p, 67static void
69 int oldprio, int running) 68prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
70{ 69{
71 BUG(); /* how!?, what priority? */ 70 BUG(); /* how!?, what priority? */
72} 71}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec38..56e5dec837f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -311,9 +311,21 @@ void irq_enter(void)
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314# define invoke_softirq() __do_softirq() 314static inline void invoke_softirq(void)
315{
316 if (!force_irqthreads)
317 __do_softirq();
318 else
319 wakeup_softirqd();
320}
315#else 321#else
316# define invoke_softirq() do_softirq() 322static inline void invoke_softirq(void)
323{
324 if (!force_irqthreads)
325 do_softirq();
326 else
327 wakeup_softirqd();
328}
317#endif 329#endif
318 330
319/* 331/*
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
721{ 733{
722 set_current_state(TASK_INTERRUPTIBLE); 734 set_current_state(TASK_INTERRUPTIBLE);
723 735
724 current->flags |= PF_KSOFTIRQD;
725 while (!kthread_should_stop()) { 736 while (!kthread_should_stop()) {
726 preempt_disable(); 737 preempt_disable();
727 if (!local_softirq_pending()) { 738 if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
738 don't process */ 749 don't process */
739 if (cpu_is_offline((long)__bind_cpu)) 750 if (cpu_is_offline((long)__bind_cpu))
740 goto wait_to_die; 751 goto wait_to_die;
741 do_softirq(); 752 local_irq_disable();
753 if (local_softirq_pending())
754 __do_softirq();
755 local_irq_enable();
742 preempt_enable_no_resched(); 756 preempt_enable_no_resched();
743 cond_resched(); 757 cond_resched();
744 preempt_disable(); 758 preempt_disable();
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c..25cc41cd8f3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open);
186/* fanotify! */ 186/* fanotify! */
187cond_syscall(sys_fanotify_init); 187cond_syscall(sys_fanotify_init);
188cond_syscall(sys_fanotify_mark); 188cond_syscall(sys_fanotify_mark);
189
190/* open by handle */
191cond_syscall(sys_name_to_handle_at);
192cond_syscall(sys_open_by_handle_at);
193cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4eed0af5d14..51054fea5d9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -361,20 +361,13 @@ static struct ctl_table kern_table[] = {
361 .mode = 0644, 361 .mode = 0644,
362 .proc_handler = sched_rt_handler, 362 .proc_handler = sched_rt_handler,
363 }, 363 },
364 {
365 .procname = "sched_compat_yield",
366 .data = &sysctl_sched_compat_yield,
367 .maxlen = sizeof(unsigned int),
368 .mode = 0644,
369 .proc_handler = proc_dointvec,
370 },
371#ifdef CONFIG_SCHED_AUTOGROUP 364#ifdef CONFIG_SCHED_AUTOGROUP
372 { 365 {
373 .procname = "sched_autogroup_enabled", 366 .procname = "sched_autogroup_enabled",
374 .data = &sysctl_sched_autogroup_enabled, 367 .data = &sysctl_sched_autogroup_enabled,
375 .maxlen = sizeof(unsigned int), 368 .maxlen = sizeof(unsigned int),
376 .mode = 0644, 369 .mode = 0644,
377 .proc_handler = proc_dointvec, 370 .proc_handler = proc_dointvec_minmax,
378 .extra1 = &zero, 371 .extra1 = &zero,
379 .extra2 = &one, 372 .extra2 = &one,
380 }, 373 },
@@ -948,7 +941,7 @@ static struct ctl_table kern_table[] = {
948 .data = &sysctl_perf_event_sample_rate, 941 .data = &sysctl_perf_event_sample_rate,
949 .maxlen = sizeof(sysctl_perf_event_sample_rate), 942 .maxlen = sizeof(sysctl_perf_event_sample_rate),
950 .mode = 0644, 943 .mode = 0644,
951 .proc_handler = proc_dointvec, 944 .proc_handler = perf_proc_update_handler,
952 }, 945 },
953#endif 946#endif
954#ifdef CONFIG_KMEMCHECK 947#ifdef CONFIG_KMEMCHECK
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9..3b8e028b960 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1322{ 1322{
1323 const struct bin_table *table = NULL; 1323 const struct bin_table *table = NULL;
1324 struct nameidata nd;
1325 struct vfsmount *mnt; 1324 struct vfsmount *mnt;
1326 struct file *file; 1325 struct file *file;
1327 ssize_t result; 1326 ssize_t result;
1328 char *pathname; 1327 char *pathname;
1329 int flags; 1328 int flags;
1330 int acc_mode;
1331 1329
1332 pathname = sysctl_getname(name, nlen, &table); 1330 pathname = sysctl_getname(name, nlen, &table);
1333 result = PTR_ERR(pathname); 1331 result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1337 /* How should the sysctl be accessed? */ 1335 /* How should the sysctl be accessed? */
1338 if (oldval && oldlen && newval && newlen) { 1336 if (oldval && oldlen && newval && newlen) {
1339 flags = O_RDWR; 1337 flags = O_RDWR;
1340 acc_mode = MAY_READ | MAY_WRITE;
1341 } else if (newval && newlen) { 1338 } else if (newval && newlen) {
1342 flags = O_WRONLY; 1339 flags = O_WRONLY;
1343 acc_mode = MAY_WRITE;
1344 } else if (oldval && oldlen) { 1340 } else if (oldval && oldlen) {
1345 flags = O_RDONLY; 1341 flags = O_RDONLY;
1346 acc_mode = MAY_READ;
1347 } else { 1342 } else {
1348 result = 0; 1343 result = 0;
1349 goto out_putname; 1344 goto out_putname;
1350 } 1345 }
1351 1346
1352 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1353 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1354 if (result)
1355 goto out_putname;
1356
1357 result = may_open(&nd.path, acc_mode, flags);
1358 if (result)
1359 goto out_putpath;
1360
1361 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1362 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1363 if (IS_ERR(file)) 1350 if (IS_ERR(file))
1364 goto out_putname; 1351 goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
1370 putname(pathname); 1357 putname(pathname);
1371out: 1358out:
1372 return result; 1359 return result;
1373
1374out_putpath:
1375 path_put(&nd.path);
1376 goto out_putname;
1377} 1360}
1378 1361
1379 1362
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576..8e8dc6d705c 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
150 * various programs will get confused when the clock gets warped. 150 * various programs will get confused when the clock gets warped.
151 */ 151 */
152 152
153int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) 153int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
154{ 154{
155 static int firsttime = 1; 155 static int firsttime = 1;
156 int error = 0; 156 int error = 0;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677#if (BITS_PER_LONG < 64) 677/**
678u64 get_jiffies_64(void) 678 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
679 *
680 * @n: nsecs in u64
681 *
682 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
683 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
684 * for scheduler, not for use in device drivers to calculate timeout value.
685 *
686 * note:
687 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
688 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
689 */
690unsigned long nsecs_to_jiffies(u64 n)
679{ 691{
680 unsigned long seq; 692 return (unsigned long)nsecs_to_jiffies64(n);
681 u64 ret;
682
683 do {
684 seq = read_seqbegin(&xtime_lock);
685 ret = jiffies_64;
686 } while (read_seqretry(&xtime_lock, seq));
687 return ret;
688} 693}
689EXPORT_SYMBOL(get_jiffies_64);
690#endif
691
692EXPORT_SYMBOL(jiffies);
693 694
694/* 695/*
695 * Add two timespec values and do a safety check for overflow. 696 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06..b0425991e9a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o
2 3
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f..0d74b9ba90c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a845690..b2fa506667c 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
22************************************************************************/ 22************************************************************************/
23#include <linux/clocksource.h> 23#include <linux/clocksource.h>
24#include <linux/jiffies.h> 24#include <linux/jiffies.h>
25#include <linux/module.h>
25#include <linux/init.h> 26#include <linux/init.h>
26 27
28#include "tick-internal.h"
29
27/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on 31 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as 32 * all systems. It has the same coarse resolution as
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
64 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
65}; 68};
66 69
70#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void)
72{
73 unsigned long seq;
74 u64 ret;
75
76 do {
77 seq = read_seqbegin(&xtime_lock);
78 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq));
80 return ret;
81}
82EXPORT_SYMBOL(get_jiffies_64);
83#endif
84
85EXPORT_SYMBOL(jiffies);
86
67static int __init init_jiffies_clocksource(void) 87static int __init init_jiffies_clocksource(void)
68{ 88{
69 return clocksource_register(&clocksource_jiffies); 89 return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa92..5f1bb8e2008 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19#include "tick-internal.h"
20
19/* 21/*
20 * NTP timekeeping variables: 22 * NTP timekeeping variables:
21 */ 23 */
@@ -646,6 +648,17 @@ int do_adjtimex(struct timex *txc)
646 hrtimer_cancel(&leap_timer); 648 hrtimer_cancel(&leap_timer);
647 } 649 }
648 650
651 if (txc->modes & ADJ_SETOFFSET) {
652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec;
655 if (!(txc->modes & ADJ_NANO))
656 delta.tv_nsec *= 1000;
657 result = timekeeping_inject_offset(&delta);
658 if (result)
659 return result;
660 }
661
649 getnstimeofday(&ts); 662 getnstimeofday(&ts);
650 663
651 write_seqlock_irq(&xtime_lock); 664 write_seqlock_irq(&xtime_lock);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 00000000000..25028dd4fa1
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,451 @@
1/*
2 * posix-clock.c - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/device.h>
21#include <linux/file.h>
22#include <linux/mutex.h>
23#include <linux/posix-clock.h>
24#include <linux/slab.h>
25#include <linux/syscalls.h>
26#include <linux/uaccess.h>
27
28static void delete_clock(struct kref *kref);
29
30/*
31 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
32 */
33static struct posix_clock *get_posix_clock(struct file *fp)
34{
35 struct posix_clock *clk = fp->private_data;
36
37 mutex_lock(&clk->mutex);
38
39 if (!clk->zombie)
40 return clk;
41
42 mutex_unlock(&clk->mutex);
43
44 return NULL;
45}
46
47static void put_posix_clock(struct posix_clock *clk)
48{
49 mutex_unlock(&clk->mutex);
50}
51
52static ssize_t posix_clock_read(struct file *fp, char __user *buf,
53 size_t count, loff_t *ppos)
54{
55 struct posix_clock *clk = get_posix_clock(fp);
56 int err = -EINVAL;
57
58 if (!clk)
59 return -ENODEV;
60
61 if (clk->ops.read)
62 err = clk->ops.read(clk, fp->f_flags, buf, count);
63
64 put_posix_clock(clk);
65
66 return err;
67}
68
69static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
70{
71 struct posix_clock *clk = get_posix_clock(fp);
72 int result = 0;
73
74 if (!clk)
75 return -ENODEV;
76
77 if (clk->ops.poll)
78 result = clk->ops.poll(clk, fp, wait);
79
80 put_posix_clock(clk);
81
82 return result;
83}
84
85static int posix_clock_fasync(int fd, struct file *fp, int on)
86{
87 struct posix_clock *clk = get_posix_clock(fp);
88 int err = 0;
89
90 if (!clk)
91 return -ENODEV;
92
93 if (clk->ops.fasync)
94 err = clk->ops.fasync(clk, fd, fp, on);
95
96 put_posix_clock(clk);
97
98 return err;
99}
100
101static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
102{
103 struct posix_clock *clk = get_posix_clock(fp);
104 int err = -ENODEV;
105
106 if (!clk)
107 return -ENODEV;
108
109 if (clk->ops.mmap)
110 err = clk->ops.mmap(clk, vma);
111
112 put_posix_clock(clk);
113
114 return err;
115}
116
117static long posix_clock_ioctl(struct file *fp,
118 unsigned int cmd, unsigned long arg)
119{
120 struct posix_clock *clk = get_posix_clock(fp);
121 int err = -ENOTTY;
122
123 if (!clk)
124 return -ENODEV;
125
126 if (clk->ops.ioctl)
127 err = clk->ops.ioctl(clk, cmd, arg);
128
129 put_posix_clock(clk);
130
131 return err;
132}
133
134#ifdef CONFIG_COMPAT
135static long posix_clock_compat_ioctl(struct file *fp,
136 unsigned int cmd, unsigned long arg)
137{
138 struct posix_clock *clk = get_posix_clock(fp);
139 int err = -ENOTTY;
140
141 if (!clk)
142 return -ENODEV;
143
144 if (clk->ops.ioctl)
145 err = clk->ops.ioctl(clk, cmd, arg);
146
147 put_posix_clock(clk);
148
149 return err;
150}
151#endif
152
153static int posix_clock_open(struct inode *inode, struct file *fp)
154{
155 int err;
156 struct posix_clock *clk =
157 container_of(inode->i_cdev, struct posix_clock, cdev);
158
159 mutex_lock(&clk->mutex);
160
161 if (clk->zombie) {
162 err = -ENODEV;
163 goto out;
164 }
165 if (clk->ops.open)
166 err = clk->ops.open(clk, fp->f_mode);
167 else
168 err = 0;
169
170 if (!err) {
171 kref_get(&clk->kref);
172 fp->private_data = clk;
173 }
174out:
175 mutex_unlock(&clk->mutex);
176 return err;
177}
178
179static int posix_clock_release(struct inode *inode, struct file *fp)
180{
181 struct posix_clock *clk = fp->private_data;
182 int err = 0;
183
184 if (clk->ops.release)
185 err = clk->ops.release(clk);
186
187 kref_put(&clk->kref, delete_clock);
188
189 fp->private_data = NULL;
190
191 return err;
192}
193
194static const struct file_operations posix_clock_file_operations = {
195 .owner = THIS_MODULE,
196 .llseek = no_llseek,
197 .read = posix_clock_read,
198 .poll = posix_clock_poll,
199 .unlocked_ioctl = posix_clock_ioctl,
200 .open = posix_clock_open,
201 .release = posix_clock_release,
202 .fasync = posix_clock_fasync,
203 .mmap = posix_clock_mmap,
204#ifdef CONFIG_COMPAT
205 .compat_ioctl = posix_clock_compat_ioctl,
206#endif
207};
208
209int posix_clock_register(struct posix_clock *clk, dev_t devid)
210{
211 int err;
212
213 kref_init(&clk->kref);
214 mutex_init(&clk->mutex);
215
216 cdev_init(&clk->cdev, &posix_clock_file_operations);
217 clk->cdev.owner = clk->ops.owner;
218 err = cdev_add(&clk->cdev, devid, 1);
219 if (err)
220 goto no_cdev;
221
222 return err;
223no_cdev:
224 mutex_destroy(&clk->mutex);
225 return err;
226}
227EXPORT_SYMBOL_GPL(posix_clock_register);
228
229static void delete_clock(struct kref *kref)
230{
231 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
232 mutex_destroy(&clk->mutex);
233 if (clk->release)
234 clk->release(clk);
235}
236
237void posix_clock_unregister(struct posix_clock *clk)
238{
239 cdev_del(&clk->cdev);
240
241 mutex_lock(&clk->mutex);
242 clk->zombie = true;
243 mutex_unlock(&clk->mutex);
244
245 kref_put(&clk->kref, delete_clock);
246}
247EXPORT_SYMBOL_GPL(posix_clock_unregister);
248
249struct posix_clock_desc {
250 struct file *fp;
251 struct posix_clock *clk;
252};
253
254static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
255{
256 struct file *fp = fget(CLOCKID_TO_FD(id));
257 int err = -EINVAL;
258
259 if (!fp)
260 return err;
261
262 if (fp->f_op->open != posix_clock_open || !fp->private_data)
263 goto out;
264
265 cd->fp = fp;
266 cd->clk = get_posix_clock(fp);
267
268 err = cd->clk ? 0 : -ENODEV;
269out:
270 if (err)
271 fput(fp);
272 return err;
273}
274
275static void put_clock_desc(struct posix_clock_desc *cd)
276{
277 put_posix_clock(cd->clk);
278 fput(cd->fp);
279}
280
281static int pc_clock_adjtime(clockid_t id, struct timex *tx)
282{
283 struct posix_clock_desc cd;
284 int err;
285
286 err = get_clock_desc(id, &cd);
287 if (err)
288 return err;
289
290 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
291 err = -EACCES;
292 goto out;
293 }
294
295 if (cd.clk->ops.clock_adjtime)
296 err = cd.clk->ops.clock_adjtime(cd.clk, tx);
297 else
298 err = -EOPNOTSUPP;
299out:
300 put_clock_desc(&cd);
301
302 return err;
303}
304
305static int pc_clock_gettime(clockid_t id, struct timespec *ts)
306{
307 struct posix_clock_desc cd;
308 int err;
309
310 err = get_clock_desc(id, &cd);
311 if (err)
312 return err;
313
314 if (cd.clk->ops.clock_gettime)
315 err = cd.clk->ops.clock_gettime(cd.clk, ts);
316 else
317 err = -EOPNOTSUPP;
318
319 put_clock_desc(&cd);
320
321 return err;
322}
323
324static int pc_clock_getres(clockid_t id, struct timespec *ts)
325{
326 struct posix_clock_desc cd;
327 int err;
328
329 err = get_clock_desc(id, &cd);
330 if (err)
331 return err;
332
333 if (cd.clk->ops.clock_getres)
334 err = cd.clk->ops.clock_getres(cd.clk, ts);
335 else
336 err = -EOPNOTSUPP;
337
338 put_clock_desc(&cd);
339
340 return err;
341}
342
343static int pc_clock_settime(clockid_t id, const struct timespec *ts)
344{
345 struct posix_clock_desc cd;
346 int err;
347
348 err = get_clock_desc(id, &cd);
349 if (err)
350 return err;
351
352 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
353 err = -EACCES;
354 goto out;
355 }
356
357 if (cd.clk->ops.clock_settime)
358 err = cd.clk->ops.clock_settime(cd.clk, ts);
359 else
360 err = -EOPNOTSUPP;
361out:
362 put_clock_desc(&cd);
363
364 return err;
365}
366
367static int pc_timer_create(struct k_itimer *kit)
368{
369 clockid_t id = kit->it_clock;
370 struct posix_clock_desc cd;
371 int err;
372
373 err = get_clock_desc(id, &cd);
374 if (err)
375 return err;
376
377 if (cd.clk->ops.timer_create)
378 err = cd.clk->ops.timer_create(cd.clk, kit);
379 else
380 err = -EOPNOTSUPP;
381
382 put_clock_desc(&cd);
383
384 return err;
385}
386
387static int pc_timer_delete(struct k_itimer *kit)
388{
389 clockid_t id = kit->it_clock;
390 struct posix_clock_desc cd;
391 int err;
392
393 err = get_clock_desc(id, &cd);
394 if (err)
395 return err;
396
397 if (cd.clk->ops.timer_delete)
398 err = cd.clk->ops.timer_delete(cd.clk, kit);
399 else
400 err = -EOPNOTSUPP;
401
402 put_clock_desc(&cd);
403
404 return err;
405}
406
407static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
408{
409 clockid_t id = kit->it_clock;
410 struct posix_clock_desc cd;
411
412 if (get_clock_desc(id, &cd))
413 return;
414
415 if (cd.clk->ops.timer_gettime)
416 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
417
418 put_clock_desc(&cd);
419}
420
421static int pc_timer_settime(struct k_itimer *kit, int flags,
422 struct itimerspec *ts, struct itimerspec *old)
423{
424 clockid_t id = kit->it_clock;
425 struct posix_clock_desc cd;
426 int err;
427
428 err = get_clock_desc(id, &cd);
429 if (err)
430 return err;
431
432 if (cd.clk->ops.timer_settime)
433 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
434 else
435 err = -EOPNOTSUPP;
436
437 put_clock_desc(&cd);
438
439 return err;
440}
441
442struct k_clock clock_posix_dynamic = {
443 .clock_getres = pc_clock_getres,
444 .clock_set = pc_clock_settime,
445 .clock_get = pc_clock_gettime,
446 .clock_adj = pc_clock_adjtime,
447 .timer_create = pc_timer_create,
448 .timer_set = pc_timer_settime,
449 .timer_del = pc_timer_delete,
450 .timer_get = pc_timer_gettime,
451};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index a3b5aff6260..da800ffa810 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index ed228ef6f6b..119528de823 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include <asm/irq_regs.h> 22#include <asm/irq_regs.h>
24 23
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f65d3a723a6..1009b06d6f8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4#include <linux/hrtimer.h>
5#include <linux/tick.h>
6
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
4 8
5#define TICK_DO_TIMER_NONE -1 9#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2 10#define TICK_DO_TIMER_BOOT -2
@@ -135,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
135{ 139{
136 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 140 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
137} 141}
142
143#endif
144
145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908..2d04411a5f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea243347..d5097c44b40 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h>
23#include <linux/module.h> 22#include <linux/module.h>
24 23
25#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902..3bd7e3d5c63 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
353 * 353 *
354 * Sets the time of day to the new time and update NTP and notify hrtimers 354 * Sets the time of day to the new time and update NTP and notify hrtimers
355 */ 355 */
356int do_settimeofday(struct timespec *tv) 356int do_settimeofday(const struct timespec *tv)
357{ 357{
358 struct timespec ts_delta; 358 struct timespec ts_delta;
359 unsigned long flags; 359 unsigned long flags;
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
387 387
388EXPORT_SYMBOL(do_settimeofday); 388EXPORT_SYMBOL(do_settimeofday);
389 389
390
391/**
392 * timekeeping_inject_offset - Adds or subtracts from the current time.
393 * @tv: pointer to the timespec variable containing the offset
394 *
395 * Adds or subtracts an offset value from the current time.
396 */
397int timekeeping_inject_offset(struct timespec *ts)
398{
399 unsigned long flags;
400
401 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
402 return -EINVAL;
403
404 write_seqlock_irqsave(&xtime_lock, flags);
405
406 timekeeping_forward_now();
407
408 xtime = timespec_add(xtime, *ts);
409 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
410
411 timekeeper.ntp_error = 0;
412 ntp_clear();
413
414 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
415 timekeeper.mult);
416
417 write_sequnlock_irqrestore(&xtime_lock, flags);
418
419 /* signal hrtimers about time change */
420 clock_was_set();
421
422 return 0;
423}
424EXPORT_SYMBOL(timekeeping_inject_offset);
425
390/** 426/**
391 * change_clocksource - Swaps clocksources if a new one is available 427 * change_clocksource - Swaps clocksources if a new one is available
392 * 428 *
@@ -779,7 +815,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
779 * 815 *
780 * Called from the timer interrupt, must hold a write on xtime_lock. 816 * Called from the timer interrupt, must hold a write on xtime_lock.
781 */ 817 */
782void update_wall_time(void) 818static void update_wall_time(void)
783{ 819{
784 struct clocksource *clock; 820 struct clocksource *clock;
785 cycle_t offset; 821 cycle_t offset;
@@ -871,7 +907,7 @@ void update_wall_time(void)
871 * getboottime - Return the real time of system boot. 907 * getboottime - Return the real time of system boot.
872 * @ts: pointer to the timespec to be set 908 * @ts: pointer to the timespec to be set
873 * 909 *
874 * Returns the time of day in a timespec. 910 * Returns the wall-time of boot in a timespec.
875 * 911 *
876 * This is based on the wall_to_monotonic offset and the total suspend 912 * This is based on the wall_to_monotonic offset and the total suspend
877 * time. Calls to settimeofday will affect the value returned (which 913 * time. Calls to settimeofday will affect the value returned (which
@@ -889,6 +925,55 @@ void getboottime(struct timespec *ts)
889} 925}
890EXPORT_SYMBOL_GPL(getboottime); 926EXPORT_SYMBOL_GPL(getboottime);
891 927
928
929/**
930 * get_monotonic_boottime - Returns monotonic time since boot
931 * @ts: pointer to the timespec to be set
932 *
933 * Returns the monotonic time since boot in a timespec.
934 *
935 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
936 * includes the time spent in suspend.
937 */
938void get_monotonic_boottime(struct timespec *ts)
939{
940 struct timespec tomono, sleep;
941 unsigned int seq;
942 s64 nsecs;
943
944 WARN_ON(timekeeping_suspended);
945
946 do {
947 seq = read_seqbegin(&xtime_lock);
948 *ts = xtime;
949 tomono = wall_to_monotonic;
950 sleep = total_sleep_time;
951 nsecs = timekeeping_get_ns();
952
953 } while (read_seqretry(&xtime_lock, seq));
954
955 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
956 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
957}
958EXPORT_SYMBOL_GPL(get_monotonic_boottime);
959
960/**
961 * ktime_get_boottime - Returns monotonic time since boot in a ktime
962 *
963 * Returns the monotonic time since boot in a ktime
964 *
965 * This is similar to CLOCK_MONTONIC/ktime_get, but also
966 * includes the time spent in suspend.
967 */
968ktime_t ktime_get_boottime(void)
969{
970 struct timespec ts;
971
972 get_monotonic_boottime(&ts);
973 return timespec_to_ktime(ts);
974}
975EXPORT_SYMBOL_GPL(ktime_get_boottime);
976
892/** 977/**
893 * monotonic_to_bootbased - Convert the monotonic time to boot based. 978 * monotonic_to_bootbased - Convert the monotonic time to boot based.
894 * @ts: pointer to the timespec to be converted 979 * @ts: pointer to the timespec to be converted
@@ -910,11 +995,6 @@ struct timespec __current_kernel_time(void)
910 return xtime; 995 return xtime;
911} 996}
912 997
913struct timespec __get_wall_to_monotonic(void)
914{
915 return wall_to_monotonic;
916}
917
918struct timespec current_kernel_time(void) 998struct timespec current_kernel_time(void)
919{ 999{
920 struct timespec now; 1000 struct timespec now;
@@ -946,3 +1026,48 @@ struct timespec get_monotonic_coarse(void)
946 now.tv_nsec + mono.tv_nsec); 1026 now.tv_nsec + mono.tv_nsec);
947 return now; 1027 return now;
948} 1028}
1029
1030/*
1031 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1032 * without sampling the sequence number in xtime_lock.
1033 * jiffies is defined in the linker script...
1034 */
1035void do_timer(unsigned long ticks)
1036{
1037 jiffies_64 += ticks;
1038 update_wall_time();
1039 calc_global_load(ticks);
1040}
1041
1042/**
1043 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
1044 * and sleep offsets.
1045 * @xtim: pointer to timespec to be set with xtime
1046 * @wtom: pointer to timespec to be set with wall_to_monotonic
1047 * @sleep: pointer to timespec to be set with time in suspend
1048 */
1049void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1050 struct timespec *wtom, struct timespec *sleep)
1051{
1052 unsigned long seq;
1053
1054 do {
1055 seq = read_seqbegin(&xtime_lock);
1056 *xtim = xtime;
1057 *wtom = wall_to_monotonic;
1058 *sleep = total_sleep_time;
1059 } while (read_seqretry(&xtime_lock, seq));
1060}
1061
1062/**
1063 * xtime_update() - advances the timekeeping infrastructure
1064 * @ticks: number of ticks, that have elapsed since the last call.
1065 *
1066 * Must be called with interrupts disabled.
1067 */
1068void xtime_update(unsigned long ticks)
1069{
1070 write_seqlock(&xtime_lock);
1071 do_timer(ticks);
1072 write_sequnlock(&xtime_lock);
1073}
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d24..fd6198692b5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
404 404
405static struct debug_obj_descr timer_debug_descr; 405static struct debug_obj_descr timer_debug_descr;
406 406
407static void *timer_debug_hint(void *addr)
408{
409 return ((struct timer_list *) addr)->function;
410}
411
407/* 412/*
408 * fixup_init is called when: 413 * fixup_init is called when:
409 * - an active object is initialized 414 * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
477 482
478static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
479 .name = "timer_list", 484 .name = "timer_list",
485 .debug_hint = timer_debug_hint,
480 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
481 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
482 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
@@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
964 * add_timer_on(). Upon exit the timer is not queued and the handler is 970 * add_timer_on(). Upon exit the timer is not queued and the handler is
965 * not running on any CPU. 971 * not running on any CPU.
966 * 972 *
973 * Note: You must not hold locks that are held in interrupt context
974 * while calling this function. Even if the lock has nothing to do
975 * with the timer in question. Here's why:
976 *
977 * CPU0 CPU1
978 * ---- ----
979 * <SOFTIRQ>
980 * call_timer_fn();
981 * base->running_timer = mytimer;
982 * spin_lock_irq(somelock);
983 * <IRQ>
984 * spin_lock(somelock);
985 * del_timer_sync(mytimer);
986 * while (base->running_timer == mytimer);
987 *
988 * Now del_timer_sync() will never return and never release somelock.
989 * The interrupt on the other CPU is waiting to grab somelock but
990 * it has interrupted the softirq that CPU0 is waiting to finish.
991 *
967 * The function returns whether it has deactivated a pending timer or not. 992 * The function returns whether it has deactivated a pending timer or not.
968 */ 993 */
969int del_timer_sync(struct timer_list *timer) 994int del_timer_sync(struct timer_list *timer)
@@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer)
971#ifdef CONFIG_LOCKDEP 996#ifdef CONFIG_LOCKDEP
972 unsigned long flags; 997 unsigned long flags;
973 998
999 /*
1000 * If lockdep gives a backtrace here, please reference
1001 * the synchronization rules above.
1002 */
974 local_irq_save(flags); 1003 local_irq_save(flags);
975 lock_map_acquire(&timer->lockdep_map); 1004 lock_map_acquire(&timer->lockdep_map);
976 lock_map_release(&timer->lockdep_map); 1005 lock_map_release(&timer->lockdep_map);
@@ -1295,19 +1324,6 @@ void run_local_timers(void)
1295 raise_softirq(TIMER_SOFTIRQ); 1324 raise_softirq(TIMER_SOFTIRQ);
1296} 1325}
1297 1326
1298/*
1299 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1300 * without sampling the sequence number in xtime_lock.
1301 * jiffies is defined in the linker script...
1302 */
1303
1304void do_timer(unsigned long ticks)
1305{
1306 jiffies_64 += ticks;
1307 update_wall_time();
1308 calc_global_load(ticks);
1309}
1310
1311#ifdef __ARCH_WANT_SYS_ALARM 1327#ifdef __ARCH_WANT_SYS_ALARM
1312 1328
1313/* 1329/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae8388..888b611897d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void)
3328 /* The cpu_boot init_task->ret_stack will never be freed */ 3328 /* The cpu_boot init_task->ret_stack will never be freed */
3329 for_each_online_cpu(cpu) { 3329 for_each_online_cpu(cpu) {
3330 if (!idle_task(cpu)->ret_stack) 3330 if (!idle_task(cpu)->ret_stack)
3331 ftrace_graph_init_task(idle_task(cpu)); 3331 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3332 } 3332 }
3333 3333
3334 do { 3334 do {
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void)
3418 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
3419} 3419}
3420 3420
3421static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
3422
3423static void
3424graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3425{
3426 atomic_set(&t->tracing_graph_pause, 0);
3427 atomic_set(&t->trace_overrun, 0);
3428 t->ftrace_timestamp = 0;
3429 /* make curr_ret_stack visable before we add the ret_stack */
3430 smp_wmb();
3431 t->ret_stack = ret_stack;
3432}
3433
3434/*
3435 * Allocate a return stack for the idle task. May be the first
3436 * time through, or it may be done by CPU hotplug online.
3437 */
3438void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
3439{
3440 t->curr_ret_stack = -1;
3441 /*
3442 * The idle task has no parent, it either has its own
3443 * stack or no stack at all.
3444 */
3445 if (t->ret_stack)
3446 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
3447
3448 if (ftrace_graph_active) {
3449 struct ftrace_ret_stack *ret_stack;
3450
3451 ret_stack = per_cpu(idle_ret_stack, cpu);
3452 if (!ret_stack) {
3453 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
3454 * sizeof(struct ftrace_ret_stack),
3455 GFP_KERNEL);
3456 if (!ret_stack)
3457 return;
3458 per_cpu(idle_ret_stack, cpu) = ret_stack;
3459 }
3460 graph_init_task(t, ret_stack);
3461 }
3462}
3463
3421/* Allocate a return stack for newly created task */ 3464/* Allocate a return stack for newly created task */
3422void ftrace_graph_init_task(struct task_struct *t) 3465void ftrace_graph_init_task(struct task_struct *t)
3423{ 3466{
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3433 GFP_KERNEL); 3476 GFP_KERNEL);
3434 if (!ret_stack) 3477 if (!ret_stack)
3435 return; 3478 return;
3436 atomic_set(&t->tracing_graph_pause, 0); 3479 graph_init_task(t, ret_stack);
3437 atomic_set(&t->trace_overrun, 0);
3438 t->ftrace_timestamp = 0;
3439 /* make curr_ret_stack visable before we add the ret_stack */
3440 smp_wmb();
3441 t->ret_stack = ret_stack;
3442 } 3480 }
3443} 3481}
3444 3482
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbc..db7b439d23e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1429} 1428}
1430EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1431 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1432static inline void * 1442static inline void *
1433__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1434{ 1444{
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2162 if (likely(ts >= cpu_buffer->write_stamp)) { 2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2163 delta = diff; 2173 delta = diff;
2164 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2175 int local_clock_stable = 1;
2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2177 local_clock_stable = sched_clock_stable;
2178#endif
2165 WARN_ONCE(delta > (1ULL << 59), 2179 WARN_ONCE(delta > (1ULL << 59),
2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2167 (unsigned long long)delta, 2181 (unsigned long long)delta,
2168 (unsigned long long)ts, 2182 (unsigned long long)ts,
2169 (unsigned long long)cpu_buffer->write_stamp); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2170 add_timestamp = 1; 2188 add_timestamp = 1;
2171 } 2189 }
2172 } 2190 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb8058..9541c27c1cf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
41#include "trace.h" 41#include "trace.h"
42#include "trace_output.h" 42#include "trace_output.h"
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45
46/* 44/*
47 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
48 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
340/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
341unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
342 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
343 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
344 342
345static int trace_stop_count; 343static int trace_stop_count;
346static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
425 "sleep-time", 423 "sleep-time",
426 "graph-time", 424 "graph-time",
427 "record-cmd", 425 "record-cmd",
426 "overwrite",
428 NULL 427 NULL
429}; 428};
430 429
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
780 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
781 780
782 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
783 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
784 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
785 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
792 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
793 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
794 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
795 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
796 } 804 }
797#endif 805#endif
@@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1102 1110
1103 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1104 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1105 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
1106 entry->flags = 1113 entry->flags =
1107#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1114#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1108 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1115 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m)
1749 seq_puts(m, "# | / _----=> need-resched \n"); 1756 seq_puts(m, "# | / _----=> need-resched \n");
1750 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1757 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1751 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1758 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1752 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1759 seq_puts(m, "# |||| / delay \n");
1753 seq_puts(m, "# |||||/ delay \n"); 1760 seq_puts(m, "# cmd pid ||||| time | caller \n");
1754 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1761 seq_puts(m, "# \\ / ||||| \\ | / \n");
1755 seq_puts(m, "# \\ / |||||| \\ | / \n");
1756} 1762}
1757 1763
1758static void print_func_help_header(struct seq_file *m) 1764static void print_func_help_header(struct seq_file *m)
@@ -2529,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2529 2535
2530 if (mask == TRACE_ITER_RECORD_CMD) 2536 if (mask == TRACE_ITER_RECORD_CMD)
2531 trace_event_enable_cmd_record(enabled); 2537 trace_event_enable_cmd_record(enabled);
2538
2539 if (mask == TRACE_ITER_OVERWRITE)
2540 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2532} 2541}
2533 2542
2534static ssize_t 2543static ssize_t
@@ -2710,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2710 2719
2711 mutex_lock(&trace_types_lock); 2720 mutex_lock(&trace_types_lock);
2712 if (tracer_enabled ^ val) { 2721 if (tracer_enabled ^ val) {
2722
2723 /* Only need to warn if this is used to change the state */
2724 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2725
2713 if (val) { 2726 if (val) {
2714 tracer_enabled = 1; 2727 tracer_enabled = 1;
2715 if (current_trace->start) 2728 if (current_trace->start)
@@ -4551,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4551__init static int tracer_alloc_buffers(void) 4564__init static int tracer_alloc_buffers(void)
4552{ 4565{
4553 int ring_buf_size; 4566 int ring_buf_size;
4567 enum ring_buffer_flags rb_flags;
4554 int i; 4568 int i;
4555 int ret = -ENOMEM; 4569 int ret = -ENOMEM;
4556 4570
4571
4557 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4572 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4558 goto out; 4573 goto out;
4559 4574
@@ -4566,12 +4581,13 @@ __init static int tracer_alloc_buffers(void)
4566 else 4581 else
4567 ring_buf_size = 1; 4582 ring_buf_size = 1;
4568 4583
4584 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4585
4569 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4586 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4570 cpumask_copy(tracing_cpumask, cpu_all_mask); 4587 cpumask_copy(tracing_cpumask, cpu_all_mask);
4571 4588
4572 /* TODO: make the number of buffers hot pluggable with CPUS */ 4589 /* TODO: make the number of buffers hot pluggable with CPUS */
4573 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4590 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4574 TRACE_BUFFER_FLAGS);
4575 if (!global_trace.buffer) { 4591 if (!global_trace.buffer) {
4576 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4592 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4577 WARN_ON(1); 4593 WARN_ON(1);
@@ -4581,7 +4597,7 @@ __init static int tracer_alloc_buffers(void)
4581 4597
4582 4598
4583#ifdef CONFIG_TRACER_MAX_TRACE 4599#ifdef CONFIG_TRACER_MAX_TRACE
4584 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4600 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4585 if (!max_tr.buffer) { 4601 if (!max_tr.buffer) {
4586 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4602 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4587 WARN_ON(1); 4603 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c..5e9dfc6286d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
606 TRACE_ITER_SLEEP_TIME = 0x40000, 606 TRACE_ITER_SLEEP_TIME = 0x40000,
607 TRACE_ITER_GRAPH_TIME = 0x80000, 607 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 608 TRACE_ITER_RECORD_CMD = 0x100000,
609 TRACE_ITER_OVERWRITE = 0x200000,
609}; 610};
610 611
611/* 612/*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
661}; 662};
662 663
663struct event_filter { 664struct event_filter {
664 int n_preds; 665 int n_preds; /* Number assigned */
665 struct filter_pred **preds; 666 int a_preds; /* allocated */
667 struct filter_pred *preds;
668 struct filter_pred *root;
666 char *filter_string; 669 char *filter_string;
667}; 670};
668 671
@@ -674,11 +677,23 @@ struct event_subsystem {
674 int nr_events; 677 int nr_events;
675}; 678};
676 679
680#define FILTER_PRED_INVALID ((unsigned short)-1)
681#define FILTER_PRED_IS_RIGHT (1 << 15)
682#define FILTER_PRED_FOLD (1 << 15)
683
684/*
685 * The max preds is the size of unsigned short with
686 * two flags at the MSBs. One bit is used for both the IS_RIGHT
687 * and FOLD flags. The other is reserved.
688 *
689 * 2^14 preds is way more than enough.
690 */
691#define MAX_FILTER_PRED 16384
692
677struct filter_pred; 693struct filter_pred;
678struct regex; 694struct regex;
679 695
680typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 696typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
681 int val1, int val2);
682 697
683typedef int (*regex_match_func)(char *str, struct regex *r, int len); 698typedef int (*regex_match_func)(char *str, struct regex *r, int len);
684 699
@@ -700,11 +715,23 @@ struct filter_pred {
700 filter_pred_fn_t fn; 715 filter_pred_fn_t fn;
701 u64 val; 716 u64 val;
702 struct regex regex; 717 struct regex regex;
703 char *field_name; 718 /*
719 * Leaf nodes use field_name, ops is used by AND and OR
720 * nodes. The field_name is always freed when freeing a pred.
721 * We can overload field_name for ops and have it freed
722 * as well.
723 */
724 union {
725 char *field_name;
726 unsigned short *ops;
727 };
704 int offset; 728 int offset;
705 int not; 729 int not;
706 int op; 730 int op;
707 int pop_n; 731 unsigned short index;
732 unsigned short parent;
733 unsigned short left;
734 unsigned short right;
708}; 735};
709 736
710extern struct list_head ftrace_common_fields; 737extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be..1516cb3ec54 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a..e88f74fe1d4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, lock_depth);
120 119
121 return ret; 120 return ret;
122} 121}
@@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
326{ 325{
327 return __ftrace_set_clr_event(NULL, system, event, set); 326 return __ftrace_set_clr_event(NULL, system, event, set);
328} 327}
328EXPORT_SYMBOL_GPL(trace_set_clr_event);
329 329
330/* 128 should be much more than enough */ 330/* 128 should be much more than enough */
331#define EVENT_BUF_SIZE 127 331#define EVENT_BUF_SIZE 127
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17..3249b4f77ef 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int type;
395 int match;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b..8435b43b178 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
353 kfree(data); 353 kfree(data);
354} 354}
355 355
356/* Bitfield fetch function */
357struct bitfield_fetch_param {
358 struct fetch_param orig;
359 unsigned char hi_shift;
360 unsigned char low_shift;
361};
362
363#define DEFINE_FETCH_bitfield(type) \
364static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
365 void *data, void *dest) \
366{ \
367 struct bitfield_fetch_param *bprm = data; \
368 type buf = 0; \
369 call_fetch(&bprm->orig, regs, &buf); \
370 if (buf) { \
371 buf <<= bprm->hi_shift; \
372 buf >>= bprm->low_shift; \
373 } \
374 *(type *)dest = buf; \
375}
376DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string NULL
378#define fetch_bitfield_string_size NULL
379
380static __kprobes void
381free_bitfield_fetch_param(struct bitfield_fetch_param *data)
382{
383 /*
384 * Don't check the bitfield itself, because this must be the
385 * last fetch function.
386 */
387 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
388 free_deref_fetch_param(data->orig.data);
389 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
390 free_symbol_cache(data->orig.data);
391 kfree(data);
392}
356/* Default (unsigned long) fetch type */ 393/* Default (unsigned long) fetch type */
357#define __DEFAULT_FETCH_TYPE(t) u##t 394#define __DEFAULT_FETCH_TYPE(t) u##t
358#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 395#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
367 FETCH_MTD_memory, 404 FETCH_MTD_memory,
368 FETCH_MTD_symbol, 405 FETCH_MTD_symbol,
369 FETCH_MTD_deref, 406 FETCH_MTD_deref,
407 FETCH_MTD_bitfield,
370 FETCH_MTD_END, 408 FETCH_MTD_END,
371}; 409};
372 410
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \ 425ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \ 426ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \ 427ASSIGN_FETCH_FUNC(deref, ftype), \
428ASSIGN_FETCH_FUNC(bitfield, ftype), \
390 } \ 429 } \
391 } 430 }
392 431
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
430 if (!type) 469 if (!type)
431 type = DEFAULT_FETCH_TYPE_STR; 470 type = DEFAULT_FETCH_TYPE_STR;
432 471
472 /* Special case: bitfield */
473 if (*type == 'b') {
474 unsigned long bs;
475 type = strchr(type, '/');
476 if (!type)
477 goto fail;
478 type++;
479 if (strict_strtoul(type, 0, &bs))
480 goto fail;
481 switch (bs) {
482 case 8:
483 return find_fetch_type("u8");
484 case 16:
485 return find_fetch_type("u16");
486 case 32:
487 return find_fetch_type("u32");
488 case 64:
489 return find_fetch_type("u64");
490 default:
491 goto fail;
492 }
493 }
494
433 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 495 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
434 if (strcmp(type, fetch_type_table[i].name) == 0) 496 if (strcmp(type, fetch_type_table[i].name) == 0)
435 return &fetch_type_table[i]; 497 return &fetch_type_table[i];
498fail:
436 return NULL; 499 return NULL;
437} 500}
438 501
@@ -586,7 +649,9 @@ error:
586 649
587static void free_probe_arg(struct probe_arg *arg) 650static void free_probe_arg(struct probe_arg *arg)
588{ 651{
589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 652 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
653 free_bitfield_fetch_param(arg->fetch.data);
654 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
590 free_deref_fetch_param(arg->fetch.data); 655 free_deref_fetch_param(arg->fetch.data);
591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 656 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
592 free_symbol_cache(arg->fetch.data); 657 free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
767 } 832 }
768 break; 833 break;
769 case '+': /* deref memory */ 834 case '+': /* deref memory */
835 arg++; /* Skip '+', because strict_strtol() rejects it. */
770 case '-': 836 case '-':
771 tmp = strchr(arg, '('); 837 tmp = strchr(arg, '(');
772 if (!tmp) 838 if (!tmp)
773 break; 839 break;
774 *tmp = '\0'; 840 *tmp = '\0';
775 ret = strict_strtol(arg + 1, 0, &offset); 841 ret = strict_strtol(arg, 0, &offset);
776 if (ret) 842 if (ret)
777 break; 843 break;
778 if (arg[0] == '-')
779 offset = -offset;
780 arg = tmp + 1; 844 arg = tmp + 1;
781 tmp = strrchr(arg, ')'); 845 tmp = strrchr(arg, ')');
782 if (tmp) { 846 if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
807 return ret; 871 return ret;
808} 872}
809 873
874#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
875
876/* Bitfield type needs to be parsed into a fetch function */
877static int __parse_bitfield_probe_arg(const char *bf,
878 const struct fetch_type *t,
879 struct fetch_param *f)
880{
881 struct bitfield_fetch_param *bprm;
882 unsigned long bw, bo;
883 char *tail;
884
885 if (*bf != 'b')
886 return 0;
887
888 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
889 if (!bprm)
890 return -ENOMEM;
891 bprm->orig = *f;
892 f->fn = t->fetch[FETCH_MTD_bitfield];
893 f->data = (void *)bprm;
894
895 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
896 if (bw == 0 || *tail != '@')
897 return -EINVAL;
898
899 bf = tail + 1;
900 bo = simple_strtoul(bf, &tail, 0);
901 if (tail == bf || *tail != '/')
902 return -EINVAL;
903
904 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
905 bprm->low_shift = bprm->hi_shift + bo;
906 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
907}
908
810/* String length checking wrapper */ 909/* String length checking wrapper */
811static int parse_probe_arg(char *arg, struct trace_probe *tp, 910static int parse_probe_arg(char *arg, struct trace_probe *tp,
812 struct probe_arg *parg, int is_return) 911 struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
836 parg->offset = tp->size; 935 parg->offset = tp->size;
837 tp->size += parg->type->size; 936 tp->size += parg->type->size;
838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 937 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
938 if (ret >= 0 && t != NULL)
939 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
839 if (ret >= 0) { 940 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type, 941 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn); 942 parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
1130 return ret; 1231 return ret;
1131} 1232}
1132 1233
1133#define WRITE_BUFSIZE 128 1234#define WRITE_BUFSIZE 4096
1134 1235
1135static ssize_t probes_write(struct file *file, const char __user *buffer, 1236static ssize_t probes_write(struct file *file, const char __user *buffer,
1136 size_t count, loff_t *ppos) 1237 size_t count, loff_t *ppos)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa220..456be9063c2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 529 * @entry: The trace entry field from the ring buffer
530 * 530 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 531 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 532 * count.
533 */ 533 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 535{
536 int hardirq, softirq; 536 char hardsoft_irq;
537 char need_resched;
538 char irqs_off;
539 int hardirq;
540 int softirq;
537 int ret; 541 int ret;
538 542
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 543 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 544 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 545
546 irqs_off =
547 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
548 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
549 '.';
550 need_resched =
551 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
552 hardsoft_irq =
553 (hardirq && softirq) ? 'H' :
554 hardirq ? 'h' :
555 softirq ? 's' :
556 '.';
557
542 if (!trace_seq_printf(s, "%c%c%c", 558 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 559 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 560 return 0;
551 561
552 if (entry->preempt_count) 562 if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 564 else
555 ret = trace_seq_putc(s, '.'); 565 ret = trace_seq_putc(s, '.');
556 566
557 if (!ret) 567 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 568}
565 569
566static int 570static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c4..7e62c0a1845 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d209..ee7b5a0bb9f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
60 60
61static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
62 62
63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
63static __init struct syscall_metadata * 76static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall) 77find_syscall_meta(unsigned long syscall)
65{ 78{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
72 stop = __stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
73 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
74 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
75 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
76 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
77 * Only compare after the "sys" prefix. Archs that use
78 * syscall wrappers may have syscalls symbols aliases prefixed
79 * with "SyS" instead of "sys", leading to an unwanted
80 * mismatch.
81 */
82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
83 return *start; 93 return *start;
84 } 94 }
85 return NULL; 95 return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
359 int num; 369 int num;
360 370
361 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
362 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
363 return -ENOSYS; 373 return -ENOSYS;
364 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
365 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
377 int num; 387 int num;
378 388
379 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
380 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
381 return; 391 return;
382 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
383 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
393 int num; 403 int num;
394 404
395 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
396 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
397 return -ENOSYS; 407 return -ENOSYS;
398 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
399 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
411 int num; 421 int num;
412 422
413 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
414 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
415 return; 425 return;
416 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
417 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
424int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
425{ 435{
426 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
427 445
428 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
429 return -ENOMEM; 447 return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
438 return id; 456 return id;
439} 457}
440 458
441unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
442{ 460{
443 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
444} 462}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b578a..b5fe4c00eb3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -316,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
316 316
317static struct debug_obj_descr work_debug_descr; 317static struct debug_obj_descr work_debug_descr;
318 318
319static void *work_debug_hint(void *addr)
320{
321 return ((struct work_struct *) addr)->func;
322}
323
319/* 324/*
320 * fixup_init is called when: 325 * fixup_init is called when:
321 * - an active object is initialized 326 * - an active object is initialized
@@ -387,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
387 392
388static struct debug_obj_descr work_debug_descr = { 393static struct debug_obj_descr work_debug_descr = {
389 .name = "work_struct", 394 .name = "work_struct",
395 .debug_hint = work_debug_hint,
390 .fixup_init = work_fixup_init, 396 .fixup_init = work_fixup_init,
391 .fixup_activate = work_fixup_activate, 397 .fixup_activate = work_fixup_activate,
392 .fixup_free = work_fixup_free, 398 .fixup_free = work_fixup_free,
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index deebcc57d4e..9d86e45086f 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -249,14 +249,17 @@ static struct debug_bucket *get_bucket(unsigned long addr)
249 249
250static void debug_print_object(struct debug_obj *obj, char *msg) 250static void debug_print_object(struct debug_obj *obj, char *msg)
251{ 251{
252 struct debug_obj_descr *descr = obj->descr;
252 static int limit; 253 static int limit;
253 254
254 if (limit < 5 && obj->descr != descr_test) { 255 if (limit < 5 && descr != descr_test) {
256 void *hint = descr->debug_hint ?
257 descr->debug_hint(obj->object) : NULL;
255 limit++; 258 limit++;
256 WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) " 259 WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) "
257 "object type: %s\n", 260 "object type: %s hint: %pS\n",
258 msg, obj_states[obj->state], obj->astate, 261 msg, obj_states[obj->state], obj->astate,
259 obj->descr->name); 262 descr->name, hint);
260 } 263 }
261 debug_objects_warnings++; 264 debug_objects_warnings++;
262} 265}
diff --git a/lib/plist.c b/lib/plist.c
index 1471988d919..0ae7e643172 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -28,6 +28,8 @@
28 28
29#ifdef CONFIG_DEBUG_PI_LIST 29#ifdef CONFIG_DEBUG_PI_LIST
30 30
31static struct plist_head test_head;
32
31static void plist_check_prev_next(struct list_head *t, struct list_head *p, 33static void plist_check_prev_next(struct list_head *t, struct list_head *p,
32 struct list_head *n) 34 struct list_head *n)
33{ 35{
@@ -54,12 +56,13 @@ static void plist_check_list(struct list_head *top)
54 56
55static void plist_check_head(struct plist_head *head) 57static void plist_check_head(struct plist_head *head)
56{ 58{
57 WARN_ON(!head->rawlock && !head->spinlock); 59 WARN_ON(head != &test_head && !head->rawlock && !head->spinlock);
58 if (head->rawlock) 60 if (head->rawlock)
59 WARN_ON_SMP(!raw_spin_is_locked(head->rawlock)); 61 WARN_ON_SMP(!raw_spin_is_locked(head->rawlock));
60 if (head->spinlock) 62 if (head->spinlock)
61 WARN_ON_SMP(!spin_is_locked(head->spinlock)); 63 WARN_ON_SMP(!spin_is_locked(head->spinlock));
62 plist_check_list(&head->prio_list); 64 if (!plist_head_empty(head))
65 plist_check_list(&plist_first(head)->prio_list);
63 plist_check_list(&head->node_list); 66 plist_check_list(&head->node_list);
64} 67}
65 68
@@ -75,25 +78,33 @@ static void plist_check_head(struct plist_head *head)
75 */ 78 */
76void plist_add(struct plist_node *node, struct plist_head *head) 79void plist_add(struct plist_node *node, struct plist_head *head)
77{ 80{
78 struct plist_node *iter; 81 struct plist_node *first, *iter, *prev = NULL;
82 struct list_head *node_next = &head->node_list;
79 83
80 plist_check_head(head); 84 plist_check_head(head);
81 WARN_ON(!plist_node_empty(node)); 85 WARN_ON(!plist_node_empty(node));
86 WARN_ON(!list_empty(&node->prio_list));
87
88 if (plist_head_empty(head))
89 goto ins_node;
82 90
83 list_for_each_entry(iter, &head->prio_list, plist.prio_list) { 91 first = iter = plist_first(head);
84 if (node->prio < iter->prio) 92
85 goto lt_prio; 93 do {
86 else if (node->prio == iter->prio) { 94 if (node->prio < iter->prio) {
87 iter = list_entry(iter->plist.prio_list.next, 95 node_next = &iter->node_list;
88 struct plist_node, plist.prio_list); 96 break;
89 goto eq_prio;
90 } 97 }
91 }
92 98
93lt_prio: 99 prev = iter;
94 list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); 100 iter = list_entry(iter->prio_list.next,
95eq_prio: 101 struct plist_node, prio_list);
96 list_add_tail(&node->plist.node_list, &iter->plist.node_list); 102 } while (iter != first);
103
104 if (!prev || prev->prio != node->prio)
105 list_add_tail(&node->prio_list, &iter->prio_list);
106ins_node:
107 list_add_tail(&node->node_list, node_next);
97 108
98 plist_check_head(head); 109 plist_check_head(head);
99} 110}
@@ -108,14 +119,98 @@ void plist_del(struct plist_node *node, struct plist_head *head)
108{ 119{
109 plist_check_head(head); 120 plist_check_head(head);
110 121
111 if (!list_empty(&node->plist.prio_list)) { 122 if (!list_empty(&node->prio_list)) {
112 struct plist_node *next = plist_first(&node->plist); 123 if (node->node_list.next != &head->node_list) {
124 struct plist_node *next;
125
126 next = list_entry(node->node_list.next,
127 struct plist_node, node_list);
113 128
114 list_move_tail(&next->plist.prio_list, &node->plist.prio_list); 129 /* add the next plist_node into prio_list */
115 list_del_init(&node->plist.prio_list); 130 if (list_empty(&next->prio_list))
131 list_add(&next->prio_list, &node->prio_list);
132 }
133 list_del_init(&node->prio_list);
116 } 134 }
117 135
118 list_del_init(&node->plist.node_list); 136 list_del_init(&node->node_list);
119 137
120 plist_check_head(head); 138 plist_check_head(head);
121} 139}
140
141#ifdef CONFIG_DEBUG_PI_LIST
142#include <linux/sched.h>
143#include <linux/module.h>
144#include <linux/init.h>
145
146static struct plist_node __initdata test_node[241];
147
148static void __init plist_test_check(int nr_expect)
149{
150 struct plist_node *first, *prio_pos, *node_pos;
151
152 if (plist_head_empty(&test_head)) {
153 BUG_ON(nr_expect != 0);
154 return;
155 }
156
157 prio_pos = first = plist_first(&test_head);
158 plist_for_each(node_pos, &test_head) {
159 if (nr_expect-- < 0)
160 break;
161 if (node_pos == first)
162 continue;
163 if (node_pos->prio == prio_pos->prio) {
164 BUG_ON(!list_empty(&node_pos->prio_list));
165 continue;
166 }
167
168 BUG_ON(prio_pos->prio > node_pos->prio);
169 BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list);
170 prio_pos = node_pos;
171 }
172
173 BUG_ON(nr_expect != 0);
174 BUG_ON(prio_pos->prio_list.next != &first->prio_list);
175}
176
177static int __init plist_test(void)
178{
179 int nr_expect = 0, i, loop;
180 unsigned int r = local_clock();
181
182 printk(KERN_INFO "start plist test\n");
183 plist_head_init(&test_head, NULL);
184 for (i = 0; i < ARRAY_SIZE(test_node); i++)
185 plist_node_init(test_node + i, 0);
186
187 for (loop = 0; loop < 1000; loop++) {
188 r = r * 193939 % 47629;
189 i = r % ARRAY_SIZE(test_node);
190 if (plist_node_empty(test_node + i)) {
191 r = r * 193939 % 47629;
192 test_node[i].prio = r % 99;
193 plist_add(test_node + i, &test_head);
194 nr_expect++;
195 } else {
196 plist_del(test_node + i, &test_head);
197 nr_expect--;
198 }
199 plist_test_check(nr_expect);
200 }
201
202 for (i = 0; i < ARRAY_SIZE(test_node); i++) {
203 if (plist_node_empty(test_node + i))
204 continue;
205 plist_del(test_node + i, &test_head);
206 nr_expect--;
207 plist_test_check(nr_expect);
208 }
209
210 printk(KERN_INFO "end plist test\n");
211 return 0;
212}
213
214module_init(plist_test);
215
216#endif
diff --git a/lib/rwsem.c b/lib/rwsem.c
index f236d7cd5cf..aa7c3052261 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -222,8 +222,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
222/* 222/*
223 * wait for the read lock to be granted 223 * wait for the read lock to be granted
224 */ 224 */
225asmregparm struct rw_semaphore __sched * 225struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
226rwsem_down_read_failed(struct rw_semaphore *sem)
227{ 226{
228 return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, 227 return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ,
229 -RWSEM_ACTIVE_READ_BIAS); 228 -RWSEM_ACTIVE_READ_BIAS);
@@ -232,8 +231,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
232/* 231/*
233 * wait for the write lock to be granted 232 * wait for the write lock to be granted
234 */ 233 */
235asmregparm struct rw_semaphore __sched * 234struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
236rwsem_down_write_failed(struct rw_semaphore *sem)
237{ 235{
238 return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, 236 return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE,
239 -RWSEM_ACTIVE_WRITE_BIAS); 237 -RWSEM_ACTIVE_WRITE_BIAS);
@@ -243,7 +241,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem)
243 * handle waking up a waiter on the semaphore 241 * handle waking up a waiter on the semaphore
244 * - up_read/up_write has decremented the active part of count if we come here 242 * - up_read/up_write has decremented the active part of count if we come here
245 */ 243 */
246asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) 244struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
247{ 245{
248 unsigned long flags; 246 unsigned long flags;
249 247
@@ -263,7 +261,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
263 * - caller incremented waiting part of count and discovered it still negative 261 * - caller incremented waiting part of count and discovered it still negative
264 * - just wake up any readers at the front of the queue 262 * - just wake up any readers at the front of the queue
265 */ 263 */
266asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) 264struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
267{ 265{
268 unsigned long flags; 266 unsigned long flags;
269 267
diff --git a/mm/Makefile b/mm/Makefile
index 2b1b575ae71..42a8326c3e3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
@@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
15 $(mmu-y) 15 $(mmu-y)
16obj-y += init-mm.o 16obj-y += init-mm.o
17 17
18ifdef CONFIG_NO_BOOTMEM
19 obj-y += nobootmem.o
20else
21 obj-y += bootmem.o
22endif
23
18obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 24obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
19 25
20obj-$(CONFIG_BOUNCE) += bounce.o 26obj-$(CONFIG_BOUNCE) += bounce.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 13b0caa9793..07aeb89e396 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -23,6 +23,13 @@
23 23
24#include "internal.h" 24#include "internal.h"
25 25
26#ifndef CONFIG_NEED_MULTIPLE_NODES
27struct pglist_data __refdata contig_page_data = {
28 .bdata = &bootmem_node_data[0]
29};
30EXPORT_SYMBOL(contig_page_data);
31#endif
32
26unsigned long max_low_pfn; 33unsigned long max_low_pfn;
27unsigned long min_low_pfn; 34unsigned long min_low_pfn;
28unsigned long max_pfn; 35unsigned long max_pfn;
@@ -35,7 +42,6 @@ unsigned long max_pfn;
35unsigned long saved_max_pfn; 42unsigned long saved_max_pfn;
36#endif 43#endif
37 44
38#ifndef CONFIG_NO_BOOTMEM
39bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 45bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
40 46
41static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 47static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -146,7 +152,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
146 min_low_pfn = start; 152 min_low_pfn = start;
147 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 153 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
148} 154}
149#endif 155
150/* 156/*
151 * free_bootmem_late - free bootmem pages directly to page allocator 157 * free_bootmem_late - free bootmem pages directly to page allocator
152 * @addr: starting address of the range 158 * @addr: starting address of the range
@@ -171,53 +177,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
171 } 177 }
172} 178}
173 179
174#ifdef CONFIG_NO_BOOTMEM
175static void __init __free_pages_memory(unsigned long start, unsigned long end)
176{
177 int i;
178 unsigned long start_aligned, end_aligned;
179 int order = ilog2(BITS_PER_LONG);
180
181 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
182 end_aligned = end & ~(BITS_PER_LONG - 1);
183
184 if (end_aligned <= start_aligned) {
185 for (i = start; i < end; i++)
186 __free_pages_bootmem(pfn_to_page(i), 0);
187
188 return;
189 }
190
191 for (i = start; i < start_aligned; i++)
192 __free_pages_bootmem(pfn_to_page(i), 0);
193
194 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
195 __free_pages_bootmem(pfn_to_page(i), order);
196
197 for (i = end_aligned; i < end; i++)
198 __free_pages_bootmem(pfn_to_page(i), 0);
199}
200
201unsigned long __init free_all_memory_core_early(int nodeid)
202{
203 int i;
204 u64 start, end;
205 unsigned long count = 0;
206 struct range *range = NULL;
207 int nr_range;
208
209 nr_range = get_free_all_memory_range(&range, nodeid);
210
211 for (i = 0; i < nr_range; i++) {
212 start = range[i].start;
213 end = range[i].end;
214 count += end - start;
215 __free_pages_memory(start, end);
216 }
217
218 return count;
219}
220#else
221static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 180static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
222{ 181{
223 int aligned; 182 int aligned;
@@ -278,7 +237,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
278 237
279 return count; 238 return count;
280} 239}
281#endif
282 240
283/** 241/**
284 * free_all_bootmem_node - release a node's free pages to the buddy allocator 242 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -289,12 +247,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
289unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 247unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
290{ 248{
291 register_page_bootmem_info_node(pgdat); 249 register_page_bootmem_info_node(pgdat);
292#ifdef CONFIG_NO_BOOTMEM
293 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
294 return 0;
295#else
296 return free_all_bootmem_core(pgdat->bdata); 250 return free_all_bootmem_core(pgdat->bdata);
297#endif
298} 251}
299 252
300/** 253/**
@@ -304,16 +257,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
304 */ 257 */
305unsigned long __init free_all_bootmem(void) 258unsigned long __init free_all_bootmem(void)
306{ 259{
307#ifdef CONFIG_NO_BOOTMEM
308 /*
309 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
310 * because in some case like Node0 doesnt have RAM installed
311 * low ram will be on Node1
312 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
313 * will be used instead of only Node0 related
314 */
315 return free_all_memory_core_early(MAX_NUMNODES);
316#else
317 unsigned long total_pages = 0; 260 unsigned long total_pages = 0;
318 bootmem_data_t *bdata; 261 bootmem_data_t *bdata;
319 262
@@ -321,10 +264,8 @@ unsigned long __init free_all_bootmem(void)
321 total_pages += free_all_bootmem_core(bdata); 264 total_pages += free_all_bootmem_core(bdata);
322 265
323 return total_pages; 266 return total_pages;
324#endif
325} 267}
326 268
327#ifndef CONFIG_NO_BOOTMEM
328static void __init __free(bootmem_data_t *bdata, 269static void __init __free(bootmem_data_t *bdata,
329 unsigned long sidx, unsigned long eidx) 270 unsigned long sidx, unsigned long eidx)
330{ 271{
@@ -419,7 +360,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
419 } 360 }
420 BUG(); 361 BUG();
421} 362}
422#endif
423 363
424/** 364/**
425 * free_bootmem_node - mark a page range as usable 365 * free_bootmem_node - mark a page range as usable
@@ -434,10 +374,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
434void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 374void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
435 unsigned long size) 375 unsigned long size)
436{ 376{
437#ifdef CONFIG_NO_BOOTMEM
438 kmemleak_free_part(__va(physaddr), size);
439 memblock_x86_free_range(physaddr, physaddr + size);
440#else
441 unsigned long start, end; 377 unsigned long start, end;
442 378
443 kmemleak_free_part(__va(physaddr), size); 379 kmemleak_free_part(__va(physaddr), size);
@@ -446,7 +382,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
446 end = PFN_DOWN(physaddr + size); 382 end = PFN_DOWN(physaddr + size);
447 383
448 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 384 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
449#endif
450} 385}
451 386
452/** 387/**
@@ -460,10 +395,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
460 */ 395 */
461void __init free_bootmem(unsigned long addr, unsigned long size) 396void __init free_bootmem(unsigned long addr, unsigned long size)
462{ 397{
463#ifdef CONFIG_NO_BOOTMEM
464 kmemleak_free_part(__va(addr), size);
465 memblock_x86_free_range(addr, addr + size);
466#else
467 unsigned long start, end; 398 unsigned long start, end;
468 399
469 kmemleak_free_part(__va(addr), size); 400 kmemleak_free_part(__va(addr), size);
@@ -472,7 +403,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
472 end = PFN_DOWN(addr + size); 403 end = PFN_DOWN(addr + size);
473 404
474 mark_bootmem(start, end, 0, 0); 405 mark_bootmem(start, end, 0, 0);
475#endif
476} 406}
477 407
478/** 408/**
@@ -489,17 +419,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
489int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 419int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
490 unsigned long size, int flags) 420 unsigned long size, int flags)
491{ 421{
492#ifdef CONFIG_NO_BOOTMEM
493 panic("no bootmem");
494 return 0;
495#else
496 unsigned long start, end; 422 unsigned long start, end;
497 423
498 start = PFN_DOWN(physaddr); 424 start = PFN_DOWN(physaddr);
499 end = PFN_UP(physaddr + size); 425 end = PFN_UP(physaddr + size);
500 426
501 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 427 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
502#endif
503} 428}
504 429
505/** 430/**
@@ -515,20 +440,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
515int __init reserve_bootmem(unsigned long addr, unsigned long size, 440int __init reserve_bootmem(unsigned long addr, unsigned long size,
516 int flags) 441 int flags)
517{ 442{
518#ifdef CONFIG_NO_BOOTMEM
519 panic("no bootmem");
520 return 0;
521#else
522 unsigned long start, end; 443 unsigned long start, end;
523 444
524 start = PFN_DOWN(addr); 445 start = PFN_DOWN(addr);
525 end = PFN_UP(addr + size); 446 end = PFN_UP(addr + size);
526 447
527 return mark_bootmem(start, end, 1, flags); 448 return mark_bootmem(start, end, 1, flags);
528#endif
529} 449}
530 450
531#ifndef CONFIG_NO_BOOTMEM
532int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 451int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
533 int flags) 452 int flags)
534{ 453{
@@ -685,33 +604,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
685#endif 604#endif
686 return NULL; 605 return NULL;
687} 606}
688#endif
689 607
690static void * __init ___alloc_bootmem_nopanic(unsigned long size, 608static void * __init ___alloc_bootmem_nopanic(unsigned long size,
691 unsigned long align, 609 unsigned long align,
692 unsigned long goal, 610 unsigned long goal,
693 unsigned long limit) 611 unsigned long limit)
694{ 612{
695#ifdef CONFIG_NO_BOOTMEM
696 void *ptr;
697
698 if (WARN_ON_ONCE(slab_is_available()))
699 return kzalloc(size, GFP_NOWAIT);
700
701restart:
702
703 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
704
705 if (ptr)
706 return ptr;
707
708 if (goal != 0) {
709 goal = 0;
710 goto restart;
711 }
712
713 return NULL;
714#else
715 bootmem_data_t *bdata; 613 bootmem_data_t *bdata;
716 void *region; 614 void *region;
717 615
@@ -737,7 +635,6 @@ restart:
737 } 635 }
738 636
739 return NULL; 637 return NULL;
740#endif
741} 638}
742 639
743/** 640/**
@@ -758,10 +655,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
758{ 655{
759 unsigned long limit = 0; 656 unsigned long limit = 0;
760 657
761#ifdef CONFIG_NO_BOOTMEM
762 limit = -1UL;
763#endif
764
765 return ___alloc_bootmem_nopanic(size, align, goal, limit); 658 return ___alloc_bootmem_nopanic(size, align, goal, limit);
766} 659}
767 660
@@ -798,14 +691,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
798{ 691{
799 unsigned long limit = 0; 692 unsigned long limit = 0;
800 693
801#ifdef CONFIG_NO_BOOTMEM
802 limit = -1UL;
803#endif
804
805 return ___alloc_bootmem(size, align, goal, limit); 694 return ___alloc_bootmem(size, align, goal, limit);
806} 695}
807 696
808#ifndef CONFIG_NO_BOOTMEM
809static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 697static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
810 unsigned long size, unsigned long align, 698 unsigned long size, unsigned long align,
811 unsigned long goal, unsigned long limit) 699 unsigned long goal, unsigned long limit)
@@ -822,7 +710,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
822 710
823 return ___alloc_bootmem(size, align, goal, limit); 711 return ___alloc_bootmem(size, align, goal, limit);
824} 712}
825#endif
826 713
827/** 714/**
828 * __alloc_bootmem_node - allocate boot memory from a specific node 715 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -842,24 +729,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
842void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 729void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
843 unsigned long align, unsigned long goal) 730 unsigned long align, unsigned long goal)
844{ 731{
845 void *ptr;
846
847 if (WARN_ON_ONCE(slab_is_available())) 732 if (WARN_ON_ONCE(slab_is_available()))
848 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 733 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
849 734
850#ifdef CONFIG_NO_BOOTMEM 735 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
851 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
852 goal, -1ULL);
853 if (ptr)
854 return ptr;
855
856 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
857 goal, -1ULL);
858#else
859 ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
860#endif
861
862 return ptr;
863} 736}
864 737
865void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 738void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -880,13 +753,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
880 unsigned long new_goal; 753 unsigned long new_goal;
881 754
882 new_goal = MAX_DMA32_PFN << PAGE_SHIFT; 755 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
883#ifdef CONFIG_NO_BOOTMEM
884 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
885 new_goal, -1ULL);
886#else
887 ptr = alloc_bootmem_core(pgdat->bdata, size, align, 756 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
888 new_goal, 0); 757 new_goal, 0);
889#endif
890 if (ptr) 758 if (ptr)
891 return ptr; 759 return ptr;
892 } 760 }
@@ -907,16 +775,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
907void * __init alloc_bootmem_section(unsigned long size, 775void * __init alloc_bootmem_section(unsigned long size,
908 unsigned long section_nr) 776 unsigned long section_nr)
909{ 777{
910#ifdef CONFIG_NO_BOOTMEM
911 unsigned long pfn, goal, limit;
912
913 pfn = section_nr_to_pfn(section_nr);
914 goal = pfn << PAGE_SHIFT;
915 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
916
917 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
918 SMP_CACHE_BYTES, goal, limit);
919#else
920 bootmem_data_t *bdata; 778 bootmem_data_t *bdata;
921 unsigned long pfn, goal, limit; 779 unsigned long pfn, goal, limit;
922 780
@@ -926,7 +784,6 @@ void * __init alloc_bootmem_section(unsigned long size,
926 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 784 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
927 785
928 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 786 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
929#endif
930} 787}
931#endif 788#endif
932 789
@@ -938,16 +795,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
938 if (WARN_ON_ONCE(slab_is_available())) 795 if (WARN_ON_ONCE(slab_is_available()))
939 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
940 797
941#ifdef CONFIG_NO_BOOTMEM
942 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
943 goal, -1ULL);
944#else
945 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 798 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
946 if (ptr) 799 if (ptr)
947 return ptr; 800 return ptr;
948 801
949 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 802 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
950#endif
951 if (ptr) 803 if (ptr)
952 return ptr; 804 return ptr;
953 805
@@ -995,21 +847,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
995void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 847void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
996 unsigned long align, unsigned long goal) 848 unsigned long align, unsigned long goal)
997{ 849{
998 void *ptr;
999
1000 if (WARN_ON_ONCE(slab_is_available())) 850 if (WARN_ON_ONCE(slab_is_available()))
1001 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 851 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
1002 852
1003#ifdef CONFIG_NO_BOOTMEM 853 return ___alloc_bootmem_node(pgdat->bdata, size, align,
1004 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
1005 goal, ARCH_LOW_ADDRESS_LIMIT); 854 goal, ARCH_LOW_ADDRESS_LIMIT);
1006 if (ptr)
1007 return ptr;
1008 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
1009 goal, ARCH_LOW_ADDRESS_LIMIT);
1010#else
1011 ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
1012 goal, ARCH_LOW_ADDRESS_LIMIT);
1013#endif
1014 return ptr;
1015} 855}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 00000000000..e2bdb07079c
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,435 @@
1/*
2 * bootmem - A boot-time physical memory allocator and configurator
3 *
4 * Copyright (C) 1999 Ingo Molnar
5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
7 *
8 * Access to this subsystem has to be serialized externally (which is true
9 * for the boot process anyway).
10 */
11#include <linux/init.h>
12#include <linux/pfn.h>
13#include <linux/slab.h>
14#include <linux/bootmem.h>
15#include <linux/module.h>
16#include <linux/kmemleak.h>
17#include <linux/range.h>
18#include <linux/memblock.h>
19
20#include <asm/bug.h>
21#include <asm/io.h>
22#include <asm/processor.h>
23
24#include "internal.h"
25
26#ifndef CONFIG_NEED_MULTIPLE_NODES
27struct pglist_data __refdata contig_page_data;
28EXPORT_SYMBOL(contig_page_data);
29#endif
30
31unsigned long max_low_pfn;
32unsigned long min_low_pfn;
33unsigned long max_pfn;
34
35#ifdef CONFIG_CRASH_DUMP
36/*
37 * If we have booted due to a crash, max_pfn will be a very low value. We need
38 * to know the amount of memory that the previous kernel used.
39 */
40unsigned long saved_max_pfn;
41#endif
42
43static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
44 u64 goal, u64 limit)
45{
46 void *ptr;
47 u64 addr;
48
49 if (limit > memblock.current_limit)
50 limit = memblock.current_limit;
51
52 addr = find_memory_core_early(nid, size, align, goal, limit);
53
54 if (addr == MEMBLOCK_ERROR)
55 return NULL;
56
57 ptr = phys_to_virt(addr);
58 memset(ptr, 0, size);
59 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
60 /*
61 * The min_count is set to 0 so that bootmem allocated blocks
62 * are never reported as leaks.
63 */
64 kmemleak_alloc(ptr, size, 0, 0);
65 return ptr;
66}
67
68/*
69 * free_bootmem_late - free bootmem pages directly to page allocator
70 * @addr: starting address of the range
71 * @size: size of the range in bytes
72 *
73 * This is only useful when the bootmem allocator has already been torn
74 * down, but we are still initializing the system. Pages are given directly
75 * to the page allocator, no bootmem metadata is updated because it is gone.
76 */
77void __init free_bootmem_late(unsigned long addr, unsigned long size)
78{
79 unsigned long cursor, end;
80
81 kmemleak_free_part(__va(addr), size);
82
83 cursor = PFN_UP(addr);
84 end = PFN_DOWN(addr + size);
85
86 for (; cursor < end; cursor++) {
87 __free_pages_bootmem(pfn_to_page(cursor), 0);
88 totalram_pages++;
89 }
90}
91
92static void __init __free_pages_memory(unsigned long start, unsigned long end)
93{
94 int i;
95 unsigned long start_aligned, end_aligned;
96 int order = ilog2(BITS_PER_LONG);
97
98 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
99 end_aligned = end & ~(BITS_PER_LONG - 1);
100
101 if (end_aligned <= start_aligned) {
102 for (i = start; i < end; i++)
103 __free_pages_bootmem(pfn_to_page(i), 0);
104
105 return;
106 }
107
108 for (i = start; i < start_aligned; i++)
109 __free_pages_bootmem(pfn_to_page(i), 0);
110
111 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
112 __free_pages_bootmem(pfn_to_page(i), order);
113
114 for (i = end_aligned; i < end; i++)
115 __free_pages_bootmem(pfn_to_page(i), 0);
116}
117
118unsigned long __init free_all_memory_core_early(int nodeid)
119{
120 int i;
121 u64 start, end;
122 unsigned long count = 0;
123 struct range *range = NULL;
124 int nr_range;
125
126 nr_range = get_free_all_memory_range(&range, nodeid);
127
128 for (i = 0; i < nr_range; i++) {
129 start = range[i].start;
130 end = range[i].end;
131 count += end - start;
132 __free_pages_memory(start, end);
133 }
134
135 return count;
136}
137
138/**
139 * free_all_bootmem_node - release a node's free pages to the buddy allocator
140 * @pgdat: node to be released
141 *
142 * Returns the number of pages actually released.
143 */
144unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
145{
146 register_page_bootmem_info_node(pgdat);
147
148 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
149 return 0;
150}
151
152/**
153 * free_all_bootmem - release free pages to the buddy allocator
154 *
155 * Returns the number of pages actually released.
156 */
157unsigned long __init free_all_bootmem(void)
158{
159 /*
160 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
161 * because in some case like Node0 doesnt have RAM installed
162 * low ram will be on Node1
163 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
164 * will be used instead of only Node0 related
165 */
166 return free_all_memory_core_early(MAX_NUMNODES);
167}
168
169/**
170 * free_bootmem_node - mark a page range as usable
171 * @pgdat: node the range resides on
172 * @physaddr: starting address of the range
173 * @size: size of the range in bytes
174 *
175 * Partial pages will be considered reserved and left as they are.
176 *
177 * The range must reside completely on the specified node.
178 */
179void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
180 unsigned long size)
181{
182 kmemleak_free_part(__va(physaddr), size);
183 memblock_x86_free_range(physaddr, physaddr + size);
184}
185
186/**
187 * free_bootmem - mark a page range as usable
188 * @addr: starting address of the range
189 * @size: size of the range in bytes
190 *
191 * Partial pages will be considered reserved and left as they are.
192 *
193 * The range must be contiguous but may span node boundaries.
194 */
195void __init free_bootmem(unsigned long addr, unsigned long size)
196{
197 kmemleak_free_part(__va(addr), size);
198 memblock_x86_free_range(addr, addr + size);
199}
200
201static void * __init ___alloc_bootmem_nopanic(unsigned long size,
202 unsigned long align,
203 unsigned long goal,
204 unsigned long limit)
205{
206 void *ptr;
207
208 if (WARN_ON_ONCE(slab_is_available()))
209 return kzalloc(size, GFP_NOWAIT);
210
211restart:
212
213 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
214
215 if (ptr)
216 return ptr;
217
218 if (goal != 0) {
219 goal = 0;
220 goto restart;
221 }
222
223 return NULL;
224}
225
226/**
227 * __alloc_bootmem_nopanic - allocate boot memory without panicking
228 * @size: size of the request in bytes
229 * @align: alignment of the region
230 * @goal: preferred starting address of the region
231 *
232 * The goal is dropped if it can not be satisfied and the allocation will
233 * fall back to memory below @goal.
234 *
235 * Allocation may happen on any node in the system.
236 *
237 * Returns NULL on failure.
238 */
239void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
240 unsigned long goal)
241{
242 unsigned long limit = -1UL;
243
244 return ___alloc_bootmem_nopanic(size, align, goal, limit);
245}
246
247static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
248 unsigned long goal, unsigned long limit)
249{
250 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
251
252 if (mem)
253 return mem;
254 /*
255 * Whoops, we cannot satisfy the allocation request.
256 */
257 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
258 panic("Out of memory");
259 return NULL;
260}
261
262/**
263 * __alloc_bootmem - allocate boot memory
264 * @size: size of the request in bytes
265 * @align: alignment of the region
266 * @goal: preferred starting address of the region
267 *
268 * The goal is dropped if it can not be satisfied and the allocation will
269 * fall back to memory below @goal.
270 *
271 * Allocation may happen on any node in the system.
272 *
273 * The function panics if the request can not be satisfied.
274 */
275void * __init __alloc_bootmem(unsigned long size, unsigned long align,
276 unsigned long goal)
277{
278 unsigned long limit = -1UL;
279
280 return ___alloc_bootmem(size, align, goal, limit);
281}
282
283/**
284 * __alloc_bootmem_node - allocate boot memory from a specific node
285 * @pgdat: node to allocate from
286 * @size: size of the request in bytes
287 * @align: alignment of the region
288 * @goal: preferred starting address of the region
289 *
290 * The goal is dropped if it can not be satisfied and the allocation will
291 * fall back to memory below @goal.
292 *
293 * Allocation may fall back to any node in the system if the specified node
294 * can not hold the requested memory.
295 *
296 * The function panics if the request can not be satisfied.
297 */
298void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
299 unsigned long align, unsigned long goal)
300{
301 void *ptr;
302
303 if (WARN_ON_ONCE(slab_is_available()))
304 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
305
306 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
307 goal, -1ULL);
308 if (ptr)
309 return ptr;
310
311 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
312 goal, -1ULL);
313}
314
315void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
316 unsigned long align, unsigned long goal)
317{
318#ifdef MAX_DMA32_PFN
319 unsigned long end_pfn;
320
321 if (WARN_ON_ONCE(slab_is_available()))
322 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
323
324 /* update goal according ...MAX_DMA32_PFN */
325 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
326
327 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
328 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
329 void *ptr;
330 unsigned long new_goal;
331
332 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
333 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
334 new_goal, -1ULL);
335 if (ptr)
336 return ptr;
337 }
338#endif
339
340 return __alloc_bootmem_node(pgdat, size, align, goal);
341
342}
343
344#ifdef CONFIG_SPARSEMEM
345/**
346 * alloc_bootmem_section - allocate boot memory from a specific section
347 * @size: size of the request in bytes
348 * @section_nr: sparse map section to allocate from
349 *
350 * Return NULL on failure.
351 */
352void * __init alloc_bootmem_section(unsigned long size,
353 unsigned long section_nr)
354{
355 unsigned long pfn, goal, limit;
356
357 pfn = section_nr_to_pfn(section_nr);
358 goal = pfn << PAGE_SHIFT;
359 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
360
361 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
362 SMP_CACHE_BYTES, goal, limit);
363}
364#endif
365
366void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
367 unsigned long align, unsigned long goal)
368{
369 void *ptr;
370
371 if (WARN_ON_ONCE(slab_is_available()))
372 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
373
374 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
375 goal, -1ULL);
376 if (ptr)
377 return ptr;
378
379 return __alloc_bootmem_nopanic(size, align, goal);
380}
381
382#ifndef ARCH_LOW_ADDRESS_LIMIT
383#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
384#endif
385
386/**
387 * __alloc_bootmem_low - allocate low boot memory
388 * @size: size of the request in bytes
389 * @align: alignment of the region
390 * @goal: preferred starting address of the region
391 *
392 * The goal is dropped if it can not be satisfied and the allocation will
393 * fall back to memory below @goal.
394 *
395 * Allocation may happen on any node in the system.
396 *
397 * The function panics if the request can not be satisfied.
398 */
399void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
400 unsigned long goal)
401{
402 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
403}
404
405/**
406 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
407 * @pgdat: node to allocate from
408 * @size: size of the request in bytes
409 * @align: alignment of the region
410 * @goal: preferred starting address of the region
411 *
412 * The goal is dropped if it can not be satisfied and the allocation will
413 * fall back to memory below @goal.
414 *
415 * Allocation may fall back to any node in the system if the specified node
416 * can not hold the requested memory.
417 *
418 * The function panics if the request can not be satisfied.
419 */
420void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
421 unsigned long align, unsigned long goal)
422{
423 void *ptr;
424
425 if (WARN_ON_ONCE(slab_is_available()))
426 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
427
428 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
429 goal, ARCH_LOW_ADDRESS_LIMIT);
430 if (ptr)
431 return ptr;
432
433 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
434 goal, ARCH_LOW_ADDRESS_LIMIT);
435}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cdef1d4b4e4..bd7625676a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3699,13 +3699,45 @@ void __init free_bootmem_with_active_regions(int nid,
3699} 3699}
3700 3700
3701#ifdef CONFIG_HAVE_MEMBLOCK 3701#ifdef CONFIG_HAVE_MEMBLOCK
3702/*
3703 * Basic iterator support. Return the last range of PFNs for a node
3704 * Note: nid == MAX_NUMNODES returns last region regardless of node
3705 */
3706static int __meminit last_active_region_index_in_nid(int nid)
3707{
3708 int i;
3709
3710 for (i = nr_nodemap_entries - 1; i >= 0; i--)
3711 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3712 return i;
3713
3714 return -1;
3715}
3716
3717/*
3718 * Basic iterator support. Return the previous active range of PFNs for a node
3719 * Note: nid == MAX_NUMNODES returns next region regardless of node
3720 */
3721static int __meminit previous_active_region_index_in_nid(int index, int nid)
3722{
3723 for (index = index - 1; index >= 0; index--)
3724 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3725 return index;
3726
3727 return -1;
3728}
3729
3730#define for_each_active_range_index_in_nid_reverse(i, nid) \
3731 for (i = last_active_region_index_in_nid(nid); i != -1; \
3732 i = previous_active_region_index_in_nid(i, nid))
3733
3702u64 __init find_memory_core_early(int nid, u64 size, u64 align, 3734u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3703 u64 goal, u64 limit) 3735 u64 goal, u64 limit)
3704{ 3736{
3705 int i; 3737 int i;
3706 3738
3707 /* Need to go over early_node_map to find out good range for node */ 3739 /* Need to go over early_node_map to find out good range for node */
3708 for_each_active_range_index_in_nid(i, nid) { 3740 for_each_active_range_index_in_nid_reverse(i, nid) {
3709 u64 addr; 3741 u64 addr;
3710 u64 ei_start, ei_last; 3742 u64 ei_start, ei_last;
3711 u64 final_start, final_end; 3743 u64 final_start, final_end;
@@ -3748,34 +3780,6 @@ int __init add_from_early_node_map(struct range *range, int az,
3748 return nr_range; 3780 return nr_range;
3749} 3781}
3750 3782
3751#ifdef CONFIG_NO_BOOTMEM
3752void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3753 u64 goal, u64 limit)
3754{
3755 void *ptr;
3756 u64 addr;
3757
3758 if (limit > memblock.current_limit)
3759 limit = memblock.current_limit;
3760
3761 addr = find_memory_core_early(nid, size, align, goal, limit);
3762
3763 if (addr == MEMBLOCK_ERROR)
3764 return NULL;
3765
3766 ptr = phys_to_virt(addr);
3767 memset(ptr, 0, size);
3768 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
3769 /*
3770 * The min_count is set to 0 so that bootmem allocated blocks
3771 * are never reported as leaks.
3772 */
3773 kmemleak_alloc(ptr, size, 0, 0);
3774 return ptr;
3775}
3776#endif
3777
3778
3779void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3783void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3780{ 3784{
3781 int i; 3785 int i;
@@ -4809,15 +4813,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4809 dma_reserve = new_dma_reserve; 4813 dma_reserve = new_dma_reserve;
4810} 4814}
4811 4815
4812#ifndef CONFIG_NEED_MULTIPLE_NODES
4813struct pglist_data __refdata contig_page_data = {
4814#ifndef CONFIG_NO_BOOTMEM
4815 .bdata = &bootmem_node_data[0]
4816#endif
4817 };
4818EXPORT_SYMBOL(contig_page_data);
4819#endif
4820
4821void __init free_area_init(unsigned long *zones_size) 4816void __init free_area_init(unsigned long *zones_size)
4822{ 4817{
4823 free_area_init_node(0, zones_size, 4818 free_area_init_node(0, zones_size,
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c99060..3437b65d6d6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2144,8 +2144,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2144{ 2144{
2145 struct inode *inode = dentry->d_inode; 2145 struct inode *inode = dentry->d_inode;
2146 2146
2147 if (*len < 3) 2147 if (*len < 3) {
2148 *len = 3;
2148 return 255; 2149 return 255;
2150 }
2149 2151
2150 if (inode_unhashed(inode)) { 2152 if (inode_unhashed(inode)) {
2151 /* Unfortunately insert_inode_hash is not idempotent, 2153 /* Unfortunately insert_inode_hash is not idempotent,
diff --git a/net/core/scm.c b/net/core/scm.c
index bbe45445080..4c1ef026d69 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -95,7 +95,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
95 int fd = fdp[i]; 95 int fd = fdp[i];
96 struct file *file; 96 struct file *file;
97 97
98 if (fd < 0 || !(file = fget(fd))) 98 if (fd < 0 || !(file = fget_raw(fd)))
99 return -EBADF; 99 return -EBADF;
100 *fpp++ = file; 100 *fpp++ = file;
101 fpl->count++; 101 fpl->count++;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 437a99e560e..ba5b8c20849 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -850,7 +850,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
850 * Get the parent directory, calculate the hash for last 850 * Get the parent directory, calculate the hash for last
851 * component. 851 * component.
852 */ 852 */
853 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd); 853 err = kern_path_parent(sunaddr->sun_path, &nd);
854 if (err) 854 if (err)
855 goto out_mknod_parent; 855 goto out_mknod_parent;
856 856
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index f89f83bf828..b6f4b994eb3 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -104,7 +104,7 @@ struct sock *unix_get_socket(struct file *filp)
104 /* 104 /*
105 * Socket ? 105 * Socket ?
106 */ 106 */
107 if (S_ISSOCK(inode->i_mode)) { 107 if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
108 struct socket *sock = SOCKET_I(inode); 108 struct socket *sock = SOCKET_I(inode);
109 struct sock *s = sock->sk; 109 struct sock *s = sock->sk;
110 110
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4c0383da1c9..58848e3e392 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2654,11 +2654,6 @@ sub process {
2654 WARN("Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); 2654 WARN("Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr);
2655 } 2655 }
2656 2656
2657# SPIN_LOCK_UNLOCKED & RW_LOCK_UNLOCKED are deprecated
2658 if ($line =~ /\b(SPIN_LOCK_UNLOCKED|RW_LOCK_UNLOCKED)/) {
2659 ERROR("Use of $1 is deprecated: see Documentation/spinlocks.txt\n" . $herecurr);
2660 }
2661
2662# warn about #if 0 2657# warn about #if 0
2663 if ($line =~ /^.\s*\#\s*if\s+0\b/) { 2658 if ($line =~ /^.\s*\#\s*if\s+0\b/) {
2664 CHK("if this code is redundant consider removing it\n" . 2659 CHK("if this code is redundant consider removing it\n" .
diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl
index fd81fc33d63..a4fe923c013 100644
--- a/scripts/kconfig/streamline_config.pl
+++ b/scripts/kconfig/streamline_config.pl
@@ -1,6 +1,6 @@
1#!/usr/bin/perl -w 1#!/usr/bin/perl -w
2# 2#
3# Copywrite 2005-2009 - Steven Rostedt 3# Copyright 2005-2009 - Steven Rostedt
4# Licensed under the terms of the GNU GPL License version 2 4# Licensed under the terms of the GNU GPL License version 2
5# 5#
6# It's simple enough to figure out how this works. 6# It's simple enough to figure out how this works.
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 038b3d1e298..f9f6f52db77 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -206,7 +206,8 @@ static uint32_t (*w2)(uint16_t);
206static int 206static int
207is_mcounted_section_name(char const *const txtname) 207is_mcounted_section_name(char const *const txtname)
208{ 208{
209 return 0 == strcmp(".text", txtname) || 209 return 0 == strcmp(".text", txtname) ||
210 0 == strcmp(".ref.text", txtname) ||
210 0 == strcmp(".sched.text", txtname) || 211 0 == strcmp(".sched.text", txtname) ||
211 0 == strcmp(".spinlock.text", txtname) || 212 0 == strcmp(".spinlock.text", txtname) ||
212 0 == strcmp(".irqentry.text", txtname) || 213 0 == strcmp(".irqentry.text", txtname) ||
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 1d7963f4ee7..4be0deea71c 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -130,6 +130,7 @@ if ($inputfile =~ m,kernel/trace/ftrace\.o$,) {
130# Acceptable sections to record. 130# Acceptable sections to record.
131my %text_sections = ( 131my %text_sections = (
132 ".text" => 1, 132 ".text" => 1,
133 ".ref.text" => 1,
133 ".sched.text" => 1, 134 ".sched.text" => 1,
134 ".spinlock.text" => 1, 135 ".spinlock.text" => 1,
135 ".irqentry.text" => 1, 136 ".irqentry.text" => 1,
diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py
index 44423b4dcb8..8c81d76959e 100644
--- a/scripts/rt-tester/rt-tester.py
+++ b/scripts/rt-tester/rt-tester.py
@@ -33,8 +33,6 @@ cmd_opcodes = {
33 "lockintnowait" : "6", 33 "lockintnowait" : "6",
34 "lockcont" : "7", 34 "lockcont" : "7",
35 "unlock" : "8", 35 "unlock" : "8",
36 "lockbkl" : "9",
37 "unlockbkl" : "10",
38 "signal" : "11", 36 "signal" : "11",
39 "resetevent" : "98", 37 "resetevent" : "98",
40 "reset" : "99", 38 "reset" : "99",
diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
index 8821f27cc8b..3710c8b2090 100644
--- a/scripts/rt-tester/t2-l1-2rt-sameprio.tst
+++ b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal 0 22# signal 0
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst
index cde1f189a02..b4cc95975ad 100644
--- a/scripts/rt-tester/t2-l1-pi.tst
+++ b/scripts/rt-tester/t2-l1-pi.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal 0 22# signal 0
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst
index 3ab0bfc4995..1b57376cc1f 100644
--- a/scripts/rt-tester/t2-l1-signal.tst
+++ b/scripts/rt-tester/t2-l1-signal.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal 0 22# signal 0
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
index f4b5d5d6215..68b10629b6f 100644
--- a/scripts/rt-tester/t2-l2-2rt-deadlock.tst
+++ b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal 0 22# signal 0
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst
index 63440ca2cce..8e6c8b11ae5 100644
--- a/scripts/rt-tester/t3-l1-pi-1rt.tst
+++ b/scripts/rt-tester/t3-l1-pi-1rt.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst
index e5816fe67df..69c2212fc52 100644
--- a/scripts/rt-tester/t3-l1-pi-2rt.tst
+++ b/scripts/rt-tester/t3-l1-pi-2rt.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst
index 718b82b5d3b..9b0f1eb26a8 100644
--- a/scripts/rt-tester/t3-l1-pi-3rt.tst
+++ b/scripts/rt-tester/t3-l1-pi-3rt.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst
index c6e21356349..39ec74ab06e 100644
--- a/scripts/rt-tester/t3-l1-pi-signal.tst
+++ b/scripts/rt-tester/t3-l1-pi-signal.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst
index f53749d59d7..e03db7e010f 100644
--- a/scripts/rt-tester/t3-l1-pi-steal.tst
+++ b/scripts/rt-tester/t3-l1-pi-steal.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst
index cdc3e4fd7ba..7b59100d3e4 100644
--- a/scripts/rt-tester/t3-l2-pi.tst
+++ b/scripts/rt-tester/t3-l2-pi.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst
index baa14137f47..2f0e049d644 100644
--- a/scripts/rt-tester/t4-l2-pi-deboost.tst
+++ b/scripts/rt-tester/t4-l2-pi-deboost.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
index e6ec0c81b54..04f4034ff89 100644
--- a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
+++ b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
index ca64f8bbf4b..a48a6ee29dd 100644
--- a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
+++ b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
@@ -19,8 +19,6 @@
19# lockintnowait lock nr (0-7) 19# lockintnowait lock nr (0-7)
20# lockcont lock nr (0-7) 20# lockcont lock nr (0-7)
21# unlock lock nr (0-7) 21# unlock lock nr (0-7)
22# lockbkl lock nr (0-7)
23# unlockbkl lock nr (0-7)
24# signal thread to signal (0-7) 22# signal thread to signal (0-7)
25# reset 0 23# reset 0
26# resetevent 0 24# resetevent 0
@@ -39,9 +37,6 @@
39# blocked lock nr (0-7) 37# blocked lock nr (0-7)
40# blockedwake lock nr (0-7) 38# blockedwake lock nr (0-7)
41# unlocked lock nr (0-7) 39# unlocked lock nr (0-7)
42# lockedbkl dont care
43# blockedbkl dont care
44# unlockedbkl dont care
45# opcodeeq command opcode or number 40# opcodeeq command opcode or number
46# opcodelt number 41# opcodelt number
47# opcodegt number 42# opcodegt number
diff --git a/security/commoncap.c b/security/commoncap.c
index 64c2ed9c901..dbfdaed4cc6 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -93,7 +93,7 @@ int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap,
93 * Determine whether the current process may set the system clock and timezone 93 * Determine whether the current process may set the system clock and timezone
94 * information, returning 0 if permission granted, -ve if denied. 94 * information, returning 0 if permission granted, -ve if denied.
95 */ 95 */
96int cap_settime(struct timespec *ts, struct timezone *tz) 96int cap_settime(const struct timespec *ts, const struct timezone *tz)
97{ 97{
98 if (!capable(CAP_SYS_TIME)) 98 if (!capable(CAP_SYS_TIME))
99 return -EPERM; 99 return -EPERM;
diff --git a/security/security.c b/security/security.c
index 7b7308ace8c..bb33ecadcf9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -201,7 +201,7 @@ int security_syslog(int type)
201 return security_ops->syslog(type); 201 return security_ops->syslog(type);
202} 202}
203 203
204int security_settime(struct timespec *ts, struct timezone *tz) 204int security_settime(const struct timespec *ts, const struct timezone *tz)
205{ 205{
206 return security_ops->settime(ts, tz); 206 return security_ops->settime(ts, tz);
207} 207}
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index cb43289e447..416684be0ad 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -1,4 +1,3 @@
1PERF-BUILD-OPTIONS
2PERF-CFLAGS 1PERF-CFLAGS
3PERF-GUI-VARS 2PERF-GUI-VARS
4PERF-VERSION-FILE 3PERF-VERSION-FILE
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index bd498d49695..4626a398836 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -178,8 +178,8 @@ install-pdf: pdf
178 $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir) 178 $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
179 $(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir) 179 $(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
180 180
181install-html: html 181#install-html: html
182 '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir) 182# '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
183 183
184../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE 184../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
185 $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE 185 $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
@@ -288,15 +288,16 @@ $(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
288 sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \ 288 sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
289 mv $@+ $@ 289 mv $@+ $@
290 290
291install-webdoc : html 291# UNIMPLEMENTED
292 '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST) 292#install-webdoc : html
293# '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
293 294
294quick-install: quick-install-man 295# quick-install: quick-install-man
295 296
296quick-install-man: 297# quick-install-man:
297 '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir) 298# '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
298 299
299quick-install-html: 300#quick-install-html:
300 '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir) 301# '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
301 302
302.PHONY: .FORCE-PERF-VERSION-FILE 303.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 399751befee..7a527f7e9da 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -8,7 +8,7 @@ perf-list - List all symbolic event types
8SYNOPSIS 8SYNOPSIS
9-------- 9--------
10[verse] 10[verse]
11'perf list' 11'perf list' [hw|sw|cache|tracepoint|event_glob]
12 12
13DESCRIPTION 13DESCRIPTION
14----------- 14-----------
@@ -63,7 +63,26 @@ details. Some of them are referenced in the SEE ALSO section below.
63 63
64OPTIONS 64OPTIONS
65------- 65-------
66None 66
67Without options all known events will be listed.
68
69To limit the list use:
70
71. 'hw' or 'hardware' to list hardware events such as cache-misses, etc.
72
73. 'sw' or 'software' to list software events such as context switches, etc.
74
75. 'cache' or 'hwcache' to list hardware cache events such as L1-dcache-loads, etc.
76
77. 'tracepoint' to list all tracepoint events, alternatively use
78 'subsys_glob:event_glob' to filter by tracepoint subsystems such as sched,
79 block, etc.
80
81. If none of the above is matched, it will apply the supplied glob to all
82 events, printing the ones that match.
83
84One or more types can be used at the same time, listing the events for the
85types specified.
67 86
68SEE ALSO 87SEE ALSO
69-------- 88--------
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index 921de259ea1..4a26a2f3a6a 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -24,8 +24,8 @@ and statistics with this 'perf lock' command.
24 24
25 'perf lock report' reports statistical data. 25 'perf lock report' reports statistical data.
26 26
27OPTIONS 27COMMON OPTIONS
28------- 28--------------
29 29
30-i:: 30-i::
31--input=<file>:: 31--input=<file>::
@@ -39,6 +39,14 @@ OPTIONS
39--dump-raw-trace:: 39--dump-raw-trace::
40 Dump raw trace in ASCII. 40 Dump raw trace in ASCII.
41 41
42REPORT OPTIONS
43--------------
44
45-k::
46--key=<value>::
47 Sorting key. Possible values: acquired (default), contended,
48 wait_total, wait_max, wait_min.
49
42SEE ALSO 50SEE ALSO
43-------- 51--------
44linkperf:perf[1] 52linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 86b797a35aa..02bafce4b34 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -16,7 +16,7 @@ or
16or 16or
17'perf probe' --list 17'perf probe' --list
18or 18or
19'perf probe' [options] --line='FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]' 19'perf probe' [options] --line='LINE'
20or 20or
21'perf probe' [options] --vars='PROBEPOINT' 21'perf probe' [options] --vars='PROBEPOINT'
22 22
@@ -73,6 +73,17 @@ OPTIONS
73 (Only for --vars) Show external defined variables in addition to local 73 (Only for --vars) Show external defined variables in addition to local
74 variables. 74 variables.
75 75
76-F::
77--funcs::
78 Show available functions in given module or kernel.
79
80--filter=FILTER::
81 (Only for --vars and --funcs) Set filter. FILTER is a combination of glob
82 pattern, see FILTER PATTERN for detail.
83 Default FILTER is "!__k???tab_* & !__crc_*" for --vars, and "!_*"
84 for --funcs.
85 If several filters are specified, only the last filter is used.
86
76-f:: 87-f::
77--force:: 88--force::
78 Forcibly add events with existing name. 89 Forcibly add events with existing name.
@@ -117,13 +128,14 @@ LINE SYNTAX
117----------- 128-----------
118Line range is described by following syntax. 129Line range is described by following syntax.
119 130
120 "FUNC[:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]" 131 "FUNC[@SRC][:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]"
121 132
122FUNC specifies the function name of showing lines. 'RLN' is the start line 133FUNC specifies the function name of showing lines. 'RLN' is the start line
123number from function entry line, and 'RLN2' is the end line number. As same as 134number from function entry line, and 'RLN2' is the end line number. As same as
124probe syntax, 'SRC' means the source file path, 'ALN' is start line number, 135probe syntax, 'SRC' means the source file path, 'ALN' is start line number,
125and 'ALN2' is end line number in the file. It is also possible to specify how 136and 'ALN2' is end line number in the file. It is also possible to specify how
126many lines to show by using 'NUM'. 137many lines to show by using 'NUM'. Moreover, 'FUNC@SRC' combination is good
138for searching a specific function when several functions share same name.
127So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function. 139So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function.
128 140
129LAZY MATCHING 141LAZY MATCHING
@@ -135,6 +147,14 @@ e.g.
135 147
136This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.) 148This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.)
137 149
150FILTER PATTERN
151--------------
152 The filter pattern is a glob matching pattern(s) to filter variables.
153 In addition, you can use "!" for specifying filter-out rule. You also can give several rules combined with "&" or "|", and fold those rules as one rule by using "(" ")".
154
155e.g.
156 With --filter "foo* | bar*", perf probe -V shows variables which start with "foo" or "bar".
157 With --filter "!foo* & *bar", perf probe -V shows variables which don't start with "foo" and end with "bar", like "fizzbar". But "foobar" is filtered out.
138 158
139EXAMPLES 159EXAMPLES
140-------- 160--------
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index e032716c839..5a520f82529 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -137,6 +137,17 @@ Do not update the builid cache. This saves some overhead in situations
137where the information in the perf.data file (which includes buildids) 137where the information in the perf.data file (which includes buildids)
138is sufficient. 138is sufficient.
139 139
140-G name,...::
141--cgroup name,...::
142monitor only in the container (cgroup) called "name". This option is available only
143in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
144container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
145can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
146to first event, second cgroup to second event and so on. It is possible to provide
147an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
148corresponding events, i.e., they always refer to events defined earlier on the command
149line.
150
140SEE ALSO 151SEE ALSO
141-------- 152--------
142linkperf:perf-stat[1], linkperf:perf-list[1] 153linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index b6da7affbbe..918cc38ee6d 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -83,6 +83,17 @@ This option is only valid in system-wide mode.
83print counts using a CSV-style output to make it easy to import directly into 83print counts using a CSV-style output to make it easy to import directly into
84spreadsheets. Columns are separated by the string specified in SEP. 84spreadsheets. Columns are separated by the string specified in SEP.
85 85
86-G name::
87--cgroup name::
88monitor only in the container (cgroup) called "name". This option is available only
89in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
90container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
91can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
92to first event, second cgroup to second event and so on. It is possible to provide
93an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
94corresponding events, i.e., they always refer to events defined earlier on the command
95line.
96
86EXAMPLES 97EXAMPLES
87-------- 98--------
88 99
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 7141c42e146..9b8421805c5 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -3,7 +3,7 @@ ifeq ("$(origin O)", "command line")
3endif 3endif
4 4
5# The default target of this Makefile is... 5# The default target of this Makefile is...
6all:: 6all:
7 7
8ifneq ($(OUTPUT),) 8ifneq ($(OUTPUT),)
9# check that the output directory actually exists 9# check that the output directory actually exists
@@ -11,152 +11,12 @@ OUTDIR := $(shell cd $(OUTPUT) && /bin/pwd)
11$(if $(OUTDIR),, $(error output directory "$(OUTPUT)" does not exist)) 11$(if $(OUTDIR),, $(error output directory "$(OUTPUT)" does not exist))
12endif 12endif
13 13
14# Define V=1 to have a more verbose compile. 14# Define V to have a more verbose compile.
15# Define V=2 to have an even more verbose compile.
16#
17# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
18# or vsnprintf() return -1 instead of number of characters which would
19# have been written to the final string if enough space had been available.
20#
21# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
22# when attempting to read from an fopen'ed directory.
23#
24# Define NO_OPENSSL environment variable if you do not have OpenSSL.
25# This also implies MOZILLA_SHA1.
26#
27# Define CURLDIR=/foo/bar if your curl header and library files are in
28# /foo/bar/include and /foo/bar/lib directories.
29#
30# Define EXPATDIR=/foo/bar if your expat header and library files are in
31# /foo/bar/include and /foo/bar/lib directories.
32#
33# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
34#
35# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
36# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
37#
38# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
39# do not support the 'size specifiers' introduced by C99, namely ll, hh,
40# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
41# some C compilers supported these specifiers prior to C99 as an extension.
42#
43# Define NO_STRCASESTR if you don't have strcasestr.
44#
45# Define NO_MEMMEM if you don't have memmem.
46#
47# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
48# If your compiler also does not support long long or does not have
49# strtoull, define NO_STRTOULL.
50#
51# Define NO_SETENV if you don't have setenv in the C library.
52#
53# Define NO_UNSETENV if you don't have unsetenv in the C library.
54#
55# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
56#
57# Define NO_SYS_SELECT_H if you don't have sys/select.h.
58#
59# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
60# Enable it on Windows. By default, symrefs are still used.
61#
62# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
63# tests. These tests take up a significant amount of the total test time
64# but are not needed unless you plan to talk to SVN repos.
65#
66# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
67# installed in /sw, but don't want PERF to link against any libraries
68# installed there. If defined you may specify your own (or Fink's)
69# include directories and library directories by defining CFLAGS
70# and LDFLAGS appropriately.
71#
72# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
73# have DarwinPorts installed in /opt/local, but don't want PERF to
74# link against any libraries installed there. If defined you may
75# specify your own (or DarwinPort's) include directories and
76# library directories by defining CFLAGS and LDFLAGS appropriately.
77#
78# Define PPC_SHA1 environment variable when running make to make use of
79# a bundled SHA1 routine optimized for PowerPC.
80#
81# Define ARM_SHA1 environment variable when running make to make use of
82# a bundled SHA1 routine optimized for ARM.
83#
84# Define MOZILLA_SHA1 environment variable when running make to make use of
85# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
86# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
87# choice) has very fast version optimized for i586.
88#
89# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
90#
91# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
92#
93# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
94# Patrick Mauritz).
95#
96# Define NO_MMAP if you want to avoid mmap.
97#
98# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
99#
100# Define NO_PREAD if you have a problem with pread() system call (e.g.
101# cygwin.dll before v1.5.22).
102#
103# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
104# generally faster on your platform than accessing the working directory.
105#
106# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
107# the executable mode bit, but doesn't really do so.
108#
109# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
110#
111# Define NO_SOCKADDR_STORAGE if your platform does not have struct
112# sockaddr_storage.
113#
114# Define NO_ICONV if your libc does not properly support iconv.
115#
116# Define OLD_ICONV if your library has an old iconv(), where the second
117# (input buffer pointer) parameter is declared with type (const char **).
118#
119# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
120#
121# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
122# that tells runtime paths to dynamic libraries;
123# "-Wl,-rpath=/path/lib" is used instead.
124#
125# Define USE_NSEC below if you want perf to care about sub-second file mtimes
126# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
127# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
128# randomly break unless your underlying filesystem supports those sub-second
129# times (my ext3 doesn't).
130#
131# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
132# "st_ctim"
133#
134# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
135# available. This automatically turns USE_NSEC off.
136#
137# Define USE_STDEV below if you want perf to care about the underlying device
138# change being considered an inode change from the update-index perspective.
139#
140# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
141# field that counts the on-disk footprint in 512-byte blocks.
142# 15#
143# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8 16# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
144# 17#
145# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72. 18# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
146# 19#
147# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
148# MakeMaker (e.g. using ActiveState under Cygwin).
149#
150# Define NO_PERL if you do not want Perl scripts or libraries at all.
151#
152# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
153# is a simplified version of the merge sort used in glibc. This is
154# recommended if Git triggers O(n^2) behavior in your platform's qsort().
155#
156# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
157# your external grep (e.g., if your system lacks grep, if its grep is
158# broken, or spawning external process is slower than built-in grep perf has).
159#
160# Define LDFLAGS=-static to build a static binary. 20# Define LDFLAGS=-static to build a static binary.
161# 21#
162# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds. 22# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
@@ -167,12 +27,7 @@ $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
167 @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT) 27 @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
168-include $(OUTPUT)PERF-VERSION-FILE 28-include $(OUTPUT)PERF-VERSION-FILE
169 29
170uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') 30uname_M := $(shell uname -m 2>/dev/null || echo not)
171uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
172uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
173uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
174uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
175uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
176 31
177ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \ 32ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
178 -e s/arm.*/arm/ -e s/sa110/arm/ \ 33 -e s/arm.*/arm/ -e s/sa110/arm/ \
@@ -191,8 +46,6 @@ ifeq ($(ARCH),x86_64)
191 ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S 46 ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
192endif 47endif
193 48
194# CFLAGS and LDFLAGS are for the users to override from the command line.
195
196# 49#
197# Include saner warnings here, which can catch bugs: 50# Include saner warnings here, which can catch bugs:
198# 51#
@@ -270,22 +123,13 @@ CC = $(CROSS_COMPILE)gcc
270AR = $(CROSS_COMPILE)ar 123AR = $(CROSS_COMPILE)ar
271RM = rm -f 124RM = rm -f
272MKDIR = mkdir 125MKDIR = mkdir
273TAR = tar
274FIND = find 126FIND = find
275INSTALL = install 127INSTALL = install
276RPMBUILD = rpmbuild
277PTHREAD_LIBS = -lpthread
278 128
279# sparse is architecture-neutral, which means that we need to tell it 129# sparse is architecture-neutral, which means that we need to tell it
280# explicitly what architecture to check for. Fix this up for yours.. 130# explicitly what architecture to check for. Fix this up for yours..
281SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ 131SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
282 132
283ifeq ($(V), 2)
284 QUIET_STDERR = ">/dev/null"
285else
286 QUIET_STDERR = ">/dev/null 2>&1"
287endif
288
289-include feature-tests.mak 133-include feature-tests.mak
290 134
291ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -fstack-protector-all),y) 135ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -fstack-protector-all),y)
@@ -310,49 +154,37 @@ BASIC_LDFLAGS =
310 154
311# Guard against environment variables 155# Guard against environment variables
312BUILTIN_OBJS = 156BUILTIN_OBJS =
313BUILT_INS =
314COMPAT_CFLAGS =
315COMPAT_OBJS =
316LIB_H = 157LIB_H =
317LIB_OBJS = 158LIB_OBJS =
318SCRIPT_PERL = 159PYRF_OBJS =
319SCRIPT_SH = 160SCRIPT_SH =
320TEST_PROGRAMS =
321 161
322SCRIPT_SH += perf-archive.sh 162SCRIPT_SH += perf-archive.sh
323 163
324grep-libs = $(filter -l%,$(1)) 164grep-libs = $(filter -l%,$(1))
325strip-libs = $(filter-out -l%,$(1)) 165strip-libs = $(filter-out -l%,$(1))
326 166
167$(OUTPUT)python/perf.so: $(PYRF_OBJS)
168 $(QUIET_GEN)python util/setup.py --quiet build_ext --build-lib='$(OUTPUT)python' \
169 --build-temp='$(OUTPUT)python/temp'
327# 170#
328# No Perl scripts right now: 171# No Perl scripts right now:
329# 172#
330 173
331# SCRIPT_PERL += perf-add--interactive.perl 174SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
332
333SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
334 $(patsubst %.perl,%,$(SCRIPT_PERL))
335
336# Empty...
337EXTRA_PROGRAMS =
338
339# ... and all the rest that could be moved out of bindir to perfexecdir
340PROGRAMS += $(EXTRA_PROGRAMS)
341 175
342# 176#
343# Single 'perf' binary right now: 177# Single 'perf' binary right now:
344# 178#
345PROGRAMS += $(OUTPUT)perf 179PROGRAMS += $(OUTPUT)perf
346 180
347# List built-in command $C whose implementation cmd_$C() is not in 181LANG_BINDINGS =
348# builtin-$C.o but is linked in as part of some other command.
349#
350 182
351# what 'all' will build and 'install' will install, in perfexecdir 183# what 'all' will build and 'install' will install, in perfexecdir
352ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) 184ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
353 185
354# what 'all' will build but not install in perfexecdir 186# what 'all' will build but not install in perfexecdir
355OTHER_PROGRAMS = $(OUTPUT)perf$X 187OTHER_PROGRAMS = $(OUTPUT)perf
356 188
357# Set paths to tools early so that they can be used for version tests. 189# Set paths to tools early so that they can be used for version tests.
358ifndef SHELL_PATH 190ifndef SHELL_PATH
@@ -395,6 +227,7 @@ LIB_H += util/include/dwarf-regs.h
395LIB_H += util/include/asm/dwarf2.h 227LIB_H += util/include/asm/dwarf2.h
396LIB_H += util/include/asm/cpufeature.h 228LIB_H += util/include/asm/cpufeature.h
397LIB_H += perf.h 229LIB_H += perf.h
230LIB_H += util/annotate.h
398LIB_H += util/cache.h 231LIB_H += util/cache.h
399LIB_H += util/callchain.h 232LIB_H += util/callchain.h
400LIB_H += util/build-id.h 233LIB_H += util/build-id.h
@@ -402,6 +235,7 @@ LIB_H += util/debug.h
402LIB_H += util/debugfs.h 235LIB_H += util/debugfs.h
403LIB_H += util/event.h 236LIB_H += util/event.h
404LIB_H += util/evsel.h 237LIB_H += util/evsel.h
238LIB_H += util/evlist.h
405LIB_H += util/exec_cmd.h 239LIB_H += util/exec_cmd.h
406LIB_H += util/types.h 240LIB_H += util/types.h
407LIB_H += util/levenshtein.h 241LIB_H += util/levenshtein.h
@@ -416,6 +250,7 @@ LIB_H += util/help.h
416LIB_H += util/session.h 250LIB_H += util/session.h
417LIB_H += util/strbuf.h 251LIB_H += util/strbuf.h
418LIB_H += util/strlist.h 252LIB_H += util/strlist.h
253LIB_H += util/strfilter.h
419LIB_H += util/svghelper.h 254LIB_H += util/svghelper.h
420LIB_H += util/run-command.h 255LIB_H += util/run-command.h
421LIB_H += util/sigchain.h 256LIB_H += util/sigchain.h
@@ -425,21 +260,26 @@ LIB_H += util/values.h
425LIB_H += util/sort.h 260LIB_H += util/sort.h
426LIB_H += util/hist.h 261LIB_H += util/hist.h
427LIB_H += util/thread.h 262LIB_H += util/thread.h
263LIB_H += util/thread_map.h
428LIB_H += util/trace-event.h 264LIB_H += util/trace-event.h
429LIB_H += util/probe-finder.h 265LIB_H += util/probe-finder.h
430LIB_H += util/probe-event.h 266LIB_H += util/probe-event.h
431LIB_H += util/pstack.h 267LIB_H += util/pstack.h
432LIB_H += util/cpumap.h 268LIB_H += util/cpumap.h
269LIB_H += util/top.h
433LIB_H += $(ARCH_INCLUDE) 270LIB_H += $(ARCH_INCLUDE)
271LIB_H += util/cgroup.h
434 272
435LIB_OBJS += $(OUTPUT)util/abspath.o 273LIB_OBJS += $(OUTPUT)util/abspath.o
436LIB_OBJS += $(OUTPUT)util/alias.o 274LIB_OBJS += $(OUTPUT)util/alias.o
275LIB_OBJS += $(OUTPUT)util/annotate.o
437LIB_OBJS += $(OUTPUT)util/build-id.o 276LIB_OBJS += $(OUTPUT)util/build-id.o
438LIB_OBJS += $(OUTPUT)util/config.o 277LIB_OBJS += $(OUTPUT)util/config.o
439LIB_OBJS += $(OUTPUT)util/ctype.o 278LIB_OBJS += $(OUTPUT)util/ctype.o
440LIB_OBJS += $(OUTPUT)util/debugfs.o 279LIB_OBJS += $(OUTPUT)util/debugfs.o
441LIB_OBJS += $(OUTPUT)util/environment.o 280LIB_OBJS += $(OUTPUT)util/environment.o
442LIB_OBJS += $(OUTPUT)util/event.o 281LIB_OBJS += $(OUTPUT)util/event.o
282LIB_OBJS += $(OUTPUT)util/evlist.o
443LIB_OBJS += $(OUTPUT)util/evsel.o 283LIB_OBJS += $(OUTPUT)util/evsel.o
444LIB_OBJS += $(OUTPUT)util/exec_cmd.o 284LIB_OBJS += $(OUTPUT)util/exec_cmd.o
445LIB_OBJS += $(OUTPUT)util/help.o 285LIB_OBJS += $(OUTPUT)util/help.o
@@ -455,6 +295,8 @@ LIB_OBJS += $(OUTPUT)util/quote.o
455LIB_OBJS += $(OUTPUT)util/strbuf.o 295LIB_OBJS += $(OUTPUT)util/strbuf.o
456LIB_OBJS += $(OUTPUT)util/string.o 296LIB_OBJS += $(OUTPUT)util/string.o
457LIB_OBJS += $(OUTPUT)util/strlist.o 297LIB_OBJS += $(OUTPUT)util/strlist.o
298LIB_OBJS += $(OUTPUT)util/strfilter.o
299LIB_OBJS += $(OUTPUT)util/top.o
458LIB_OBJS += $(OUTPUT)util/usage.o 300LIB_OBJS += $(OUTPUT)util/usage.o
459LIB_OBJS += $(OUTPUT)util/wrapper.o 301LIB_OBJS += $(OUTPUT)util/wrapper.o
460LIB_OBJS += $(OUTPUT)util/sigchain.o 302LIB_OBJS += $(OUTPUT)util/sigchain.o
@@ -469,6 +311,7 @@ LIB_OBJS += $(OUTPUT)util/map.o
469LIB_OBJS += $(OUTPUT)util/pstack.o 311LIB_OBJS += $(OUTPUT)util/pstack.o
470LIB_OBJS += $(OUTPUT)util/session.o 312LIB_OBJS += $(OUTPUT)util/session.o
471LIB_OBJS += $(OUTPUT)util/thread.o 313LIB_OBJS += $(OUTPUT)util/thread.o
314LIB_OBJS += $(OUTPUT)util/thread_map.o
472LIB_OBJS += $(OUTPUT)util/trace-event-parse.o 315LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
473LIB_OBJS += $(OUTPUT)util/trace-event-read.o 316LIB_OBJS += $(OUTPUT)util/trace-event-read.o
474LIB_OBJS += $(OUTPUT)util/trace-event-info.o 317LIB_OBJS += $(OUTPUT)util/trace-event-info.o
@@ -480,6 +323,7 @@ LIB_OBJS += $(OUTPUT)util/probe-event.o
480LIB_OBJS += $(OUTPUT)util/util.o 323LIB_OBJS += $(OUTPUT)util/util.o
481LIB_OBJS += $(OUTPUT)util/xyarray.o 324LIB_OBJS += $(OUTPUT)util/xyarray.o
482LIB_OBJS += $(OUTPUT)util/cpumap.o 325LIB_OBJS += $(OUTPUT)util/cpumap.o
326LIB_OBJS += $(OUTPUT)util/cgroup.o
483 327
484BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o 328BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
485 329
@@ -514,6 +358,20 @@ BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
514 358
515PERFLIBS = $(LIB_FILE) 359PERFLIBS = $(LIB_FILE)
516 360
361# Files needed for the python binding, perf.so
362# pyrf is just an internal name needed for all those wrappers.
363# This has to be in sync with what is in the 'sources' variable in
364# tools/perf/util/setup.py
365
366PYRF_OBJS += $(OUTPUT)util/cpumap.o
367PYRF_OBJS += $(OUTPUT)util/ctype.o
368PYRF_OBJS += $(OUTPUT)util/evlist.o
369PYRF_OBJS += $(OUTPUT)util/evsel.o
370PYRF_OBJS += $(OUTPUT)util/python.o
371PYRF_OBJS += $(OUTPUT)util/thread_map.o
372PYRF_OBJS += $(OUTPUT)util/util.o
373PYRF_OBJS += $(OUTPUT)util/xyarray.o
374
517# 375#
518# Platform specific tweaks 376# Platform specific tweaks
519# 377#
@@ -535,22 +393,6 @@ endif # NO_DWARF
535 393
536-include arch/$(ARCH)/Makefile 394-include arch/$(ARCH)/Makefile
537 395
538ifeq ($(uname_S),Darwin)
539 ifndef NO_FINK
540 ifeq ($(shell test -d /sw/lib && echo y),y)
541 BASIC_CFLAGS += -I/sw/include
542 BASIC_LDFLAGS += -L/sw/lib
543 endif
544 endif
545 ifndef NO_DARWIN_PORTS
546 ifeq ($(shell test -d /opt/local/lib && echo y),y)
547 BASIC_CFLAGS += -I/opt/local/include
548 BASIC_LDFLAGS += -L/opt/local/lib
549 endif
550 endif
551 PTHREAD_LIBS =
552endif
553
554ifneq ($(OUTPUT),) 396ifneq ($(OUTPUT),)
555 BASIC_CFLAGS += -I$(OUTPUT) 397 BASIC_CFLAGS += -I$(OUTPUT)
556endif 398endif
@@ -595,6 +437,7 @@ else
595 LIB_OBJS += $(OUTPUT)util/ui/browsers/annotate.o 437 LIB_OBJS += $(OUTPUT)util/ui/browsers/annotate.o
596 LIB_OBJS += $(OUTPUT)util/ui/browsers/hists.o 438 LIB_OBJS += $(OUTPUT)util/ui/browsers/hists.o
597 LIB_OBJS += $(OUTPUT)util/ui/browsers/map.o 439 LIB_OBJS += $(OUTPUT)util/ui/browsers/map.o
440 LIB_OBJS += $(OUTPUT)util/ui/browsers/top.o
598 LIB_OBJS += $(OUTPUT)util/ui/helpline.o 441 LIB_OBJS += $(OUTPUT)util/ui/helpline.o
599 LIB_OBJS += $(OUTPUT)util/ui/progress.o 442 LIB_OBJS += $(OUTPUT)util/ui/progress.o
600 LIB_OBJS += $(OUTPUT)util/ui/util.o 443 LIB_OBJS += $(OUTPUT)util/ui/util.o
@@ -604,6 +447,7 @@ else
604 LIB_H += util/ui/libslang.h 447 LIB_H += util/ui/libslang.h
605 LIB_H += util/ui/progress.h 448 LIB_H += util/ui/progress.h
606 LIB_H += util/ui/util.h 449 LIB_H += util/ui/util.h
450 LIB_H += util/ui/ui.h
607 endif 451 endif
608endif 452endif
609 453
@@ -635,12 +479,14 @@ else
635 PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null` 479 PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null`
636 FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS) 480 FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
637 ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y) 481 ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y)
482 msg := $(warning No Python.h found, install python-dev[el] to have python support in 'perf script' and to build the python bindings)
638 BASIC_CFLAGS += -DNO_LIBPYTHON 483 BASIC_CFLAGS += -DNO_LIBPYTHON
639 else 484 else
640 ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS) 485 ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
641 EXTLIBS += $(PYTHON_EMBED_LIBADD) 486 EXTLIBS += $(PYTHON_EMBED_LIBADD)
642 LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o 487 LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
643 LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o 488 LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
489 LANG_BINDINGS += $(OUTPUT)python/perf.so
644 endif 490 endif
645endif 491endif
646 492
@@ -690,201 +536,13 @@ else
690 endif 536 endif
691endif 537endif
692 538
693ifndef CC_LD_DYNPATH
694 ifdef NO_R_TO_GCC_LINKER
695 # Some gcc does not accept and pass -R to the linker to specify
696 # the runtime dynamic library path.
697 CC_LD_DYNPATH = -Wl,-rpath,
698 else
699 CC_LD_DYNPATH = -R
700 endif
701endif
702
703ifdef NEEDS_SOCKET
704 EXTLIBS += -lsocket
705endif
706ifdef NEEDS_NSL
707 EXTLIBS += -lnsl
708endif
709ifdef NO_D_TYPE_IN_DIRENT
710 BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
711endif
712ifdef NO_D_INO_IN_DIRENT
713 BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
714endif
715ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
716 BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
717endif
718ifdef USE_NSEC
719 BASIC_CFLAGS += -DUSE_NSEC
720endif
721ifdef USE_ST_TIMESPEC
722 BASIC_CFLAGS += -DUSE_ST_TIMESPEC
723endif
724ifdef NO_NSEC
725 BASIC_CFLAGS += -DNO_NSEC
726endif
727ifdef NO_C99_FORMAT
728 BASIC_CFLAGS += -DNO_C99_FORMAT
729endif
730ifdef SNPRINTF_RETURNS_BOGUS
731 COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
732 COMPAT_OBJS += $(OUTPUT)compat/snprintf.o
733endif
734ifdef FREAD_READS_DIRECTORIES
735 COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
736 COMPAT_OBJS += $(OUTPUT)compat/fopen.o
737endif
738ifdef NO_SYMLINK_HEAD
739 BASIC_CFLAGS += -DNO_SYMLINK_HEAD
740endif
741ifdef NO_STRCASESTR
742 COMPAT_CFLAGS += -DNO_STRCASESTR
743 COMPAT_OBJS += $(OUTPUT)compat/strcasestr.o
744endif
745ifdef NO_STRTOUMAX
746 COMPAT_CFLAGS += -DNO_STRTOUMAX
747 COMPAT_OBJS += $(OUTPUT)compat/strtoumax.o
748endif
749ifdef NO_STRTOULL
750 COMPAT_CFLAGS += -DNO_STRTOULL
751endif
752ifdef NO_SETENV
753 COMPAT_CFLAGS += -DNO_SETENV
754 COMPAT_OBJS += $(OUTPUT)compat/setenv.o
755endif
756ifdef NO_MKDTEMP
757 COMPAT_CFLAGS += -DNO_MKDTEMP
758 COMPAT_OBJS += $(OUTPUT)compat/mkdtemp.o
759endif
760ifdef NO_UNSETENV
761 COMPAT_CFLAGS += -DNO_UNSETENV
762 COMPAT_OBJS += $(OUTPUT)compat/unsetenv.o
763endif
764ifdef NO_SYS_SELECT_H
765 BASIC_CFLAGS += -DNO_SYS_SELECT_H
766endif
767ifdef NO_MMAP
768 COMPAT_CFLAGS += -DNO_MMAP
769 COMPAT_OBJS += $(OUTPUT)compat/mmap.o
770else
771 ifdef USE_WIN32_MMAP
772 COMPAT_CFLAGS += -DUSE_WIN32_MMAP
773 COMPAT_OBJS += $(OUTPUT)compat/win32mmap.o
774 endif
775endif
776ifdef NO_PREAD
777 COMPAT_CFLAGS += -DNO_PREAD
778 COMPAT_OBJS += $(OUTPUT)compat/pread.o
779endif
780ifdef NO_FAST_WORKING_DIRECTORY
781 BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
782endif
783ifdef NO_TRUSTABLE_FILEMODE
784 BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
785endif
786ifdef NO_IPV6
787 BASIC_CFLAGS += -DNO_IPV6
788endif
789ifdef NO_UINTMAX_T
790 BASIC_CFLAGS += -Duintmax_t=uint32_t
791endif
792ifdef NO_SOCKADDR_STORAGE
793ifdef NO_IPV6
794 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
795else
796 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
797endif
798endif
799ifdef NO_INET_NTOP
800 LIB_OBJS += $(OUTPUT)compat/inet_ntop.o
801endif
802ifdef NO_INET_PTON
803 LIB_OBJS += $(OUTPUT)compat/inet_pton.o
804endif
805
806ifdef NO_ICONV
807 BASIC_CFLAGS += -DNO_ICONV
808endif
809
810ifdef OLD_ICONV
811 BASIC_CFLAGS += -DOLD_ICONV
812endif
813
814ifdef NO_DEFLATE_BOUND
815 BASIC_CFLAGS += -DNO_DEFLATE_BOUND
816endif
817
818ifdef PPC_SHA1
819 SHA1_HEADER = "ppc/sha1.h"
820 LIB_OBJS += $(OUTPUT)ppc/sha1.o ppc/sha1ppc.o
821else
822ifdef ARM_SHA1
823 SHA1_HEADER = "arm/sha1.h"
824 LIB_OBJS += $(OUTPUT)arm/sha1.o $(OUTPUT)arm/sha1_arm.o
825else
826ifdef MOZILLA_SHA1
827 SHA1_HEADER = "mozilla-sha1/sha1.h"
828 LIB_OBJS += $(OUTPUT)mozilla-sha1/sha1.o
829else
830 SHA1_HEADER = <openssl/sha.h>
831 EXTLIBS += $(LIB_4_CRYPTO)
832endif
833endif
834endif
835ifdef NO_PERL_MAKEMAKER
836 export NO_PERL_MAKEMAKER
837endif
838ifdef NO_HSTRERROR
839 COMPAT_CFLAGS += -DNO_HSTRERROR
840 COMPAT_OBJS += $(OUTPUT)compat/hstrerror.o
841endif
842ifdef NO_MEMMEM
843 COMPAT_CFLAGS += -DNO_MEMMEM
844 COMPAT_OBJS += $(OUTPUT)compat/memmem.o
845endif
846ifdef INTERNAL_QSORT
847 COMPAT_CFLAGS += -DINTERNAL_QSORT
848 COMPAT_OBJS += $(OUTPUT)compat/qsort.o
849endif
850ifdef RUNTIME_PREFIX
851 COMPAT_CFLAGS += -DRUNTIME_PREFIX
852endif
853
854ifdef DIR_HAS_BSD_GROUP_SEMANTICS
855 COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
856endif
857ifdef NO_EXTERNAL_GREP
858 BASIC_CFLAGS += -DNO_EXTERNAL_GREP
859endif
860
861ifeq ($(PERL_PATH),)
862NO_PERL=NoThanks
863endif
864
865QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir
866QUIET_SUBDIR1 =
867
868ifneq ($(findstring $(MAKEFLAGS),w),w)
869PRINT_DIR = --no-print-directory
870else # "make -w"
871NO_SUBDIR = :
872endif
873
874ifneq ($(findstring $(MAKEFLAGS),s),s) 539ifneq ($(findstring $(MAKEFLAGS),s),s)
875ifndef V 540ifndef V
876 QUIET_CC = @echo ' ' CC $@; 541 QUIET_CC = @echo ' ' CC $@;
877 QUIET_AR = @echo ' ' AR $@; 542 QUIET_AR = @echo ' ' AR $@;
878 QUIET_LINK = @echo ' ' LINK $@; 543 QUIET_LINK = @echo ' ' LINK $@;
879 QUIET_MKDIR = @echo ' ' MKDIR $@; 544 QUIET_MKDIR = @echo ' ' MKDIR $@;
880 QUIET_BUILT_IN = @echo ' ' BUILTIN $@;
881 QUIET_GEN = @echo ' ' GEN $@; 545 QUIET_GEN = @echo ' ' GEN $@;
882 QUIET_SUBDIR0 = +@subdir=
883 QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \
884 $(MAKE) $(PRINT_DIR) -C $$subdir
885 export V
886 export QUIET_GEN
887 export QUIET_BUILT_IN
888endif 546endif
889endif 547endif
890 548
@@ -894,7 +552,6 @@ endif
894 552
895# Shell quote (do not use $(call) to accommodate ancient setups); 553# Shell quote (do not use $(call) to accommodate ancient setups);
896 554
897SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
898ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG)) 555ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
899 556
900DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) 557DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
@@ -908,46 +565,36 @@ htmldir_SQ = $(subst ','\'',$(htmldir))
908prefix_SQ = $(subst ','\'',$(prefix)) 565prefix_SQ = $(subst ','\'',$(prefix))
909 566
910SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) 567SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
911PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
912 568
913LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS) 569LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS)
914 570
915BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
916 $(COMPAT_CFLAGS)
917LIB_OBJS += $(COMPAT_OBJS)
918
919ALL_CFLAGS += $(BASIC_CFLAGS) 571ALL_CFLAGS += $(BASIC_CFLAGS)
920ALL_CFLAGS += $(ARCH_CFLAGS) 572ALL_CFLAGS += $(ARCH_CFLAGS)
921ALL_LDFLAGS += $(BASIC_LDFLAGS) 573ALL_LDFLAGS += $(BASIC_LDFLAGS)
922 574
923export TAR INSTALL DESTDIR SHELL_PATH 575export INSTALL SHELL_PATH
924 576
925 577
926### Build rules 578### Build rules
927 579
928SHELL = $(SHELL_PATH) 580SHELL = $(SHELL_PATH)
929 581
930all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS 582all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
931ifneq (,$X)
932 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
933endif
934
935all::
936 583
937please_set_SHELL_PATH_to_a_more_modern_shell: 584please_set_SHELL_PATH_to_a_more_modern_shell:
938 @$$(:) 585 @$$(:)
939 586
940shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell 587shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
941 588
942strip: $(PROGRAMS) $(OUTPUT)perf$X 589strip: $(PROGRAMS) $(OUTPUT)perf
943 $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf$X 590 $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
944 591
945$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS 592$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
946 $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \ 593 $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
947 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ 594 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
948 $(ALL_CFLAGS) -c $(filter %.c,$^) -o $@ 595 $(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
949 596
950$(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS) 597$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
951 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \ 598 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \
952 $(BUILTIN_OBJS) $(LIBS) -o $@ 599 $(BUILTIN_OBJS) $(LIBS) -o $@
953 600
@@ -963,39 +610,17 @@ $(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPU
963 '-DPERF_MAN_PATH="$(mandir_SQ)"' \ 610 '-DPERF_MAN_PATH="$(mandir_SQ)"' \
964 '-DPERF_INFO_PATH="$(infodir_SQ)"' $< 611 '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
965 612
966$(BUILT_INS): $(OUTPUT)perf$X
967 $(QUIET_BUILT_IN)$(RM) $@ && \
968 ln perf$X $@ 2>/dev/null || \
969 ln -s perf$X $@ 2>/dev/null || \
970 cp perf$X $@
971
972$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt 613$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
973 614
974$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt) 615$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
975 $(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@ 616 $(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
976 617
977$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh 618$(SCRIPTS) : % : %.sh
978 $(QUIET_GEN)$(RM) $(OUTPUT)$@ $(OUTPUT)$@+ && \ 619 $(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
979 sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
980 -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
981 -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
982 -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
983 -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
984 $@.sh > $(OUTPUT)$@+ && \
985 chmod +x $(OUTPUT)$@+ && \
986 mv $(OUTPUT)$@+ $(OUTPUT)$@
987
988configure: configure.ac
989 $(QUIET_GEN)$(RM) $@ $<+ && \
990 sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
991 $< > $<+ && \
992 autoconf -o $@ $<+ && \
993 $(RM) $<+
994 620
995# These can record PERF_VERSION 621# These can record PERF_VERSION
996$(OUTPUT)perf.o perf.spec \ 622$(OUTPUT)perf.o perf.spec \
997 $(patsubst %.sh,%,$(SCRIPT_SH)) \ 623 $(SCRIPTS) \
998 $(patsubst %.perl,%,$(SCRIPT_PERL)) \
999 : $(OUTPUT)PERF-VERSION-FILE 624 : $(OUTPUT)PERF-VERSION-FILE
1000 625
1001$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS 626$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
@@ -1012,9 +637,6 @@ $(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
1012 '-DPREFIX="$(prefix_SQ)"' \ 637 '-DPREFIX="$(prefix_SQ)"' \
1013 $< 638 $<
1014 639
1015$(OUTPUT)builtin-init-db.o: builtin-init-db.c $(OUTPUT)PERF-CFLAGS
1016 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
1017
1018$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS 640$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
1019 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< 641 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
1020 642
@@ -1024,6 +646,9 @@ $(OUTPUT)util/ui/browser.o: util/ui/browser.c $(OUTPUT)PERF-CFLAGS
1024$(OUTPUT)util/ui/browsers/annotate.o: util/ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS 646$(OUTPUT)util/ui/browsers/annotate.o: util/ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS
1025 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $< 647 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
1026 648
649$(OUTPUT)util/ui/browsers/top.o: util/ui/browsers/top.c $(OUTPUT)PERF-CFLAGS
650 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
651
1027$(OUTPUT)util/ui/browsers/hists.o: util/ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS 652$(OUTPUT)util/ui/browsers/hists.o: util/ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS
1028 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $< 653 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
1029 654
@@ -1045,12 +670,11 @@ $(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/tra
1045$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS 670$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
1046 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $< 671 $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
1047 672
1048$(OUTPUT)perf-%$X: %.o $(PERFLIBS) 673$(OUTPUT)perf-%: %.o $(PERFLIBS)
1049 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) 674 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
1050 675
1051$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) 676$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
1052$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) 677$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
1053builtin-revert.o wt-status.o: wt-status.h
1054 678
1055# we compile into subdirectories. if the target directory is not the source directory, they might not exists. So 679# we compile into subdirectories. if the target directory is not the source directory, they might not exists. So
1056# we depend the various files onto their directories. 680# we depend the various files onto their directories.
@@ -1063,6 +687,36 @@ $(sort $(dir $(DIRECTORY_DEPS))):
1063$(LIB_FILE): $(LIB_OBJS) 687$(LIB_FILE): $(LIB_OBJS)
1064 $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) 688 $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
1065 689
690help:
691 @echo 'Perf make targets:'
692 @echo ' doc - make *all* documentation (see below)'
693 @echo ' man - make manpage documentation (access with man <foo>)'
694 @echo ' html - make html documentation'
695 @echo ' info - make GNU info documentation (access with info <foo>)'
696 @echo ' pdf - make pdf documentation'
697 @echo ' TAGS - use etags to make tag information for source browsing'
698 @echo ' tags - use ctags to make tag information for source browsing'
699 @echo ' cscope - use cscope to make interactive browsing database'
700 @echo ''
701 @echo 'Perf install targets:'
702 @echo ' NOTE: documentation build requires asciidoc, xmlto packages to be installed'
703 @echo ' HINT: use "make prefix=<path> <install target>" to install to a particular'
704 @echo ' path like make prefix=/usr/local install install-doc'
705 @echo ' install - install compiled binaries'
706 @echo ' install-doc - install *all* documentation'
707 @echo ' install-man - install manpage documentation'
708 @echo ' install-html - install html documentation'
709 @echo ' install-info - install GNU info documentation'
710 @echo ' install-pdf - install pdf documentation'
711 @echo ''
712 @echo ' quick-install-doc - alias for quick-install-man'
713 @echo ' quick-install-man - install the documentation quickly'
714 @echo ' quick-install-html - install the html documentation quickly'
715 @echo ''
716 @echo 'Perf maintainer targets:'
717 @echo ' distclean - alias to clean'
718 @echo ' clean - clean all binary objects and build output'
719
1066doc: 720doc:
1067 $(MAKE) -C Documentation all 721 $(MAKE) -C Documentation all
1068 722
@@ -1101,30 +755,12 @@ $(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS
1101 echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \ 755 echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \
1102 fi 756 fi
1103 757
1104# We need to apply sq twice, once to protect from the shell
1105# that runs $(OUTPUT)PERF-BUILD-OPTIONS, and then again to protect it
1106# and the first level quoting from the shell that runs "echo".
1107$(OUTPUT)PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
1108 @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
1109 @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
1110 @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
1111 @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
1112
1113### Testing rules 758### Testing rules
1114 759
1115#
1116# None right now:
1117#
1118# TEST_PROGRAMS += test-something$X
1119
1120all:: $(TEST_PROGRAMS)
1121
1122# GNU make supports exporting all variables by "export" without parameters. 760# GNU make supports exporting all variables by "export" without parameters.
1123# However, the environment gets quite big, and some programs have problems 761# However, the environment gets quite big, and some programs have problems
1124# with that. 762# with that.
1125 763
1126export NO_SVN_TESTS
1127
1128check: $(OUTPUT)common-cmds.h 764check: $(OUTPUT)common-cmds.h
1129 if sparse; \ 765 if sparse; \
1130 then \ 766 then \
@@ -1133,33 +769,21 @@ check: $(OUTPUT)common-cmds.h
1133 sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \ 769 sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
1134 done; \ 770 done; \
1135 else \ 771 else \
1136 echo 2>&1 "Did you mean 'make test'?"; \
1137 exit 1; \ 772 exit 1; \
1138 fi 773 fi
1139 774
1140remove-dashes:
1141 ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
1142
1143### Installation rules 775### Installation rules
1144 776
1145ifneq ($(filter /%,$(firstword $(template_dir))),)
1146template_instdir = $(template_dir)
1147else
1148template_instdir = $(prefix)/$(template_dir)
1149endif
1150export template_instdir
1151
1152ifneq ($(filter /%,$(firstword $(perfexecdir))),) 777ifneq ($(filter /%,$(firstword $(perfexecdir))),)
1153perfexec_instdir = $(perfexecdir) 778perfexec_instdir = $(perfexecdir)
1154else 779else
1155perfexec_instdir = $(prefix)/$(perfexecdir) 780perfexec_instdir = $(prefix)/$(perfexecdir)
1156endif 781endif
1157perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir)) 782perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
1158export perfexec_instdir
1159 783
1160install: all 784install: all
1161 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' 785 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
1162 $(INSTALL) $(OUTPUT)perf$X '$(DESTDIR_SQ)$(bindir_SQ)' 786 $(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'
1163 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace' 787 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
1164 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin' 788 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
1165 $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' 789 $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
@@ -1172,14 +796,6 @@ install: all
1172 $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python' 796 $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'
1173 $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin' 797 $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
1174 798
1175ifdef BUILT_INS
1176 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
1177 $(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
1178ifneq (,$X)
1179 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) $(OUTPUT)perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
1180endif
1181endif
1182
1183install-doc: 799install-doc:
1184 $(MAKE) -C Documentation install 800 $(MAKE) -C Documentation install
1185 801
@@ -1204,104 +820,17 @@ quick-install-man:
1204quick-install-html: 820quick-install-html:
1205 $(MAKE) -C Documentation quick-install-html 821 $(MAKE) -C Documentation quick-install-html
1206 822
1207
1208### Maintainer's dist rules
1209#
1210# None right now
1211#
1212#
1213# perf.spec: perf.spec.in
1214# sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
1215# mv $@+ $@
1216#
1217# PERF_TARNAME=perf-$(PERF_VERSION)
1218# dist: perf.spec perf-archive$(X) configure
1219# ./perf-archive --format=tar \
1220# --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
1221# @mkdir -p $(PERF_TARNAME)
1222# @cp perf.spec configure $(PERF_TARNAME)
1223# @echo $(PERF_VERSION) > $(PERF_TARNAME)/version
1224# $(TAR) rf $(PERF_TARNAME).tar \
1225# $(PERF_TARNAME)/perf.spec \
1226# $(PERF_TARNAME)/configure \
1227# $(PERF_TARNAME)/version
1228# @$(RM) -r $(PERF_TARNAME)
1229# gzip -f -9 $(PERF_TARNAME).tar
1230#
1231# htmldocs = perf-htmldocs-$(PERF_VERSION)
1232# manpages = perf-manpages-$(PERF_VERSION)
1233# dist-doc:
1234# $(RM) -r .doc-tmp-dir
1235# mkdir .doc-tmp-dir
1236# $(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
1237# cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
1238# gzip -n -9 -f $(htmldocs).tar
1239# :
1240# $(RM) -r .doc-tmp-dir
1241# mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
1242# $(MAKE) -C Documentation DESTDIR=./ \
1243# man1dir=../.doc-tmp-dir/man1 \
1244# man5dir=../.doc-tmp-dir/man5 \
1245# man7dir=../.doc-tmp-dir/man7 \
1246# install
1247# cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
1248# gzip -n -9 -f $(manpages).tar
1249# $(RM) -r .doc-tmp-dir
1250#
1251# rpm: dist
1252# $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
1253
1254### Cleaning rules 823### Cleaning rules
1255 824
1256distclean: clean
1257# $(RM) configure
1258
1259clean: 825clean:
1260 $(RM) *.o */*.o */*/*.o */*/*/*.o $(LIB_FILE) 826 $(RM) $(OUTPUT){*.o,*/*.o,*/*/*.o,*/*/*/*.o,$(LIB_FILE),perf-archive}
1261 $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X 827 $(RM) $(ALL_PROGRAMS) perf
1262 $(RM) $(TEST_PROGRAMS)
1263 $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* 828 $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope*
1264 $(RM) -r autom4te.cache
1265 $(RM) config.log config.mak.autogen config.mak.append config.status config.cache
1266 $(RM) -r $(PERF_TARNAME) .doc-tmp-dir
1267 $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
1268 $(RM) $(htmldocs).tar.gz $(manpages).tar.gz
1269 $(MAKE) -C Documentation/ clean 829 $(MAKE) -C Documentation/ clean
1270 $(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-BUILD-OPTIONS 830 $(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS
831 @python util/setup.py clean --build-lib='$(OUTPUT)python' \
832 --build-temp='$(OUTPUT)python/temp'
1271 833
1272.PHONY: all install clean strip 834.PHONY: all install clean strip
1273.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell 835.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
1274.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS 836.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
1275.PHONY: .FORCE-PERF-BUILD-OPTIONS
1276
1277### Make sure built-ins do not have dups and listed in perf.c
1278#
1279check-builtins::
1280 ./check-builtins.sh
1281
1282### Test suite coverage testing
1283#
1284# None right now
1285#
1286# .PHONY: coverage coverage-clean coverage-build coverage-report
1287#
1288# coverage:
1289# $(MAKE) coverage-build
1290# $(MAKE) coverage-report
1291#
1292# coverage-clean:
1293# rm -f *.gcda *.gcno
1294#
1295# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
1296# COVERAGE_LDFLAGS = $(CFLAGS) -O0 -lgcov
1297#
1298# coverage-build: coverage-clean
1299# $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
1300# $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
1301# -j1 test
1302#
1303# coverage-report:
1304# gcov -b *.c */*.c
1305# grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
1306# | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
1307# | tee coverage-untested-functions
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index d9ab3ce446a..0c7454f8b8a 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -55,7 +55,7 @@ int bench_sched_pipe(int argc, const char **argv,
55 * discarding returned value of read(), write() 55 * discarding returned value of read(), write()
56 * causes error in building environment for perf 56 * causes error in building environment for perf
57 */ 57 */
58 int ret, wait_stat; 58 int __used ret, wait_stat;
59 pid_t pid, retpid; 59 pid_t pid, retpid;
60 60
61 argc = parse_options(argc, argv, options, 61 argc = parse_options(argc, argv, options,
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 8879463807e..695de4b5ae6 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -9,6 +9,7 @@
9 9
10#include "util/util.h" 10#include "util/util.h"
11 11
12#include "util/util.h"
12#include "util/color.h" 13#include "util/color.h"
13#include <linux/list.h> 14#include <linux/list.h>
14#include "util/cache.h" 15#include "util/cache.h"
@@ -18,6 +19,9 @@
18#include "perf.h" 19#include "perf.h"
19#include "util/debug.h" 20#include "util/debug.h"
20 21
22#include "util/evlist.h"
23#include "util/evsel.h"
24#include "util/annotate.h"
21#include "util/event.h" 25#include "util/event.h"
22#include "util/parse-options.h" 26#include "util/parse-options.h"
23#include "util/parse-events.h" 27#include "util/parse-events.h"
@@ -36,9 +40,13 @@ static bool print_line;
36 40
37static const char *sym_hist_filter; 41static const char *sym_hist_filter;
38 42
39static int hists__add_entry(struct hists *self, struct addr_location *al) 43static int perf_evlist__add_sample(struct perf_evlist *evlist,
44 struct perf_sample *sample,
45 struct addr_location *al)
40{ 46{
47 struct perf_evsel *evsel;
41 struct hist_entry *he; 48 struct hist_entry *he;
49 int ret;
42 50
43 if (sym_hist_filter != NULL && 51 if (sym_hist_filter != NULL &&
44 (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) { 52 (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) {
@@ -51,25 +59,51 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
51 return 0; 59 return 0;
52 } 60 }
53 61
54 he = __hists__add_entry(self, al, NULL, 1); 62 evsel = perf_evlist__id2evsel(evlist, sample->id);
63 if (evsel == NULL) {
64 /*
65 * FIXME: Propagate this back, but at least we're in a builtin,
66 * where exit() is allowed. ;-)
67 */
68 ui__warning("Invalid %s file, contains samples with id not in "
69 "its header!\n", input_name);
70 exit_browser(0);
71 exit(1);
72 }
73
74 he = __hists__add_entry(&evsel->hists, al, NULL, 1);
55 if (he == NULL) 75 if (he == NULL)
56 return -ENOMEM; 76 return -ENOMEM;
57 77
58 return hist_entry__inc_addr_samples(he, al->addr); 78 ret = 0;
79 if (he->ms.sym != NULL) {
80 struct annotation *notes = symbol__annotation(he->ms.sym);
81 if (notes->src == NULL &&
82 symbol__alloc_hist(he->ms.sym, evlist->nr_entries) < 0)
83 return -ENOMEM;
84
85 ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
86 }
87
88 evsel->hists.stats.total_period += sample->period;
89 hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
90 return ret;
59} 91}
60 92
61static int process_sample_event(event_t *event, struct sample_data *sample, 93static int process_sample_event(union perf_event *event,
94 struct perf_sample *sample,
62 struct perf_session *session) 95 struct perf_session *session)
63{ 96{
64 struct addr_location al; 97 struct addr_location al;
65 98
66 if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { 99 if (perf_event__preprocess_sample(event, session, &al, sample,
100 symbol__annotate_init) < 0) {
67 pr_warning("problem processing %d event, skipping it.\n", 101 pr_warning("problem processing %d event, skipping it.\n",
68 event->header.type); 102 event->header.type);
69 return -1; 103 return -1;
70 } 104 }
71 105
72 if (!al.filtered && hists__add_entry(&session->hists, &al)) { 106 if (!al.filtered && perf_evlist__add_sample(session->evlist, sample, &al)) {
73 pr_warning("problem incrementing symbol count, " 107 pr_warning("problem incrementing symbol count, "
74 "skipping event\n"); 108 "skipping event\n");
75 return -1; 109 return -1;
@@ -78,261 +112,26 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
78 return 0; 112 return 0;
79} 113}
80 114
81static int objdump_line__print(struct objdump_line *self, 115static int hist_entry__tty_annotate(struct hist_entry *he, int evidx)
82 struct list_head *head,
83 struct hist_entry *he, u64 len)
84{
85 struct symbol *sym = he->ms.sym;
86 static const char *prev_line;
87 static const char *prev_color;
88
89 if (self->offset != -1) {
90 const char *path = NULL;
91 unsigned int hits = 0;
92 double percent = 0.0;
93 const char *color;
94 struct sym_priv *priv = symbol__priv(sym);
95 struct sym_ext *sym_ext = priv->ext;
96 struct sym_hist *h = priv->hist;
97 s64 offset = self->offset;
98 struct objdump_line *next = objdump__get_next_ip_line(head, self);
99
100 while (offset < (s64)len &&
101 (next == NULL || offset < next->offset)) {
102 if (sym_ext) {
103 if (path == NULL)
104 path = sym_ext[offset].path;
105 percent += sym_ext[offset].percent;
106 } else
107 hits += h->ip[offset];
108
109 ++offset;
110 }
111
112 if (sym_ext == NULL && h->sum)
113 percent = 100.0 * hits / h->sum;
114
115 color = get_percent_color(percent);
116
117 /*
118 * Also color the filename and line if needed, with
119 * the same color than the percentage. Don't print it
120 * twice for close colored ip with the same filename:line
121 */
122 if (path) {
123 if (!prev_line || strcmp(prev_line, path)
124 || color != prev_color) {
125 color_fprintf(stdout, color, " %s", path);
126 prev_line = path;
127 prev_color = color;
128 }
129 }
130
131 color_fprintf(stdout, color, " %7.2f", percent);
132 printf(" : ");
133 color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", self->line);
134 } else {
135 if (!*self->line)
136 printf(" :\n");
137 else
138 printf(" : %s\n", self->line);
139 }
140
141 return 0;
142}
143
144static struct rb_root root_sym_ext;
145
146static void insert_source_line(struct sym_ext *sym_ext)
147{
148 struct sym_ext *iter;
149 struct rb_node **p = &root_sym_ext.rb_node;
150 struct rb_node *parent = NULL;
151
152 while (*p != NULL) {
153 parent = *p;
154 iter = rb_entry(parent, struct sym_ext, node);
155
156 if (sym_ext->percent > iter->percent)
157 p = &(*p)->rb_left;
158 else
159 p = &(*p)->rb_right;
160 }
161
162 rb_link_node(&sym_ext->node, parent, p);
163 rb_insert_color(&sym_ext->node, &root_sym_ext);
164}
165
166static void free_source_line(struct hist_entry *he, int len)
167{
168 struct sym_priv *priv = symbol__priv(he->ms.sym);
169 struct sym_ext *sym_ext = priv->ext;
170 int i;
171
172 if (!sym_ext)
173 return;
174
175 for (i = 0; i < len; i++)
176 free(sym_ext[i].path);
177 free(sym_ext);
178
179 priv->ext = NULL;
180 root_sym_ext = RB_ROOT;
181}
182
183/* Get the filename:line for the colored entries */
184static void
185get_source_line(struct hist_entry *he, int len, const char *filename)
186{
187 struct symbol *sym = he->ms.sym;
188 u64 start;
189 int i;
190 char cmd[PATH_MAX * 2];
191 struct sym_ext *sym_ext;
192 struct sym_priv *priv = symbol__priv(sym);
193 struct sym_hist *h = priv->hist;
194
195 if (!h->sum)
196 return;
197
198 sym_ext = priv->ext = calloc(len, sizeof(struct sym_ext));
199 if (!priv->ext)
200 return;
201
202 start = he->ms.map->unmap_ip(he->ms.map, sym->start);
203
204 for (i = 0; i < len; i++) {
205 char *path = NULL;
206 size_t line_len;
207 u64 offset;
208 FILE *fp;
209
210 sym_ext[i].percent = 100.0 * h->ip[i] / h->sum;
211 if (sym_ext[i].percent <= 0.5)
212 continue;
213
214 offset = start + i;
215 sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset);
216 fp = popen(cmd, "r");
217 if (!fp)
218 continue;
219
220 if (getline(&path, &line_len, fp) < 0 || !line_len)
221 goto next;
222
223 sym_ext[i].path = malloc(sizeof(char) * line_len + 1);
224 if (!sym_ext[i].path)
225 goto next;
226
227 strcpy(sym_ext[i].path, path);
228 insert_source_line(&sym_ext[i]);
229
230 next:
231 pclose(fp);
232 }
233}
234
235static void print_summary(const char *filename)
236{
237 struct sym_ext *sym_ext;
238 struct rb_node *node;
239
240 printf("\nSorted summary for file %s\n", filename);
241 printf("----------------------------------------------\n\n");
242
243 if (RB_EMPTY_ROOT(&root_sym_ext)) {
244 printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
245 return;
246 }
247
248 node = rb_first(&root_sym_ext);
249 while (node) {
250 double percent;
251 const char *color;
252 char *path;
253
254 sym_ext = rb_entry(node, struct sym_ext, node);
255 percent = sym_ext->percent;
256 color = get_percent_color(percent);
257 path = sym_ext->path;
258
259 color_fprintf(stdout, color, " %7.2f %s", percent, path);
260 node = rb_next(node);
261 }
262}
263
264static void hist_entry__print_hits(struct hist_entry *self)
265{
266 struct symbol *sym = self->ms.sym;
267 struct sym_priv *priv = symbol__priv(sym);
268 struct sym_hist *h = priv->hist;
269 u64 len = sym->end - sym->start, offset;
270
271 for (offset = 0; offset < len; ++offset)
272 if (h->ip[offset] != 0)
273 printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
274 sym->start + offset, h->ip[offset]);
275 printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
276}
277
278static int hist_entry__tty_annotate(struct hist_entry *he)
279{ 116{
280 struct map *map = he->ms.map; 117 return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx,
281 struct dso *dso = map->dso; 118 print_line, full_paths, 0, 0);
282 struct symbol *sym = he->ms.sym;
283 const char *filename = dso->long_name, *d_filename;
284 u64 len;
285 LIST_HEAD(head);
286 struct objdump_line *pos, *n;
287
288 if (hist_entry__annotate(he, &head, 0) < 0)
289 return -1;
290
291 if (full_paths)
292 d_filename = filename;
293 else
294 d_filename = basename(filename);
295
296 len = sym->end - sym->start;
297
298 if (print_line) {
299 get_source_line(he, len, filename);
300 print_summary(filename);
301 }
302
303 printf("\n\n------------------------------------------------\n");
304 printf(" Percent | Source code & Disassembly of %s\n", d_filename);
305 printf("------------------------------------------------\n");
306
307 if (verbose)
308 hist_entry__print_hits(he);
309
310 list_for_each_entry_safe(pos, n, &head, node) {
311 objdump_line__print(pos, &head, he, len);
312 list_del(&pos->node);
313 objdump_line__free(pos);
314 }
315
316 if (print_line)
317 free_source_line(he, len);
318
319 return 0;
320} 119}
321 120
322static void hists__find_annotations(struct hists *self) 121static void hists__find_annotations(struct hists *self, int evidx)
323{ 122{
324 struct rb_node *nd = rb_first(&self->entries), *next; 123 struct rb_node *nd = rb_first(&self->entries), *next;
325 int key = KEY_RIGHT; 124 int key = KEY_RIGHT;
326 125
327 while (nd) { 126 while (nd) {
328 struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); 127 struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
329 struct sym_priv *priv; 128 struct annotation *notes;
330 129
331 if (he->ms.sym == NULL || he->ms.map->dso->annotate_warned) 130 if (he->ms.sym == NULL || he->ms.map->dso->annotate_warned)
332 goto find_next; 131 goto find_next;
333 132
334 priv = symbol__priv(he->ms.sym); 133 notes = symbol__annotation(he->ms.sym);
335 if (priv->hist == NULL) { 134 if (notes->src == NULL) {
336find_next: 135find_next:
337 if (key == KEY_LEFT) 136 if (key == KEY_LEFT)
338 nd = rb_prev(nd); 137 nd = rb_prev(nd);
@@ -342,7 +141,7 @@ find_next:
342 } 141 }
343 142
344 if (use_browser > 0) { 143 if (use_browser > 0) {
345 key = hist_entry__tui_annotate(he); 144 key = hist_entry__tui_annotate(he, evidx);
346 switch (key) { 145 switch (key) {
347 case KEY_RIGHT: 146 case KEY_RIGHT:
348 next = rb_next(nd); 147 next = rb_next(nd);
@@ -357,24 +156,24 @@ find_next:
357 if (next != NULL) 156 if (next != NULL)
358 nd = next; 157 nd = next;
359 } else { 158 } else {
360 hist_entry__tty_annotate(he); 159 hist_entry__tty_annotate(he, evidx);
361 nd = rb_next(nd); 160 nd = rb_next(nd);
362 /* 161 /*
363 * Since we have a hist_entry per IP for the same 162 * Since we have a hist_entry per IP for the same
364 * symbol, free he->ms.sym->hist to signal we already 163 * symbol, free he->ms.sym->src to signal we already
365 * processed this symbol. 164 * processed this symbol.
366 */ 165 */
367 free(priv->hist); 166 free(notes->src);
368 priv->hist = NULL; 167 notes->src = NULL;
369 } 168 }
370 } 169 }
371} 170}
372 171
373static struct perf_event_ops event_ops = { 172static struct perf_event_ops event_ops = {
374 .sample = process_sample_event, 173 .sample = process_sample_event,
375 .mmap = event__process_mmap, 174 .mmap = perf_event__process_mmap,
376 .comm = event__process_comm, 175 .comm = perf_event__process_comm,
377 .fork = event__process_task, 176 .fork = perf_event__process_task,
378 .ordered_samples = true, 177 .ordered_samples = true,
379 .ordering_requires_timestamps = true, 178 .ordering_requires_timestamps = true,
380}; 179};
@@ -383,6 +182,8 @@ static int __cmd_annotate(void)
383{ 182{
384 int ret; 183 int ret;
385 struct perf_session *session; 184 struct perf_session *session;
185 struct perf_evsel *pos;
186 u64 total_nr_samples;
386 187
387 session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); 188 session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
388 if (session == NULL) 189 if (session == NULL)
@@ -403,12 +204,36 @@ static int __cmd_annotate(void)
403 if (verbose > 2) 204 if (verbose > 2)
404 perf_session__fprintf_dsos(session, stdout); 205 perf_session__fprintf_dsos(session, stdout);
405 206
406 hists__collapse_resort(&session->hists); 207 total_nr_samples = 0;
407 hists__output_resort(&session->hists); 208 list_for_each_entry(pos, &session->evlist->entries, node) {
408 hists__find_annotations(&session->hists); 209 struct hists *hists = &pos->hists;
409out_delete: 210 u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
410 perf_session__delete(session); 211
212 if (nr_samples > 0) {
213 total_nr_samples += nr_samples;
214 hists__collapse_resort(hists);
215 hists__output_resort(hists);
216 hists__find_annotations(hists, pos->idx);
217 }
218 }
411 219
220 if (total_nr_samples == 0) {
221 ui__warning("The %s file has no samples!\n", input_name);
222 goto out_delete;
223 }
224out_delete:
225 /*
226 * Speed up the exit process, for large files this can
227 * take quite a while.
228 *
229 * XXX Enable this when using valgrind or if we ever
230 * librarize this command.
231 *
232 * Also experiment with obstacks to see how much speed
233 * up we'll get here.
234 *
235 * perf_session__delete(session);
236 */
412 return ret; 237 return ret;
413} 238}
414 239
@@ -451,9 +276,9 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
451 else if (use_tui) 276 else if (use_tui)
452 use_browser = 1; 277 use_browser = 1;
453 278
454 setup_browser(); 279 setup_browser(true);
455 280
456 symbol_conf.priv_size = sizeof(struct sym_priv); 281 symbol_conf.priv_size = sizeof(struct annotation);
457 symbol_conf.try_vmlinux_path = true; 282 symbol_conf.try_vmlinux_path = true;
458 283
459 if (symbol__init() < 0) 284 if (symbol__init() < 0)
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 3153e492dbc..6b7d91160ec 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -30,13 +30,13 @@ static int hists__add_entry(struct hists *self,
30 return -ENOMEM; 30 return -ENOMEM;
31} 31}
32 32
33static int diff__process_sample_event(event_t *event, 33static int diff__process_sample_event(union perf_event *event,
34 struct sample_data *sample, 34 struct perf_sample *sample,
35 struct perf_session *session) 35 struct perf_session *session)
36{ 36{
37 struct addr_location al; 37 struct addr_location al;
38 38
39 if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { 39 if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
40 pr_warning("problem processing %d event, skipping it.\n", 40 pr_warning("problem processing %d event, skipping it.\n",
41 event->header.type); 41 event->header.type);
42 return -1; 42 return -1;
@@ -56,11 +56,11 @@ static int diff__process_sample_event(event_t *event,
56 56
57static struct perf_event_ops event_ops = { 57static struct perf_event_ops event_ops = {
58 .sample = diff__process_sample_event, 58 .sample = diff__process_sample_event,
59 .mmap = event__process_mmap, 59 .mmap = perf_event__process_mmap,
60 .comm = event__process_comm, 60 .comm = perf_event__process_comm,
61 .exit = event__process_task, 61 .exit = perf_event__process_task,
62 .fork = event__process_task, 62 .fork = perf_event__process_task,
63 .lost = event__process_lost, 63 .lost = perf_event__process_lost,
64 .ordered_samples = true, 64 .ordered_samples = true,
65 .ordering_requires_timestamps = true, 65 .ordering_requires_timestamps = true,
66}; 66};
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 0c78ffa7bf6..e29f04ed339 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -16,8 +16,8 @@
16static char const *input_name = "-"; 16static char const *input_name = "-";
17static bool inject_build_ids; 17static bool inject_build_ids;
18 18
19static int event__repipe_synth(event_t *event, 19static int perf_event__repipe_synth(union perf_event *event,
20 struct perf_session *session __used) 20 struct perf_session *session __used)
21{ 21{
22 uint32_t size; 22 uint32_t size;
23 void *buf = event; 23 void *buf = event;
@@ -36,41 +36,44 @@ static int event__repipe_synth(event_t *event,
36 return 0; 36 return 0;
37} 37}
38 38
39static int event__repipe(event_t *event, struct sample_data *sample __used, 39static int perf_event__repipe(union perf_event *event,
40 struct perf_session *session) 40 struct perf_sample *sample __used,
41 struct perf_session *session)
41{ 42{
42 return event__repipe_synth(event, session); 43 return perf_event__repipe_synth(event, session);
43} 44}
44 45
45static int event__repipe_mmap(event_t *self, struct sample_data *sample, 46static int perf_event__repipe_mmap(union perf_event *event,
46 struct perf_session *session) 47 struct perf_sample *sample,
48 struct perf_session *session)
47{ 49{
48 int err; 50 int err;
49 51
50 err = event__process_mmap(self, sample, session); 52 err = perf_event__process_mmap(event, sample, session);
51 event__repipe(self, sample, session); 53 perf_event__repipe(event, sample, session);
52 54
53 return err; 55 return err;
54} 56}
55 57
56static int event__repipe_task(event_t *self, struct sample_data *sample, 58static int perf_event__repipe_task(union perf_event *event,
57 struct perf_session *session) 59 struct perf_sample *sample,
60 struct perf_session *session)
58{ 61{
59 int err; 62 int err;
60 63
61 err = event__process_task(self, sample, session); 64 err = perf_event__process_task(event, sample, session);
62 event__repipe(self, sample, session); 65 perf_event__repipe(event, sample, session);
63 66
64 return err; 67 return err;
65} 68}
66 69
67static int event__repipe_tracing_data(event_t *self, 70static int perf_event__repipe_tracing_data(union perf_event *event,
68 struct perf_session *session) 71 struct perf_session *session)
69{ 72{
70 int err; 73 int err;
71 74
72 event__repipe_synth(self, session); 75 perf_event__repipe_synth(event, session);
73 err = event__process_tracing_data(self, session); 76 err = perf_event__process_tracing_data(event, session);
74 77
75 return err; 78 return err;
76} 79}
@@ -109,8 +112,8 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
109 if (self->kernel) 112 if (self->kernel)
110 misc = PERF_RECORD_MISC_KERNEL; 113 misc = PERF_RECORD_MISC_KERNEL;
111 114
112 err = event__synthesize_build_id(self, misc, event__repipe, 115 err = perf_event__synthesize_build_id(self, misc, perf_event__repipe,
113 machine, session); 116 machine, session);
114 if (err) { 117 if (err) {
115 pr_err("Can't synthesize build_id event for %s\n", self->long_name); 118 pr_err("Can't synthesize build_id event for %s\n", self->long_name);
116 return -1; 119 return -1;
@@ -119,8 +122,9 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
119 return 0; 122 return 0;
120} 123}
121 124
122static int event__inject_buildid(event_t *event, struct sample_data *sample, 125static int perf_event__inject_buildid(union perf_event *event,
123 struct perf_session *session) 126 struct perf_sample *sample,
127 struct perf_session *session)
124{ 128{
125 struct addr_location al; 129 struct addr_location al;
126 struct thread *thread; 130 struct thread *thread;
@@ -155,24 +159,24 @@ static int event__inject_buildid(event_t *event, struct sample_data *sample,
155 } 159 }
156 160
157repipe: 161repipe:
158 event__repipe(event, sample, session); 162 perf_event__repipe(event, sample, session);
159 return 0; 163 return 0;
160} 164}
161 165
162struct perf_event_ops inject_ops = { 166struct perf_event_ops inject_ops = {
163 .sample = event__repipe, 167 .sample = perf_event__repipe,
164 .mmap = event__repipe, 168 .mmap = perf_event__repipe,
165 .comm = event__repipe, 169 .comm = perf_event__repipe,
166 .fork = event__repipe, 170 .fork = perf_event__repipe,
167 .exit = event__repipe, 171 .exit = perf_event__repipe,
168 .lost = event__repipe, 172 .lost = perf_event__repipe,
169 .read = event__repipe, 173 .read = perf_event__repipe,
170 .throttle = event__repipe, 174 .throttle = perf_event__repipe,
171 .unthrottle = event__repipe, 175 .unthrottle = perf_event__repipe,
172 .attr = event__repipe_synth, 176 .attr = perf_event__repipe_synth,
173 .event_type = event__repipe_synth, 177 .event_type = perf_event__repipe_synth,
174 .tracing_data = event__repipe_synth, 178 .tracing_data = perf_event__repipe_synth,
175 .build_id = event__repipe_synth, 179 .build_id = perf_event__repipe_synth,
176}; 180};
177 181
178extern volatile int session_done; 182extern volatile int session_done;
@@ -190,10 +194,10 @@ static int __cmd_inject(void)
190 signal(SIGINT, sig_handler); 194 signal(SIGINT, sig_handler);
191 195
192 if (inject_build_ids) { 196 if (inject_build_ids) {
193 inject_ops.sample = event__inject_buildid; 197 inject_ops.sample = perf_event__inject_buildid;
194 inject_ops.mmap = event__repipe_mmap; 198 inject_ops.mmap = perf_event__repipe_mmap;
195 inject_ops.fork = event__repipe_task; 199 inject_ops.fork = perf_event__repipe_task;
196 inject_ops.tracing_data = event__repipe_tracing_data; 200 inject_ops.tracing_data = perf_event__repipe_tracing_data;
197 } 201 }
198 202
199 session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops); 203 session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops);
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index d97256d6598..7f618f4e7b7 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -275,9 +275,8 @@ static void process_free_event(void *data,
275 s_alloc->alloc_cpu = -1; 275 s_alloc->alloc_cpu = -1;
276} 276}
277 277
278static void 278static void process_raw_event(union perf_event *raw_event __used, void *data,
279process_raw_event(event_t *raw_event __used, void *data, 279 int cpu, u64 timestamp, struct thread *thread)
280 int cpu, u64 timestamp, struct thread *thread)
281{ 280{
282 struct event *event; 281 struct event *event;
283 int type; 282 int type;
@@ -304,7 +303,8 @@ process_raw_event(event_t *raw_event __used, void *data,
304 } 303 }
305} 304}
306 305
307static int process_sample_event(event_t *event, struct sample_data *sample, 306static int process_sample_event(union perf_event *event,
307 struct perf_sample *sample,
308 struct perf_session *session) 308 struct perf_session *session)
309{ 309{
310 struct thread *thread = perf_session__findnew(session, event->ip.pid); 310 struct thread *thread = perf_session__findnew(session, event->ip.pid);
@@ -325,7 +325,7 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
325 325
326static struct perf_event_ops event_ops = { 326static struct perf_event_ops event_ops = {
327 .sample = process_sample_event, 327 .sample = process_sample_event,
328 .comm = event__process_comm, 328 .comm = perf_event__process_comm,
329 .ordered_samples = true, 329 .ordered_samples = true,
330}; 330};
331 331
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index d88c6961274..6313b6eb3eb 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -5,6 +5,7 @@
5 * 5 *
6 * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de> 6 * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 7 * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
8 * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
8 */ 9 */
9#include "builtin.h" 10#include "builtin.h"
10 11
@@ -13,9 +14,47 @@
13#include "util/parse-events.h" 14#include "util/parse-events.h"
14#include "util/cache.h" 15#include "util/cache.h"
15 16
16int cmd_list(int argc __used, const char **argv __used, const char *prefix __used) 17int cmd_list(int argc, const char **argv, const char *prefix __used)
17{ 18{
18 setup_pager(); 19 setup_pager();
19 print_events(); 20
21 if (argc == 1)
22 print_events(NULL);
23 else {
24 int i;
25
26 for (i = 1; i < argc; ++i) {
27 if (i > 1)
28 putchar('\n');
29 if (strncmp(argv[i], "tracepoint", 10) == 0)
30 print_tracepoint_events(NULL, NULL);
31 else if (strcmp(argv[i], "hw") == 0 ||
32 strcmp(argv[i], "hardware") == 0)
33 print_events_type(PERF_TYPE_HARDWARE);
34 else if (strcmp(argv[i], "sw") == 0 ||
35 strcmp(argv[i], "software") == 0)
36 print_events_type(PERF_TYPE_SOFTWARE);
37 else if (strcmp(argv[i], "cache") == 0 ||
38 strcmp(argv[i], "hwcache") == 0)
39 print_hwcache_events(NULL);
40 else {
41 char *sep = strchr(argv[i], ':'), *s;
42 int sep_idx;
43
44 if (sep == NULL) {
45 print_events(argv[i]);
46 continue;
47 }
48 sep_idx = sep - argv[i];
49 s = strdup(argv[i]);
50 if (s == NULL)
51 return -1;
52
53 s[sep_idx] = '\0';
54 print_tracepoint_events(s, s + sep_idx + 1);
55 free(s);
56 }
57 }
58 }
20 return 0; 59 return 0;
21} 60}
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 2b36defc5d7..2e93f99b148 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -834,14 +834,14 @@ static void dump_info(void)
834 die("Unknown type of information\n"); 834 die("Unknown type of information\n");
835} 835}
836 836
837static int process_sample_event(event_t *self, struct sample_data *sample, 837static int process_sample_event(union perf_event *event, struct perf_sample *sample,
838 struct perf_session *s) 838 struct perf_session *s)
839{ 839{
840 struct thread *thread = perf_session__findnew(s, sample->tid); 840 struct thread *thread = perf_session__findnew(s, sample->tid);
841 841
842 if (thread == NULL) { 842 if (thread == NULL) {
843 pr_debug("problem processing %d event, skipping it.\n", 843 pr_debug("problem processing %d event, skipping it.\n",
844 self->header.type); 844 event->header.type);
845 return -1; 845 return -1;
846 } 846 }
847 847
@@ -852,7 +852,7 @@ static int process_sample_event(event_t *self, struct sample_data *sample,
852 852
853static struct perf_event_ops eops = { 853static struct perf_event_ops eops = {
854 .sample = process_sample_event, 854 .sample = process_sample_event,
855 .comm = event__process_comm, 855 .comm = perf_event__process_comm,
856 .ordered_samples = true, 856 .ordered_samples = true,
857}; 857};
858 858
@@ -893,7 +893,7 @@ static const char * const report_usage[] = {
893 893
894static const struct option report_options[] = { 894static const struct option report_options[] = {
895 OPT_STRING('k', "key", &sort_key, "acquired", 895 OPT_STRING('k', "key", &sort_key, "acquired",
896 "key for sorting"), 896 "key for sorting (acquired / contended / wait_total / wait_max / wait_min)"),
897 /* TODO: type */ 897 /* TODO: type */
898 OPT_END() 898 OPT_END()
899}; 899};
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index add163c9f0e..2c0e64d0b4a 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -36,6 +36,7 @@
36#include "builtin.h" 36#include "builtin.h"
37#include "util/util.h" 37#include "util/util.h"
38#include "util/strlist.h" 38#include "util/strlist.h"
39#include "util/strfilter.h"
39#include "util/symbol.h" 40#include "util/symbol.h"
40#include "util/debug.h" 41#include "util/debug.h"
41#include "util/debugfs.h" 42#include "util/debugfs.h"
@@ -43,6 +44,8 @@
43#include "util/probe-finder.h" 44#include "util/probe-finder.h"
44#include "util/probe-event.h" 45#include "util/probe-event.h"
45 46
47#define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
48#define DEFAULT_FUNC_FILTER "!_*"
46#define MAX_PATH_LEN 256 49#define MAX_PATH_LEN 256
47 50
48/* Session management structure */ 51/* Session management structure */
@@ -52,6 +55,7 @@ static struct {
52 bool show_lines; 55 bool show_lines;
53 bool show_vars; 56 bool show_vars;
54 bool show_ext_vars; 57 bool show_ext_vars;
58 bool show_funcs;
55 bool mod_events; 59 bool mod_events;
56 int nevents; 60 int nevents;
57 struct perf_probe_event events[MAX_PROBES]; 61 struct perf_probe_event events[MAX_PROBES];
@@ -59,6 +63,7 @@ static struct {
59 struct line_range line_range; 63 struct line_range line_range;
60 const char *target_module; 64 const char *target_module;
61 int max_probe_points; 65 int max_probe_points;
66 struct strfilter *filter;
62} params; 67} params;
63 68
64/* Parse an event definition. Note that any error must die. */ 69/* Parse an event definition. Note that any error must die. */
@@ -157,6 +162,27 @@ static int opt_show_vars(const struct option *opt __used,
157} 162}
158#endif 163#endif
159 164
165static int opt_set_filter(const struct option *opt __used,
166 const char *str, int unset __used)
167{
168 const char *err;
169
170 if (str) {
171 pr_debug2("Set filter: %s\n", str);
172 if (params.filter)
173 strfilter__delete(params.filter);
174 params.filter = strfilter__new(str, &err);
175 if (!params.filter) {
176 pr_err("Filter parse error at %td.\n", err - str + 1);
177 pr_err("Source: \"%s\"\n", str);
178 pr_err(" %*c\n", (int)(err - str + 1), '^');
179 return -EINVAL;
180 }
181 }
182
183 return 0;
184}
185
160static const char * const probe_usage[] = { 186static const char * const probe_usage[] = {
161 "perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]", 187 "perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
162 "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]", 188 "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
@@ -221,6 +247,13 @@ static const struct option options[] = {
221 OPT__DRY_RUN(&probe_event_dry_run), 247 OPT__DRY_RUN(&probe_event_dry_run),
222 OPT_INTEGER('\0', "max-probes", &params.max_probe_points, 248 OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
223 "Set how many probe points can be found for a probe."), 249 "Set how many probe points can be found for a probe."),
250 OPT_BOOLEAN('F', "funcs", &params.show_funcs,
251 "Show potential probe-able functions."),
252 OPT_CALLBACK('\0', "filter", NULL,
253 "[!]FILTER", "Set a filter (with --vars/funcs only)\n"
254 "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\" for --vars,\n"
255 "\t\t\t \"" DEFAULT_FUNC_FILTER "\" for --funcs)",
256 opt_set_filter),
224 OPT_END() 257 OPT_END()
225}; 258};
226 259
@@ -246,7 +279,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
246 params.max_probe_points = MAX_PROBES; 279 params.max_probe_points = MAX_PROBES;
247 280
248 if ((!params.nevents && !params.dellist && !params.list_events && 281 if ((!params.nevents && !params.dellist && !params.list_events &&
249 !params.show_lines)) 282 !params.show_lines && !params.show_funcs))
250 usage_with_options(probe_usage, options); 283 usage_with_options(probe_usage, options);
251 284
252 /* 285 /*
@@ -267,12 +300,41 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
267 pr_err(" Error: Don't use --list with --vars.\n"); 300 pr_err(" Error: Don't use --list with --vars.\n");
268 usage_with_options(probe_usage, options); 301 usage_with_options(probe_usage, options);
269 } 302 }
303 if (params.show_funcs) {
304 pr_err(" Error: Don't use --list with --funcs.\n");
305 usage_with_options(probe_usage, options);
306 }
270 ret = show_perf_probe_events(); 307 ret = show_perf_probe_events();
271 if (ret < 0) 308 if (ret < 0)
272 pr_err(" Error: Failed to show event list. (%d)\n", 309 pr_err(" Error: Failed to show event list. (%d)\n",
273 ret); 310 ret);
274 return ret; 311 return ret;
275 } 312 }
313 if (params.show_funcs) {
314 if (params.nevents != 0 || params.dellist) {
315 pr_err(" Error: Don't use --funcs with"
316 " --add/--del.\n");
317 usage_with_options(probe_usage, options);
318 }
319 if (params.show_lines) {
320 pr_err(" Error: Don't use --funcs with --line.\n");
321 usage_with_options(probe_usage, options);
322 }
323 if (params.show_vars) {
324 pr_err(" Error: Don't use --funcs with --vars.\n");
325 usage_with_options(probe_usage, options);
326 }
327 if (!params.filter)
328 params.filter = strfilter__new(DEFAULT_FUNC_FILTER,
329 NULL);
330 ret = show_available_funcs(params.target_module,
331 params.filter);
332 strfilter__delete(params.filter);
333 if (ret < 0)
334 pr_err(" Error: Failed to show functions."
335 " (%d)\n", ret);
336 return ret;
337 }
276 338
277#ifdef DWARF_SUPPORT 339#ifdef DWARF_SUPPORT
278 if (params.show_lines) { 340 if (params.show_lines) {
@@ -297,10 +359,16 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
297 " --add/--del.\n"); 359 " --add/--del.\n");
298 usage_with_options(probe_usage, options); 360 usage_with_options(probe_usage, options);
299 } 361 }
362 if (!params.filter)
363 params.filter = strfilter__new(DEFAULT_VAR_FILTER,
364 NULL);
365
300 ret = show_available_vars(params.events, params.nevents, 366 ret = show_available_vars(params.events, params.nevents,
301 params.max_probe_points, 367 params.max_probe_points,
302 params.target_module, 368 params.target_module,
369 params.filter,
303 params.show_ext_vars); 370 params.show_ext_vars);
371 strfilter__delete(params.filter);
304 if (ret < 0) 372 if (ret < 0)
305 pr_err(" Error: Failed to show vars. (%d)\n", ret); 373 pr_err(" Error: Failed to show vars. (%d)\n", ret);
306 return ret; 374 return ret;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 60cac6f92e8..6febcc168a8 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -18,11 +18,13 @@
18 18
19#include "util/header.h" 19#include "util/header.h"
20#include "util/event.h" 20#include "util/event.h"
21#include "util/evlist.h"
21#include "util/evsel.h" 22#include "util/evsel.h"
22#include "util/debug.h" 23#include "util/debug.h"
23#include "util/session.h" 24#include "util/session.h"
24#include "util/symbol.h" 25#include "util/symbol.h"
25#include "util/cpumap.h" 26#include "util/cpumap.h"
27#include "util/thread_map.h"
26 28
27#include <unistd.h> 29#include <unistd.h>
28#include <sched.h> 30#include <sched.h>
@@ -37,16 +39,14 @@ enum write_mode_t {
37 39
38static u64 user_interval = ULLONG_MAX; 40static u64 user_interval = ULLONG_MAX;
39static u64 default_interval = 0; 41static u64 default_interval = 0;
40static u64 sample_type;
41 42
42static struct cpu_map *cpus;
43static unsigned int page_size; 43static unsigned int page_size;
44static unsigned int mmap_pages = 128; 44static unsigned int mmap_pages = 128;
45static unsigned int user_freq = UINT_MAX; 45static unsigned int user_freq = UINT_MAX;
46static int freq = 1000; 46static int freq = 1000;
47static int output; 47static int output;
48static int pipe_output = 0; 48static int pipe_output = 0;
49static const char *output_name = "perf.data"; 49static const char *output_name = NULL;
50static int group = 0; 50static int group = 0;
51static int realtime_prio = 0; 51static int realtime_prio = 0;
52static bool nodelay = false; 52static bool nodelay = false;
@@ -55,7 +55,6 @@ static bool sample_id_all_avail = true;
55static bool system_wide = false; 55static bool system_wide = false;
56static pid_t target_pid = -1; 56static pid_t target_pid = -1;
57static pid_t target_tid = -1; 57static pid_t target_tid = -1;
58static struct thread_map *threads;
59static pid_t child_pid = -1; 58static pid_t child_pid = -1;
60static bool no_inherit = false; 59static bool no_inherit = false;
61static enum write_mode_t write_mode = WRITE_FORCE; 60static enum write_mode_t write_mode = WRITE_FORCE;
@@ -66,51 +65,17 @@ static bool sample_address = false;
66static bool sample_time = false; 65static bool sample_time = false;
67static bool no_buildid = false; 66static bool no_buildid = false;
68static bool no_buildid_cache = false; 67static bool no_buildid_cache = false;
68static struct perf_evlist *evsel_list;
69 69
70static long samples = 0; 70static long samples = 0;
71static u64 bytes_written = 0; 71static u64 bytes_written = 0;
72 72
73static struct pollfd *event_array;
74
75static int nr_poll = 0;
76static int nr_cpu = 0;
77
78static int file_new = 1; 73static int file_new = 1;
79static off_t post_processing_offset; 74static off_t post_processing_offset;
80 75
81static struct perf_session *session; 76static struct perf_session *session;
82static const char *cpu_list; 77static const char *cpu_list;
83 78
84struct mmap_data {
85 void *base;
86 unsigned int mask;
87 unsigned int prev;
88};
89
90static struct mmap_data mmap_array[MAX_NR_CPUS];
91
92static unsigned long mmap_read_head(struct mmap_data *md)
93{
94 struct perf_event_mmap_page *pc = md->base;
95 long head;
96
97 head = pc->data_head;
98 rmb();
99
100 return head;
101}
102
103static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
104{
105 struct perf_event_mmap_page *pc = md->base;
106
107 /*
108 * ensure all reads are done before we write the tail out.
109 */
110 /* mb(); */
111 pc->data_tail = tail;
112}
113
114static void advance_output(size_t size) 79static void advance_output(size_t size)
115{ 80{
116 bytes_written += size; 81 bytes_written += size;
@@ -131,42 +96,26 @@ static void write_output(void *buf, size_t size)
131 } 96 }
132} 97}
133 98
134static int process_synthesized_event(event_t *event, 99static int process_synthesized_event(union perf_event *event,
135 struct sample_data *sample __used, 100 struct perf_sample *sample __used,
136 struct perf_session *self __used) 101 struct perf_session *self __used)
137{ 102{
138 write_output(event, event->header.size); 103 write_output(event, event->header.size);
139 return 0; 104 return 0;
140} 105}
141 106
142static void mmap_read(struct mmap_data *md) 107static void mmap_read(struct perf_mmap *md)
143{ 108{
144 unsigned int head = mmap_read_head(md); 109 unsigned int head = perf_mmap__read_head(md);
145 unsigned int old = md->prev; 110 unsigned int old = md->prev;
146 unsigned char *data = md->base + page_size; 111 unsigned char *data = md->base + page_size;
147 unsigned long size; 112 unsigned long size;
148 void *buf; 113 void *buf;
149 int diff;
150 114
151 /* 115 if (old == head)
152 * If we're further behind than half the buffer, there's a chance 116 return;
153 * the writer will bite our tail and mess up the samples under us.
154 *
155 * If we somehow ended up ahead of the head, we got messed up.
156 *
157 * In either case, truncate and restart at head.
158 */
159 diff = head - old;
160 if (diff < 0) {
161 fprintf(stderr, "WARNING: failed to keep up with mmap data\n");
162 /*
163 * head points to a known good entry, start there.
164 */
165 old = head;
166 }
167 117
168 if (old != head) 118 samples++;
169 samples++;
170 119
171 size = head - old; 120 size = head - old;
172 121
@@ -185,7 +134,7 @@ static void mmap_read(struct mmap_data *md)
185 write_output(buf, size); 134 write_output(buf, size);
186 135
187 md->prev = old; 136 md->prev = old;
188 mmap_write_tail(md, old); 137 perf_mmap__write_tail(md, old);
189} 138}
190 139
191static volatile int done = 0; 140static volatile int done = 0;
@@ -209,53 +158,10 @@ static void sig_atexit(void)
209 kill(getpid(), signr); 158 kill(getpid(), signr);
210} 159}
211 160
212static int group_fd; 161static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
213
214static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
215{
216 struct perf_header_attr *h_attr;
217
218 if (nr < session->header.attrs) {
219 h_attr = session->header.attr[nr];
220 } else {
221 h_attr = perf_header_attr__new(a);
222 if (h_attr != NULL)
223 if (perf_header__add_attr(&session->header, h_attr) < 0) {
224 perf_header_attr__delete(h_attr);
225 h_attr = NULL;
226 }
227 }
228
229 return h_attr;
230}
231
232static void create_counter(struct perf_evsel *evsel, int cpu)
233{ 162{
234 char *filter = evsel->filter;
235 struct perf_event_attr *attr = &evsel->attr; 163 struct perf_event_attr *attr = &evsel->attr;
236 struct perf_header_attr *h_attr;
237 int track = !evsel->idx; /* only the first counter needs these */ 164 int track = !evsel->idx; /* only the first counter needs these */
238 int thread_index;
239 int ret;
240 struct {
241 u64 count;
242 u64 time_enabled;
243 u64 time_running;
244 u64 id;
245 } read_data;
246 /*
247 * Check if parse_single_tracepoint_event has already asked for
248 * PERF_SAMPLE_TIME.
249 *
250 * XXX this is kludgy but short term fix for problems introduced by
251 * eac23d1c that broke 'perf script' by having different sample_types
252 * when using multiple tracepoint events when we use a perf binary
253 * that tries to use sample_id_all on an older kernel.
254 *
255 * We need to move counter creation to perf_session, support
256 * different sample_types, etc.
257 */
258 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
259 165
260 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 166 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
261 PERF_FORMAT_TOTAL_TIME_RUNNING | 167 PERF_FORMAT_TOTAL_TIME_RUNNING |
@@ -263,7 +169,7 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
263 169
264 attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; 170 attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
265 171
266 if (nr_counters > 1) 172 if (evlist->nr_entries > 1)
267 attr->sample_type |= PERF_SAMPLE_ID; 173 attr->sample_type |= PERF_SAMPLE_ID;
268 174
269 /* 175 /*
@@ -315,19 +221,58 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
315 221
316 attr->mmap = track; 222 attr->mmap = track;
317 attr->comm = track; 223 attr->comm = track;
318 attr->inherit = !no_inherit; 224
319 if (target_pid == -1 && target_tid == -1 && !system_wide) { 225 if (target_pid == -1 && target_tid == -1 && !system_wide) {
320 attr->disabled = 1; 226 attr->disabled = 1;
321 attr->enable_on_exec = 1; 227 attr->enable_on_exec = 1;
322 } 228 }
323retry_sample_id: 229}
324 attr->sample_id_all = sample_id_all_avail ? 1 : 0;
325 230
326 for (thread_index = 0; thread_index < threads->nr; thread_index++) { 231static bool perf_evlist__equal(struct perf_evlist *evlist,
327try_again: 232 struct perf_evlist *other)
328 FD(evsel, nr_cpu, thread_index) = sys_perf_event_open(attr, threads->map[thread_index], cpu, group_fd, 0); 233{
234 struct perf_evsel *pos, *pair;
235
236 if (evlist->nr_entries != other->nr_entries)
237 return false;
238
239 pair = list_entry(other->entries.next, struct perf_evsel, node);
329 240
330 if (FD(evsel, nr_cpu, thread_index) < 0) { 241 list_for_each_entry(pos, &evlist->entries, node) {
242 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
243 return false;
244 pair = list_entry(pair->node.next, struct perf_evsel, node);
245 }
246
247 return true;
248}
249
250static void open_counters(struct perf_evlist *evlist)
251{
252 struct perf_evsel *pos;
253
254 list_for_each_entry(pos, &evlist->entries, node) {
255 struct perf_event_attr *attr = &pos->attr;
256 /*
257 * Check if parse_single_tracepoint_event has already asked for
258 * PERF_SAMPLE_TIME.
259 *
260 * XXX this is kludgy but short term fix for problems introduced by
261 * eac23d1c that broke 'perf script' by having different sample_types
262 * when using multiple tracepoint events when we use a perf binary
263 * that tries to use sample_id_all on an older kernel.
264 *
265 * We need to move counter creation to perf_session, support
266 * different sample_types, etc.
267 */
268 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
269
270 config_attr(pos, evlist);
271retry_sample_id:
272 attr->sample_id_all = sample_id_all_avail ? 1 : 0;
273try_again:
274 if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group,
275 !no_inherit) < 0) {
331 int err = errno; 276 int err = errno;
332 277
333 if (err == EPERM || err == EACCES) 278 if (err == EPERM || err == EACCES)
@@ -364,7 +309,7 @@ try_again:
364 } 309 }
365 printf("\n"); 310 printf("\n");
366 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", 311 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
367 FD(evsel, nr_cpu, thread_index), strerror(err)); 312 err, strerror(err));
368 313
369#if defined(__i386__) || defined(__x86_64__) 314#if defined(__i386__) || defined(__x86_64__)
370 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP) 315 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
@@ -375,90 +320,28 @@ try_again:
375#endif 320#endif
376 321
377 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); 322 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
378 exit(-1);
379 } 323 }
324 }
380 325
381 h_attr = get_header_attr(attr, evsel->idx); 326 if (perf_evlist__set_filters(evlist)) {
382 if (h_attr == NULL) 327 error("failed to set filter with %d (%s)\n", errno,
383 die("nomem\n"); 328 strerror(errno));
329 exit(-1);
330 }
384 331
385 if (!file_new) { 332 if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
386 if (memcmp(&h_attr->attr, attr, sizeof(*attr))) { 333 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
387 fprintf(stderr, "incompatible append\n");
388 exit(-1);
389 }
390 }
391 334
392 if (read(FD(evsel, nr_cpu, thread_index), &read_data, sizeof(read_data)) == -1) { 335 if (file_new)
393 perror("Unable to read perf file descriptor"); 336 session->evlist = evlist;
337 else {
338 if (!perf_evlist__equal(session->evlist, evlist)) {
339 fprintf(stderr, "incompatible append\n");
394 exit(-1); 340 exit(-1);
395 } 341 }
342 }
396 343
397 if (perf_header_attr__add_id(h_attr, read_data.id) < 0) { 344 perf_session__update_sample_type(session);
398 pr_warning("Not enough memory to add id\n");
399 exit(-1);
400 }
401
402 assert(FD(evsel, nr_cpu, thread_index) >= 0);
403 fcntl(FD(evsel, nr_cpu, thread_index), F_SETFL, O_NONBLOCK);
404
405 /*
406 * First counter acts as the group leader:
407 */
408 if (group && group_fd == -1)
409 group_fd = FD(evsel, nr_cpu, thread_index);
410
411 if (evsel->idx || thread_index) {
412 struct perf_evsel *first;
413 first = list_entry(evsel_list.next, struct perf_evsel, node);
414 ret = ioctl(FD(evsel, nr_cpu, thread_index),
415 PERF_EVENT_IOC_SET_OUTPUT,
416 FD(first, nr_cpu, 0));
417 if (ret) {
418 error("failed to set output: %d (%s)\n", errno,
419 strerror(errno));
420 exit(-1);
421 }
422 } else {
423 mmap_array[nr_cpu].prev = 0;
424 mmap_array[nr_cpu].mask = mmap_pages*page_size - 1;
425 mmap_array[nr_cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
426 PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, nr_cpu, thread_index), 0);
427 if (mmap_array[nr_cpu].base == MAP_FAILED) {
428 error("failed to mmap with %d (%s)\n", errno, strerror(errno));
429 exit(-1);
430 }
431
432 event_array[nr_poll].fd = FD(evsel, nr_cpu, thread_index);
433 event_array[nr_poll].events = POLLIN;
434 nr_poll++;
435 }
436
437 if (filter != NULL) {
438 ret = ioctl(FD(evsel, nr_cpu, thread_index),
439 PERF_EVENT_IOC_SET_FILTER, filter);
440 if (ret) {
441 error("failed to set filter with %d (%s)\n", errno,
442 strerror(errno));
443 exit(-1);
444 }
445 }
446 }
447
448 if (!sample_type)
449 sample_type = attr->sample_type;
450}
451
452static void open_counters(int cpu)
453{
454 struct perf_evsel *pos;
455
456 group_fd = -1;
457
458 list_for_each_entry(pos, &evsel_list, node)
459 create_counter(pos, cpu);
460
461 nr_cpu++;
462} 345}
463 346
464static int process_buildids(void) 347static int process_buildids(void)
@@ -481,14 +364,14 @@ static void atexit_header(void)
481 364
482 if (!no_buildid) 365 if (!no_buildid)
483 process_buildids(); 366 process_buildids();
484 perf_header__write(&session->header, output, true); 367 perf_session__write_header(session, evsel_list, output, true);
485 perf_session__delete(session); 368 perf_session__delete(session);
486 perf_evsel_list__delete(); 369 perf_evlist__delete(evsel_list);
487 symbol__exit(); 370 symbol__exit();
488 } 371 }
489} 372}
490 373
491static void event__synthesize_guest_os(struct machine *machine, void *data) 374static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
492{ 375{
493 int err; 376 int err;
494 struct perf_session *psession = data; 377 struct perf_session *psession = data;
@@ -504,8 +387,8 @@ static void event__synthesize_guest_os(struct machine *machine, void *data)
504 *method is used to avoid symbol missing when the first addr is 387 *method is used to avoid symbol missing when the first addr is
505 *in module instead of in guest kernel. 388 *in module instead of in guest kernel.
506 */ 389 */
507 err = event__synthesize_modules(process_synthesized_event, 390 err = perf_event__synthesize_modules(process_synthesized_event,
508 psession, machine); 391 psession, machine);
509 if (err < 0) 392 if (err < 0)
510 pr_err("Couldn't record guest kernel [%d]'s reference" 393 pr_err("Couldn't record guest kernel [%d]'s reference"
511 " relocation symbol.\n", machine->pid); 394 " relocation symbol.\n", machine->pid);
@@ -514,11 +397,12 @@ static void event__synthesize_guest_os(struct machine *machine, void *data)
514 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 397 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
515 * have no _text sometimes. 398 * have no _text sometimes.
516 */ 399 */
517 err = event__synthesize_kernel_mmap(process_synthesized_event, 400 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
518 psession, machine, "_text"); 401 psession, machine, "_text");
519 if (err < 0) 402 if (err < 0)
520 err = event__synthesize_kernel_mmap(process_synthesized_event, 403 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
521 psession, machine, "_stext"); 404 psession, machine,
405 "_stext");
522 if (err < 0) 406 if (err < 0)
523 pr_err("Couldn't record guest kernel [%d]'s reference" 407 pr_err("Couldn't record guest kernel [%d]'s reference"
524 " relocation symbol.\n", machine->pid); 408 " relocation symbol.\n", machine->pid);
@@ -533,9 +417,9 @@ static void mmap_read_all(void)
533{ 417{
534 int i; 418 int i;
535 419
536 for (i = 0; i < nr_cpu; i++) { 420 for (i = 0; i < evsel_list->cpus->nr; i++) {
537 if (mmap_array[i].base) 421 if (evsel_list->mmap[i].base)
538 mmap_read(&mmap_array[i]); 422 mmap_read(&evsel_list->mmap[i]);
539 } 423 }
540 424
541 if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO)) 425 if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
@@ -566,18 +450,26 @@ static int __cmd_record(int argc, const char **argv)
566 exit(-1); 450 exit(-1);
567 } 451 }
568 452
569 if (!strcmp(output_name, "-")) 453 if (!output_name) {
570 pipe_output = 1; 454 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
571 else if (!stat(output_name, &st) && st.st_size) { 455 pipe_output = 1;
572 if (write_mode == WRITE_FORCE) { 456 else
573 char oldname[PATH_MAX]; 457 output_name = "perf.data";
574 snprintf(oldname, sizeof(oldname), "%s.old", 458 }
575 output_name); 459 if (output_name) {
576 unlink(oldname); 460 if (!strcmp(output_name, "-"))
577 rename(output_name, oldname); 461 pipe_output = 1;
462 else if (!stat(output_name, &st) && st.st_size) {
463 if (write_mode == WRITE_FORCE) {
464 char oldname[PATH_MAX];
465 snprintf(oldname, sizeof(oldname), "%s.old",
466 output_name);
467 unlink(oldname);
468 rename(output_name, oldname);
469 }
470 } else if (write_mode == WRITE_APPEND) {
471 write_mode = WRITE_FORCE;
578 } 472 }
579 } else if (write_mode == WRITE_APPEND) {
580 write_mode = WRITE_FORCE;
581 } 473 }
582 474
583 flags = O_CREAT|O_RDWR; 475 flags = O_CREAT|O_RDWR;
@@ -606,19 +498,14 @@ static int __cmd_record(int argc, const char **argv)
606 perf_header__set_feat(&session->header, HEADER_BUILD_ID); 498 perf_header__set_feat(&session->header, HEADER_BUILD_ID);
607 499
608 if (!file_new) { 500 if (!file_new) {
609 err = perf_header__read(session, output); 501 err = perf_session__read_header(session, output);
610 if (err < 0) 502 if (err < 0)
611 goto out_delete_session; 503 goto out_delete_session;
612 } 504 }
613 505
614 if (have_tracepoints(&evsel_list)) 506 if (have_tracepoints(&evsel_list->entries))
615 perf_header__set_feat(&session->header, HEADER_TRACE_INFO); 507 perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
616 508
617 /*
618 * perf_session__delete(session) will be called at atexit_header()
619 */
620 atexit(atexit_header);
621
622 if (forks) { 509 if (forks) {
623 child_pid = fork(); 510 child_pid = fork();
624 if (child_pid < 0) { 511 if (child_pid < 0) {
@@ -659,7 +546,7 @@ static int __cmd_record(int argc, const char **argv)
659 } 546 }
660 547
661 if (!system_wide && target_tid == -1 && target_pid == -1) 548 if (!system_wide && target_tid == -1 && target_pid == -1)
662 threads->map[0] = child_pid; 549 evsel_list->threads->map[0] = child_pid;
663 550
664 close(child_ready_pipe[1]); 551 close(child_ready_pipe[1]);
665 close(go_pipe[0]); 552 close(go_pipe[0]);
@@ -673,46 +560,42 @@ static int __cmd_record(int argc, const char **argv)
673 close(child_ready_pipe[0]); 560 close(child_ready_pipe[0]);
674 } 561 }
675 562
676 if (!system_wide && no_inherit && !cpu_list) { 563 open_counters(evsel_list);
677 open_counters(-1);
678 } else {
679 for (i = 0; i < cpus->nr; i++)
680 open_counters(cpus->map[i]);
681 }
682 564
683 perf_session__set_sample_type(session, sample_type); 565 /*
566 * perf_session__delete(session) will be called at atexit_header()
567 */
568 atexit(atexit_header);
684 569
685 if (pipe_output) { 570 if (pipe_output) {
686 err = perf_header__write_pipe(output); 571 err = perf_header__write_pipe(output);
687 if (err < 0) 572 if (err < 0)
688 return err; 573 return err;
689 } else if (file_new) { 574 } else if (file_new) {
690 err = perf_header__write(&session->header, output, false); 575 err = perf_session__write_header(session, evsel_list,
576 output, false);
691 if (err < 0) 577 if (err < 0)
692 return err; 578 return err;
693 } 579 }
694 580
695 post_processing_offset = lseek(output, 0, SEEK_CUR); 581 post_processing_offset = lseek(output, 0, SEEK_CUR);
696 582
697 perf_session__set_sample_id_all(session, sample_id_all_avail);
698
699 if (pipe_output) { 583 if (pipe_output) {
700 err = event__synthesize_attrs(&session->header, 584 err = perf_session__synthesize_attrs(session,
701 process_synthesized_event, 585 process_synthesized_event);
702 session);
703 if (err < 0) { 586 if (err < 0) {
704 pr_err("Couldn't synthesize attrs.\n"); 587 pr_err("Couldn't synthesize attrs.\n");
705 return err; 588 return err;
706 } 589 }
707 590
708 err = event__synthesize_event_types(process_synthesized_event, 591 err = perf_event__synthesize_event_types(process_synthesized_event,
709 session); 592 session);
710 if (err < 0) { 593 if (err < 0) {
711 pr_err("Couldn't synthesize event_types.\n"); 594 pr_err("Couldn't synthesize event_types.\n");
712 return err; 595 return err;
713 } 596 }
714 597
715 if (have_tracepoints(&evsel_list)) { 598 if (have_tracepoints(&evsel_list->entries)) {
716 /* 599 /*
717 * FIXME err <= 0 here actually means that 600 * FIXME err <= 0 here actually means that
718 * there were no tracepoints so its not really 601 * there were no tracepoints so its not really
@@ -721,9 +604,9 @@ static int __cmd_record(int argc, const char **argv)
721 * return this more properly and also 604 * return this more properly and also
722 * propagate errors that now are calling die() 605 * propagate errors that now are calling die()
723 */ 606 */
724 err = event__synthesize_tracing_data(output, &evsel_list, 607 err = perf_event__synthesize_tracing_data(output, evsel_list,
725 process_synthesized_event, 608 process_synthesized_event,
726 session); 609 session);
727 if (err <= 0) { 610 if (err <= 0) {
728 pr_err("Couldn't record tracing data.\n"); 611 pr_err("Couldn't record tracing data.\n");
729 return err; 612 return err;
@@ -738,31 +621,34 @@ static int __cmd_record(int argc, const char **argv)
738 return -1; 621 return -1;
739 } 622 }
740 623
741 err = event__synthesize_kernel_mmap(process_synthesized_event, 624 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
742 session, machine, "_text"); 625 session, machine, "_text");
743 if (err < 0) 626 if (err < 0)
744 err = event__synthesize_kernel_mmap(process_synthesized_event, 627 err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
745 session, machine, "_stext"); 628 session, machine, "_stext");
746 if (err < 0) 629 if (err < 0)
747 pr_err("Couldn't record kernel reference relocation symbol\n" 630 pr_err("Couldn't record kernel reference relocation symbol\n"
748 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 631 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
749 "Check /proc/kallsyms permission or run as root.\n"); 632 "Check /proc/kallsyms permission or run as root.\n");
750 633
751 err = event__synthesize_modules(process_synthesized_event, 634 err = perf_event__synthesize_modules(process_synthesized_event,
752 session, machine); 635 session, machine);
753 if (err < 0) 636 if (err < 0)
754 pr_err("Couldn't record kernel module information.\n" 637 pr_err("Couldn't record kernel module information.\n"
755 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 638 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
756 "Check /proc/modules permission or run as root.\n"); 639 "Check /proc/modules permission or run as root.\n");
757 640
758 if (perf_guest) 641 if (perf_guest)
759 perf_session__process_machines(session, event__synthesize_guest_os); 642 perf_session__process_machines(session,
643 perf_event__synthesize_guest_os);
760 644
761 if (!system_wide) 645 if (!system_wide)
762 event__synthesize_thread_map(threads, process_synthesized_event, 646 perf_event__synthesize_thread_map(evsel_list->threads,
763 session); 647 process_synthesized_event,
648 session);
764 else 649 else
765 event__synthesize_threads(process_synthesized_event, session); 650 perf_event__synthesize_threads(process_synthesized_event,
651 session);
766 652
767 if (realtime_prio) { 653 if (realtime_prio) {
768 struct sched_param param; 654 struct sched_param param;
@@ -789,17 +675,17 @@ static int __cmd_record(int argc, const char **argv)
789 if (hits == samples) { 675 if (hits == samples) {
790 if (done) 676 if (done)
791 break; 677 break;
792 err = poll(event_array, nr_poll, -1); 678 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
793 waking++; 679 waking++;
794 } 680 }
795 681
796 if (done) { 682 if (done) {
797 for (i = 0; i < nr_cpu; i++) { 683 for (i = 0; i < evsel_list->cpus->nr; i++) {
798 struct perf_evsel *pos; 684 struct perf_evsel *pos;
799 685
800 list_for_each_entry(pos, &evsel_list, node) { 686 list_for_each_entry(pos, &evsel_list->entries, node) {
801 for (thread = 0; 687 for (thread = 0;
802 thread < threads->nr; 688 thread < evsel_list->threads->nr;
803 thread++) 689 thread++)
804 ioctl(FD(pos, i, thread), 690 ioctl(FD(pos, i, thread),
805 PERF_EVENT_IOC_DISABLE); 691 PERF_EVENT_IOC_DISABLE);
@@ -838,10 +724,10 @@ static const char * const record_usage[] = {
838static bool force, append_file; 724static bool force, append_file;
839 725
840const struct option record_options[] = { 726const struct option record_options[] = {
841 OPT_CALLBACK('e', "event", NULL, "event", 727 OPT_CALLBACK('e', "event", &evsel_list, "event",
842 "event selector. use 'perf list' to list available events", 728 "event selector. use 'perf list' to list available events",
843 parse_events), 729 parse_events),
844 OPT_CALLBACK(0, "filter", NULL, "filter", 730 OPT_CALLBACK(0, "filter", &evsel_list, "filter",
845 "event filter", parse_filter), 731 "event filter", parse_filter),
846 OPT_INTEGER('p', "pid", &target_pid, 732 OPT_INTEGER('p', "pid", &target_pid,
847 "record events on existing process id"), 733 "record events on existing process id"),
@@ -884,6 +770,9 @@ const struct option record_options[] = {
884 "do not update the buildid cache"), 770 "do not update the buildid cache"),
885 OPT_BOOLEAN('B', "no-buildid", &no_buildid, 771 OPT_BOOLEAN('B', "no-buildid", &no_buildid,
886 "do not collect buildids in perf.data"), 772 "do not collect buildids in perf.data"),
773 OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
774 "monitor event in cgroup name only",
775 parse_cgroups),
887 OPT_END() 776 OPT_END()
888}; 777};
889 778
@@ -892,6 +781,10 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
892 int err = -ENOMEM; 781 int err = -ENOMEM;
893 struct perf_evsel *pos; 782 struct perf_evsel *pos;
894 783
784 evsel_list = perf_evlist__new(NULL, NULL);
785 if (evsel_list == NULL)
786 return -ENOMEM;
787
895 argc = parse_options(argc, argv, record_options, record_usage, 788 argc = parse_options(argc, argv, record_options, record_usage,
896 PARSE_OPT_STOP_AT_NON_OPTION); 789 PARSE_OPT_STOP_AT_NON_OPTION);
897 if (!argc && target_pid == -1 && target_tid == -1 && 790 if (!argc && target_pid == -1 && target_tid == -1 &&
@@ -908,12 +801,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
908 write_mode = WRITE_FORCE; 801 write_mode = WRITE_FORCE;
909 } 802 }
910 803
804 if (nr_cgroups && !system_wide) {
805 fprintf(stderr, "cgroup monitoring only available in"
806 " system-wide mode\n");
807 usage_with_options(record_usage, record_options);
808 }
809
911 symbol__init(); 810 symbol__init();
912 811
913 if (no_buildid_cache || no_buildid) 812 if (no_buildid_cache || no_buildid)
914 disable_buildid_cache(); 813 disable_buildid_cache();
915 814
916 if (list_empty(&evsel_list) && perf_evsel_list__create_default() < 0) { 815 if (evsel_list->nr_entries == 0 &&
816 perf_evlist__add_default(evsel_list) < 0) {
917 pr_err("Not enough memory for event selector list\n"); 817 pr_err("Not enough memory for event selector list\n");
918 goto out_symbol_exit; 818 goto out_symbol_exit;
919 } 819 }
@@ -921,27 +821,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
921 if (target_pid != -1) 821 if (target_pid != -1)
922 target_tid = target_pid; 822 target_tid = target_pid;
923 823
924 threads = thread_map__new(target_pid, target_tid); 824 if (perf_evlist__create_maps(evsel_list, target_pid,
925 if (threads == NULL) { 825 target_tid, cpu_list) < 0)
926 pr_err("Problems finding threads of monitor\n");
927 usage_with_options(record_usage, record_options); 826 usage_with_options(record_usage, record_options);
928 }
929 827
930 cpus = cpu_map__new(cpu_list); 828 list_for_each_entry(pos, &evsel_list->entries, node) {
931 if (cpus == NULL) { 829 if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
932 perror("failed to parse CPUs map"); 830 evsel_list->threads->nr) < 0)
933 return -1;
934 }
935
936 list_for_each_entry(pos, &evsel_list, node) {
937 if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
938 goto out_free_fd; 831 goto out_free_fd;
939 if (perf_header__push_event(pos->attr.config, event_name(pos))) 832 if (perf_header__push_event(pos->attr.config, event_name(pos)))
940 goto out_free_fd; 833 goto out_free_fd;
941 } 834 }
942 event_array = malloc((sizeof(struct pollfd) * MAX_NR_CPUS * 835
943 MAX_COUNTERS * threads->nr)); 836 if (perf_evlist__alloc_pollfd(evsel_list) < 0)
944 if (!event_array)
945 goto out_free_fd; 837 goto out_free_fd;
946 838
947 if (user_interval != ULLONG_MAX) 839 if (user_interval != ULLONG_MAX)
@@ -959,16 +851,12 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
959 } else { 851 } else {
960 fprintf(stderr, "frequency and count are zero, aborting\n"); 852 fprintf(stderr, "frequency and count are zero, aborting\n");
961 err = -EINVAL; 853 err = -EINVAL;
962 goto out_free_event_array; 854 goto out_free_fd;
963 } 855 }
964 856
965 err = __cmd_record(argc, argv); 857 err = __cmd_record(argc, argv);
966
967out_free_event_array:
968 free(event_array);
969out_free_fd: 858out_free_fd:
970 thread_map__delete(threads); 859 perf_evlist__delete_maps(evsel_list);
971 threads = NULL;
972out_symbol_exit: 860out_symbol_exit:
973 symbol__exit(); 861 symbol__exit();
974 return err; 862 return err;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c27e31f289e..b1b82009ab9 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -9,6 +9,7 @@
9 9
10#include "util/util.h" 10#include "util/util.h"
11 11
12#include "util/annotate.h"
12#include "util/color.h" 13#include "util/color.h"
13#include <linux/list.h> 14#include <linux/list.h>
14#include "util/cache.h" 15#include "util/cache.h"
@@ -20,6 +21,8 @@
20 21
21#include "perf.h" 22#include "perf.h"
22#include "util/debug.h" 23#include "util/debug.h"
24#include "util/evlist.h"
25#include "util/evsel.h"
23#include "util/header.h" 26#include "util/header.h"
24#include "util/session.h" 27#include "util/session.h"
25 28
@@ -43,120 +46,79 @@ static const char default_pretty_printing_style[] = "normal";
43static const char *pretty_printing_style = default_pretty_printing_style; 46static const char *pretty_printing_style = default_pretty_printing_style;
44 47
45static char callchain_default_opt[] = "fractal,0.5"; 48static char callchain_default_opt[] = "fractal,0.5";
49static symbol_filter_t annotate_init;
46 50
47static struct hists *perf_session__hists_findnew(struct perf_session *self, 51static int perf_session__add_hist_entry(struct perf_session *session,
48 u64 event_stream, u32 type,
49 u64 config)
50{
51 struct rb_node **p = &self->hists_tree.rb_node;
52 struct rb_node *parent = NULL;
53 struct hists *iter, *new;
54
55 while (*p != NULL) {
56 parent = *p;
57 iter = rb_entry(parent, struct hists, rb_node);
58 if (iter->config == config)
59 return iter;
60
61
62 if (config > iter->config)
63 p = &(*p)->rb_right;
64 else
65 p = &(*p)->rb_left;
66 }
67
68 new = malloc(sizeof(struct hists));
69 if (new == NULL)
70 return NULL;
71 memset(new, 0, sizeof(struct hists));
72 new->event_stream = event_stream;
73 new->config = config;
74 new->type = type;
75 rb_link_node(&new->rb_node, parent, p);
76 rb_insert_color(&new->rb_node, &self->hists_tree);
77 return new;
78}
79
80static int perf_session__add_hist_entry(struct perf_session *self,
81 struct addr_location *al, 52 struct addr_location *al,
82 struct sample_data *data) 53 struct perf_sample *sample)
83{ 54{
84 struct map_symbol *syms = NULL;
85 struct symbol *parent = NULL; 55 struct symbol *parent = NULL;
86 int err = -ENOMEM; 56 int err = 0;
87 struct hist_entry *he; 57 struct hist_entry *he;
88 struct hists *hists; 58 struct perf_evsel *evsel;
89 struct perf_event_attr *attr; 59
90 60 if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
91 if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) { 61 err = perf_session__resolve_callchain(session, al->thread,
92 syms = perf_session__resolve_callchain(self, al->thread, 62 sample->callchain, &parent);
93 data->callchain, &parent); 63 if (err)
94 if (syms == NULL) 64 return err;
95 return -ENOMEM;
96 } 65 }
97 66
98 attr = perf_header__find_attr(data->id, &self->header); 67 evsel = perf_evlist__id2evsel(session->evlist, sample->id);
99 if (attr) 68 if (evsel == NULL) {
100 hists = perf_session__hists_findnew(self, data->id, attr->type, attr->config); 69 /*
101 else 70 * FIXME: Propagate this back, but at least we're in a builtin,
102 hists = perf_session__hists_findnew(self, data->id, 0, 0); 71 * where exit() is allowed. ;-)
103 if (hists == NULL) 72 */
104 goto out_free_syms; 73 ui__warning("Invalid %s file, contains samples with id %" PRIu64 " not in "
105 he = __hists__add_entry(hists, al, parent, data->period); 74 "its header!\n", input_name, sample->id);
75 exit_browser(0);
76 exit(1);
77 }
78
79 he = __hists__add_entry(&evsel->hists, al, parent, sample->period);
106 if (he == NULL) 80 if (he == NULL)
107 goto out_free_syms; 81 return -ENOMEM;
108 err = 0; 82
109 if (symbol_conf.use_callchain) { 83 if (symbol_conf.use_callchain) {
110 err = callchain_append(he->callchain, data->callchain, syms, 84 err = callchain_append(he->callchain, &session->callchain_cursor,
111 data->period); 85 sample->period);
112 if (err) 86 if (err)
113 goto out_free_syms; 87 return err;
114 } 88 }
115 /* 89 /*
116 * Only in the newt browser we are doing integrated annotation, 90 * Only in the newt browser we are doing integrated annotation,
117 * so we don't allocated the extra space needed because the stdio 91 * so we don't allocated the extra space needed because the stdio
118 * code will not use it. 92 * code will not use it.
119 */ 93 */
120 if (use_browser > 0) 94 if (al->sym != NULL && use_browser > 0) {
121 err = hist_entry__inc_addr_samples(he, al->addr); 95 struct annotation *notes = symbol__annotation(he->ms.sym);
122out_free_syms:
123 free(syms);
124 return err;
125}
126 96
127static int add_event_total(struct perf_session *session, 97 assert(evsel != NULL);
128 struct sample_data *data,
129 struct perf_event_attr *attr)
130{
131 struct hists *hists;
132 98
133 if (attr) 99 err = -ENOMEM;
134 hists = perf_session__hists_findnew(session, data->id, 100 if (notes->src == NULL &&
135 attr->type, attr->config); 101 symbol__alloc_hist(he->ms.sym, session->evlist->nr_entries) < 0)
136 else 102 goto out;
137 hists = perf_session__hists_findnew(session, data->id, 0, 0);
138 103
139 if (!hists) 104 err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
140 return -ENOMEM; 105 }
141 106
142 hists->stats.total_period += data->period; 107 evsel->hists.stats.total_period += sample->period;
143 /* 108 hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
144 * FIXME: add_event_total should be moved from here to 109out:
145 * perf_session__process_event so that the proper hist is passed to 110 return err;
146 * the event_op methods.
147 */
148 hists__inc_nr_events(hists, PERF_RECORD_SAMPLE);
149 session->hists.stats.total_period += data->period;
150 return 0;
151} 111}
152 112
153static int process_sample_event(event_t *event, struct sample_data *sample, 113
114static int process_sample_event(union perf_event *event,
115 struct perf_sample *sample,
154 struct perf_session *session) 116 struct perf_session *session)
155{ 117{
156 struct addr_location al; 118 struct addr_location al;
157 struct perf_event_attr *attr;
158 119
159 if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { 120 if (perf_event__preprocess_sample(event, session, &al, sample,
121 annotate_init) < 0) {
160 fprintf(stderr, "problem processing %d event, skipping it.\n", 122 fprintf(stderr, "problem processing %d event, skipping it.\n",
161 event->header.type); 123 event->header.type);
162 return -1; 124 return -1;
@@ -170,26 +132,17 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
170 return -1; 132 return -1;
171 } 133 }
172 134
173 attr = perf_header__find_attr(sample->id, &session->header);
174
175 if (add_event_total(session, sample, attr)) {
176 pr_debug("problem adding event period\n");
177 return -1;
178 }
179
180 return 0; 135 return 0;
181} 136}
182 137
183static int process_read_event(event_t *event, struct sample_data *sample __used, 138static int process_read_event(union perf_event *event,
184 struct perf_session *session __used) 139 struct perf_sample *sample __used,
140 struct perf_session *session)
185{ 141{
186 struct perf_event_attr *attr; 142 struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist,
187 143 event->read.id);
188 attr = perf_header__find_attr(event->read.id, &session->header);
189
190 if (show_threads) { 144 if (show_threads) {
191 const char *name = attr ? __event_name(attr->type, attr->config) 145 const char *name = evsel ? event_name(evsel) : "unknown";
192 : "unknown";
193 perf_read_values_add_value(&show_threads_values, 146 perf_read_values_add_value(&show_threads_values,
194 event->read.pid, event->read.tid, 147 event->read.pid, event->read.tid,
195 event->read.id, 148 event->read.id,
@@ -198,7 +151,7 @@ static int process_read_event(event_t *event, struct sample_data *sample __used,
198 } 151 }
199 152
200 dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid, 153 dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid,
201 attr ? __event_name(attr->type, attr->config) : "FAIL", 154 evsel ? event_name(evsel) : "FAIL",
202 event->read.value); 155 event->read.value);
203 156
204 return 0; 157 return 0;
@@ -222,7 +175,7 @@ static int perf_session__setup_sample_type(struct perf_session *self)
222 } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE && 175 } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE &&
223 !symbol_conf.use_callchain) { 176 !symbol_conf.use_callchain) {
224 symbol_conf.use_callchain = true; 177 symbol_conf.use_callchain = true;
225 if (register_callchain_param(&callchain_param) < 0) { 178 if (callchain_register_param(&callchain_param) < 0) {
226 fprintf(stderr, "Can't register callchain" 179 fprintf(stderr, "Can't register callchain"
227 " params\n"); 180 " params\n");
228 return -EINVAL; 181 return -EINVAL;
@@ -233,17 +186,17 @@ static int perf_session__setup_sample_type(struct perf_session *self)
233} 186}
234 187
235static struct perf_event_ops event_ops = { 188static struct perf_event_ops event_ops = {
236 .sample = process_sample_event, 189 .sample = process_sample_event,
237 .mmap = event__process_mmap, 190 .mmap = perf_event__process_mmap,
238 .comm = event__process_comm, 191 .comm = perf_event__process_comm,
239 .exit = event__process_task, 192 .exit = perf_event__process_task,
240 .fork = event__process_task, 193 .fork = perf_event__process_task,
241 .lost = event__process_lost, 194 .lost = perf_event__process_lost,
242 .read = process_read_event, 195 .read = process_read_event,
243 .attr = event__process_attr, 196 .attr = perf_event__process_attr,
244 .event_type = event__process_event_type, 197 .event_type = perf_event__process_event_type,
245 .tracing_data = event__process_tracing_data, 198 .tracing_data = perf_event__process_tracing_data,
246 .build_id = event__process_build_id, 199 .build_id = perf_event__process_build_id,
247 .ordered_samples = true, 200 .ordered_samples = true,
248 .ordering_requires_timestamps = true, 201 .ordering_requires_timestamps = true,
249}; 202};
@@ -269,21 +222,21 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self,
269 return ret + fprintf(fp, "\n#\n"); 222 return ret + fprintf(fp, "\n#\n");
270} 223}
271 224
272static int hists__tty_browse_tree(struct rb_root *tree, const char *help) 225static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
226 const char *help)
273{ 227{
274 struct rb_node *next = rb_first(tree); 228 struct perf_evsel *pos;
275 229
276 while (next) { 230 list_for_each_entry(pos, &evlist->entries, node) {
277 struct hists *hists = rb_entry(next, struct hists, rb_node); 231 struct hists *hists = &pos->hists;
278 const char *evname = NULL; 232 const char *evname = NULL;
279 233
280 if (rb_first(&hists->entries) != rb_last(&hists->entries)) 234 if (rb_first(&hists->entries) != rb_last(&hists->entries))
281 evname = __event_name(hists->type, hists->config); 235 evname = event_name(pos);
282 236
283 hists__fprintf_nr_sample_events(hists, evname, stdout); 237 hists__fprintf_nr_sample_events(hists, evname, stdout);
284 hists__fprintf(hists, NULL, false, stdout); 238 hists__fprintf(hists, NULL, false, stdout);
285 fprintf(stdout, "\n\n"); 239 fprintf(stdout, "\n\n");
286 next = rb_next(&hists->rb_node);
287 } 240 }
288 241
289 if (sort_order == default_sort_order && 242 if (sort_order == default_sort_order &&
@@ -304,8 +257,9 @@ static int hists__tty_browse_tree(struct rb_root *tree, const char *help)
304static int __cmd_report(void) 257static int __cmd_report(void)
305{ 258{
306 int ret = -EINVAL; 259 int ret = -EINVAL;
260 u64 nr_samples;
307 struct perf_session *session; 261 struct perf_session *session;
308 struct rb_node *next; 262 struct perf_evsel *pos;
309 const char *help = "For a higher level overview, try: perf report --sort comm,dso"; 263 const char *help = "For a higher level overview, try: perf report --sort comm,dso";
310 264
311 signal(SIGINT, sig_handler); 265 signal(SIGINT, sig_handler);
@@ -336,20 +290,24 @@ static int __cmd_report(void)
336 if (verbose > 2) 290 if (verbose > 2)
337 perf_session__fprintf_dsos(session, stdout); 291 perf_session__fprintf_dsos(session, stdout);
338 292
339 next = rb_first(&session->hists_tree); 293 nr_samples = 0;
340 while (next) { 294 list_for_each_entry(pos, &session->evlist->entries, node) {
341 struct hists *hists; 295 struct hists *hists = &pos->hists;
342 296
343 hists = rb_entry(next, struct hists, rb_node);
344 hists__collapse_resort(hists); 297 hists__collapse_resort(hists);
345 hists__output_resort(hists); 298 hists__output_resort(hists);
346 next = rb_next(&hists->rb_node); 299 nr_samples += hists->stats.nr_events[PERF_RECORD_SAMPLE];
300 }
301
302 if (nr_samples == 0) {
303 ui__warning("The %s file has no samples!\n", input_name);
304 goto out_delete;
347 } 305 }
348 306
349 if (use_browser > 0) 307 if (use_browser > 0)
350 hists__tui_browse_tree(&session->hists_tree, help); 308 perf_evlist__tui_browse_hists(session->evlist, help);
351 else 309 else
352 hists__tty_browse_tree(&session->hists_tree, help); 310 perf_evlist__tty_browse_hists(session->evlist, help);
353 311
354out_delete: 312out_delete:
355 /* 313 /*
@@ -424,7 +382,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
424 if (tok2) 382 if (tok2)
425 callchain_param.print_limit = strtod(tok2, &endptr); 383 callchain_param.print_limit = strtod(tok2, &endptr);
426setup: 384setup:
427 if (register_callchain_param(&callchain_param) < 0) { 385 if (callchain_register_param(&callchain_param) < 0) {
428 fprintf(stderr, "Can't register callchain params\n"); 386 fprintf(stderr, "Can't register callchain params\n");
429 return -1; 387 return -1;
430 } 388 }
@@ -498,7 +456,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
498 use_browser = 1; 456 use_browser = 1;
499 457
500 if (strcmp(input_name, "-") != 0) 458 if (strcmp(input_name, "-") != 0)
501 setup_browser(); 459 setup_browser(true);
502 else 460 else
503 use_browser = 0; 461 use_browser = 0;
504 /* 462 /*
@@ -507,7 +465,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
507 * implementation. 465 * implementation.
508 */ 466 */
509 if (use_browser > 0) { 467 if (use_browser > 0) {
510 symbol_conf.priv_size = sizeof(struct sym_priv); 468 symbol_conf.priv_size = sizeof(struct annotation);
469 annotate_init = symbol__annotate_init;
511 /* 470 /*
512 * For searching by name on the "Browse map details". 471 * For searching by name on the "Browse map details".
513 * providing it only in verbose mode not to bloat too 472 * providing it only in verbose mode not to bloat too
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 29acb894e03..a32f411faea 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -369,11 +369,6 @@ static void
369process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom) 369process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
370{ 370{
371 int ret = 0; 371 int ret = 0;
372 u64 now;
373 long long delta;
374
375 now = get_nsecs();
376 delta = start_time + atom->timestamp - now;
377 372
378 switch (atom->type) { 373 switch (atom->type) {
379 case SCHED_EVENT_RUN: 374 case SCHED_EVENT_RUN:
@@ -562,7 +557,7 @@ static void wait_for_tasks(void)
562 557
563static void run_one_test(void) 558static void run_one_test(void)
564{ 559{
565 u64 T0, T1, delta, avg_delta, fluct, std_dev; 560 u64 T0, T1, delta, avg_delta, fluct;
566 561
567 T0 = get_nsecs(); 562 T0 = get_nsecs();
568 wait_for_tasks(); 563 wait_for_tasks();
@@ -578,7 +573,6 @@ static void run_one_test(void)
578 else 573 else
579 fluct = delta - avg_delta; 574 fluct = delta - avg_delta;
580 sum_fluct += fluct; 575 sum_fluct += fluct;
581 std_dev = sum_fluct / nr_runs / sqrt(nr_runs);
582 if (!run_avg) 576 if (!run_avg)
583 run_avg = delta; 577 run_avg = delta;
584 run_avg = (run_avg*9 + delta)/10; 578 run_avg = (run_avg*9 + delta)/10;
@@ -799,7 +793,7 @@ replay_switch_event(struct trace_switch_event *switch_event,
799 u64 timestamp, 793 u64 timestamp,
800 struct thread *thread __used) 794 struct thread *thread __used)
801{ 795{
802 struct task_desc *prev, *next; 796 struct task_desc *prev, __used *next;
803 u64 timestamp0; 797 u64 timestamp0;
804 s64 delta; 798 s64 delta;
805 799
@@ -1404,7 +1398,7 @@ map_switch_event(struct trace_switch_event *switch_event,
1404 u64 timestamp, 1398 u64 timestamp,
1405 struct thread *thread __used) 1399 struct thread *thread __used)
1406{ 1400{
1407 struct thread *sched_out, *sched_in; 1401 struct thread *sched_out __used, *sched_in;
1408 int new_shortname; 1402 int new_shortname;
1409 u64 timestamp0; 1403 u64 timestamp0;
1410 s64 delta; 1404 s64 delta;
@@ -1580,9 +1574,9 @@ process_sched_migrate_task_event(void *data, struct perf_session *session,
1580 event, cpu, timestamp, thread); 1574 event, cpu, timestamp, thread);
1581} 1575}
1582 1576
1583static void 1577static void process_raw_event(union perf_event *raw_event __used,
1584process_raw_event(event_t *raw_event __used, struct perf_session *session, 1578 struct perf_session *session, void *data, int cpu,
1585 void *data, int cpu, u64 timestamp, struct thread *thread) 1579 u64 timestamp, struct thread *thread)
1586{ 1580{
1587 struct event *event; 1581 struct event *event;
1588 int type; 1582 int type;
@@ -1607,7 +1601,8 @@ process_raw_event(event_t *raw_event __used, struct perf_session *session,
1607 process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread); 1601 process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
1608} 1602}
1609 1603
1610static int process_sample_event(event_t *event, struct sample_data *sample, 1604static int process_sample_event(union perf_event *event,
1605 struct perf_sample *sample,
1611 struct perf_session *session) 1606 struct perf_session *session)
1612{ 1607{
1613 struct thread *thread; 1608 struct thread *thread;
@@ -1635,9 +1630,9 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
1635 1630
1636static struct perf_event_ops event_ops = { 1631static struct perf_event_ops event_ops = {
1637 .sample = process_sample_event, 1632 .sample = process_sample_event,
1638 .comm = event__process_comm, 1633 .comm = perf_event__process_comm,
1639 .lost = event__process_lost, 1634 .lost = perf_event__process_lost,
1640 .fork = event__process_task, 1635 .fork = perf_event__process_task,
1641 .ordered_samples = true, 1636 .ordered_samples = true,
1642}; 1637};
1643 1638
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b766c2a9ac9..5f40df635dc 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -63,7 +63,8 @@ static int cleanup_scripting(void)
63 63
64static char const *input_name = "perf.data"; 64static char const *input_name = "perf.data";
65 65
66static int process_sample_event(event_t *event, struct sample_data *sample, 66static int process_sample_event(union perf_event *event,
67 struct perf_sample *sample,
67 struct perf_session *session) 68 struct perf_session *session)
68{ 69{
69 struct thread *thread = perf_session__findnew(session, event->ip.pid); 70 struct thread *thread = perf_session__findnew(session, event->ip.pid);
@@ -100,14 +101,14 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
100} 101}
101 102
102static struct perf_event_ops event_ops = { 103static struct perf_event_ops event_ops = {
103 .sample = process_sample_event, 104 .sample = process_sample_event,
104 .comm = event__process_comm, 105 .comm = perf_event__process_comm,
105 .attr = event__process_attr, 106 .attr = perf_event__process_attr,
106 .event_type = event__process_event_type, 107 .event_type = perf_event__process_event_type,
107 .tracing_data = event__process_tracing_data, 108 .tracing_data = perf_event__process_tracing_data,
108 .build_id = event__process_build_id, 109 .build_id = perf_event__process_build_id,
109 .ordering_requires_timestamps = true,
110 .ordered_samples = true, 110 .ordered_samples = true,
111 .ordering_requires_timestamps = true,
111}; 112};
112 113
113extern volatile int session_done; 114extern volatile int session_done;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a482a191a0c..21c02522249 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -43,11 +43,13 @@
43#include "util/parse-options.h" 43#include "util/parse-options.h"
44#include "util/parse-events.h" 44#include "util/parse-events.h"
45#include "util/event.h" 45#include "util/event.h"
46#include "util/evlist.h"
46#include "util/evsel.h" 47#include "util/evsel.h"
47#include "util/debug.h" 48#include "util/debug.h"
48#include "util/header.h" 49#include "util/header.h"
49#include "util/cpumap.h" 50#include "util/cpumap.h"
50#include "util/thread.h" 51#include "util/thread.h"
52#include "util/thread_map.h"
51 53
52#include <sys/prctl.h> 54#include <sys/prctl.h>
53#include <math.h> 55#include <math.h>
@@ -71,8 +73,9 @@ static struct perf_event_attr default_attrs[] = {
71 73
72}; 74};
73 75
76struct perf_evlist *evsel_list;
77
74static bool system_wide = false; 78static bool system_wide = false;
75static struct cpu_map *cpus;
76static int run_idx = 0; 79static int run_idx = 0;
77 80
78static int run_count = 1; 81static int run_count = 1;
@@ -81,7 +84,6 @@ static bool scale = true;
81static bool no_aggr = false; 84static bool no_aggr = false;
82static pid_t target_pid = -1; 85static pid_t target_pid = -1;
83static pid_t target_tid = -1; 86static pid_t target_tid = -1;
84static struct thread_map *threads;
85static pid_t child_pid = -1; 87static pid_t child_pid = -1;
86static bool null_run = false; 88static bool null_run = false;
87static bool big_num = true; 89static bool big_num = true;
@@ -166,7 +168,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
166 PERF_FORMAT_TOTAL_TIME_RUNNING; 168 PERF_FORMAT_TOTAL_TIME_RUNNING;
167 169
168 if (system_wide) 170 if (system_wide)
169 return perf_evsel__open_per_cpu(evsel, cpus); 171 return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false, false);
170 172
171 attr->inherit = !no_inherit; 173 attr->inherit = !no_inherit;
172 if (target_pid == -1 && target_tid == -1) { 174 if (target_pid == -1 && target_tid == -1) {
@@ -174,7 +176,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
174 attr->enable_on_exec = 1; 176 attr->enable_on_exec = 1;
175 } 177 }
176 178
177 return perf_evsel__open_per_thread(evsel, threads); 179 return perf_evsel__open_per_thread(evsel, evsel_list->threads, false, false);
178} 180}
179 181
180/* 182/*
@@ -199,7 +201,8 @@ static int read_counter_aggr(struct perf_evsel *counter)
199 u64 *count = counter->counts->aggr.values; 201 u64 *count = counter->counts->aggr.values;
200 int i; 202 int i;
201 203
202 if (__perf_evsel__read(counter, cpus->nr, threads->nr, scale) < 0) 204 if (__perf_evsel__read(counter, evsel_list->cpus->nr,
205 evsel_list->threads->nr, scale) < 0)
203 return -1; 206 return -1;
204 207
205 for (i = 0; i < 3; i++) 208 for (i = 0; i < 3; i++)
@@ -232,7 +235,7 @@ static int read_counter(struct perf_evsel *counter)
232 u64 *count; 235 u64 *count;
233 int cpu; 236 int cpu;
234 237
235 for (cpu = 0; cpu < cpus->nr; cpu++) { 238 for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
236 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) 239 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
237 return -1; 240 return -1;
238 241
@@ -297,7 +300,7 @@ static int run_perf_stat(int argc __used, const char **argv)
297 } 300 }
298 301
299 if (target_tid == -1 && target_pid == -1 && !system_wide) 302 if (target_tid == -1 && target_pid == -1 && !system_wide)
300 threads->map[0] = child_pid; 303 evsel_list->threads->map[0] = child_pid;
301 304
302 /* 305 /*
303 * Wait for the child to be ready to exec. 306 * Wait for the child to be ready to exec.
@@ -309,7 +312,7 @@ static int run_perf_stat(int argc __used, const char **argv)
309 close(child_ready_pipe[0]); 312 close(child_ready_pipe[0]);
310 } 313 }
311 314
312 list_for_each_entry(counter, &evsel_list, node) { 315 list_for_each_entry(counter, &evsel_list->entries, node) {
313 if (create_perf_stat_counter(counter) < 0) { 316 if (create_perf_stat_counter(counter) < 0) {
314 if (errno == -EPERM || errno == -EACCES) { 317 if (errno == -EPERM || errno == -EACCES) {
315 error("You may not have permission to collect %sstats.\n" 318 error("You may not have permission to collect %sstats.\n"
@@ -347,14 +350,15 @@ static int run_perf_stat(int argc __used, const char **argv)
347 update_stats(&walltime_nsecs_stats, t1 - t0); 350 update_stats(&walltime_nsecs_stats, t1 - t0);
348 351
349 if (no_aggr) { 352 if (no_aggr) {
350 list_for_each_entry(counter, &evsel_list, node) { 353 list_for_each_entry(counter, &evsel_list->entries, node) {
351 read_counter(counter); 354 read_counter(counter);
352 perf_evsel__close_fd(counter, cpus->nr, 1); 355 perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1);
353 } 356 }
354 } else { 357 } else {
355 list_for_each_entry(counter, &evsel_list, node) { 358 list_for_each_entry(counter, &evsel_list->entries, node) {
356 read_counter_aggr(counter); 359 read_counter_aggr(counter);
357 perf_evsel__close_fd(counter, cpus->nr, threads->nr); 360 perf_evsel__close_fd(counter, evsel_list->cpus->nr,
361 evsel_list->threads->nr);
358 } 362 }
359 } 363 }
360 364
@@ -382,10 +386,13 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
382 if (no_aggr) 386 if (no_aggr)
383 sprintf(cpustr, "CPU%*d%s", 387 sprintf(cpustr, "CPU%*d%s",
384 csv_output ? 0 : -4, 388 csv_output ? 0 : -4,
385 cpus->map[cpu], csv_sep); 389 evsel_list->cpus->map[cpu], csv_sep);
386 390
387 fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel)); 391 fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
388 392
393 if (evsel->cgrp)
394 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
395
389 if (csv_output) 396 if (csv_output)
390 return; 397 return;
391 398
@@ -410,12 +417,15 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
410 if (no_aggr) 417 if (no_aggr)
411 sprintf(cpustr, "CPU%*d%s", 418 sprintf(cpustr, "CPU%*d%s",
412 csv_output ? 0 : -4, 419 csv_output ? 0 : -4,
413 cpus->map[cpu], csv_sep); 420 evsel_list->cpus->map[cpu], csv_sep);
414 else 421 else
415 cpu = 0; 422 cpu = 0;
416 423
417 fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel)); 424 fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
418 425
426 if (evsel->cgrp)
427 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
428
419 if (csv_output) 429 if (csv_output)
420 return; 430 return;
421 431
@@ -456,9 +466,17 @@ static void print_counter_aggr(struct perf_evsel *counter)
456 int scaled = counter->counts->scaled; 466 int scaled = counter->counts->scaled;
457 467
458 if (scaled == -1) { 468 if (scaled == -1) {
459 fprintf(stderr, "%*s%s%-24s\n", 469 fprintf(stderr, "%*s%s%*s",
460 csv_output ? 0 : 18, 470 csv_output ? 0 : 18,
461 "<not counted>", csv_sep, event_name(counter)); 471 "<not counted>",
472 csv_sep,
473 csv_output ? 0 : -24,
474 event_name(counter));
475
476 if (counter->cgrp)
477 fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
478
479 fputc('\n', stderr);
462 return; 480 return;
463 } 481 }
464 482
@@ -483,7 +501,6 @@ static void print_counter_aggr(struct perf_evsel *counter)
483 fprintf(stderr, " (scaled from %.2f%%)", 501 fprintf(stderr, " (scaled from %.2f%%)",
484 100 * avg_running / avg_enabled); 502 100 * avg_running / avg_enabled);
485 } 503 }
486
487 fprintf(stderr, "\n"); 504 fprintf(stderr, "\n");
488} 505}
489 506
@@ -496,19 +513,23 @@ static void print_counter(struct perf_evsel *counter)
496 u64 ena, run, val; 513 u64 ena, run, val;
497 int cpu; 514 int cpu;
498 515
499 for (cpu = 0; cpu < cpus->nr; cpu++) { 516 for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
500 val = counter->counts->cpu[cpu].val; 517 val = counter->counts->cpu[cpu].val;
501 ena = counter->counts->cpu[cpu].ena; 518 ena = counter->counts->cpu[cpu].ena;
502 run = counter->counts->cpu[cpu].run; 519 run = counter->counts->cpu[cpu].run;
503 if (run == 0 || ena == 0) { 520 if (run == 0 || ena == 0) {
504 fprintf(stderr, "CPU%*d%s%*s%s%-24s", 521 fprintf(stderr, "CPU%*d%s%*s%s%*s",
505 csv_output ? 0 : -4, 522 csv_output ? 0 : -4,
506 cpus->map[cpu], csv_sep, 523 evsel_list->cpus->map[cpu], csv_sep,
507 csv_output ? 0 : 18, 524 csv_output ? 0 : 18,
508 "<not counted>", csv_sep, 525 "<not counted>", csv_sep,
526 csv_output ? 0 : -24,
509 event_name(counter)); 527 event_name(counter));
510 528
511 fprintf(stderr, "\n"); 529 if (counter->cgrp)
530 fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
531
532 fputc('\n', stderr);
512 continue; 533 continue;
513 } 534 }
514 535
@@ -525,7 +546,7 @@ static void print_counter(struct perf_evsel *counter)
525 100.0 * run / ena); 546 100.0 * run / ena);
526 } 547 }
527 } 548 }
528 fprintf(stderr, "\n"); 549 fputc('\n', stderr);
529 } 550 }
530} 551}
531 552
@@ -555,10 +576,10 @@ static void print_stat(int argc, const char **argv)
555 } 576 }
556 577
557 if (no_aggr) { 578 if (no_aggr) {
558 list_for_each_entry(counter, &evsel_list, node) 579 list_for_each_entry(counter, &evsel_list->entries, node)
559 print_counter(counter); 580 print_counter(counter);
560 } else { 581 } else {
561 list_for_each_entry(counter, &evsel_list, node) 582 list_for_each_entry(counter, &evsel_list->entries, node)
562 print_counter_aggr(counter); 583 print_counter_aggr(counter);
563 } 584 }
564 585
@@ -610,7 +631,7 @@ static int stat__set_big_num(const struct option *opt __used,
610} 631}
611 632
612static const struct option options[] = { 633static const struct option options[] = {
613 OPT_CALLBACK('e', "event", NULL, "event", 634 OPT_CALLBACK('e', "event", &evsel_list, "event",
614 "event selector. use 'perf list' to list available events", 635 "event selector. use 'perf list' to list available events",
615 parse_events), 636 parse_events),
616 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 637 OPT_BOOLEAN('i', "no-inherit", &no_inherit,
@@ -638,6 +659,9 @@ static const struct option options[] = {
638 "disable CPU count aggregation"), 659 "disable CPU count aggregation"),
639 OPT_STRING('x', "field-separator", &csv_sep, "separator", 660 OPT_STRING('x', "field-separator", &csv_sep, "separator",
640 "print counts with custom separator"), 661 "print counts with custom separator"),
662 OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
663 "monitor event in cgroup name only",
664 parse_cgroups),
641 OPT_END() 665 OPT_END()
642}; 666};
643 667
@@ -648,6 +672,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
648 672
649 setlocale(LC_ALL, ""); 673 setlocale(LC_ALL, "");
650 674
675 evsel_list = perf_evlist__new(NULL, NULL);
676 if (evsel_list == NULL)
677 return -ENOMEM;
678
651 argc = parse_options(argc, argv, options, stat_usage, 679 argc = parse_options(argc, argv, options, stat_usage,
652 PARSE_OPT_STOP_AT_NON_OPTION); 680 PARSE_OPT_STOP_AT_NON_OPTION);
653 681
@@ -674,49 +702,50 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
674 if (run_count <= 0) 702 if (run_count <= 0)
675 usage_with_options(stat_usage, options); 703 usage_with_options(stat_usage, options);
676 704
677 /* no_aggr is for system-wide only */ 705 /* no_aggr, cgroup are for system-wide only */
678 if (no_aggr && !system_wide) 706 if ((no_aggr || nr_cgroups) && !system_wide) {
707 fprintf(stderr, "both cgroup and no-aggregation "
708 "modes only available in system-wide mode\n");
709
679 usage_with_options(stat_usage, options); 710 usage_with_options(stat_usage, options);
711 }
680 712
681 /* Set attrs and nr_counters if no event is selected and !null_run */ 713 /* Set attrs and nr_counters if no event is selected and !null_run */
682 if (!null_run && !nr_counters) { 714 if (!null_run && !evsel_list->nr_entries) {
683 size_t c; 715 size_t c;
684 716
685 nr_counters = ARRAY_SIZE(default_attrs);
686
687 for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { 717 for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
688 pos = perf_evsel__new(&default_attrs[c], 718 pos = perf_evsel__new(&default_attrs[c], c);
689 nr_counters);
690 if (pos == NULL) 719 if (pos == NULL)
691 goto out; 720 goto out;
692 list_add(&pos->node, &evsel_list); 721 perf_evlist__add(evsel_list, pos);
693 } 722 }
694 } 723 }
695 724
696 if (target_pid != -1) 725 if (target_pid != -1)
697 target_tid = target_pid; 726 target_tid = target_pid;
698 727
699 threads = thread_map__new(target_pid, target_tid); 728 evsel_list->threads = thread_map__new(target_pid, target_tid);
700 if (threads == NULL) { 729 if (evsel_list->threads == NULL) {
701 pr_err("Problems finding threads of monitor\n"); 730 pr_err("Problems finding threads of monitor\n");
702 usage_with_options(stat_usage, options); 731 usage_with_options(stat_usage, options);
703 } 732 }
704 733
705 if (system_wide) 734 if (system_wide)
706 cpus = cpu_map__new(cpu_list); 735 evsel_list->cpus = cpu_map__new(cpu_list);
707 else 736 else
708 cpus = cpu_map__dummy_new(); 737 evsel_list->cpus = cpu_map__dummy_new();
709 738
710 if (cpus == NULL) { 739 if (evsel_list->cpus == NULL) {
711 perror("failed to parse CPUs map"); 740 perror("failed to parse CPUs map");
712 usage_with_options(stat_usage, options); 741 usage_with_options(stat_usage, options);
713 return -1; 742 return -1;
714 } 743 }
715 744
716 list_for_each_entry(pos, &evsel_list, node) { 745 list_for_each_entry(pos, &evsel_list->entries, node) {
717 if (perf_evsel__alloc_stat_priv(pos) < 0 || 746 if (perf_evsel__alloc_stat_priv(pos) < 0 ||
718 perf_evsel__alloc_counts(pos, cpus->nr) < 0 || 747 perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 ||
719 perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) 748 perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0)
720 goto out_free_fd; 749 goto out_free_fd;
721 } 750 }
722 751
@@ -741,11 +770,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
741 if (status != -1) 770 if (status != -1)
742 print_stat(argc, argv); 771 print_stat(argc, argv);
743out_free_fd: 772out_free_fd:
744 list_for_each_entry(pos, &evsel_list, node) 773 list_for_each_entry(pos, &evsel_list->entries, node)
745 perf_evsel__free_stat_priv(pos); 774 perf_evsel__free_stat_priv(pos);
746 perf_evsel_list__delete(); 775 perf_evlist__delete_maps(evsel_list);
747out: 776out:
748 thread_map__delete(threads); 777 perf_evlist__delete(evsel_list);
749 threads = NULL;
750 return status; 778 return status;
751} 779}
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 5dcdba653d7..1b2106c58f6 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -7,10 +7,11 @@
7 7
8#include "util/cache.h" 8#include "util/cache.h"
9#include "util/debug.h" 9#include "util/debug.h"
10#include "util/evlist.h"
10#include "util/parse-options.h" 11#include "util/parse-options.h"
11#include "util/session.h" 12#include "util/parse-events.h"
12#include "util/symbol.h" 13#include "util/symbol.h"
13#include "util/thread.h" 14#include "util/thread_map.h"
14 15
15static long page_size; 16static long page_size;
16 17
@@ -238,14 +239,14 @@ out:
238#include "util/evsel.h" 239#include "util/evsel.h"
239#include <sys/types.h> 240#include <sys/types.h>
240 241
241static int trace_event__id(const char *event_name) 242static int trace_event__id(const char *evname)
242{ 243{
243 char *filename; 244 char *filename;
244 int err = -1, fd; 245 int err = -1, fd;
245 246
246 if (asprintf(&filename, 247 if (asprintf(&filename,
247 "/sys/kernel/debug/tracing/events/syscalls/%s/id", 248 "/sys/kernel/debug/tracing/events/syscalls/%s/id",
248 event_name) < 0) 249 evname) < 0)
249 return -1; 250 return -1;
250 251
251 fd = open(filename, O_RDONLY); 252 fd = open(filename, O_RDONLY);
@@ -289,7 +290,7 @@ static int test__open_syscall_event(void)
289 goto out_thread_map_delete; 290 goto out_thread_map_delete;
290 } 291 }
291 292
292 if (perf_evsel__open_per_thread(evsel, threads) < 0) { 293 if (perf_evsel__open_per_thread(evsel, threads, false, false) < 0) {
293 pr_debug("failed to open counter: %s, " 294 pr_debug("failed to open counter: %s, "
294 "tweak /proc/sys/kernel/perf_event_paranoid?\n", 295 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
295 strerror(errno)); 296 strerror(errno));
@@ -347,9 +348,9 @@ static int test__open_syscall_event_on_all_cpus(void)
347 } 348 }
348 349
349 cpus = cpu_map__new(NULL); 350 cpus = cpu_map__new(NULL);
350 if (threads == NULL) { 351 if (cpus == NULL) {
351 pr_debug("thread_map__new\n"); 352 pr_debug("cpu_map__new\n");
352 return -1; 353 goto out_thread_map_delete;
353 } 354 }
354 355
355 356
@@ -364,7 +365,7 @@ static int test__open_syscall_event_on_all_cpus(void)
364 goto out_thread_map_delete; 365 goto out_thread_map_delete;
365 } 366 }
366 367
367 if (perf_evsel__open(evsel, cpus, threads) < 0) { 368 if (perf_evsel__open(evsel, cpus, threads, false, false) < 0) {
368 pr_debug("failed to open counter: %s, " 369 pr_debug("failed to open counter: %s, "
369 "tweak /proc/sys/kernel/perf_event_paranoid?\n", 370 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
370 strerror(errno)); 371 strerror(errno));
@@ -408,6 +409,8 @@ static int test__open_syscall_event_on_all_cpus(void)
408 goto out_close_fd; 409 goto out_close_fd;
409 } 410 }
410 411
412 err = 0;
413
411 for (cpu = 0; cpu < cpus->nr; ++cpu) { 414 for (cpu = 0; cpu < cpus->nr; ++cpu) {
412 unsigned int expected; 415 unsigned int expected;
413 416
@@ -416,18 +419,18 @@ static int test__open_syscall_event_on_all_cpus(void)
416 419
417 if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) { 420 if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) {
418 pr_debug("perf_evsel__open_read_on_cpu\n"); 421 pr_debug("perf_evsel__open_read_on_cpu\n");
419 goto out_close_fd; 422 err = -1;
423 break;
420 } 424 }
421 425
422 expected = nr_open_calls + cpu; 426 expected = nr_open_calls + cpu;
423 if (evsel->counts->cpu[cpu].val != expected) { 427 if (evsel->counts->cpu[cpu].val != expected) {
424 pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n", 428 pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
425 expected, cpus->map[cpu], evsel->counts->cpu[cpu].val); 429 expected, cpus->map[cpu], evsel->counts->cpu[cpu].val);
426 goto out_close_fd; 430 err = -1;
427 } 431 }
428 } 432 }
429 433
430 err = 0;
431out_close_fd: 434out_close_fd:
432 perf_evsel__close_fd(evsel, 1, threads->nr); 435 perf_evsel__close_fd(evsel, 1, threads->nr);
433out_evsel_delete: 436out_evsel_delete:
@@ -437,6 +440,159 @@ out_thread_map_delete:
437 return err; 440 return err;
438} 441}
439 442
443/*
444 * This test will generate random numbers of calls to some getpid syscalls,
445 * then establish an mmap for a group of events that are created to monitor
446 * the syscalls.
447 *
448 * It will receive the events, using mmap, use its PERF_SAMPLE_ID generated
449 * sample.id field to map back to its respective perf_evsel instance.
450 *
451 * Then it checks if the number of syscalls reported as perf events by
452 * the kernel corresponds to the number of syscalls made.
453 */
454static int test__basic_mmap(void)
455{
456 int err = -1;
457 union perf_event *event;
458 struct thread_map *threads;
459 struct cpu_map *cpus;
460 struct perf_evlist *evlist;
461 struct perf_event_attr attr = {
462 .type = PERF_TYPE_TRACEPOINT,
463 .read_format = PERF_FORMAT_ID,
464 .sample_type = PERF_SAMPLE_ID,
465 .watermark = 0,
466 };
467 cpu_set_t cpu_set;
468 const char *syscall_names[] = { "getsid", "getppid", "getpgrp",
469 "getpgid", };
470 pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp,
471 (void*)getpgid };
472#define nsyscalls ARRAY_SIZE(syscall_names)
473 int ids[nsyscalls];
474 unsigned int nr_events[nsyscalls],
475 expected_nr_events[nsyscalls], i, j;
476 struct perf_evsel *evsels[nsyscalls], *evsel;
477
478 for (i = 0; i < nsyscalls; ++i) {
479 char name[64];
480
481 snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
482 ids[i] = trace_event__id(name);
483 if (ids[i] < 0) {
484 pr_debug("Is debugfs mounted on /sys/kernel/debug?\n");
485 return -1;
486 }
487 nr_events[i] = 0;
488 expected_nr_events[i] = random() % 257;
489 }
490
491 threads = thread_map__new(-1, getpid());
492 if (threads == NULL) {
493 pr_debug("thread_map__new\n");
494 return -1;
495 }
496
497 cpus = cpu_map__new(NULL);
498 if (cpus == NULL) {
499 pr_debug("cpu_map__new\n");
500 goto out_free_threads;
501 }
502
503 CPU_ZERO(&cpu_set);
504 CPU_SET(cpus->map[0], &cpu_set);
505 sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
506 if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
507 pr_debug("sched_setaffinity() failed on CPU %d: %s ",
508 cpus->map[0], strerror(errno));
509 goto out_free_cpus;
510 }
511
512 evlist = perf_evlist__new(cpus, threads);
513 if (evlist == NULL) {
514 pr_debug("perf_evlist__new\n");
515 goto out_free_cpus;
516 }
517
518 /* anonymous union fields, can't be initialized above */
519 attr.wakeup_events = 1;
520 attr.sample_period = 1;
521
522 for (i = 0; i < nsyscalls; ++i) {
523 attr.config = ids[i];
524 evsels[i] = perf_evsel__new(&attr, i);
525 if (evsels[i] == NULL) {
526 pr_debug("perf_evsel__new\n");
527 goto out_free_evlist;
528 }
529
530 perf_evlist__add(evlist, evsels[i]);
531
532 if (perf_evsel__open(evsels[i], cpus, threads, false, false) < 0) {
533 pr_debug("failed to open counter: %s, "
534 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
535 strerror(errno));
536 goto out_close_fd;
537 }
538 }
539
540 if (perf_evlist__mmap(evlist, 128, true) < 0) {
541 pr_debug("failed to mmap events: %d (%s)\n", errno,
542 strerror(errno));
543 goto out_close_fd;
544 }
545
546 for (i = 0; i < nsyscalls; ++i)
547 for (j = 0; j < expected_nr_events[i]; ++j) {
548 int foo = syscalls[i]();
549 ++foo;
550 }
551
552 while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) {
553 struct perf_sample sample;
554
555 if (event->header.type != PERF_RECORD_SAMPLE) {
556 pr_debug("unexpected %s event\n",
557 perf_event__name(event->header.type));
558 goto out_munmap;
559 }
560
561 perf_event__parse_sample(event, attr.sample_type, false, &sample);
562 evsel = perf_evlist__id2evsel(evlist, sample.id);
563 if (evsel == NULL) {
564 pr_debug("event with id %" PRIu64
565 " doesn't map to an evsel\n", sample.id);
566 goto out_munmap;
567 }
568 nr_events[evsel->idx]++;
569 }
570
571 list_for_each_entry(evsel, &evlist->entries, node) {
572 if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) {
573 pr_debug("expected %d %s events, got %d\n",
574 expected_nr_events[evsel->idx],
575 event_name(evsel), nr_events[evsel->idx]);
576 goto out_munmap;
577 }
578 }
579
580 err = 0;
581out_munmap:
582 perf_evlist__munmap(evlist);
583out_close_fd:
584 for (i = 0; i < nsyscalls; ++i)
585 perf_evsel__close_fd(evsels[i], 1, threads->nr);
586out_free_evlist:
587 perf_evlist__delete(evlist);
588out_free_cpus:
589 cpu_map__delete(cpus);
590out_free_threads:
591 thread_map__delete(threads);
592 return err;
593#undef nsyscalls
594}
595
440static struct test { 596static struct test {
441 const char *desc; 597 const char *desc;
442 int (*func)(void); 598 int (*func)(void);
@@ -454,6 +610,10 @@ static struct test {
454 .func = test__open_syscall_event_on_all_cpus, 610 .func = test__open_syscall_event_on_all_cpus,
455 }, 611 },
456 { 612 {
613 .desc = "read samples using the mmap interface",
614 .func = test__basic_mmap,
615 },
616 {
457 .func = NULL, 617 .func = NULL,
458 }, 618 },
459}; 619};
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 0ace786e83e..67c0459dc32 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -273,21 +273,24 @@ static int cpus_cstate_state[MAX_CPUS];
273static u64 cpus_pstate_start_times[MAX_CPUS]; 273static u64 cpus_pstate_start_times[MAX_CPUS];
274static u64 cpus_pstate_state[MAX_CPUS]; 274static u64 cpus_pstate_state[MAX_CPUS];
275 275
276static int process_comm_event(event_t *event, struct sample_data *sample __used, 276static int process_comm_event(union perf_event *event,
277 struct perf_sample *sample __used,
277 struct perf_session *session __used) 278 struct perf_session *session __used)
278{ 279{
279 pid_set_comm(event->comm.tid, event->comm.comm); 280 pid_set_comm(event->comm.tid, event->comm.comm);
280 return 0; 281 return 0;
281} 282}
282 283
283static int process_fork_event(event_t *event, struct sample_data *sample __used, 284static int process_fork_event(union perf_event *event,
285 struct perf_sample *sample __used,
284 struct perf_session *session __used) 286 struct perf_session *session __used)
285{ 287{
286 pid_fork(event->fork.pid, event->fork.ppid, event->fork.time); 288 pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
287 return 0; 289 return 0;
288} 290}
289 291
290static int process_exit_event(event_t *event, struct sample_data *sample __used, 292static int process_exit_event(union perf_event *event,
293 struct perf_sample *sample __used,
291 struct perf_session *session __used) 294 struct perf_session *session __used)
292{ 295{
293 pid_exit(event->fork.pid, event->fork.time); 296 pid_exit(event->fork.pid, event->fork.time);
@@ -483,8 +486,8 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
483} 486}
484 487
485 488
486static int process_sample_event(event_t *event __used, 489static int process_sample_event(union perf_event *event __used,
487 struct sample_data *sample, 490 struct perf_sample *sample,
488 struct perf_session *session) 491 struct perf_session *session)
489{ 492{
490 struct trace_entry *te; 493 struct trace_entry *te;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 5a29d9cd948..80c9e062bd5 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -20,11 +20,16 @@
20 20
21#include "perf.h" 21#include "perf.h"
22 22
23#include "util/annotate.h"
24#include "util/cache.h"
23#include "util/color.h" 25#include "util/color.h"
26#include "util/evlist.h"
24#include "util/evsel.h" 27#include "util/evsel.h"
25#include "util/session.h" 28#include "util/session.h"
26#include "util/symbol.h" 29#include "util/symbol.h"
27#include "util/thread.h" 30#include "util/thread.h"
31#include "util/thread_map.h"
32#include "util/top.h"
28#include "util/util.h" 33#include "util/util.h"
29#include <linux/rbtree.h> 34#include <linux/rbtree.h>
30#include "util/parse-options.h" 35#include "util/parse-options.h"
@@ -45,7 +50,6 @@
45#include <errno.h> 50#include <errno.h>
46#include <time.h> 51#include <time.h>
47#include <sched.h> 52#include <sched.h>
48#include <pthread.h>
49 53
50#include <sys/syscall.h> 54#include <sys/syscall.h>
51#include <sys/ioctl.h> 55#include <sys/ioctl.h>
@@ -60,85 +64,42 @@
60 64
61#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) 65#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
62 66
67static struct perf_top top = {
68 .count_filter = 5,
69 .delay_secs = 2,
70 .display_weighted = -1,
71 .target_pid = -1,
72 .target_tid = -1,
73 .active_symbols = LIST_HEAD_INIT(top.active_symbols),
74 .active_symbols_lock = PTHREAD_MUTEX_INITIALIZER,
75 .active_symbols_cond = PTHREAD_COND_INITIALIZER,
76 .freq = 1000, /* 1 KHz */
77};
78
63static bool system_wide = false; 79static bool system_wide = false;
64 80
65static int default_interval = 0; 81static bool use_tui, use_stdio;
66 82
67static int count_filter = 5; 83static int default_interval = 0;
68static int print_entries;
69 84
70static int target_pid = -1;
71static int target_tid = -1;
72static struct thread_map *threads;
73static bool inherit = false; 85static bool inherit = false;
74static struct cpu_map *cpus;
75static int realtime_prio = 0; 86static int realtime_prio = 0;
76static bool group = false; 87static bool group = false;
77static unsigned int page_size; 88static unsigned int page_size;
78static unsigned int mmap_pages = 16; 89static unsigned int mmap_pages = 128;
79static int freq = 1000; /* 1 KHz */
80 90
81static int delay_secs = 2;
82static bool zero = false;
83static bool dump_symtab = false; 91static bool dump_symtab = false;
84 92
85static bool hide_kernel_symbols = false;
86static bool hide_user_symbols = false;
87static struct winsize winsize; 93static struct winsize winsize;
88 94
89/*
90 * Source
91 */
92
93struct source_line {
94 u64 eip;
95 unsigned long count[MAX_COUNTERS];
96 char *line;
97 struct source_line *next;
98};
99
100static const char *sym_filter = NULL; 95static const char *sym_filter = NULL;
101struct sym_entry *sym_filter_entry = NULL;
102struct sym_entry *sym_filter_entry_sched = NULL; 96struct sym_entry *sym_filter_entry_sched = NULL;
103static int sym_pcnt_filter = 5; 97static int sym_pcnt_filter = 5;
104static int sym_counter = 0;
105static struct perf_evsel *sym_evsel = NULL;
106static int display_weighted = -1;
107static const char *cpu_list;
108
109/*
110 * Symbols
111 */
112
113struct sym_entry_source {
114 struct source_line *source;
115 struct source_line *lines;
116 struct source_line **lines_tail;
117 pthread_mutex_t lock;
118};
119
120struct sym_entry {
121 struct rb_node rb_node;
122 struct list_head node;
123 unsigned long snap_count;
124 double weight;
125 int skip;
126 u16 name_len;
127 u8 origin;
128 struct map *map;
129 struct sym_entry_source *src;
130 unsigned long count[0];
131};
132 98
133/* 99/*
134 * Source functions 100 * Source functions
135 */ 101 */
136 102
137static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
138{
139 return ((void *)self) + symbol_conf.priv_size;
140}
141
142void get_term_dimensions(struct winsize *ws) 103void get_term_dimensions(struct winsize *ws)
143{ 104{
144 char *s = getenv("LINES"); 105 char *s = getenv("LINES");
@@ -163,10 +124,10 @@ void get_term_dimensions(struct winsize *ws)
163 124
164static void update_print_entries(struct winsize *ws) 125static void update_print_entries(struct winsize *ws)
165{ 126{
166 print_entries = ws->ws_row; 127 top.print_entries = ws->ws_row;
167 128
168 if (print_entries > 9) 129 if (top.print_entries > 9)
169 print_entries -= 9; 130 top.print_entries -= 9;
170} 131}
171 132
172static void sig_winch_handler(int sig __used) 133static void sig_winch_handler(int sig __used)
@@ -178,12 +139,9 @@ static void sig_winch_handler(int sig __used)
178static int parse_source(struct sym_entry *syme) 139static int parse_source(struct sym_entry *syme)
179{ 140{
180 struct symbol *sym; 141 struct symbol *sym;
181 struct sym_entry_source *source; 142 struct annotation *notes;
182 struct map *map; 143 struct map *map;
183 FILE *file; 144 int err = -1;
184 char command[PATH_MAX*2];
185 const char *path;
186 u64 len;
187 145
188 if (!syme) 146 if (!syme)
189 return -1; 147 return -1;
@@ -194,411 +152,137 @@ static int parse_source(struct sym_entry *syme)
194 /* 152 /*
195 * We can't annotate with just /proc/kallsyms 153 * We can't annotate with just /proc/kallsyms
196 */ 154 */
197 if (map->dso->origin == DSO__ORIG_KERNEL) 155 if (map->dso->origin == DSO__ORIG_KERNEL) {
156 pr_err("Can't annotate %s: No vmlinux file was found in the "
157 "path\n", sym->name);
158 sleep(1);
198 return -1; 159 return -1;
199
200 if (syme->src == NULL) {
201 syme->src = zalloc(sizeof(*source));
202 if (syme->src == NULL)
203 return -1;
204 pthread_mutex_init(&syme->src->lock, NULL);
205 } 160 }
206 161
207 source = syme->src; 162 notes = symbol__annotation(sym);
208 163 if (notes->src != NULL) {
209 if (source->lines) { 164 pthread_mutex_lock(&notes->lock);
210 pthread_mutex_lock(&source->lock);
211 goto out_assign; 165 goto out_assign;
212 } 166 }
213 path = map->dso->long_name;
214
215 len = sym->end - sym->start;
216
217 sprintf(command,
218 "objdump --start-address=%#0*" PRIx64 " --stop-address=%#0*" PRIx64 " -dS %s",
219 BITS_PER_LONG / 4, map__rip_2objdump(map, sym->start),
220 BITS_PER_LONG / 4, map__rip_2objdump(map, sym->end), path);
221
222 file = popen(command, "r");
223 if (!file)
224 return -1;
225
226 pthread_mutex_lock(&source->lock);
227 source->lines_tail = &source->lines;
228 while (!feof(file)) {
229 struct source_line *src;
230 size_t dummy = 0;
231 char *c, *sep;
232 167
233 src = malloc(sizeof(struct source_line)); 168 pthread_mutex_lock(&notes->lock);
234 assert(src != NULL);
235 memset(src, 0, sizeof(struct source_line));
236 169
237 if (getline(&src->line, &dummy, file) < 0) 170 if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
238 break; 171 pthread_mutex_unlock(&notes->lock);
239 if (!src->line) 172 pr_err("Not enough memory for annotating '%s' symbol!\n",
240 break; 173 sym->name);
241 174 sleep(1);
242 c = strchr(src->line, '\n'); 175 return err;
243 if (c)
244 *c = 0;
245
246 src->next = NULL;
247 *source->lines_tail = src;
248 source->lines_tail = &src->next;
249
250 src->eip = strtoull(src->line, &sep, 16);
251 if (*sep == ':')
252 src->eip = map__objdump_2ip(map, src->eip);
253 else /* this line has no ip info (e.g. source line) */
254 src->eip = 0;
255 } 176 }
256 pclose(file); 177
178 err = symbol__annotate(sym, syme->map, 0);
179 if (err == 0) {
257out_assign: 180out_assign:
258 sym_filter_entry = syme; 181 top.sym_filter_entry = syme;
259 pthread_mutex_unlock(&source->lock); 182 }
260 return 0; 183
184 pthread_mutex_unlock(&notes->lock);
185 return err;
261} 186}
262 187
263static void __zero_source_counters(struct sym_entry *syme) 188static void __zero_source_counters(struct sym_entry *syme)
264{ 189{
265 int i; 190 struct symbol *sym = sym_entry__symbol(syme);
266 struct source_line *line; 191 symbol__annotate_zero_histograms(sym);
267
268 line = syme->src->lines;
269 while (line) {
270 for (i = 0; i < nr_counters; i++)
271 line->count[i] = 0;
272 line = line->next;
273 }
274} 192}
275 193
276static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip) 194static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
277{ 195{
278 struct source_line *line; 196 struct annotation *notes;
279 197 struct symbol *sym;
280 if (syme != sym_filter_entry)
281 return;
282 198
283 if (pthread_mutex_trylock(&syme->src->lock)) 199 if (syme != top.sym_filter_entry)
284 return; 200 return;
285 201
286 if (syme->src == NULL || syme->src->source == NULL) 202 sym = sym_entry__symbol(syme);
287 goto out_unlock; 203 notes = symbol__annotation(sym);
288
289 for (line = syme->src->lines; line; line = line->next) {
290 /* skip lines without IP info */
291 if (line->eip == 0)
292 continue;
293 if (line->eip == ip) {
294 line->count[counter]++;
295 break;
296 }
297 if (line->eip > ip)
298 break;
299 }
300out_unlock:
301 pthread_mutex_unlock(&syme->src->lock);
302}
303
304#define PATTERN_LEN (BITS_PER_LONG / 4 + 2)
305
306static void lookup_sym_source(struct sym_entry *syme)
307{
308 struct symbol *symbol = sym_entry__symbol(syme);
309 struct source_line *line;
310 char pattern[PATTERN_LEN + 1];
311
312 sprintf(pattern, "%0*" PRIx64 " <", BITS_PER_LONG / 4,
313 map__rip_2objdump(syme->map, symbol->start));
314
315 pthread_mutex_lock(&syme->src->lock);
316 for (line = syme->src->lines; line; line = line->next) {
317 if (memcmp(line->line, pattern, PATTERN_LEN) == 0) {
318 syme->src->source = line;
319 break;
320 }
321 }
322 pthread_mutex_unlock(&syme->src->lock);
323}
324 204
325static void show_lines(struct source_line *queue, int count, int total) 205 if (pthread_mutex_trylock(&notes->lock))
326{ 206 return;
327 int i;
328 struct source_line *line;
329 207
330 line = queue; 208 ip = syme->map->map_ip(syme->map, ip);
331 for (i = 0; i < count; i++) { 209 symbol__inc_addr_samples(sym, syme->map, counter, ip);
332 float pcnt = 100.0*(float)line->count[sym_counter]/(float)total;
333 210
334 printf("%8li %4.1f%%\t%s\n", line->count[sym_counter], pcnt, line->line); 211 pthread_mutex_unlock(&notes->lock);
335 line = line->next;
336 }
337} 212}
338 213
339#define TRACE_COUNT 3
340
341static void show_details(struct sym_entry *syme) 214static void show_details(struct sym_entry *syme)
342{ 215{
216 struct annotation *notes;
343 struct symbol *symbol; 217 struct symbol *symbol;
344 struct source_line *line; 218 int more;
345 struct source_line *line_queue = NULL;
346 int displayed = 0;
347 int line_queue_count = 0, total = 0, more = 0;
348 219
349 if (!syme) 220 if (!syme)
350 return; 221 return;
351 222
352 if (!syme->src->source)
353 lookup_sym_source(syme);
354
355 if (!syme->src->source)
356 return;
357
358 symbol = sym_entry__symbol(syme); 223 symbol = sym_entry__symbol(syme);
359 printf("Showing %s for %s\n", event_name(sym_evsel), symbol->name); 224 notes = symbol__annotation(symbol);
360 printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter);
361
362 pthread_mutex_lock(&syme->src->lock);
363 line = syme->src->source;
364 while (line) {
365 total += line->count[sym_counter];
366 line = line->next;
367 }
368
369 line = syme->src->source;
370 while (line) {
371 float pcnt = 0.0;
372
373 if (!line_queue_count)
374 line_queue = line;
375 line_queue_count++;
376
377 if (line->count[sym_counter])
378 pcnt = 100.0 * line->count[sym_counter] / (float)total;
379 if (pcnt >= (float)sym_pcnt_filter) {
380 if (displayed <= print_entries)
381 show_lines(line_queue, line_queue_count, total);
382 else more++;
383 displayed += line_queue_count;
384 line_queue_count = 0;
385 line_queue = NULL;
386 } else if (line_queue_count > TRACE_COUNT) {
387 line_queue = line_queue->next;
388 line_queue_count--;
389 }
390
391 line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8;
392 line = line->next;
393 }
394 pthread_mutex_unlock(&syme->src->lock);
395 if (more)
396 printf("%d lines not displayed, maybe increase display entries [e]\n", more);
397}
398 225
399/* 226 pthread_mutex_lock(&notes->lock);
400 * Symbols will be added here in event__process_sample and will get out
401 * after decayed.
402 */
403static LIST_HEAD(active_symbols);
404static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
405
406/*
407 * Ordering weight: count-1 * count-2 * ... / count-n
408 */
409static double sym_weight(const struct sym_entry *sym)
410{
411 double weight = sym->snap_count;
412 int counter;
413
414 if (!display_weighted)
415 return weight;
416 227
417 for (counter = 1; counter < nr_counters-1; counter++) 228 if (notes->src == NULL)
418 weight *= sym->count[counter]; 229 goto out_unlock;
419 230
420 weight /= (sym->count[counter] + 1); 231 printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name);
232 printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter);
421 233
422 return weight; 234 more = symbol__annotate_printf(symbol, syme->map, top.sym_evsel->idx,
235 0, sym_pcnt_filter, top.print_entries, 4);
236 if (top.zero)
237 symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx);
238 else
239 symbol__annotate_decay_histogram(symbol, top.sym_evsel->idx);
240 if (more != 0)
241 printf("%d lines not displayed, maybe increase display entries [e]\n", more);
242out_unlock:
243 pthread_mutex_unlock(&notes->lock);
423} 244}
424 245
425static long samples;
426static long kernel_samples, us_samples;
427static long exact_samples;
428static long guest_us_samples, guest_kernel_samples;
429static const char CONSOLE_CLEAR[] = ""; 246static const char CONSOLE_CLEAR[] = "";
430 247
431static void __list_insert_active_sym(struct sym_entry *syme) 248static void __list_insert_active_sym(struct sym_entry *syme)
432{ 249{
433 list_add(&syme->node, &active_symbols); 250 list_add(&syme->node, &top.active_symbols);
434}
435
436static void list_remove_active_sym(struct sym_entry *syme)
437{
438 pthread_mutex_lock(&active_symbols_lock);
439 list_del_init(&syme->node);
440 pthread_mutex_unlock(&active_symbols_lock);
441} 251}
442 252
443static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) 253static void print_sym_table(struct perf_session *session)
444{ 254{
445 struct rb_node **p = &tree->rb_node; 255 char bf[160];
446 struct rb_node *parent = NULL; 256 int printed = 0;
447 struct sym_entry *iter;
448
449 while (*p != NULL) {
450 parent = *p;
451 iter = rb_entry(parent, struct sym_entry, rb_node);
452
453 if (se->weight > iter->weight)
454 p = &(*p)->rb_left;
455 else
456 p = &(*p)->rb_right;
457 }
458
459 rb_link_node(&se->rb_node, parent, p);
460 rb_insert_color(&se->rb_node, tree);
461}
462
463static void print_sym_table(void)
464{
465 int printed = 0, j;
466 struct perf_evsel *counter;
467 int snap = !display_weighted ? sym_counter : 0;
468 float samples_per_sec = samples/delay_secs;
469 float ksamples_per_sec = kernel_samples/delay_secs;
470 float us_samples_per_sec = (us_samples)/delay_secs;
471 float guest_kernel_samples_per_sec = (guest_kernel_samples)/delay_secs;
472 float guest_us_samples_per_sec = (guest_us_samples)/delay_secs;
473 float esamples_percent = (100.0*exact_samples)/samples;
474 float sum_ksamples = 0.0;
475 struct sym_entry *syme, *n;
476 struct rb_root tmp = RB_ROOT;
477 struct rb_node *nd; 257 struct rb_node *nd;
478 int sym_width = 0, dso_width = 0, dso_short_width = 0; 258 struct sym_entry *syme;
259 struct rb_root tmp = RB_ROOT;
479 const int win_width = winsize.ws_col - 1; 260 const int win_width = winsize.ws_col - 1;
480 261 int sym_width, dso_width, dso_short_width;
481 samples = us_samples = kernel_samples = exact_samples = 0; 262 float sum_ksamples = perf_top__decay_samples(&top, &tmp);
482 guest_kernel_samples = guest_us_samples = 0;
483
484 /* Sort the active symbols */
485 pthread_mutex_lock(&active_symbols_lock);
486 syme = list_entry(active_symbols.next, struct sym_entry, node);
487 pthread_mutex_unlock(&active_symbols_lock);
488
489 list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
490 syme->snap_count = syme->count[snap];
491 if (syme->snap_count != 0) {
492
493 if ((hide_user_symbols &&
494 syme->origin == PERF_RECORD_MISC_USER) ||
495 (hide_kernel_symbols &&
496 syme->origin == PERF_RECORD_MISC_KERNEL)) {
497 list_remove_active_sym(syme);
498 continue;
499 }
500 syme->weight = sym_weight(syme);
501 rb_insert_active_sym(&tmp, syme);
502 sum_ksamples += syme->snap_count;
503
504 for (j = 0; j < nr_counters; j++)
505 syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
506 } else
507 list_remove_active_sym(syme);
508 }
509 263
510 puts(CONSOLE_CLEAR); 264 puts(CONSOLE_CLEAR);
511 265
512 printf("%-*.*s\n", win_width, win_width, graph_dotted_line); 266 perf_top__header_snprintf(&top, bf, sizeof(bf));
513 if (!perf_guest) { 267 printf("%s\n", bf);
514 printf(" PerfTop:%8.0f irqs/sec kernel:%4.1f%%"
515 " exact: %4.1f%% [",
516 samples_per_sec,
517 100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
518 samples_per_sec)),
519 esamples_percent);
520 } else {
521 printf(" PerfTop:%8.0f irqs/sec kernel:%4.1f%% us:%4.1f%%"
522 " guest kernel:%4.1f%% guest us:%4.1f%%"
523 " exact: %4.1f%% [",
524 samples_per_sec,
525 100.0 - (100.0 * ((samples_per_sec-ksamples_per_sec) /
526 samples_per_sec)),
527 100.0 - (100.0 * ((samples_per_sec-us_samples_per_sec) /
528 samples_per_sec)),
529 100.0 - (100.0 * ((samples_per_sec -
530 guest_kernel_samples_per_sec) /
531 samples_per_sec)),
532 100.0 - (100.0 * ((samples_per_sec -
533 guest_us_samples_per_sec) /
534 samples_per_sec)),
535 esamples_percent);
536 }
537
538 if (nr_counters == 1 || !display_weighted) {
539 struct perf_evsel *first;
540 first = list_entry(evsel_list.next, struct perf_evsel, node);
541 printf("%" PRIu64, (uint64_t)first->attr.sample_period);
542 if (freq)
543 printf("Hz ");
544 else
545 printf(" ");
546 }
547
548 if (!display_weighted)
549 printf("%s", event_name(sym_evsel));
550 else list_for_each_entry(counter, &evsel_list, node) {
551 if (counter->idx)
552 printf("/");
553
554 printf("%s", event_name(counter));
555 }
556 268
557 printf( "], "); 269 perf_top__reset_sample_counters(&top);
558
559 if (target_pid != -1)
560 printf(" (target_pid: %d", target_pid);
561 else if (target_tid != -1)
562 printf(" (target_tid: %d", target_tid);
563 else
564 printf(" (all");
565
566 if (cpu_list)
567 printf(", CPU%s: %s)\n", cpus->nr > 1 ? "s" : "", cpu_list);
568 else {
569 if (target_tid != -1)
570 printf(")\n");
571 else
572 printf(", %d CPU%s)\n", cpus->nr, cpus->nr > 1 ? "s" : "");
573 }
574 270
575 printf("%-*.*s\n", win_width, win_width, graph_dotted_line); 271 printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
576 272
577 if (sym_filter_entry) { 273 if (session->hists.stats.total_lost != 0) {
578 show_details(sym_filter_entry); 274 color_fprintf(stdout, PERF_COLOR_RED, "WARNING:");
579 return; 275 printf(" LOST %" PRIu64 " events, Check IO/CPU overload\n",
276 session->hists.stats.total_lost);
580 } 277 }
581 278
582 /* 279 if (top.sym_filter_entry) {
583 * Find the longest symbol name that will be displayed 280 show_details(top.sym_filter_entry);
584 */ 281 return;
585 for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
586 syme = rb_entry(nd, struct sym_entry, rb_node);
587 if (++printed > print_entries ||
588 (int)syme->snap_count < count_filter)
589 continue;
590
591 if (syme->map->dso->long_name_len > dso_width)
592 dso_width = syme->map->dso->long_name_len;
593
594 if (syme->map->dso->short_name_len > dso_short_width)
595 dso_short_width = syme->map->dso->short_name_len;
596
597 if (syme->name_len > sym_width)
598 sym_width = syme->name_len;
599 } 282 }
600 283
601 printed = 0; 284 perf_top__find_widths(&top, &tmp, &dso_width, &dso_short_width,
285 &sym_width);
602 286
603 if (sym_width + dso_width > winsize.ws_col - 29) { 287 if (sym_width + dso_width > winsize.ws_col - 29) {
604 dso_width = dso_short_width; 288 dso_width = dso_short_width;
@@ -606,7 +290,7 @@ static void print_sym_table(void)
606 sym_width = winsize.ws_col - dso_width - 29; 290 sym_width = winsize.ws_col - dso_width - 29;
607 } 291 }
608 putchar('\n'); 292 putchar('\n');
609 if (nr_counters == 1) 293 if (top.evlist->nr_entries == 1)
610 printf(" samples pcnt"); 294 printf(" samples pcnt");
611 else 295 else
612 printf(" weight samples pcnt"); 296 printf(" weight samples pcnt");
@@ -615,7 +299,7 @@ static void print_sym_table(void)
615 printf(" RIP "); 299 printf(" RIP ");
616 printf(" %-*.*s DSO\n", sym_width, sym_width, "function"); 300 printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
617 printf(" %s _______ _____", 301 printf(" %s _______ _____",
618 nr_counters == 1 ? " " : "______"); 302 top.evlist->nr_entries == 1 ? " " : "______");
619 if (verbose) 303 if (verbose)
620 printf(" ________________"); 304 printf(" ________________");
621 printf(" %-*.*s", sym_width, sym_width, graph_line); 305 printf(" %-*.*s", sym_width, sym_width, graph_line);
@@ -628,13 +312,14 @@ static void print_sym_table(void)
628 312
629 syme = rb_entry(nd, struct sym_entry, rb_node); 313 syme = rb_entry(nd, struct sym_entry, rb_node);
630 sym = sym_entry__symbol(syme); 314 sym = sym_entry__symbol(syme);
631 if (++printed > print_entries || (int)syme->snap_count < count_filter) 315 if (++printed > top.print_entries ||
316 (int)syme->snap_count < top.count_filter)
632 continue; 317 continue;
633 318
634 pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / 319 pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
635 sum_ksamples)); 320 sum_ksamples));
636 321
637 if (nr_counters == 1 || !display_weighted) 322 if (top.evlist->nr_entries == 1 || !top.display_weighted)
638 printf("%20.2f ", syme->weight); 323 printf("%20.2f ", syme->weight);
639 else 324 else
640 printf("%9.1f %10ld ", syme->weight, syme->snap_count); 325 printf("%9.1f %10ld ", syme->weight, syme->snap_count);
@@ -693,10 +378,8 @@ static void prompt_symbol(struct sym_entry **target, const char *msg)
693 378
694 /* zero counters of active symbol */ 379 /* zero counters of active symbol */
695 if (syme) { 380 if (syme) {
696 pthread_mutex_lock(&syme->src->lock);
697 __zero_source_counters(syme); 381 __zero_source_counters(syme);
698 *target = NULL; 382 *target = NULL;
699 pthread_mutex_unlock(&syme->src->lock);
700 } 383 }
701 384
702 fprintf(stdout, "\n%s: ", msg); 385 fprintf(stdout, "\n%s: ", msg);
@@ -707,11 +390,11 @@ static void prompt_symbol(struct sym_entry **target, const char *msg)
707 if (p) 390 if (p)
708 *p = 0; 391 *p = 0;
709 392
710 pthread_mutex_lock(&active_symbols_lock); 393 pthread_mutex_lock(&top.active_symbols_lock);
711 syme = list_entry(active_symbols.next, struct sym_entry, node); 394 syme = list_entry(top.active_symbols.next, struct sym_entry, node);
712 pthread_mutex_unlock(&active_symbols_lock); 395 pthread_mutex_unlock(&top.active_symbols_lock);
713 396
714 list_for_each_entry_safe_from(syme, n, &active_symbols, node) { 397 list_for_each_entry_safe_from(syme, n, &top.active_symbols, node) {
715 struct symbol *sym = sym_entry__symbol(syme); 398 struct symbol *sym = sym_entry__symbol(syme);
716 399
717 if (!strcmp(buf, sym->name)) { 400 if (!strcmp(buf, sym->name)) {
@@ -735,34 +418,34 @@ static void print_mapped_keys(void)
735{ 418{
736 char *name = NULL; 419 char *name = NULL;
737 420
738 if (sym_filter_entry) { 421 if (top.sym_filter_entry) {
739 struct symbol *sym = sym_entry__symbol(sym_filter_entry); 422 struct symbol *sym = sym_entry__symbol(top.sym_filter_entry);
740 name = sym->name; 423 name = sym->name;
741 } 424 }
742 425
743 fprintf(stdout, "\nMapped keys:\n"); 426 fprintf(stdout, "\nMapped keys:\n");
744 fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", delay_secs); 427 fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top.delay_secs);
745 fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", print_entries); 428 fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top.print_entries);
746 429
747 if (nr_counters > 1) 430 if (top.evlist->nr_entries > 1)
748 fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_evsel)); 431 fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top.sym_evsel));
749 432
750 fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter); 433 fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top.count_filter);
751 434
752 fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); 435 fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
753 fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); 436 fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL");
754 fprintf(stdout, "\t[S] stop annotation.\n"); 437 fprintf(stdout, "\t[S] stop annotation.\n");
755 438
756 if (nr_counters > 1) 439 if (top.evlist->nr_entries > 1)
757 fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); 440 fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", top.display_weighted ? 1 : 0);
758 441
759 fprintf(stdout, 442 fprintf(stdout,
760 "\t[K] hide kernel_symbols symbols. \t(%s)\n", 443 "\t[K] hide kernel_symbols symbols. \t(%s)\n",
761 hide_kernel_symbols ? "yes" : "no"); 444 top.hide_kernel_symbols ? "yes" : "no");
762 fprintf(stdout, 445 fprintf(stdout,
763 "\t[U] hide user symbols. \t(%s)\n", 446 "\t[U] hide user symbols. \t(%s)\n",
764 hide_user_symbols ? "yes" : "no"); 447 top.hide_user_symbols ? "yes" : "no");
765 fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", zero ? 1 : 0); 448 fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top.zero ? 1 : 0);
766 fprintf(stdout, "\t[qQ] quit.\n"); 449 fprintf(stdout, "\t[qQ] quit.\n");
767} 450}
768 451
@@ -783,7 +466,7 @@ static int key_mapped(int c)
783 return 1; 466 return 1;
784 case 'E': 467 case 'E':
785 case 'w': 468 case 'w':
786 return nr_counters > 1 ? 1 : 0; 469 return top.evlist->nr_entries > 1 ? 1 : 0;
787 default: 470 default:
788 break; 471 break;
789 } 472 }
@@ -818,47 +501,47 @@ static void handle_keypress(struct perf_session *session, int c)
818 501
819 switch (c) { 502 switch (c) {
820 case 'd': 503 case 'd':
821 prompt_integer(&delay_secs, "Enter display delay"); 504 prompt_integer(&top.delay_secs, "Enter display delay");
822 if (delay_secs < 1) 505 if (top.delay_secs < 1)
823 delay_secs = 1; 506 top.delay_secs = 1;
824 break; 507 break;
825 case 'e': 508 case 'e':
826 prompt_integer(&print_entries, "Enter display entries (lines)"); 509 prompt_integer(&top.print_entries, "Enter display entries (lines)");
827 if (print_entries == 0) { 510 if (top.print_entries == 0) {
828 sig_winch_handler(SIGWINCH); 511 sig_winch_handler(SIGWINCH);
829 signal(SIGWINCH, sig_winch_handler); 512 signal(SIGWINCH, sig_winch_handler);
830 } else 513 } else
831 signal(SIGWINCH, SIG_DFL); 514 signal(SIGWINCH, SIG_DFL);
832 break; 515 break;
833 case 'E': 516 case 'E':
834 if (nr_counters > 1) { 517 if (top.evlist->nr_entries > 1) {
835 fprintf(stderr, "\nAvailable events:"); 518 fprintf(stderr, "\nAvailable events:");
836 519
837 list_for_each_entry(sym_evsel, &evsel_list, node) 520 list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
838 fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel)); 521 fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel));
839 522
840 prompt_integer(&sym_counter, "Enter details event counter"); 523 prompt_integer(&top.sym_counter, "Enter details event counter");
841 524
842 if (sym_counter >= nr_counters) { 525 if (top.sym_counter >= top.evlist->nr_entries) {
843 sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node); 526 top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
844 sym_counter = 0; 527 top.sym_counter = 0;
845 fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel)); 528 fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel));
846 sleep(1); 529 sleep(1);
847 break; 530 break;
848 } 531 }
849 list_for_each_entry(sym_evsel, &evsel_list, node) 532 list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
850 if (sym_evsel->idx == sym_counter) 533 if (top.sym_evsel->idx == top.sym_counter)
851 break; 534 break;
852 } else sym_counter = 0; 535 } else top.sym_counter = 0;
853 break; 536 break;
854 case 'f': 537 case 'f':
855 prompt_integer(&count_filter, "Enter display event count filter"); 538 prompt_integer(&top.count_filter, "Enter display event count filter");
856 break; 539 break;
857 case 'F': 540 case 'F':
858 prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)"); 541 prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)");
859 break; 542 break;
860 case 'K': 543 case 'K':
861 hide_kernel_symbols = !hide_kernel_symbols; 544 top.hide_kernel_symbols = !top.hide_kernel_symbols;
862 break; 545 break;
863 case 'q': 546 case 'q':
864 case 'Q': 547 case 'Q':
@@ -867,34 +550,50 @@ static void handle_keypress(struct perf_session *session, int c)
867 perf_session__fprintf_dsos(session, stderr); 550 perf_session__fprintf_dsos(session, stderr);
868 exit(0); 551 exit(0);
869 case 's': 552 case 's':
870 prompt_symbol(&sym_filter_entry, "Enter details symbol"); 553 prompt_symbol(&top.sym_filter_entry, "Enter details symbol");
871 break; 554 break;
872 case 'S': 555 case 'S':
873 if (!sym_filter_entry) 556 if (!top.sym_filter_entry)
874 break; 557 break;
875 else { 558 else {
876 struct sym_entry *syme = sym_filter_entry; 559 struct sym_entry *syme = top.sym_filter_entry;
877 560
878 pthread_mutex_lock(&syme->src->lock); 561 top.sym_filter_entry = NULL;
879 sym_filter_entry = NULL;
880 __zero_source_counters(syme); 562 __zero_source_counters(syme);
881 pthread_mutex_unlock(&syme->src->lock);
882 } 563 }
883 break; 564 break;
884 case 'U': 565 case 'U':
885 hide_user_symbols = !hide_user_symbols; 566 top.hide_user_symbols = !top.hide_user_symbols;
886 break; 567 break;
887 case 'w': 568 case 'w':
888 display_weighted = ~display_weighted; 569 top.display_weighted = ~top.display_weighted;
889 break; 570 break;
890 case 'z': 571 case 'z':
891 zero = !zero; 572 top.zero = !top.zero;
892 break; 573 break;
893 default: 574 default:
894 break; 575 break;
895 } 576 }
896} 577}
897 578
579static void *display_thread_tui(void *arg __used)
580{
581 int err = 0;
582 pthread_mutex_lock(&top.active_symbols_lock);
583 while (list_empty(&top.active_symbols)) {
584 err = pthread_cond_wait(&top.active_symbols_cond,
585 &top.active_symbols_lock);
586 if (err)
587 break;
588 }
589 pthread_mutex_unlock(&top.active_symbols_lock);
590 if (!err)
591 perf_top__tui_browser(&top);
592 exit_browser(0);
593 exit(0);
594 return NULL;
595}
596
898static void *display_thread(void *arg __used) 597static void *display_thread(void *arg __used)
899{ 598{
900 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; 599 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
@@ -909,13 +608,13 @@ static void *display_thread(void *arg __used)
909 tc.c_cc[VTIME] = 0; 608 tc.c_cc[VTIME] = 0;
910 609
911repeat: 610repeat:
912 delay_msecs = delay_secs * 1000; 611 delay_msecs = top.delay_secs * 1000;
913 tcsetattr(0, TCSANOW, &tc); 612 tcsetattr(0, TCSANOW, &tc);
914 /* trash return*/ 613 /* trash return*/
915 getc(stdin); 614 getc(stdin);
916 615
917 do { 616 do {
918 print_sym_table(); 617 print_sym_table(session);
919 } while (!poll(&stdin_poll, 1, delay_msecs) == 1); 618 } while (!poll(&stdin_poll, 1, delay_msecs) == 1);
920 619
921 c = getc(stdin); 620 c = getc(stdin);
@@ -930,6 +629,7 @@ repeat:
930/* Tag samples to be skipped. */ 629/* Tag samples to be skipped. */
931static const char *skip_symbols[] = { 630static const char *skip_symbols[] = {
932 "default_idle", 631 "default_idle",
632 "native_safe_halt",
933 "cpu_idle", 633 "cpu_idle",
934 "enter_idle", 634 "enter_idle",
935 "exit_idle", 635 "exit_idle",
@@ -965,9 +665,9 @@ static int symbol_filter(struct map *map, struct symbol *sym)
965 665
966 syme = symbol__priv(sym); 666 syme = symbol__priv(sym);
967 syme->map = map; 667 syme->map = map;
968 syme->src = NULL; 668 symbol__annotate_init(map, sym);
969 669
970 if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) { 670 if (!top.sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) {
971 /* schedule initial sym_filter_entry setup */ 671 /* schedule initial sym_filter_entry setup */
972 sym_filter_entry_sched = syme; 672 sym_filter_entry_sched = syme;
973 sym_filter = NULL; 673 sym_filter = NULL;
@@ -980,44 +680,40 @@ static int symbol_filter(struct map *map, struct symbol *sym)
980 } 680 }
981 } 681 }
982 682
983 if (!syme->skip)
984 syme->name_len = strlen(sym->name);
985
986 return 0; 683 return 0;
987} 684}
988 685
989static void event__process_sample(const event_t *self, 686static void perf_event__process_sample(const union perf_event *event,
990 struct sample_data *sample, 687 struct perf_sample *sample,
991 struct perf_session *session, 688 struct perf_session *session)
992 struct perf_evsel *evsel)
993{ 689{
994 u64 ip = self->ip.ip; 690 u64 ip = event->ip.ip;
995 struct sym_entry *syme; 691 struct sym_entry *syme;
996 struct addr_location al; 692 struct addr_location al;
997 struct machine *machine; 693 struct machine *machine;
998 u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 694 u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
999 695
1000 ++samples; 696 ++top.samples;
1001 697
1002 switch (origin) { 698 switch (origin) {
1003 case PERF_RECORD_MISC_USER: 699 case PERF_RECORD_MISC_USER:
1004 ++us_samples; 700 ++top.us_samples;
1005 if (hide_user_symbols) 701 if (top.hide_user_symbols)
1006 return; 702 return;
1007 machine = perf_session__find_host_machine(session); 703 machine = perf_session__find_host_machine(session);
1008 break; 704 break;
1009 case PERF_RECORD_MISC_KERNEL: 705 case PERF_RECORD_MISC_KERNEL:
1010 ++kernel_samples; 706 ++top.kernel_samples;
1011 if (hide_kernel_symbols) 707 if (top.hide_kernel_symbols)
1012 return; 708 return;
1013 machine = perf_session__find_host_machine(session); 709 machine = perf_session__find_host_machine(session);
1014 break; 710 break;
1015 case PERF_RECORD_MISC_GUEST_KERNEL: 711 case PERF_RECORD_MISC_GUEST_KERNEL:
1016 ++guest_kernel_samples; 712 ++top.guest_kernel_samples;
1017 machine = perf_session__find_machine(session, self->ip.pid); 713 machine = perf_session__find_machine(session, event->ip.pid);
1018 break; 714 break;
1019 case PERF_RECORD_MISC_GUEST_USER: 715 case PERF_RECORD_MISC_GUEST_USER:
1020 ++guest_us_samples; 716 ++top.guest_us_samples;
1021 /* 717 /*
1022 * TODO: we don't process guest user from host side 718 * TODO: we don't process guest user from host side
1023 * except simple counting. 719 * except simple counting.
@@ -1029,15 +725,15 @@ static void event__process_sample(const event_t *self,
1029 725
1030 if (!machine && perf_guest) { 726 if (!machine && perf_guest) {
1031 pr_err("Can't find guest [%d]'s kernel information\n", 727 pr_err("Can't find guest [%d]'s kernel information\n",
1032 self->ip.pid); 728 event->ip.pid);
1033 return; 729 return;
1034 } 730 }
1035 731
1036 if (self->header.misc & PERF_RECORD_MISC_EXACT_IP) 732 if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
1037 exact_samples++; 733 top.exact_samples++;
1038 734
1039 if (event__preprocess_sample(self, session, &al, sample, 735 if (perf_event__preprocess_sample(event, session, &al, sample,
1040 symbol_filter) < 0 || 736 symbol_filter) < 0 ||
1041 al.filtered) 737 al.filtered)
1042 return; 738 return;
1043 739
@@ -1055,8 +751,9 @@ static void event__process_sample(const event_t *self,
1055 */ 751 */
1056 if (al.map == machine->vmlinux_maps[MAP__FUNCTION] && 752 if (al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
1057 RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) { 753 RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
1058 pr_err("The %s file can't be used\n", 754 ui__warning("The %s file can't be used\n",
1059 symbol_conf.vmlinux_name); 755 symbol_conf.vmlinux_name);
756 exit_browser(0);
1060 exit(1); 757 exit(1);
1061 } 758 }
1062 759
@@ -1065,13 +762,13 @@ static void event__process_sample(const event_t *self,
1065 762
1066 /* let's see, whether we need to install initial sym_filter_entry */ 763 /* let's see, whether we need to install initial sym_filter_entry */
1067 if (sym_filter_entry_sched) { 764 if (sym_filter_entry_sched) {
1068 sym_filter_entry = sym_filter_entry_sched; 765 top.sym_filter_entry = sym_filter_entry_sched;
1069 sym_filter_entry_sched = NULL; 766 sym_filter_entry_sched = NULL;
1070 if (parse_source(sym_filter_entry) < 0) { 767 if (parse_source(top.sym_filter_entry) < 0) {
1071 struct symbol *sym = sym_entry__symbol(sym_filter_entry); 768 struct symbol *sym = sym_entry__symbol(top.sym_filter_entry);
1072 769
1073 pr_err("Can't annotate %s", sym->name); 770 pr_err("Can't annotate %s", sym->name);
1074 if (sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) { 771 if (top.sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) {
1075 pr_err(": No vmlinux file was found in the path:\n"); 772 pr_err(": No vmlinux file was found in the path:\n");
1076 machine__fprintf_vmlinux_path(machine, stderr); 773 machine__fprintf_vmlinux_path(machine, stderr);
1077 } else 774 } else
@@ -1082,166 +779,73 @@ static void event__process_sample(const event_t *self,
1082 779
1083 syme = symbol__priv(al.sym); 780 syme = symbol__priv(al.sym);
1084 if (!syme->skip) { 781 if (!syme->skip) {
1085 syme->count[evsel->idx]++; 782 struct perf_evsel *evsel;
783
1086 syme->origin = origin; 784 syme->origin = origin;
785 evsel = perf_evlist__id2evsel(top.evlist, sample->id);
786 assert(evsel != NULL);
787 syme->count[evsel->idx]++;
1087 record_precise_ip(syme, evsel->idx, ip); 788 record_precise_ip(syme, evsel->idx, ip);
1088 pthread_mutex_lock(&active_symbols_lock); 789 pthread_mutex_lock(&top.active_symbols_lock);
1089 if (list_empty(&syme->node) || !syme->node.next) 790 if (list_empty(&syme->node) || !syme->node.next) {
791 static bool first = true;
1090 __list_insert_active_sym(syme); 792 __list_insert_active_sym(syme);
1091 pthread_mutex_unlock(&active_symbols_lock); 793 if (first) {
794 pthread_cond_broadcast(&top.active_symbols_cond);
795 first = false;
796 }
797 }
798 pthread_mutex_unlock(&top.active_symbols_lock);
1092 } 799 }
1093} 800}
1094 801
1095struct mmap_data { 802static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
1096 void *base;
1097 int mask;
1098 unsigned int prev;
1099};
1100
1101static int perf_evsel__alloc_mmap_per_thread(struct perf_evsel *evsel,
1102 int ncpus, int nthreads)
1103{
1104 evsel->priv = xyarray__new(ncpus, nthreads, sizeof(struct mmap_data));
1105 return evsel->priv != NULL ? 0 : -ENOMEM;
1106}
1107
1108static void perf_evsel__free_mmap(struct perf_evsel *evsel)
1109{
1110 xyarray__delete(evsel->priv);
1111 evsel->priv = NULL;
1112}
1113
1114static unsigned int mmap_read_head(struct mmap_data *md)
1115{
1116 struct perf_event_mmap_page *pc = md->base;
1117 int head;
1118
1119 head = pc->data_head;
1120 rmb();
1121
1122 return head;
1123}
1124
1125static void perf_session__mmap_read_counter(struct perf_session *self,
1126 struct perf_evsel *evsel,
1127 int cpu, int thread_idx)
1128{ 803{
1129 struct xyarray *mmap_array = evsel->priv; 804 struct perf_sample sample;
1130 struct mmap_data *md = xyarray__entry(mmap_array, cpu, thread_idx); 805 union perf_event *event;
1131 unsigned int head = mmap_read_head(md);
1132 unsigned int old = md->prev;
1133 unsigned char *data = md->base + page_size;
1134 struct sample_data sample;
1135 int diff;
1136
1137 /*
1138 * If we're further behind than half the buffer, there's a chance
1139 * the writer will bite our tail and mess up the samples under us.
1140 *
1141 * If we somehow ended up ahead of the head, we got messed up.
1142 *
1143 * In either case, truncate and restart at head.
1144 */
1145 diff = head - old;
1146 if (diff > md->mask / 2 || diff < 0) {
1147 fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
1148
1149 /*
1150 * head points to a known good entry, start there.
1151 */
1152 old = head;
1153 }
1154
1155 for (; old != head;) {
1156 event_t *event = (event_t *)&data[old & md->mask];
1157
1158 event_t event_copy;
1159 806
1160 size_t size = event->header.size; 807 while ((event = perf_evlist__read_on_cpu(top.evlist, cpu)) != NULL) {
808 perf_session__parse_sample(self, event, &sample);
1161 809
1162 /*
1163 * Event straddles the mmap boundary -- header should always
1164 * be inside due to u64 alignment of output.
1165 */
1166 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1167 unsigned int offset = old;
1168 unsigned int len = min(sizeof(*event), size), cpy;
1169 void *dst = &event_copy;
1170
1171 do {
1172 cpy = min(md->mask + 1 - (offset & md->mask), len);
1173 memcpy(dst, &data[offset & md->mask], cpy);
1174 offset += cpy;
1175 dst += cpy;
1176 len -= cpy;
1177 } while (len);
1178
1179 event = &event_copy;
1180 }
1181
1182 event__parse_sample(event, self, &sample);
1183 if (event->header.type == PERF_RECORD_SAMPLE) 810 if (event->header.type == PERF_RECORD_SAMPLE)
1184 event__process_sample(event, &sample, self, evsel); 811 perf_event__process_sample(event, &sample, self);
1185 else 812 else
1186 event__process(event, &sample, self); 813 perf_event__process(event, &sample, self);
1187 old += size;
1188 } 814 }
1189
1190 md->prev = old;
1191} 815}
1192 816
1193static struct pollfd *event_array;
1194
1195static void perf_session__mmap_read(struct perf_session *self) 817static void perf_session__mmap_read(struct perf_session *self)
1196{ 818{
1197 struct perf_evsel *counter; 819 int i;
1198 int i, thread_index;
1199
1200 for (i = 0; i < cpus->nr; i++) {
1201 list_for_each_entry(counter, &evsel_list, node) {
1202 for (thread_index = 0;
1203 thread_index < threads->nr;
1204 thread_index++) {
1205 perf_session__mmap_read_counter(self,
1206 counter, i, thread_index);
1207 }
1208 }
1209 }
1210}
1211 820
1212int nr_poll; 821 for (i = 0; i < top.evlist->cpus->nr; i++)
1213int group_fd; 822 perf_session__mmap_read_cpu(self, i);
823}
1214 824
1215static void start_counter(int i, struct perf_evsel *evsel) 825static void start_counters(struct perf_evlist *evlist)
1216{ 826{
1217 struct xyarray *mmap_array = evsel->priv; 827 struct perf_evsel *counter;
1218 struct mmap_data *mm;
1219 struct perf_event_attr *attr;
1220 int cpu = -1;
1221 int thread_index;
1222
1223 if (target_tid == -1)
1224 cpu = cpus->map[i];
1225 828
1226 attr = &evsel->attr; 829 list_for_each_entry(counter, &evlist->entries, node) {
830 struct perf_event_attr *attr = &counter->attr;
1227 831
1228 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; 832 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
1229 833
1230 if (freq) { 834 if (top.freq) {
1231 attr->sample_type |= PERF_SAMPLE_PERIOD; 835 attr->sample_type |= PERF_SAMPLE_PERIOD;
1232 attr->freq = 1; 836 attr->freq = 1;
1233 attr->sample_freq = freq; 837 attr->sample_freq = top.freq;
1234 } 838 }
1235 839
1236 attr->inherit = (cpu < 0) && inherit; 840 if (evlist->nr_entries > 1) {
1237 attr->mmap = 1; 841 attr->sample_type |= PERF_SAMPLE_ID;
842 attr->read_format |= PERF_FORMAT_ID;
843 }
1238 844
1239 for (thread_index = 0; thread_index < threads->nr; thread_index++) { 845 attr->mmap = 1;
1240try_again: 846try_again:
1241 FD(evsel, i, thread_index) = sys_perf_event_open(attr, 847 if (perf_evsel__open(counter, top.evlist->cpus,
1242 threads->map[thread_index], cpu, group_fd, 0); 848 top.evlist->threads, group, inherit) < 0) {
1243
1244 if (FD(evsel, i, thread_index) < 0) {
1245 int err = errno; 849 int err = errno;
1246 850
1247 if (err == EPERM || err == EACCES) 851 if (err == EPERM || err == EACCES)
@@ -1253,8 +857,8 @@ try_again:
1253 * based cpu-clock-tick sw counter, which 857 * based cpu-clock-tick sw counter, which
1254 * is always available even if no PMU support: 858 * is always available even if no PMU support:
1255 */ 859 */
1256 if (attr->type == PERF_TYPE_HARDWARE 860 if (attr->type == PERF_TYPE_HARDWARE &&
1257 && attr->config == PERF_COUNT_HW_CPU_CYCLES) { 861 attr->config == PERF_COUNT_HW_CPU_CYCLES) {
1258 862
1259 if (verbose) 863 if (verbose)
1260 warning(" ... trying to fall back to cpu-clock-ticks\n"); 864 warning(" ... trying to fall back to cpu-clock-ticks\n");
@@ -1264,39 +868,22 @@ try_again:
1264 goto try_again; 868 goto try_again;
1265 } 869 }
1266 printf("\n"); 870 printf("\n");
1267 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", 871 error("sys_perf_event_open() syscall returned with %d "
1268 FD(evsel, i, thread_index), strerror(err)); 872 "(%s). /bin/dmesg may provide additional information.\n",
873 err, strerror(err));
1269 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); 874 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
1270 exit(-1); 875 exit(-1);
1271 } 876 }
1272 assert(FD(evsel, i, thread_index) >= 0);
1273 fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
1274
1275 /*
1276 * First counter acts as the group leader:
1277 */
1278 if (group && group_fd == -1)
1279 group_fd = FD(evsel, i, thread_index);
1280
1281 event_array[nr_poll].fd = FD(evsel, i, thread_index);
1282 event_array[nr_poll].events = POLLIN;
1283 nr_poll++;
1284
1285 mm = xyarray__entry(mmap_array, i, thread_index);
1286 mm->prev = 0;
1287 mm->mask = mmap_pages*page_size - 1;
1288 mm->base = mmap(NULL, (mmap_pages+1)*page_size,
1289 PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
1290 if (mm->base == MAP_FAILED)
1291 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
1292 } 877 }
878
879 if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
880 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
1293} 881}
1294 882
1295static int __cmd_top(void) 883static int __cmd_top(void)
1296{ 884{
1297 pthread_t thread; 885 pthread_t thread;
1298 struct perf_evsel *counter; 886 int ret __used;
1299 int i, ret;
1300 /* 887 /*
1301 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this 888 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
1302 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now. 889 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
@@ -1305,23 +892,23 @@ static int __cmd_top(void)
1305 if (session == NULL) 892 if (session == NULL)
1306 return -ENOMEM; 893 return -ENOMEM;
1307 894
1308 if (target_tid != -1) 895 if (top.target_tid != -1)
1309 event__synthesize_thread_map(threads, event__process, session); 896 perf_event__synthesize_thread_map(top.evlist->threads,
897 perf_event__process, session);
1310 else 898 else
1311 event__synthesize_threads(event__process, session); 899 perf_event__synthesize_threads(perf_event__process, session);
1312 900
1313 for (i = 0; i < cpus->nr; i++) { 901 start_counters(top.evlist);
1314 group_fd = -1; 902 session->evlist = top.evlist;
1315 list_for_each_entry(counter, &evsel_list, node) 903 perf_session__update_sample_type(session);
1316 start_counter(i, counter);
1317 }
1318 904
1319 /* Wait for a minimal set of events before starting the snapshot */ 905 /* Wait for a minimal set of events before starting the snapshot */
1320 poll(&event_array[0], nr_poll, 100); 906 poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
1321 907
1322 perf_session__mmap_read(session); 908 perf_session__mmap_read(session);
1323 909
1324 if (pthread_create(&thread, NULL, display_thread, session)) { 910 if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
911 display_thread), session)) {
1325 printf("Could not create display thread.\n"); 912 printf("Could not create display thread.\n");
1326 exit(-1); 913 exit(-1);
1327 } 914 }
@@ -1337,12 +924,12 @@ static int __cmd_top(void)
1337 } 924 }
1338 925
1339 while (1) { 926 while (1) {
1340 int hits = samples; 927 u64 hits = top.samples;
1341 928
1342 perf_session__mmap_read(session); 929 perf_session__mmap_read(session);
1343 930
1344 if (hits == samples) 931 if (hits == top.samples)
1345 ret = poll(event_array, nr_poll, 100); 932 ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
1346 } 933 }
1347 934
1348 return 0; 935 return 0;
@@ -1354,31 +941,31 @@ static const char * const top_usage[] = {
1354}; 941};
1355 942
1356static const struct option options[] = { 943static const struct option options[] = {
1357 OPT_CALLBACK('e', "event", NULL, "event", 944 OPT_CALLBACK('e', "event", &top.evlist, "event",
1358 "event selector. use 'perf list' to list available events", 945 "event selector. use 'perf list' to list available events",
1359 parse_events), 946 parse_events),
1360 OPT_INTEGER('c', "count", &default_interval, 947 OPT_INTEGER('c', "count", &default_interval,
1361 "event period to sample"), 948 "event period to sample"),
1362 OPT_INTEGER('p', "pid", &target_pid, 949 OPT_INTEGER('p', "pid", &top.target_pid,
1363 "profile events on existing process id"), 950 "profile events on existing process id"),
1364 OPT_INTEGER('t', "tid", &target_tid, 951 OPT_INTEGER('t', "tid", &top.target_tid,
1365 "profile events on existing thread id"), 952 "profile events on existing thread id"),
1366 OPT_BOOLEAN('a', "all-cpus", &system_wide, 953 OPT_BOOLEAN('a', "all-cpus", &system_wide,
1367 "system-wide collection from all CPUs"), 954 "system-wide collection from all CPUs"),
1368 OPT_STRING('C', "cpu", &cpu_list, "cpu", 955 OPT_STRING('C', "cpu", &top.cpu_list, "cpu",
1369 "list of cpus to monitor"), 956 "list of cpus to monitor"),
1370 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, 957 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
1371 "file", "vmlinux pathname"), 958 "file", "vmlinux pathname"),
1372 OPT_BOOLEAN('K', "hide_kernel_symbols", &hide_kernel_symbols, 959 OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
1373 "hide kernel symbols"), 960 "hide kernel symbols"),
1374 OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), 961 OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
1375 OPT_INTEGER('r', "realtime", &realtime_prio, 962 OPT_INTEGER('r', "realtime", &realtime_prio,
1376 "collect data with this RT SCHED_FIFO priority"), 963 "collect data with this RT SCHED_FIFO priority"),
1377 OPT_INTEGER('d', "delay", &delay_secs, 964 OPT_INTEGER('d', "delay", &top.delay_secs,
1378 "number of seconds to delay between refreshes"), 965 "number of seconds to delay between refreshes"),
1379 OPT_BOOLEAN('D', "dump-symtab", &dump_symtab, 966 OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
1380 "dump the symbol table used for profiling"), 967 "dump the symbol table used for profiling"),
1381 OPT_INTEGER('f', "count-filter", &count_filter, 968 OPT_INTEGER('f', "count-filter", &top.count_filter,
1382 "only display functions with more events than this"), 969 "only display functions with more events than this"),
1383 OPT_BOOLEAN('g', "group", &group, 970 OPT_BOOLEAN('g', "group", &group,
1384 "put the counters into a counter group"), 971 "put the counters into a counter group"),
@@ -1386,14 +973,16 @@ static const struct option options[] = {
1386 "child tasks inherit counters"), 973 "child tasks inherit counters"),
1387 OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name", 974 OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name",
1388 "symbol to annotate"), 975 "symbol to annotate"),
1389 OPT_BOOLEAN('z', "zero", &zero, 976 OPT_BOOLEAN('z', "zero", &top.zero,
1390 "zero history across updates"), 977 "zero history across updates"),
1391 OPT_INTEGER('F', "freq", &freq, 978 OPT_INTEGER('F', "freq", &top.freq,
1392 "profile at this frequency"), 979 "profile at this frequency"),
1393 OPT_INTEGER('E', "entries", &print_entries, 980 OPT_INTEGER('E', "entries", &top.print_entries,
1394 "display this many functions"), 981 "display this many functions"),
1395 OPT_BOOLEAN('U', "hide_user_symbols", &hide_user_symbols, 982 OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
1396 "hide user symbols"), 983 "hide user symbols"),
984 OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
985 OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
1397 OPT_INCR('v', "verbose", &verbose, 986 OPT_INCR('v', "verbose", &verbose,
1398 "be more verbose (show counter open errors, etc)"), 987 "be more verbose (show counter open errors, etc)"),
1399 OPT_END() 988 OPT_END()
@@ -1404,64 +993,68 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
1404 struct perf_evsel *pos; 993 struct perf_evsel *pos;
1405 int status = -ENOMEM; 994 int status = -ENOMEM;
1406 995
996 top.evlist = perf_evlist__new(NULL, NULL);
997 if (top.evlist == NULL)
998 return -ENOMEM;
999
1407 page_size = sysconf(_SC_PAGE_SIZE); 1000 page_size = sysconf(_SC_PAGE_SIZE);
1408 1001
1409 argc = parse_options(argc, argv, options, top_usage, 0); 1002 argc = parse_options(argc, argv, options, top_usage, 0);
1410 if (argc) 1003 if (argc)
1411 usage_with_options(top_usage, options); 1004 usage_with_options(top_usage, options);
1412 1005
1413 if (target_pid != -1) 1006 /*
1414 target_tid = target_pid; 1007 * XXX For now start disabled, only using TUI if explicitely asked for.
1008 * Change that when handle_keys equivalent gets written, live annotation
1009 * done, etc.
1010 */
1011 use_browser = 0;
1415 1012
1416 threads = thread_map__new(target_pid, target_tid); 1013 if (use_stdio)
1417 if (threads == NULL) { 1014 use_browser = 0;
1418 pr_err("Problems finding threads of monitor\n"); 1015 else if (use_tui)
1419 usage_with_options(top_usage, options); 1016 use_browser = 1;
1420 }
1421 1017
1422 event_array = malloc((sizeof(struct pollfd) * 1018 setup_browser(false);
1423 MAX_NR_CPUS * MAX_COUNTERS * threads->nr));
1424 if (!event_array)
1425 return -ENOMEM;
1426 1019
1427 /* CPU and PID are mutually exclusive */ 1020 /* CPU and PID are mutually exclusive */
1428 if (target_tid > 0 && cpu_list) { 1021 if (top.target_tid > 0 && top.cpu_list) {
1429 printf("WARNING: PID switch overriding CPU\n"); 1022 printf("WARNING: PID switch overriding CPU\n");
1430 sleep(1); 1023 sleep(1);
1431 cpu_list = NULL; 1024 top.cpu_list = NULL;
1432 } 1025 }
1433 1026
1434 if (!nr_counters && perf_evsel_list__create_default() < 0) { 1027 if (top.target_pid != -1)
1028 top.target_tid = top.target_pid;
1029
1030 if (perf_evlist__create_maps(top.evlist, top.target_pid,
1031 top.target_tid, top.cpu_list) < 0)
1032 usage_with_options(top_usage, options);
1033
1034 if (!top.evlist->nr_entries &&
1035 perf_evlist__add_default(top.evlist) < 0) {
1435 pr_err("Not enough memory for event selector list\n"); 1036 pr_err("Not enough memory for event selector list\n");
1436 return -ENOMEM; 1037 return -ENOMEM;
1437 } 1038 }
1438 1039
1439 if (delay_secs < 1) 1040 if (top.delay_secs < 1)
1440 delay_secs = 1; 1041 top.delay_secs = 1;
1441 1042
1442 /* 1043 /*
1443 * User specified count overrides default frequency. 1044 * User specified count overrides default frequency.
1444 */ 1045 */
1445 if (default_interval) 1046 if (default_interval)
1446 freq = 0; 1047 top.freq = 0;
1447 else if (freq) { 1048 else if (top.freq) {
1448 default_interval = freq; 1049 default_interval = top.freq;
1449 } else { 1050 } else {
1450 fprintf(stderr, "frequency and count are zero, aborting\n"); 1051 fprintf(stderr, "frequency and count are zero, aborting\n");
1451 exit(EXIT_FAILURE); 1052 exit(EXIT_FAILURE);
1452 } 1053 }
1453 1054
1454 if (target_tid != -1) 1055 list_for_each_entry(pos, &top.evlist->entries, node) {
1455 cpus = cpu_map__dummy_new(); 1056 if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr,
1456 else 1057 top.evlist->threads->nr) < 0)
1457 cpus = cpu_map__new(cpu_list);
1458
1459 if (cpus == NULL)
1460 usage_with_options(top_usage, options);
1461
1462 list_for_each_entry(pos, &evsel_list, node) {
1463 if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 ||
1464 perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
1465 goto out_free_fd; 1058 goto out_free_fd;
1466 /* 1059 /*
1467 * Fill in the ones not specifically initialized via -c: 1060 * Fill in the ones not specifically initialized via -c:
@@ -1472,26 +1065,28 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
1472 pos->attr.sample_period = default_interval; 1065 pos->attr.sample_period = default_interval;
1473 } 1066 }
1474 1067
1475 sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node); 1068 if (perf_evlist__alloc_pollfd(top.evlist) < 0 ||
1069 perf_evlist__alloc_mmap(top.evlist) < 0)
1070 goto out_free_fd;
1071
1072 top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
1476 1073
1477 symbol_conf.priv_size = (sizeof(struct sym_entry) + 1074 symbol_conf.priv_size = (sizeof(struct sym_entry) + sizeof(struct annotation) +
1478 (nr_counters + 1) * sizeof(unsigned long)); 1075 (top.evlist->nr_entries + 1) * sizeof(unsigned long));
1479 1076
1480 symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); 1077 symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
1481 if (symbol__init() < 0) 1078 if (symbol__init() < 0)
1482 return -1; 1079 return -1;
1483 1080
1484 get_term_dimensions(&winsize); 1081 get_term_dimensions(&winsize);
1485 if (print_entries == 0) { 1082 if (top.print_entries == 0) {
1486 update_print_entries(&winsize); 1083 update_print_entries(&winsize);
1487 signal(SIGWINCH, sig_winch_handler); 1084 signal(SIGWINCH, sig_winch_handler);
1488 } 1085 }
1489 1086
1490 status = __cmd_top(); 1087 status = __cmd_top();
1491out_free_fd: 1088out_free_fd:
1492 list_for_each_entry(pos, &evsel_list, node) 1089 perf_evlist__delete(top.evlist);
1493 perf_evsel__free_mmap(pos);
1494 perf_evsel_list__delete();
1495 1090
1496 return status; 1091 return status;
1497} 1092}
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 95aaf565c70..a5fc660c1f1 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -94,6 +94,32 @@ void get_term_dimensions(struct winsize *ws);
94#include "util/types.h" 94#include "util/types.h"
95#include <stdbool.h> 95#include <stdbool.h>
96 96
97struct perf_mmap {
98 void *base;
99 int mask;
100 unsigned int prev;
101};
102
103static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
104{
105 struct perf_event_mmap_page *pc = mm->base;
106 int head = pc->data_head;
107 rmb();
108 return head;
109}
110
111static inline void perf_mmap__write_tail(struct perf_mmap *md,
112 unsigned long tail)
113{
114 struct perf_event_mmap_page *pc = md->base;
115
116 /*
117 * ensure all reads are done before we write the tail out.
118 */
119 /* mb(); */
120 pc->data_tail = tail;
121}
122
97/* 123/*
98 * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all 124 * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
99 * counters in the current task. 125 * counters in the current task.
diff --git a/tools/perf/python/twatch.py b/tools/perf/python/twatch.py
new file mode 100755
index 00000000000..df638c438a9
--- /dev/null
+++ b/tools/perf/python/twatch.py
@@ -0,0 +1,41 @@
1#! /usr/bin/python
2# -*- python -*-
3# -*- coding: utf-8 -*-
4# twatch - Experimental use of the perf python interface
5# Copyright (C) 2011 Arnaldo Carvalho de Melo <acme@redhat.com>
6#
7# This application is free software; you can redistribute it and/or
8# modify it under the terms of the GNU General Public License
9# as published by the Free Software Foundation; version 2.
10#
11# This application is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# General Public License for more details.
15
16import perf
17
18def main():
19 cpus = perf.cpu_map()
20 threads = perf.thread_map()
21 evsel = perf.evsel(task = 1, comm = 1, mmap = 0,
22 wakeup_events = 1, sample_period = 1,
23 sample_id_all = 1,
24 sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | perf.SAMPLE_CPU | perf.SAMPLE_TID)
25 evsel.open(cpus = cpus, threads = threads);
26 evlist = perf.evlist(cpus, threads)
27 evlist.add(evsel)
28 evlist.mmap()
29 while True:
30 evlist.poll(timeout = -1)
31 for cpu in cpus:
32 event = evlist.read_on_cpu(cpu)
33 if not event:
34 continue
35 print "cpu: %2d, pid: %4d, tid: %4d" % (event.sample_cpu,
36 event.sample_pid,
37 event.sample_tid),
38 print event
39
40if __name__ == '__main__':
41 main()
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
new file mode 100644
index 00000000000..0d0830c98cd
--- /dev/null
+++ b/tools/perf/util/annotate.c
@@ -0,0 +1,605 @@
1/*
2 * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
3 *
4 * Parts came from builtin-annotate.c, see those files for further
5 * copyright notes.
6 *
7 * Released under the GPL v2. (and only v2, not any later version)
8 */
9
10#include "util.h"
11#include "build-id.h"
12#include "color.h"
13#include "cache.h"
14#include "symbol.h"
15#include "debug.h"
16#include "annotate.h"
17#include <pthread.h>
18
19int symbol__annotate_init(struct map *map __used, struct symbol *sym)
20{
21 struct annotation *notes = symbol__annotation(sym);
22 pthread_mutex_init(&notes->lock, NULL);
23 return 0;
24}
25
26int symbol__alloc_hist(struct symbol *sym, int nevents)
27{
28 struct annotation *notes = symbol__annotation(sym);
29 size_t sizeof_sym_hist = (sizeof(struct sym_hist) +
30 (sym->end - sym->start) * sizeof(u64));
31
32 notes->src = zalloc(sizeof(*notes->src) + nevents * sizeof_sym_hist);
33 if (notes->src == NULL)
34 return -1;
35 notes->src->sizeof_sym_hist = sizeof_sym_hist;
36 notes->src->nr_histograms = nevents;
37 INIT_LIST_HEAD(&notes->src->source);
38 return 0;
39}
40
41void symbol__annotate_zero_histograms(struct symbol *sym)
42{
43 struct annotation *notes = symbol__annotation(sym);
44
45 pthread_mutex_lock(&notes->lock);
46 if (notes->src != NULL)
47 memset(notes->src->histograms, 0,
48 notes->src->nr_histograms * notes->src->sizeof_sym_hist);
49 pthread_mutex_unlock(&notes->lock);
50}
51
52int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
53 int evidx, u64 addr)
54{
55 unsigned offset;
56 struct annotation *notes;
57 struct sym_hist *h;
58
59 notes = symbol__annotation(sym);
60 if (notes->src == NULL)
61 return -ENOMEM;
62
63 pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr));
64
65 if (addr >= sym->end)
66 return 0;
67
68 offset = addr - sym->start;
69 h = annotation__histogram(notes, evidx);
70 h->sum++;
71 h->addr[offset]++;
72
73 pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
74 ", evidx=%d] => %" PRIu64 "\n", sym->start, sym->name,
75 addr, addr - sym->start, evidx, h->addr[offset]);
76 return 0;
77}
78
79static struct objdump_line *objdump_line__new(s64 offset, char *line, size_t privsize)
80{
81 struct objdump_line *self = malloc(sizeof(*self) + privsize);
82
83 if (self != NULL) {
84 self->offset = offset;
85 self->line = line;
86 }
87
88 return self;
89}
90
91void objdump_line__free(struct objdump_line *self)
92{
93 free(self->line);
94 free(self);
95}
96
97static void objdump__add_line(struct list_head *head, struct objdump_line *line)
98{
99 list_add_tail(&line->node, head);
100}
101
102struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
103 struct objdump_line *pos)
104{
105 list_for_each_entry_continue(pos, head, node)
106 if (pos->offset >= 0)
107 return pos;
108
109 return NULL;
110}
111
112static int objdump_line__print(struct objdump_line *oline, struct symbol *sym,
113 int evidx, u64 len, int min_pcnt,
114 int printed, int max_lines,
115 struct objdump_line *queue)
116{
117 static const char *prev_line;
118 static const char *prev_color;
119
120 if (oline->offset != -1) {
121 const char *path = NULL;
122 unsigned int hits = 0;
123 double percent = 0.0;
124 const char *color;
125 struct annotation *notes = symbol__annotation(sym);
126 struct source_line *src_line = notes->src->lines;
127 struct sym_hist *h = annotation__histogram(notes, evidx);
128 s64 offset = oline->offset;
129 struct objdump_line *next;
130
131 next = objdump__get_next_ip_line(&notes->src->source, oline);
132
133 while (offset < (s64)len &&
134 (next == NULL || offset < next->offset)) {
135 if (src_line) {
136 if (path == NULL)
137 path = src_line[offset].path;
138 percent += src_line[offset].percent;
139 } else
140 hits += h->addr[offset];
141
142 ++offset;
143 }
144
145 if (src_line == NULL && h->sum)
146 percent = 100.0 * hits / h->sum;
147
148 if (percent < min_pcnt)
149 return -1;
150
151 if (max_lines && printed >= max_lines)
152 return 1;
153
154 if (queue != NULL) {
155 list_for_each_entry_from(queue, &notes->src->source, node) {
156 if (queue == oline)
157 break;
158 objdump_line__print(queue, sym, evidx, len,
159 0, 0, 1, NULL);
160 }
161 }
162
163 color = get_percent_color(percent);
164
165 /*
166 * Also color the filename and line if needed, with
167 * the same color than the percentage. Don't print it
168 * twice for close colored addr with the same filename:line
169 */
170 if (path) {
171 if (!prev_line || strcmp(prev_line, path)
172 || color != prev_color) {
173 color_fprintf(stdout, color, " %s", path);
174 prev_line = path;
175 prev_color = color;
176 }
177 }
178
179 color_fprintf(stdout, color, " %7.2f", percent);
180 printf(" : ");
181 color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", oline->line);
182 } else if (max_lines && printed >= max_lines)
183 return 1;
184 else {
185 if (queue)
186 return -1;
187
188 if (!*oline->line)
189 printf(" :\n");
190 else
191 printf(" : %s\n", oline->line);
192 }
193
194 return 0;
195}
196
197static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
198 FILE *file, size_t privsize)
199{
200 struct annotation *notes = symbol__annotation(sym);
201 struct objdump_line *objdump_line;
202 char *line = NULL, *tmp, *tmp2, *c;
203 size_t line_len;
204 s64 line_ip, offset = -1;
205
206 if (getline(&line, &line_len, file) < 0)
207 return -1;
208
209 if (!line)
210 return -1;
211
212 while (line_len != 0 && isspace(line[line_len - 1]))
213 line[--line_len] = '\0';
214
215 c = strchr(line, '\n');
216 if (c)
217 *c = 0;
218
219 line_ip = -1;
220
221 /*
222 * Strip leading spaces:
223 */
224 tmp = line;
225 while (*tmp) {
226 if (*tmp != ' ')
227 break;
228 tmp++;
229 }
230
231 if (*tmp) {
232 /*
233 * Parse hexa addresses followed by ':'
234 */
235 line_ip = strtoull(tmp, &tmp2, 16);
236 if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0')
237 line_ip = -1;
238 }
239
240 if (line_ip != -1) {
241 u64 start = map__rip_2objdump(map, sym->start),
242 end = map__rip_2objdump(map, sym->end);
243
244 offset = line_ip - start;
245 if (offset < 0 || (u64)line_ip > end)
246 offset = -1;
247 }
248
249 objdump_line = objdump_line__new(offset, line, privsize);
250 if (objdump_line == NULL) {
251 free(line);
252 return -1;
253 }
254 objdump__add_line(&notes->src->source, objdump_line);
255
256 return 0;
257}
258
259int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize)
260{
261 struct dso *dso = map->dso;
262 char *filename = dso__build_id_filename(dso, NULL, 0);
263 bool free_filename = true;
264 char command[PATH_MAX * 2];
265 FILE *file;
266 int err = 0;
267 char symfs_filename[PATH_MAX];
268
269 if (filename) {
270 snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
271 symbol_conf.symfs, filename);
272 }
273
274 if (filename == NULL) {
275 if (dso->has_build_id) {
276 pr_err("Can't annotate %s: not enough memory\n",
277 sym->name);
278 return -ENOMEM;
279 }
280 goto fallback;
281 } else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
282 strstr(command, "[kernel.kallsyms]") ||
283 access(symfs_filename, R_OK)) {
284 free(filename);
285fallback:
286 /*
287 * If we don't have build-ids or the build-id file isn't in the
288 * cache, or is just a kallsyms file, well, lets hope that this
289 * DSO is the same as when 'perf record' ran.
290 */
291 filename = dso->long_name;
292 snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
293 symbol_conf.symfs, filename);
294 free_filename = false;
295 }
296
297 if (dso->origin == DSO__ORIG_KERNEL) {
298 char bf[BUILD_ID_SIZE * 2 + 16] = " with build id ";
299 char *build_id_msg = NULL;
300
301 if (dso->annotate_warned)
302 goto out_free_filename;
303
304 if (dso->has_build_id) {
305 build_id__sprintf(dso->build_id,
306 sizeof(dso->build_id), bf + 15);
307 build_id_msg = bf;
308 }
309 err = -ENOENT;
310 dso->annotate_warned = 1;
311 pr_err("Can't annotate %s: No vmlinux file%s was found in the "
312 "path.\nPlease use 'perf buildid-cache -av vmlinux' or "
313 "--vmlinux vmlinux.\n",
314 sym->name, build_id_msg ?: "");
315 goto out_free_filename;
316 }
317
318 pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
319 filename, sym->name, map->unmap_ip(map, sym->start),
320 map->unmap_ip(map, sym->end));
321
322 pr_debug("annotating [%p] %30s : [%p] %30s\n",
323 dso, dso->long_name, sym, sym->name);
324
325 snprintf(command, sizeof(command),
326 "objdump --start-address=0x%016" PRIx64
327 " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand",
328 map__rip_2objdump(map, sym->start),
329 map__rip_2objdump(map, sym->end),
330 symfs_filename, filename);
331
332 pr_debug("Executing: %s\n", command);
333
334 file = popen(command, "r");
335 if (!file)
336 goto out_free_filename;
337
338 while (!feof(file))
339 if (symbol__parse_objdump_line(sym, map, file, privsize) < 0)
340 break;
341
342 pclose(file);
343out_free_filename:
344 if (free_filename)
345 free(filename);
346 return err;
347}
348
349static void insert_source_line(struct rb_root *root, struct source_line *src_line)
350{
351 struct source_line *iter;
352 struct rb_node **p = &root->rb_node;
353 struct rb_node *parent = NULL;
354
355 while (*p != NULL) {
356 parent = *p;
357 iter = rb_entry(parent, struct source_line, node);
358
359 if (src_line->percent > iter->percent)
360 p = &(*p)->rb_left;
361 else
362 p = &(*p)->rb_right;
363 }
364
365 rb_link_node(&src_line->node, parent, p);
366 rb_insert_color(&src_line->node, root);
367}
368
369static void symbol__free_source_line(struct symbol *sym, int len)
370{
371 struct annotation *notes = symbol__annotation(sym);
372 struct source_line *src_line = notes->src->lines;
373 int i;
374
375 for (i = 0; i < len; i++)
376 free(src_line[i].path);
377
378 free(src_line);
379 notes->src->lines = NULL;
380}
381
382/* Get the filename:line for the colored entries */
383static int symbol__get_source_line(struct symbol *sym, struct map *map,
384 int evidx, struct rb_root *root, int len,
385 const char *filename)
386{
387 u64 start;
388 int i;
389 char cmd[PATH_MAX * 2];
390 struct source_line *src_line;
391 struct annotation *notes = symbol__annotation(sym);
392 struct sym_hist *h = annotation__histogram(notes, evidx);
393
394 if (!h->sum)
395 return 0;
396
397 src_line = notes->src->lines = calloc(len, sizeof(struct source_line));
398 if (!notes->src->lines)
399 return -1;
400
401 start = map->unmap_ip(map, sym->start);
402
403 for (i = 0; i < len; i++) {
404 char *path = NULL;
405 size_t line_len;
406 u64 offset;
407 FILE *fp;
408
409 src_line[i].percent = 100.0 * h->addr[i] / h->sum;
410 if (src_line[i].percent <= 0.5)
411 continue;
412
413 offset = start + i;
414 sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset);
415 fp = popen(cmd, "r");
416 if (!fp)
417 continue;
418
419 if (getline(&path, &line_len, fp) < 0 || !line_len)
420 goto next;
421
422 src_line[i].path = malloc(sizeof(char) * line_len + 1);
423 if (!src_line[i].path)
424 goto next;
425
426 strcpy(src_line[i].path, path);
427 insert_source_line(root, &src_line[i]);
428
429 next:
430 pclose(fp);
431 }
432
433 return 0;
434}
435
436static void print_summary(struct rb_root *root, const char *filename)
437{
438 struct source_line *src_line;
439 struct rb_node *node;
440
441 printf("\nSorted summary for file %s\n", filename);
442 printf("----------------------------------------------\n\n");
443
444 if (RB_EMPTY_ROOT(root)) {
445 printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
446 return;
447 }
448
449 node = rb_first(root);
450 while (node) {
451 double percent;
452 const char *color;
453 char *path;
454
455 src_line = rb_entry(node, struct source_line, node);
456 percent = src_line->percent;
457 color = get_percent_color(percent);
458 path = src_line->path;
459
460 color_fprintf(stdout, color, " %7.2f %s", percent, path);
461 node = rb_next(node);
462 }
463}
464
465static void symbol__annotate_hits(struct symbol *sym, int evidx)
466{
467 struct annotation *notes = symbol__annotation(sym);
468 struct sym_hist *h = annotation__histogram(notes, evidx);
469 u64 len = sym->end - sym->start, offset;
470
471 for (offset = 0; offset < len; ++offset)
472 if (h->addr[offset] != 0)
473 printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
474 sym->start + offset, h->addr[offset]);
475 printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
476}
477
478int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
479 bool full_paths, int min_pcnt, int max_lines,
480 int context)
481{
482 struct dso *dso = map->dso;
483 const char *filename = dso->long_name, *d_filename;
484 struct annotation *notes = symbol__annotation(sym);
485 struct objdump_line *pos, *queue = NULL;
486 int printed = 2, queue_len = 0;
487 int more = 0;
488 u64 len;
489
490 if (full_paths)
491 d_filename = filename;
492 else
493 d_filename = basename(filename);
494
495 len = sym->end - sym->start;
496
497 printf(" Percent | Source code & Disassembly of %s\n", d_filename);
498 printf("------------------------------------------------\n");
499
500 if (verbose)
501 symbol__annotate_hits(sym, evidx);
502
503 list_for_each_entry(pos, &notes->src->source, node) {
504 if (context && queue == NULL) {
505 queue = pos;
506 queue_len = 0;
507 }
508
509 switch (objdump_line__print(pos, sym, evidx, len, min_pcnt,
510 printed, max_lines, queue)) {
511 case 0:
512 ++printed;
513 if (context) {
514 printed += queue_len;
515 queue = NULL;
516 queue_len = 0;
517 }
518 break;
519 case 1:
520 /* filtered by max_lines */
521 ++more;
522 break;
523 case -1:
524 default:
525 /*
526 * Filtered by min_pcnt or non IP lines when
527 * context != 0
528 */
529 if (!context)
530 break;
531 if (queue_len == context)
532 queue = list_entry(queue->node.next, typeof(*queue), node);
533 else
534 ++queue_len;
535 break;
536 }
537 }
538
539 return more;
540}
541
542void symbol__annotate_zero_histogram(struct symbol *sym, int evidx)
543{
544 struct annotation *notes = symbol__annotation(sym);
545 struct sym_hist *h = annotation__histogram(notes, evidx);
546
547 memset(h, 0, notes->src->sizeof_sym_hist);
548}
549
550void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
551{
552 struct annotation *notes = symbol__annotation(sym);
553 struct sym_hist *h = annotation__histogram(notes, evidx);
554 struct objdump_line *pos;
555 int len = sym->end - sym->start;
556
557 h->sum = 0;
558
559 list_for_each_entry(pos, &notes->src->source, node) {
560 if (pos->offset != -1 && pos->offset < len) {
561 h->addr[pos->offset] = h->addr[pos->offset] * 7 / 8;
562 h->sum += h->addr[pos->offset];
563 }
564 }
565}
566
567void objdump_line_list__purge(struct list_head *head)
568{
569 struct objdump_line *pos, *n;
570
571 list_for_each_entry_safe(pos, n, head, node) {
572 list_del(&pos->node);
573 objdump_line__free(pos);
574 }
575}
576
577int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
578 bool print_lines, bool full_paths, int min_pcnt,
579 int max_lines)
580{
581 struct dso *dso = map->dso;
582 const char *filename = dso->long_name;
583 struct rb_root source_line = RB_ROOT;
584 u64 len;
585
586 if (symbol__annotate(sym, map, 0) < 0)
587 return -1;
588
589 len = sym->end - sym->start;
590
591 if (print_lines) {
592 symbol__get_source_line(sym, map, evidx, &source_line,
593 len, filename);
594 print_summary(&source_line, filename);
595 }
596
597 symbol__annotate_printf(sym, map, evidx, full_paths,
598 min_pcnt, max_lines, 0);
599 if (print_lines)
600 symbol__free_source_line(sym, len);
601
602 objdump_line_list__purge(&symbol__annotation(sym)->src->source);
603
604 return 0;
605}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
new file mode 100644
index 00000000000..c2c28689680
--- /dev/null
+++ b/tools/perf/util/annotate.h
@@ -0,0 +1,103 @@
1#ifndef __PERF_ANNOTATE_H
2#define __PERF_ANNOTATE_H
3
4#include <stdbool.h>
5#include "types.h"
6#include "symbol.h"
7#include <linux/list.h>
8#include <linux/rbtree.h>
9
10struct objdump_line {
11 struct list_head node;
12 s64 offset;
13 char *line;
14};
15
16void objdump_line__free(struct objdump_line *self);
17struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
18 struct objdump_line *pos);
19
20struct sym_hist {
21 u64 sum;
22 u64 addr[0];
23};
24
25struct source_line {
26 struct rb_node node;
27 double percent;
28 char *path;
29};
30
31/** struct annotated_source - symbols with hits have this attached as in sannotation
32 *
33 * @histogram: Array of addr hit histograms per event being monitored
34 * @lines: If 'print_lines' is specified, per source code line percentages
35 * @source: source parsed from objdump -dS
36 *
37 * lines is allocated, percentages calculated and all sorted by percentage
38 * when the annotation is about to be presented, so the percentages are for
39 * one of the entries in the histogram array, i.e. for the event/counter being
40 * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate
41 * returns.
42 */
43struct annotated_source {
44 struct list_head source;
45 struct source_line *lines;
46 int nr_histograms;
47 int sizeof_sym_hist;
48 struct sym_hist histograms[0];
49};
50
51struct annotation {
52 pthread_mutex_t lock;
53 struct annotated_source *src;
54};
55
56struct sannotation {
57 struct annotation annotation;
58 struct symbol symbol;
59};
60
61static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx)
62{
63 return (((void *)&notes->src->histograms) +
64 (notes->src->sizeof_sym_hist * idx));
65}
66
67static inline struct annotation *symbol__annotation(struct symbol *sym)
68{
69 struct sannotation *a = container_of(sym, struct sannotation, symbol);
70 return &a->annotation;
71}
72
73int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
74 int evidx, u64 addr);
75int symbol__alloc_hist(struct symbol *sym, int nevents);
76void symbol__annotate_zero_histograms(struct symbol *sym);
77
78int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize);
79int symbol__annotate_init(struct map *map __used, struct symbol *sym);
80int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
81 bool full_paths, int min_pcnt, int max_lines,
82 int context);
83void symbol__annotate_zero_histogram(struct symbol *sym, int evidx);
84void symbol__annotate_decay_histogram(struct symbol *sym, int evidx);
85void objdump_line_list__purge(struct list_head *head);
86
87int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
88 bool print_lines, bool full_paths, int min_pcnt,
89 int max_lines);
90
91#ifdef NO_NEWT_SUPPORT
92static inline int symbol__tui_annotate(struct symbol *sym __used,
93 struct map *map __used,
94 int evidx __used, int refresh __used)
95{
96 return 0;
97}
98#else
99int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
100 int refresh);
101#endif
102
103#endif /* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index deffb8c9607..31f934af986 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -14,8 +14,8 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include "debug.h" 15#include "debug.h"
16 16
17static int build_id__mark_dso_hit(event_t *event, 17static int build_id__mark_dso_hit(union perf_event *event,
18 struct sample_data *sample __used, 18 struct perf_sample *sample __used,
19 struct perf_session *session) 19 struct perf_session *session)
20{ 20{
21 struct addr_location al; 21 struct addr_location al;
@@ -37,13 +37,14 @@ static int build_id__mark_dso_hit(event_t *event,
37 return 0; 37 return 0;
38} 38}
39 39
40static int event__exit_del_thread(event_t *self, struct sample_data *sample __used, 40static int perf_event__exit_del_thread(union perf_event *event,
41 struct perf_session *session) 41 struct perf_sample *sample __used,
42 struct perf_session *session)
42{ 43{
43 struct thread *thread = perf_session__findnew(session, self->fork.tid); 44 struct thread *thread = perf_session__findnew(session, event->fork.tid);
44 45
45 dump_printf("(%d:%d):(%d:%d)\n", self->fork.pid, self->fork.tid, 46 dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
46 self->fork.ppid, self->fork.ptid); 47 event->fork.ppid, event->fork.ptid);
47 48
48 if (thread) { 49 if (thread) {
49 rb_erase(&thread->rb_node, &session->threads); 50 rb_erase(&thread->rb_node, &session->threads);
@@ -56,9 +57,9 @@ static int event__exit_del_thread(event_t *self, struct sample_data *sample __us
56 57
57struct perf_event_ops build_id__mark_dso_hit_ops = { 58struct perf_event_ops build_id__mark_dso_hit_ops = {
58 .sample = build_id__mark_dso_hit, 59 .sample = build_id__mark_dso_hit,
59 .mmap = event__process_mmap, 60 .mmap = perf_event__process_mmap,
60 .fork = event__process_task, 61 .fork = perf_event__process_task,
61 .exit = event__exit_del_thread, 62 .exit = perf_event__exit_del_thread,
62}; 63};
63 64
64char *dso__build_id_filename(struct dso *self, char *bf, size_t size) 65char *dso__build_id_filename(struct dso *self, char *bf, size_t size)
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index a7729797fd9..fc5e5a09d5b 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -34,13 +34,14 @@ extern int pager_use_color;
34extern int use_browser; 34extern int use_browser;
35 35
36#ifdef NO_NEWT_SUPPORT 36#ifdef NO_NEWT_SUPPORT
37static inline void setup_browser(void) 37static inline void setup_browser(bool fallback_to_pager)
38{ 38{
39 setup_pager(); 39 if (fallback_to_pager)
40 setup_pager();
40} 41}
41static inline void exit_browser(bool wait_for_ok __used) {} 42static inline void exit_browser(bool wait_for_ok __used) {}
42#else 43#else
43void setup_browser(void); 44void setup_browser(bool fallback_to_pager);
44void exit_browser(bool wait_for_ok); 45void exit_browser(bool wait_for_ok);
45#endif 46#endif
46 47
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index e12d539417b..9f7106a8d9a 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2009-2010, Frederic Weisbecker <fweisbec@gmail.com> 2 * Copyright (C) 2009-2011, Frederic Weisbecker <fweisbec@gmail.com>
3 * 3 *
4 * Handle the callchains from the stream in an ad-hoc radix tree and then 4 * Handle the callchains from the stream in an ad-hoc radix tree and then
5 * sort them in an rbtree. 5 * sort them in an rbtree.
@@ -18,7 +18,8 @@
18#include "util.h" 18#include "util.h"
19#include "callchain.h" 19#include "callchain.h"
20 20
21bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event) 21bool ip_callchain__valid(struct ip_callchain *chain,
22 const union perf_event *event)
22{ 23{
23 unsigned int chain_size = event->header.size; 24 unsigned int chain_size = event->header.size;
24 chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event; 25 chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
@@ -26,10 +27,10 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event)
26} 27}
27 28
28#define chain_for_each_child(child, parent) \ 29#define chain_for_each_child(child, parent) \
29 list_for_each_entry(child, &parent->children, brothers) 30 list_for_each_entry(child, &parent->children, siblings)
30 31
31#define chain_for_each_child_safe(child, next, parent) \ 32#define chain_for_each_child_safe(child, next, parent) \
32 list_for_each_entry_safe(child, next, &parent->children, brothers) 33 list_for_each_entry_safe(child, next, &parent->children, siblings)
33 34
34static void 35static void
35rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, 36rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
@@ -38,14 +39,14 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
38 struct rb_node **p = &root->rb_node; 39 struct rb_node **p = &root->rb_node;
39 struct rb_node *parent = NULL; 40 struct rb_node *parent = NULL;
40 struct callchain_node *rnode; 41 struct callchain_node *rnode;
41 u64 chain_cumul = cumul_hits(chain); 42 u64 chain_cumul = callchain_cumul_hits(chain);
42 43
43 while (*p) { 44 while (*p) {
44 u64 rnode_cumul; 45 u64 rnode_cumul;
45 46
46 parent = *p; 47 parent = *p;
47 rnode = rb_entry(parent, struct callchain_node, rb_node); 48 rnode = rb_entry(parent, struct callchain_node, rb_node);
48 rnode_cumul = cumul_hits(rnode); 49 rnode_cumul = callchain_cumul_hits(rnode);
49 50
50 switch (mode) { 51 switch (mode) {
51 case CHAIN_FLAT: 52 case CHAIN_FLAT:
@@ -104,7 +105,7 @@ static void __sort_chain_graph_abs(struct callchain_node *node,
104 105
105 chain_for_each_child(child, node) { 106 chain_for_each_child(child, node) {
106 __sort_chain_graph_abs(child, min_hit); 107 __sort_chain_graph_abs(child, min_hit);
107 if (cumul_hits(child) >= min_hit) 108 if (callchain_cumul_hits(child) >= min_hit)
108 rb_insert_callchain(&node->rb_root, child, 109 rb_insert_callchain(&node->rb_root, child,
109 CHAIN_GRAPH_ABS); 110 CHAIN_GRAPH_ABS);
110 } 111 }
@@ -129,7 +130,7 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
129 130
130 chain_for_each_child(child, node) { 131 chain_for_each_child(child, node) {
131 __sort_chain_graph_rel(child, min_percent); 132 __sort_chain_graph_rel(child, min_percent);
132 if (cumul_hits(child) >= min_hit) 133 if (callchain_cumul_hits(child) >= min_hit)
133 rb_insert_callchain(&node->rb_root, child, 134 rb_insert_callchain(&node->rb_root, child,
134 CHAIN_GRAPH_REL); 135 CHAIN_GRAPH_REL);
135 } 136 }
@@ -143,7 +144,7 @@ sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root,
143 rb_root->rb_node = chain_root->node.rb_root.rb_node; 144 rb_root->rb_node = chain_root->node.rb_root.rb_node;
144} 145}
145 146
146int register_callchain_param(struct callchain_param *param) 147int callchain_register_param(struct callchain_param *param)
147{ 148{
148 switch (param->mode) { 149 switch (param->mode) {
149 case CHAIN_GRAPH_ABS: 150 case CHAIN_GRAPH_ABS:
@@ -189,32 +190,27 @@ create_child(struct callchain_node *parent, bool inherit_children)
189 chain_for_each_child(next, new) 190 chain_for_each_child(next, new)
190 next->parent = new; 191 next->parent = new;
191 } 192 }
192 list_add_tail(&new->brothers, &parent->children); 193 list_add_tail(&new->siblings, &parent->children);
193 194
194 return new; 195 return new;
195} 196}
196 197
197 198
198struct resolved_ip {
199 u64 ip;
200 struct map_symbol ms;
201};
202
203struct resolved_chain {
204 u64 nr;
205 struct resolved_ip ips[0];
206};
207
208
209/* 199/*
210 * Fill the node with callchain values 200 * Fill the node with callchain values
211 */ 201 */
212static void 202static void
213fill_node(struct callchain_node *node, struct resolved_chain *chain, int start) 203fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
214{ 204{
215 unsigned int i; 205 struct callchain_cursor_node *cursor_node;
206
207 node->val_nr = cursor->nr - cursor->pos;
208 if (!node->val_nr)
209 pr_warning("Warning: empty node in callchain tree\n");
216 210
217 for (i = start; i < chain->nr; i++) { 211 cursor_node = callchain_cursor_current(cursor);
212
213 while (cursor_node) {
218 struct callchain_list *call; 214 struct callchain_list *call;
219 215
220 call = zalloc(sizeof(*call)); 216 call = zalloc(sizeof(*call));
@@ -222,23 +218,25 @@ fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
222 perror("not enough memory for the code path tree"); 218 perror("not enough memory for the code path tree");
223 return; 219 return;
224 } 220 }
225 call->ip = chain->ips[i].ip; 221 call->ip = cursor_node->ip;
226 call->ms = chain->ips[i].ms; 222 call->ms.sym = cursor_node->sym;
223 call->ms.map = cursor_node->map;
227 list_add_tail(&call->list, &node->val); 224 list_add_tail(&call->list, &node->val);
225
226 callchain_cursor_advance(cursor);
227 cursor_node = callchain_cursor_current(cursor);
228 } 228 }
229 node->val_nr = chain->nr - start;
230 if (!node->val_nr)
231 pr_warning("Warning: empty node in callchain tree\n");
232} 229}
233 230
234static void 231static void
235add_child(struct callchain_node *parent, struct resolved_chain *chain, 232add_child(struct callchain_node *parent,
236 int start, u64 period) 233 struct callchain_cursor *cursor,
234 u64 period)
237{ 235{
238 struct callchain_node *new; 236 struct callchain_node *new;
239 237
240 new = create_child(parent, false); 238 new = create_child(parent, false);
241 fill_node(new, chain, start); 239 fill_node(new, cursor);
242 240
243 new->children_hit = 0; 241 new->children_hit = 0;
244 new->hit = period; 242 new->hit = period;
@@ -250,9 +248,10 @@ add_child(struct callchain_node *parent, struct resolved_chain *chain,
250 * Then create another child to host the given callchain of new branch 248 * Then create another child to host the given callchain of new branch
251 */ 249 */
252static void 250static void
253split_add_child(struct callchain_node *parent, struct resolved_chain *chain, 251split_add_child(struct callchain_node *parent,
254 struct callchain_list *to_split, int idx_parents, int idx_local, 252 struct callchain_cursor *cursor,
255 u64 period) 253 struct callchain_list *to_split,
254 u64 idx_parents, u64 idx_local, u64 period)
256{ 255{
257 struct callchain_node *new; 256 struct callchain_node *new;
258 struct list_head *old_tail; 257 struct list_head *old_tail;
@@ -272,14 +271,14 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
272 /* split the hits */ 271 /* split the hits */
273 new->hit = parent->hit; 272 new->hit = parent->hit;
274 new->children_hit = parent->children_hit; 273 new->children_hit = parent->children_hit;
275 parent->children_hit = cumul_hits(new); 274 parent->children_hit = callchain_cumul_hits(new);
276 new->val_nr = parent->val_nr - idx_local; 275 new->val_nr = parent->val_nr - idx_local;
277 parent->val_nr = idx_local; 276 parent->val_nr = idx_local;
278 277
279 /* create a new child for the new branch if any */ 278 /* create a new child for the new branch if any */
280 if (idx_total < chain->nr) { 279 if (idx_total < cursor->nr) {
281 parent->hit = 0; 280 parent->hit = 0;
282 add_child(parent, chain, idx_total, period); 281 add_child(parent, cursor, period);
283 parent->children_hit += period; 282 parent->children_hit += period;
284 } else { 283 } else {
285 parent->hit = period; 284 parent->hit = period;
@@ -287,36 +286,41 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
287} 286}
288 287
289static int 288static int
290append_chain(struct callchain_node *root, struct resolved_chain *chain, 289append_chain(struct callchain_node *root,
291 unsigned int start, u64 period); 290 struct callchain_cursor *cursor,
291 u64 period);
292 292
293static void 293static void
294append_chain_children(struct callchain_node *root, struct resolved_chain *chain, 294append_chain_children(struct callchain_node *root,
295 unsigned int start, u64 period) 295 struct callchain_cursor *cursor,
296 u64 period)
296{ 297{
297 struct callchain_node *rnode; 298 struct callchain_node *rnode;
298 299
299 /* lookup in childrens */ 300 /* lookup in childrens */
300 chain_for_each_child(rnode, root) { 301 chain_for_each_child(rnode, root) {
301 unsigned int ret = append_chain(rnode, chain, start, period); 302 unsigned int ret = append_chain(rnode, cursor, period);
302 303
303 if (!ret) 304 if (!ret)
304 goto inc_children_hit; 305 goto inc_children_hit;
305 } 306 }
306 /* nothing in children, add to the current node */ 307 /* nothing in children, add to the current node */
307 add_child(root, chain, start, period); 308 add_child(root, cursor, period);
308 309
309inc_children_hit: 310inc_children_hit:
310 root->children_hit += period; 311 root->children_hit += period;
311} 312}
312 313
313static int 314static int
314append_chain(struct callchain_node *root, struct resolved_chain *chain, 315append_chain(struct callchain_node *root,
315 unsigned int start, u64 period) 316 struct callchain_cursor *cursor,
317 u64 period)
316{ 318{
319 struct callchain_cursor_node *curr_snap = cursor->curr;
317 struct callchain_list *cnode; 320 struct callchain_list *cnode;
318 unsigned int i = start; 321 u64 start = cursor->pos;
319 bool found = false; 322 bool found = false;
323 u64 matches;
320 324
321 /* 325 /*
322 * Lookup in the current node 326 * Lookup in the current node
@@ -324,141 +328,134 @@ append_chain(struct callchain_node *root, struct resolved_chain *chain,
324 * anywhere inside a function. 328 * anywhere inside a function.
325 */ 329 */
326 list_for_each_entry(cnode, &root->val, list) { 330 list_for_each_entry(cnode, &root->val, list) {
331 struct callchain_cursor_node *node;
327 struct symbol *sym; 332 struct symbol *sym;
328 333
329 if (i == chain->nr) 334 node = callchain_cursor_current(cursor);
335 if (!node)
330 break; 336 break;
331 337
332 sym = chain->ips[i].ms.sym; 338 sym = node->sym;
333 339
334 if (cnode->ms.sym && sym) { 340 if (cnode->ms.sym && sym) {
335 if (cnode->ms.sym->start != sym->start) 341 if (cnode->ms.sym->start != sym->start)
336 break; 342 break;
337 } else if (cnode->ip != chain->ips[i].ip) 343 } else if (cnode->ip != node->ip)
338 break; 344 break;
339 345
340 if (!found) 346 if (!found)
341 found = true; 347 found = true;
342 i++; 348
349 callchain_cursor_advance(cursor);
343 } 350 }
344 351
345 /* matches not, relay on the parent */ 352 /* matches not, relay on the parent */
346 if (!found) 353 if (!found) {
354 cursor->curr = curr_snap;
355 cursor->pos = start;
347 return -1; 356 return -1;
357 }
358
359 matches = cursor->pos - start;
348 360
349 /* we match only a part of the node. Split it and add the new chain */ 361 /* we match only a part of the node. Split it and add the new chain */
350 if (i - start < root->val_nr) { 362 if (matches < root->val_nr) {
351 split_add_child(root, chain, cnode, start, i - start, period); 363 split_add_child(root, cursor, cnode, start, matches, period);
352 return 0; 364 return 0;
353 } 365 }
354 366
355 /* we match 100% of the path, increment the hit */ 367 /* we match 100% of the path, increment the hit */
356 if (i - start == root->val_nr && i == chain->nr) { 368 if (matches == root->val_nr && cursor->pos == cursor->nr) {
357 root->hit += period; 369 root->hit += period;
358 return 0; 370 return 0;
359 } 371 }
360 372
361 /* We match the node and still have a part remaining */ 373 /* We match the node and still have a part remaining */
362 append_chain_children(root, chain, i, period); 374 append_chain_children(root, cursor, period);
363 375
364 return 0; 376 return 0;
365} 377}
366 378
367static void filter_context(struct ip_callchain *old, struct resolved_chain *new, 379int callchain_append(struct callchain_root *root,
368 struct map_symbol *syms) 380 struct callchain_cursor *cursor,
369{ 381 u64 period)
370 int i, j = 0;
371
372 for (i = 0; i < (int)old->nr; i++) {
373 if (old->ips[i] >= PERF_CONTEXT_MAX)
374 continue;
375
376 new->ips[j].ip = old->ips[i];
377 new->ips[j].ms = syms[i];
378 j++;
379 }
380
381 new->nr = j;
382}
383
384
385int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
386 struct map_symbol *syms, u64 period)
387{ 382{
388 struct resolved_chain *filtered; 383 if (!cursor->nr)
389
390 if (!chain->nr)
391 return 0; 384 return 0;
392 385
393 filtered = zalloc(sizeof(*filtered) + 386 callchain_cursor_commit(cursor);
394 chain->nr * sizeof(struct resolved_ip));
395 if (!filtered)
396 return -ENOMEM;
397
398 filter_context(chain, filtered, syms);
399
400 if (!filtered->nr)
401 goto end;
402 387
403 append_chain_children(&root->node, filtered, 0, period); 388 append_chain_children(&root->node, cursor, period);
404 389
405 if (filtered->nr > root->max_depth) 390 if (cursor->nr > root->max_depth)
406 root->max_depth = filtered->nr; 391 root->max_depth = cursor->nr;
407end:
408 free(filtered);
409 392
410 return 0; 393 return 0;
411} 394}
412 395
413static int 396static int
414merge_chain_branch(struct callchain_node *dst, struct callchain_node *src, 397merge_chain_branch(struct callchain_cursor *cursor,
415 struct resolved_chain *chain) 398 struct callchain_node *dst, struct callchain_node *src)
416{ 399{
400 struct callchain_cursor_node **old_last = cursor->last;
417 struct callchain_node *child, *next_child; 401 struct callchain_node *child, *next_child;
418 struct callchain_list *list, *next_list; 402 struct callchain_list *list, *next_list;
419 int old_pos = chain->nr; 403 int old_pos = cursor->nr;
420 int err = 0; 404 int err = 0;
421 405
422 list_for_each_entry_safe(list, next_list, &src->val, list) { 406 list_for_each_entry_safe(list, next_list, &src->val, list) {
423 chain->ips[chain->nr].ip = list->ip; 407 callchain_cursor_append(cursor, list->ip,
424 chain->ips[chain->nr].ms = list->ms; 408 list->ms.map, list->ms.sym);
425 chain->nr++;
426 list_del(&list->list); 409 list_del(&list->list);
427 free(list); 410 free(list);
428 } 411 }
429 412
430 if (src->hit) 413 if (src->hit) {
431 append_chain_children(dst, chain, 0, src->hit); 414 callchain_cursor_commit(cursor);
415 append_chain_children(dst, cursor, src->hit);
416 }
432 417
433 chain_for_each_child_safe(child, next_child, src) { 418 chain_for_each_child_safe(child, next_child, src) {
434 err = merge_chain_branch(dst, child, chain); 419 err = merge_chain_branch(cursor, dst, child);
435 if (err) 420 if (err)
436 break; 421 break;
437 422
438 list_del(&child->brothers); 423 list_del(&child->siblings);
439 free(child); 424 free(child);
440 } 425 }
441 426
442 chain->nr = old_pos; 427 cursor->nr = old_pos;
428 cursor->last = old_last;
443 429
444 return err; 430 return err;
445} 431}
446 432
447int callchain_merge(struct callchain_root *dst, struct callchain_root *src) 433int callchain_merge(struct callchain_cursor *cursor,
434 struct callchain_root *dst, struct callchain_root *src)
435{
436 return merge_chain_branch(cursor, &dst->node, &src->node);
437}
438
439int callchain_cursor_append(struct callchain_cursor *cursor,
440 u64 ip, struct map *map, struct symbol *sym)
448{ 441{
449 struct resolved_chain *chain; 442 struct callchain_cursor_node *node = *cursor->last;
450 int err;
451 443
452 chain = malloc(sizeof(*chain) + 444 if (!node) {
453 src->max_depth * sizeof(struct resolved_ip)); 445 node = calloc(sizeof(*node), 1);
454 if (!chain) 446 if (!node)
455 return -ENOMEM; 447 return -ENOMEM;
456 448
457 chain->nr = 0; 449 *cursor->last = node;
450 }
458 451
459 err = merge_chain_branch(&dst->node, &src->node, chain); 452 node->ip = ip;
453 node->map = map;
454 node->sym = sym;
460 455
461 free(chain); 456 cursor->nr++;
462 457
463 return err; 458 cursor->last = &node->next;
459
460 return 0;
464} 461}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index c15fb8c24ad..1a79df9f739 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -16,7 +16,7 @@ enum chain_mode {
16 16
17struct callchain_node { 17struct callchain_node {
18 struct callchain_node *parent; 18 struct callchain_node *parent;
19 struct list_head brothers; 19 struct list_head siblings;
20 struct list_head children; 20 struct list_head children;
21 struct list_head val; 21 struct list_head val;
22 struct rb_node rb_node; /* to sort nodes in an rbtree */ 22 struct rb_node rb_node; /* to sort nodes in an rbtree */
@@ -49,9 +49,30 @@ struct callchain_list {
49 struct list_head list; 49 struct list_head list;
50}; 50};
51 51
52/*
53 * A callchain cursor is a single linked list that
54 * let one feed a callchain progressively.
55 * It keeps persitent allocated entries to minimize
56 * allocations.
57 */
58struct callchain_cursor_node {
59 u64 ip;
60 struct map *map;
61 struct symbol *sym;
62 struct callchain_cursor_node *next;
63};
64
65struct callchain_cursor {
66 u64 nr;
67 struct callchain_cursor_node *first;
68 struct callchain_cursor_node **last;
69 u64 pos;
70 struct callchain_cursor_node *curr;
71};
72
52static inline void callchain_init(struct callchain_root *root) 73static inline void callchain_init(struct callchain_root *root)
53{ 74{
54 INIT_LIST_HEAD(&root->node.brothers); 75 INIT_LIST_HEAD(&root->node.siblings);
55 INIT_LIST_HEAD(&root->node.children); 76 INIT_LIST_HEAD(&root->node.children);
56 INIT_LIST_HEAD(&root->node.val); 77 INIT_LIST_HEAD(&root->node.val);
57 78
@@ -61,15 +82,54 @@ static inline void callchain_init(struct callchain_root *root)
61 root->max_depth = 0; 82 root->max_depth = 0;
62} 83}
63 84
64static inline u64 cumul_hits(struct callchain_node *node) 85static inline u64 callchain_cumul_hits(struct callchain_node *node)
65{ 86{
66 return node->hit + node->children_hit; 87 return node->hit + node->children_hit;
67} 88}
68 89
69int register_callchain_param(struct callchain_param *param); 90int callchain_register_param(struct callchain_param *param);
70int callchain_append(struct callchain_root *root, struct ip_callchain *chain, 91int callchain_append(struct callchain_root *root,
71 struct map_symbol *syms, u64 period); 92 struct callchain_cursor *cursor,
72int callchain_merge(struct callchain_root *dst, struct callchain_root *src); 93 u64 period);
94
95int callchain_merge(struct callchain_cursor *cursor,
96 struct callchain_root *dst, struct callchain_root *src);
97
98bool ip_callchain__valid(struct ip_callchain *chain,
99 const union perf_event *event);
100/*
101 * Initialize a cursor before adding entries inside, but keep
102 * the previously allocated entries as a cache.
103 */
104static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
105{
106 cursor->nr = 0;
107 cursor->last = &cursor->first;
108}
109
110int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
111 struct map *map, struct symbol *sym);
73 112
74bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event); 113/* Close a cursor writing session. Initialize for the reader */
114static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
115{
116 cursor->curr = cursor->first;
117 cursor->pos = 0;
118}
119
120/* Cursor reading iteration helpers */
121static inline struct callchain_cursor_node *
122callchain_cursor_current(struct callchain_cursor *cursor)
123{
124 if (cursor->pos == cursor->nr)
125 return NULL;
126
127 return cursor->curr;
128}
129
130static inline void callchain_cursor_advance(struct callchain_cursor *cursor)
131{
132 cursor->curr = cursor->curr->next;
133 cursor->pos++;
134}
75#endif /* __PERF_CALLCHAIN_H */ 135#endif /* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
new file mode 100644
index 00000000000..9fea7553522
--- /dev/null
+++ b/tools/perf/util/cgroup.c
@@ -0,0 +1,178 @@
1#include "util.h"
2#include "../perf.h"
3#include "parse-options.h"
4#include "evsel.h"
5#include "cgroup.h"
6#include "debugfs.h" /* MAX_PATH, STR() */
7#include "evlist.h"
8
9int nr_cgroups;
10
11static int
12cgroupfs_find_mountpoint(char *buf, size_t maxlen)
13{
14 FILE *fp;
15 char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1];
16 char *token, *saved_ptr;
17 int found = 0;
18
19 fp = fopen("/proc/mounts", "r");
20 if (!fp)
21 return -1;
22
23 /*
24 * in order to handle split hierarchy, we need to scan /proc/mounts
25 * and inspect every cgroupfs mount point to find one that has
26 * perf_event subsystem
27 */
28 while (fscanf(fp, "%*s %"STR(MAX_PATH)"s %"STR(MAX_PATH)"s %"
29 STR(MAX_PATH)"s %*d %*d\n",
30 mountpoint, type, tokens) == 3) {
31
32 if (!strcmp(type, "cgroup")) {
33
34 token = strtok_r(tokens, ",", &saved_ptr);
35
36 while (token != NULL) {
37 if (!strcmp(token, "perf_event")) {
38 found = 1;
39 break;
40 }
41 token = strtok_r(NULL, ",", &saved_ptr);
42 }
43 }
44 if (found)
45 break;
46 }
47 fclose(fp);
48 if (!found)
49 return -1;
50
51 if (strlen(mountpoint) < maxlen) {
52 strcpy(buf, mountpoint);
53 return 0;
54 }
55 return -1;
56}
57
58static int open_cgroup(char *name)
59{
60 char path[MAX_PATH+1];
61 char mnt[MAX_PATH+1];
62 int fd;
63
64
65 if (cgroupfs_find_mountpoint(mnt, MAX_PATH+1))
66 return -1;
67
68 snprintf(path, MAX_PATH, "%s/%s", mnt, name);
69
70 fd = open(path, O_RDONLY);
71 if (fd == -1)
72 fprintf(stderr, "no access to cgroup %s\n", path);
73
74 return fd;
75}
76
77static int add_cgroup(struct perf_evlist *evlist, char *str)
78{
79 struct perf_evsel *counter;
80 struct cgroup_sel *cgrp = NULL;
81 int n;
82 /*
83 * check if cgrp is already defined, if so we reuse it
84 */
85 list_for_each_entry(counter, &evlist->entries, node) {
86 cgrp = counter->cgrp;
87 if (!cgrp)
88 continue;
89 if (!strcmp(cgrp->name, str))
90 break;
91
92 cgrp = NULL;
93 }
94
95 if (!cgrp) {
96 cgrp = zalloc(sizeof(*cgrp));
97 if (!cgrp)
98 return -1;
99
100 cgrp->name = str;
101
102 cgrp->fd = open_cgroup(str);
103 if (cgrp->fd == -1) {
104 free(cgrp);
105 return -1;
106 }
107 }
108
109 /*
110 * find corresponding event
111 * if add cgroup N, then need to find event N
112 */
113 n = 0;
114 list_for_each_entry(counter, &evlist->entries, node) {
115 if (n == nr_cgroups)
116 goto found;
117 n++;
118 }
119 if (cgrp->refcnt == 0)
120 free(cgrp);
121
122 return -1;
123found:
124 cgrp->refcnt++;
125 counter->cgrp = cgrp;
126 return 0;
127}
128
129void close_cgroup(struct cgroup_sel *cgrp)
130{
131 if (!cgrp)
132 return;
133
134 /* XXX: not reentrant */
135 if (--cgrp->refcnt == 0) {
136 close(cgrp->fd);
137 free(cgrp->name);
138 free(cgrp);
139 }
140}
141
142int parse_cgroups(const struct option *opt __used, const char *str,
143 int unset __used)
144{
145 struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
146 const char *p, *e, *eos = str + strlen(str);
147 char *s;
148 int ret;
149
150 if (list_empty(&evlist->entries)) {
151 fprintf(stderr, "must define events before cgroups\n");
152 return -1;
153 }
154
155 for (;;) {
156 p = strchr(str, ',');
157 e = p ? p : eos;
158
159 /* allow empty cgroups, i.e., skip */
160 if (e - str) {
161 /* termination added */
162 s = strndup(str, e - str);
163 if (!s)
164 return -1;
165 ret = add_cgroup(evlist, s);
166 if (ret) {
167 free(s);
168 return -1;
169 }
170 }
171 /* nr_cgroups is increased een for empty cgroups */
172 nr_cgroups++;
173 if (!p)
174 break;
175 str = p+1;
176 }
177 return 0;
178}
diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h
new file mode 100644
index 00000000000..89acd6debdc
--- /dev/null
+++ b/tools/perf/util/cgroup.h
@@ -0,0 +1,17 @@
1#ifndef __CGROUP_H__
2#define __CGROUP_H__
3
4struct option;
5
6struct cgroup_sel {
7 char *name;
8 int fd;
9 int refcnt;
10};
11
12
13extern int nr_cgroups; /* number of explicit cgroups defined */
14extern void close_cgroup(struct cgroup_sel *cgrp);
15extern int parse_cgroups(const struct option *opt, const char *str, int unset);
16
17#endif /* __CGROUP_H__ */
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 3ccaa104338..6893eec693a 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -177,3 +177,8 @@ struct cpu_map *cpu_map__dummy_new(void)
177 177
178 return cpus; 178 return cpus;
179} 179}
180
181void cpu_map__delete(struct cpu_map *map)
182{
183 free(map);
184}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index f7a4f42f630..072c0a37479 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -8,6 +8,6 @@ struct cpu_map {
8 8
9struct cpu_map *cpu_map__new(const char *cpu_list); 9struct cpu_map *cpu_map__new(const char *cpu_list);
10struct cpu_map *cpu_map__dummy_new(void); 10struct cpu_map *cpu_map__dummy_new(void);
11void *cpu_map__delete(struct cpu_map *map); 11void cpu_map__delete(struct cpu_map *map);
12 12
13#endif /* __PERF_CPUMAP_H */ 13#endif /* __PERF_CPUMAP_H */
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 01bbe8ecec3..d4536a9e0d8 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -57,7 +57,7 @@ void ui__warning(const char *format, ...)
57} 57}
58#endif 58#endif
59 59
60void trace_event(event_t *event) 60void trace_event(union perf_event *event)
61{ 61{
62 unsigned char *raw_event = (void *)event; 62 unsigned char *raw_event = (void *)event;
63 const char *color = PERF_COLOR_BLUE; 63 const char *color = PERF_COLOR_BLUE;
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index ca35fd66b5d..93516cf4682 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -9,7 +9,7 @@ extern int verbose;
9extern bool quiet, dump_trace; 9extern bool quiet, dump_trace;
10 10
11int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2))); 11int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
12void trace_event(event_t *event); 12void trace_event(union perf_event *event);
13 13
14struct ui_progress; 14struct ui_progress;
15 15
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 50d0a931497..2b15c362ef5 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -6,8 +6,9 @@
6#include "string.h" 6#include "string.h"
7#include "strlist.h" 7#include "strlist.h"
8#include "thread.h" 8#include "thread.h"
9#include "thread_map.h"
9 10
10static const char *event__name[] = { 11static const char *perf_event__names[] = {
11 [0] = "TOTAL", 12 [0] = "TOTAL",
12 [PERF_RECORD_MMAP] = "MMAP", 13 [PERF_RECORD_MMAP] = "MMAP",
13 [PERF_RECORD_LOST] = "LOST", 14 [PERF_RECORD_LOST] = "LOST",
@@ -25,16 +26,16 @@ static const char *event__name[] = {
25 [PERF_RECORD_FINISHED_ROUND] = "FINISHED_ROUND", 26 [PERF_RECORD_FINISHED_ROUND] = "FINISHED_ROUND",
26}; 27};
27 28
28const char *event__get_event_name(unsigned int id) 29const char *perf_event__name(unsigned int id)
29{ 30{
30 if (id >= ARRAY_SIZE(event__name)) 31 if (id >= ARRAY_SIZE(perf_event__names))
31 return "INVALID"; 32 return "INVALID";
32 if (!event__name[id]) 33 if (!perf_event__names[id])
33 return "UNKNOWN"; 34 return "UNKNOWN";
34 return event__name[id]; 35 return perf_event__names[id];
35} 36}
36 37
37static struct sample_data synth_sample = { 38static struct perf_sample synth_sample = {
38 .pid = -1, 39 .pid = -1,
39 .tid = -1, 40 .tid = -1,
40 .time = -1, 41 .time = -1,
@@ -43,9 +44,9 @@ static struct sample_data synth_sample = {
43 .period = 1, 44 .period = 1,
44}; 45};
45 46
46static pid_t event__synthesize_comm(event_t *event, pid_t pid, int full, 47static pid_t perf_event__synthesize_comm(union perf_event *event, pid_t pid,
47 event__handler_t process, 48 int full, perf_event__handler_t process,
48 struct perf_session *session) 49 struct perf_session *session)
49{ 50{
50 char filename[PATH_MAX]; 51 char filename[PATH_MAX];
51 char bf[BUFSIZ]; 52 char bf[BUFSIZ];
@@ -126,9 +127,10 @@ out:
126 return tgid; 127 return tgid;
127} 128}
128 129
129static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid, 130static int perf_event__synthesize_mmap_events(union perf_event *event,
130 event__handler_t process, 131 pid_t pid, pid_t tgid,
131 struct perf_session *session) 132 perf_event__handler_t process,
133 struct perf_session *session)
132{ 134{
133 char filename[PATH_MAX]; 135 char filename[PATH_MAX];
134 FILE *fp; 136 FILE *fp;
@@ -199,14 +201,14 @@ static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid,
199 return 0; 201 return 0;
200} 202}
201 203
202int event__synthesize_modules(event__handler_t process, 204int perf_event__synthesize_modules(perf_event__handler_t process,
203 struct perf_session *session, 205 struct perf_session *session,
204 struct machine *machine) 206 struct machine *machine)
205{ 207{
206 struct rb_node *nd; 208 struct rb_node *nd;
207 struct map_groups *kmaps = &machine->kmaps; 209 struct map_groups *kmaps = &machine->kmaps;
208 event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size); 210 union perf_event *event = zalloc((sizeof(event->mmap) +
209 211 session->id_hdr_size));
210 if (event == NULL) { 212 if (event == NULL) {
211 pr_debug("Not enough memory synthesizing mmap event " 213 pr_debug("Not enough memory synthesizing mmap event "
212 "for kernel modules\n"); 214 "for kernel modules\n");
@@ -251,23 +253,24 @@ int event__synthesize_modules(event__handler_t process,
251 return 0; 253 return 0;
252} 254}
253 255
254static int __event__synthesize_thread(event_t *comm_event, event_t *mmap_event, 256static int __event__synthesize_thread(union perf_event *comm_event,
255 pid_t pid, event__handler_t process, 257 union perf_event *mmap_event,
258 pid_t pid, perf_event__handler_t process,
256 struct perf_session *session) 259 struct perf_session *session)
257{ 260{
258 pid_t tgid = event__synthesize_comm(comm_event, pid, 1, process, 261 pid_t tgid = perf_event__synthesize_comm(comm_event, pid, 1, process,
259 session); 262 session);
260 if (tgid == -1) 263 if (tgid == -1)
261 return -1; 264 return -1;
262 return event__synthesize_mmap_events(mmap_event, pid, tgid, 265 return perf_event__synthesize_mmap_events(mmap_event, pid, tgid,
263 process, session); 266 process, session);
264} 267}
265 268
266int event__synthesize_thread_map(struct thread_map *threads, 269int perf_event__synthesize_thread_map(struct thread_map *threads,
267 event__handler_t process, 270 perf_event__handler_t process,
268 struct perf_session *session) 271 struct perf_session *session)
269{ 272{
270 event_t *comm_event, *mmap_event; 273 union perf_event *comm_event, *mmap_event;
271 int err = -1, thread; 274 int err = -1, thread;
272 275
273 comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); 276 comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
@@ -294,12 +297,12 @@ out:
294 return err; 297 return err;
295} 298}
296 299
297int event__synthesize_threads(event__handler_t process, 300int perf_event__synthesize_threads(perf_event__handler_t process,
298 struct perf_session *session) 301 struct perf_session *session)
299{ 302{
300 DIR *proc; 303 DIR *proc;
301 struct dirent dirent, *next; 304 struct dirent dirent, *next;
302 event_t *comm_event, *mmap_event; 305 union perf_event *comm_event, *mmap_event;
303 int err = -1; 306 int err = -1;
304 307
305 comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); 308 comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
@@ -357,10 +360,10 @@ static int find_symbol_cb(void *arg, const char *name, char type,
357 return 1; 360 return 1;
358} 361}
359 362
360int event__synthesize_kernel_mmap(event__handler_t process, 363int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
361 struct perf_session *session, 364 struct perf_session *session,
362 struct machine *machine, 365 struct machine *machine,
363 const char *symbol_name) 366 const char *symbol_name)
364{ 367{
365 size_t size; 368 size_t size;
366 const char *filename, *mmap_name; 369 const char *filename, *mmap_name;
@@ -374,8 +377,8 @@ int event__synthesize_kernel_mmap(event__handler_t process,
374 * kernels. 377 * kernels.
375 */ 378 */
376 struct process_symbol_args args = { .name = symbol_name, }; 379 struct process_symbol_args args = { .name = symbol_name, };
377 event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size); 380 union perf_event *event = zalloc((sizeof(event->mmap) +
378 381 session->id_hdr_size));
379 if (event == NULL) { 382 if (event == NULL) {
380 pr_debug("Not enough memory synthesizing mmap event " 383 pr_debug("Not enough memory synthesizing mmap event "
381 "for kernel modules\n"); 384 "for kernel modules\n");
@@ -421,42 +424,15 @@ int event__synthesize_kernel_mmap(event__handler_t process,
421 return err; 424 return err;
422} 425}
423 426
424static void thread__comm_adjust(struct thread *self, struct hists *hists) 427int perf_event__process_comm(union perf_event *event,
425{ 428 struct perf_sample *sample __used,
426 char *comm = self->comm; 429 struct perf_session *session)
427
428 if (!symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
429 (!symbol_conf.comm_list ||
430 strlist__has_entry(symbol_conf.comm_list, comm))) {
431 u16 slen = strlen(comm);
432
433 if (hists__new_col_len(hists, HISTC_COMM, slen))
434 hists__set_col_len(hists, HISTC_THREAD, slen + 6);
435 }
436}
437
438static int thread__set_comm_adjust(struct thread *self, const char *comm,
439 struct hists *hists)
440{ 430{
441 int ret = thread__set_comm(self, comm); 431 struct thread *thread = perf_session__findnew(session, event->comm.tid);
442
443 if (ret)
444 return ret;
445
446 thread__comm_adjust(self, hists);
447 432
448 return 0; 433 dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid);
449}
450 434
451int event__process_comm(event_t *self, struct sample_data *sample __used, 435 if (thread == NULL || thread__set_comm(thread, event->comm.comm)) {
452 struct perf_session *session)
453{
454 struct thread *thread = perf_session__findnew(session, self->comm.tid);
455
456 dump_printf(": %s:%d\n", self->comm.comm, self->comm.tid);
457
458 if (thread == NULL || thread__set_comm_adjust(thread, self->comm.comm,
459 &session->hists)) {
460 dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); 436 dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
461 return -1; 437 return -1;
462 } 438 }
@@ -464,19 +440,21 @@ int event__process_comm(event_t *self, struct sample_data *sample __used,
464 return 0; 440 return 0;
465} 441}
466 442
467int event__process_lost(event_t *self, struct sample_data *sample __used, 443int perf_event__process_lost(union perf_event *event,
468 struct perf_session *session) 444 struct perf_sample *sample __used,
445 struct perf_session *session)
469{ 446{
470 dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n", 447 dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n",
471 self->lost.id, self->lost.lost); 448 event->lost.id, event->lost.lost);
472 session->hists.stats.total_lost += self->lost.lost; 449 session->hists.stats.total_lost += event->lost.lost;
473 return 0; 450 return 0;
474} 451}
475 452
476static void event_set_kernel_mmap_len(struct map **maps, event_t *self) 453static void perf_event__set_kernel_mmap_len(union perf_event *event,
454 struct map **maps)
477{ 455{
478 maps[MAP__FUNCTION]->start = self->mmap.start; 456 maps[MAP__FUNCTION]->start = event->mmap.start;
479 maps[MAP__FUNCTION]->end = self->mmap.start + self->mmap.len; 457 maps[MAP__FUNCTION]->end = event->mmap.start + event->mmap.len;
480 /* 458 /*
481 * Be a bit paranoid here, some perf.data file came with 459 * Be a bit paranoid here, some perf.data file came with
482 * a zero sized synthesized MMAP event for the kernel. 460 * a zero sized synthesized MMAP event for the kernel.
@@ -485,8 +463,8 @@ static void event_set_kernel_mmap_len(struct map **maps, event_t *self)
485 maps[MAP__FUNCTION]->end = ~0ULL; 463 maps[MAP__FUNCTION]->end = ~0ULL;
486} 464}
487 465
488static int event__process_kernel_mmap(event_t *self, 466static int perf_event__process_kernel_mmap(union perf_event *event,
489 struct perf_session *session) 467 struct perf_session *session)
490{ 468{
491 struct map *map; 469 struct map *map;
492 char kmmap_prefix[PATH_MAX]; 470 char kmmap_prefix[PATH_MAX];
@@ -494,9 +472,9 @@ static int event__process_kernel_mmap(event_t *self,
494 enum dso_kernel_type kernel_type; 472 enum dso_kernel_type kernel_type;
495 bool is_kernel_mmap; 473 bool is_kernel_mmap;
496 474
497 machine = perf_session__findnew_machine(session, self->mmap.pid); 475 machine = perf_session__findnew_machine(session, event->mmap.pid);
498 if (!machine) { 476 if (!machine) {
499 pr_err("Can't find id %d's machine\n", self->mmap.pid); 477 pr_err("Can't find id %d's machine\n", event->mmap.pid);
500 goto out_problem; 478 goto out_problem;
501 } 479 }
502 480
@@ -506,17 +484,17 @@ static int event__process_kernel_mmap(event_t *self,
506 else 484 else
507 kernel_type = DSO_TYPE_GUEST_KERNEL; 485 kernel_type = DSO_TYPE_GUEST_KERNEL;
508 486
509 is_kernel_mmap = memcmp(self->mmap.filename, 487 is_kernel_mmap = memcmp(event->mmap.filename,
510 kmmap_prefix, 488 kmmap_prefix,
511 strlen(kmmap_prefix)) == 0; 489 strlen(kmmap_prefix)) == 0;
512 if (self->mmap.filename[0] == '/' || 490 if (event->mmap.filename[0] == '/' ||
513 (!is_kernel_mmap && self->mmap.filename[0] == '[')) { 491 (!is_kernel_mmap && event->mmap.filename[0] == '[')) {
514 492
515 char short_module_name[1024]; 493 char short_module_name[1024];
516 char *name, *dot; 494 char *name, *dot;
517 495
518 if (self->mmap.filename[0] == '/') { 496 if (event->mmap.filename[0] == '/') {
519 name = strrchr(self->mmap.filename, '/'); 497 name = strrchr(event->mmap.filename, '/');
520 if (name == NULL) 498 if (name == NULL)
521 goto out_problem; 499 goto out_problem;
522 500
@@ -528,10 +506,10 @@ static int event__process_kernel_mmap(event_t *self,
528 "[%.*s]", (int)(dot - name), name); 506 "[%.*s]", (int)(dot - name), name);
529 strxfrchar(short_module_name, '-', '_'); 507 strxfrchar(short_module_name, '-', '_');
530 } else 508 } else
531 strcpy(short_module_name, self->mmap.filename); 509 strcpy(short_module_name, event->mmap.filename);
532 510
533 map = machine__new_module(machine, self->mmap.start, 511 map = machine__new_module(machine, event->mmap.start,
534 self->mmap.filename); 512 event->mmap.filename);
535 if (map == NULL) 513 if (map == NULL)
536 goto out_problem; 514 goto out_problem;
537 515
@@ -541,9 +519,9 @@ static int event__process_kernel_mmap(event_t *self,
541 519
542 map->dso->short_name = name; 520 map->dso->short_name = name;
543 map->dso->sname_alloc = 1; 521 map->dso->sname_alloc = 1;
544 map->end = map->start + self->mmap.len; 522 map->end = map->start + event->mmap.len;
545 } else if (is_kernel_mmap) { 523 } else if (is_kernel_mmap) {
546 const char *symbol_name = (self->mmap.filename + 524 const char *symbol_name = (event->mmap.filename +
547 strlen(kmmap_prefix)); 525 strlen(kmmap_prefix));
548 /* 526 /*
549 * Should be there already, from the build-id table in 527 * Should be there already, from the build-id table in
@@ -558,10 +536,10 @@ static int event__process_kernel_mmap(event_t *self,
558 if (__machine__create_kernel_maps(machine, kernel) < 0) 536 if (__machine__create_kernel_maps(machine, kernel) < 0)
559 goto out_problem; 537 goto out_problem;
560 538
561 event_set_kernel_mmap_len(machine->vmlinux_maps, self); 539 perf_event__set_kernel_mmap_len(event, machine->vmlinux_maps);
562 perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, 540 perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
563 symbol_name, 541 symbol_name,
564 self->mmap.pgoff); 542 event->mmap.pgoff);
565 if (machine__is_default_guest(machine)) { 543 if (machine__is_default_guest(machine)) {
566 /* 544 /*
567 * preload dso of guest kernel and modules 545 * preload dso of guest kernel and modules
@@ -575,22 +553,23 @@ out_problem:
575 return -1; 553 return -1;
576} 554}
577 555
578int event__process_mmap(event_t *self, struct sample_data *sample __used, 556int perf_event__process_mmap(union perf_event *event,
579 struct perf_session *session) 557 struct perf_sample *sample __used,
558 struct perf_session *session)
580{ 559{
581 struct machine *machine; 560 struct machine *machine;
582 struct thread *thread; 561 struct thread *thread;
583 struct map *map; 562 struct map *map;
584 u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 563 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
585 int ret = 0; 564 int ret = 0;
586 565
587 dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", 566 dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n",
588 self->mmap.pid, self->mmap.tid, self->mmap.start, 567 event->mmap.pid, event->mmap.tid, event->mmap.start,
589 self->mmap.len, self->mmap.pgoff, self->mmap.filename); 568 event->mmap.len, event->mmap.pgoff, event->mmap.filename);
590 569
591 if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL || 570 if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL ||
592 cpumode == PERF_RECORD_MISC_KERNEL) { 571 cpumode == PERF_RECORD_MISC_KERNEL) {
593 ret = event__process_kernel_mmap(self, session); 572 ret = perf_event__process_kernel_mmap(event, session);
594 if (ret < 0) 573 if (ret < 0)
595 goto out_problem; 574 goto out_problem;
596 return 0; 575 return 0;
@@ -599,12 +578,12 @@ int event__process_mmap(event_t *self, struct sample_data *sample __used,
599 machine = perf_session__find_host_machine(session); 578 machine = perf_session__find_host_machine(session);
600 if (machine == NULL) 579 if (machine == NULL)
601 goto out_problem; 580 goto out_problem;
602 thread = perf_session__findnew(session, self->mmap.pid); 581 thread = perf_session__findnew(session, event->mmap.pid);
603 if (thread == NULL) 582 if (thread == NULL)
604 goto out_problem; 583 goto out_problem;
605 map = map__new(&machine->user_dsos, self->mmap.start, 584 map = map__new(&machine->user_dsos, event->mmap.start,
606 self->mmap.len, self->mmap.pgoff, 585 event->mmap.len, event->mmap.pgoff,
607 self->mmap.pid, self->mmap.filename, 586 event->mmap.pid, event->mmap.filename,
608 MAP__FUNCTION); 587 MAP__FUNCTION);
609 if (map == NULL) 588 if (map == NULL)
610 goto out_problem; 589 goto out_problem;
@@ -617,16 +596,17 @@ out_problem:
617 return 0; 596 return 0;
618} 597}
619 598
620int event__process_task(event_t *self, struct sample_data *sample __used, 599int perf_event__process_task(union perf_event *event,
621 struct perf_session *session) 600 struct perf_sample *sample __used,
601 struct perf_session *session)
622{ 602{
623 struct thread *thread = perf_session__findnew(session, self->fork.tid); 603 struct thread *thread = perf_session__findnew(session, event->fork.tid);
624 struct thread *parent = perf_session__findnew(session, self->fork.ptid); 604 struct thread *parent = perf_session__findnew(session, event->fork.ptid);
625 605
626 dump_printf("(%d:%d):(%d:%d)\n", self->fork.pid, self->fork.tid, 606 dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
627 self->fork.ppid, self->fork.ptid); 607 event->fork.ppid, event->fork.ptid);
628 608
629 if (self->header.type == PERF_RECORD_EXIT) { 609 if (event->header.type == PERF_RECORD_EXIT) {
630 perf_session__remove_thread(session, thread); 610 perf_session__remove_thread(session, thread);
631 return 0; 611 return 0;
632 } 612 }
@@ -640,20 +620,22 @@ int event__process_task(event_t *self, struct sample_data *sample __used,
640 return 0; 620 return 0;
641} 621}
642 622
643int event__process(event_t *event, struct sample_data *sample, 623int perf_event__process(union perf_event *event, struct perf_sample *sample,
644 struct perf_session *session) 624 struct perf_session *session)
645{ 625{
646 switch (event->header.type) { 626 switch (event->header.type) {
647 case PERF_RECORD_COMM: 627 case PERF_RECORD_COMM:
648 event__process_comm(event, sample, session); 628 perf_event__process_comm(event, sample, session);
649 break; 629 break;
650 case PERF_RECORD_MMAP: 630 case PERF_RECORD_MMAP:
651 event__process_mmap(event, sample, session); 631 perf_event__process_mmap(event, sample, session);
652 break; 632 break;
653 case PERF_RECORD_FORK: 633 case PERF_RECORD_FORK:
654 case PERF_RECORD_EXIT: 634 case PERF_RECORD_EXIT:
655 event__process_task(event, sample, session); 635 perf_event__process_task(event, sample, session);
656 break; 636 break;
637 case PERF_RECORD_LOST:
638 perf_event__process_lost(event, sample, session);
657 default: 639 default:
658 break; 640 break;
659 } 641 }
@@ -750,24 +732,14 @@ void thread__find_addr_location(struct thread *self,
750 al->sym = NULL; 732 al->sym = NULL;
751} 733}
752 734
753static void dso__calc_col_width(struct dso *self, struct hists *hists) 735int perf_event__preprocess_sample(const union perf_event *event,
754{ 736 struct perf_session *session,
755 if (!symbol_conf.col_width_list_str && !symbol_conf.field_sep && 737 struct addr_location *al,
756 (!symbol_conf.dso_list || 738 struct perf_sample *sample,
757 strlist__has_entry(symbol_conf.dso_list, self->name))) { 739 symbol_filter_t filter)
758 u16 slen = dso__name_len(self);
759 hists__new_col_len(hists, HISTC_DSO, slen);
760 }
761
762 self->slen_calculated = 1;
763}
764
765int event__preprocess_sample(const event_t *self, struct perf_session *session,
766 struct addr_location *al, struct sample_data *data,
767 symbol_filter_t filter)
768{ 740{
769 u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 741 u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
770 struct thread *thread = perf_session__findnew(session, self->ip.pid); 742 struct thread *thread = perf_session__findnew(session, event->ip.pid);
771 743
772 if (thread == NULL) 744 if (thread == NULL)
773 return -1; 745 return -1;
@@ -789,12 +761,12 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
789 machine__create_kernel_maps(&session->host_machine); 761 machine__create_kernel_maps(&session->host_machine);
790 762
791 thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, 763 thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
792 self->ip.pid, self->ip.ip, al); 764 event->ip.pid, event->ip.ip, al);
793 dump_printf(" ...... dso: %s\n", 765 dump_printf(" ...... dso: %s\n",
794 al->map ? al->map->dso->long_name : 766 al->map ? al->map->dso->long_name :
795 al->level == 'H' ? "[hypervisor]" : "<not found>"); 767 al->level == 'H' ? "[hypervisor]" : "<not found>");
796 al->sym = NULL; 768 al->sym = NULL;
797 al->cpu = data->cpu; 769 al->cpu = sample->cpu;
798 770
799 if (al->map) { 771 if (al->map) {
800 if (symbol_conf.dso_list && 772 if (symbol_conf.dso_list &&
@@ -805,23 +777,8 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
805 strlist__has_entry(symbol_conf.dso_list, 777 strlist__has_entry(symbol_conf.dso_list,
806 al->map->dso->long_name))))) 778 al->map->dso->long_name)))))
807 goto out_filtered; 779 goto out_filtered;
808 /*
809 * We have to do this here as we may have a dso with no symbol
810 * hit that has a name longer than the ones with symbols
811 * sampled.
812 */
813 if (!sort_dso.elide && !al->map->dso->slen_calculated)
814 dso__calc_col_width(al->map->dso, &session->hists);
815 780
816 al->sym = map__find_symbol(al->map, al->addr, filter); 781 al->sym = map__find_symbol(al->map, al->addr, filter);
817 } else {
818 const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
819
820 if (hists__col_len(&session->hists, HISTC_DSO) < unresolved_col_width &&
821 !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
822 !symbol_conf.dso_list)
823 hists__set_col_len(&session->hists, HISTC_DSO,
824 unresolved_col_width);
825 } 782 }
826 783
827 if (symbol_conf.sym_list && al->sym && 784 if (symbol_conf.sym_list && al->sym &&
@@ -834,128 +791,3 @@ out_filtered:
834 al->filtered = true; 791 al->filtered = true;
835 return 0; 792 return 0;
836} 793}
837
838static int event__parse_id_sample(const event_t *event,
839 struct perf_session *session,
840 struct sample_data *sample)
841{
842 const u64 *array;
843 u64 type;
844
845 sample->cpu = sample->pid = sample->tid = -1;
846 sample->stream_id = sample->id = sample->time = -1ULL;
847
848 if (!session->sample_id_all)
849 return 0;
850
851 array = event->sample.array;
852 array += ((event->header.size -
853 sizeof(event->header)) / sizeof(u64)) - 1;
854 type = session->sample_type;
855
856 if (type & PERF_SAMPLE_CPU) {
857 u32 *p = (u32 *)array;
858 sample->cpu = *p;
859 array--;
860 }
861
862 if (type & PERF_SAMPLE_STREAM_ID) {
863 sample->stream_id = *array;
864 array--;
865 }
866
867 if (type & PERF_SAMPLE_ID) {
868 sample->id = *array;
869 array--;
870 }
871
872 if (type & PERF_SAMPLE_TIME) {
873 sample->time = *array;
874 array--;
875 }
876
877 if (type & PERF_SAMPLE_TID) {
878 u32 *p = (u32 *)array;
879 sample->pid = p[0];
880 sample->tid = p[1];
881 }
882
883 return 0;
884}
885
886int event__parse_sample(const event_t *event, struct perf_session *session,
887 struct sample_data *data)
888{
889 const u64 *array;
890 u64 type;
891
892 if (event->header.type != PERF_RECORD_SAMPLE)
893 return event__parse_id_sample(event, session, data);
894
895 array = event->sample.array;
896 type = session->sample_type;
897
898 if (type & PERF_SAMPLE_IP) {
899 data->ip = event->ip.ip;
900 array++;
901 }
902
903 if (type & PERF_SAMPLE_TID) {
904 u32 *p = (u32 *)array;
905 data->pid = p[0];
906 data->tid = p[1];
907 array++;
908 }
909
910 if (type & PERF_SAMPLE_TIME) {
911 data->time = *array;
912 array++;
913 }
914
915 if (type & PERF_SAMPLE_ADDR) {
916 data->addr = *array;
917 array++;
918 }
919
920 data->id = -1ULL;
921 if (type & PERF_SAMPLE_ID) {
922 data->id = *array;
923 array++;
924 }
925
926 if (type & PERF_SAMPLE_STREAM_ID) {
927 data->stream_id = *array;
928 array++;
929 }
930
931 if (type & PERF_SAMPLE_CPU) {
932 u32 *p = (u32 *)array;
933 data->cpu = *p;
934 array++;
935 } else
936 data->cpu = -1;
937
938 if (type & PERF_SAMPLE_PERIOD) {
939 data->period = *array;
940 array++;
941 }
942
943 if (type & PERF_SAMPLE_READ) {
944 pr_debug("PERF_SAMPLE_READ is unsuported for now\n");
945 return -1;
946 }
947
948 if (type & PERF_SAMPLE_CALLCHAIN) {
949 data->callchain = (struct ip_callchain *)array;
950 array += 1 + data->callchain->nr;
951 }
952
953 if (type & PERF_SAMPLE_RAW) {
954 u32 *p = (u32 *)array;
955 data->raw_size = *p;
956 p++;
957 data->raw_data = p;
958 }
959
960 return 0;
961}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index cc7b52f9b49..9c35170fb37 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -61,7 +61,7 @@ struct sample_event {
61 u64 array[]; 61 u64 array[];
62}; 62};
63 63
64struct sample_data { 64struct perf_sample {
65 u64 ip; 65 u64 ip;
66 u32 pid, tid; 66 u32 pid, tid;
67 u64 time; 67 u64 time;
@@ -117,7 +117,7 @@ struct tracing_data_event {
117 u32 size; 117 u32 size;
118}; 118};
119 119
120typedef union event_union { 120union perf_event {
121 struct perf_event_header header; 121 struct perf_event_header header;
122 struct ip_event ip; 122 struct ip_event ip;
123 struct mmap_event mmap; 123 struct mmap_event mmap;
@@ -130,50 +130,54 @@ typedef union event_union {
130 struct event_type_event event_type; 130 struct event_type_event event_type;
131 struct tracing_data_event tracing_data; 131 struct tracing_data_event tracing_data;
132 struct build_id_event build_id; 132 struct build_id_event build_id;
133} event_t; 133};
134 134
135void event__print_totals(void); 135void perf_event__print_totals(void);
136 136
137struct perf_session; 137struct perf_session;
138struct thread_map; 138struct thread_map;
139 139
140typedef int (*event__handler_synth_t)(event_t *event, 140typedef int (*perf_event__handler_synth_t)(union perf_event *event,
141 struct perf_session *session);
142typedef int (*perf_event__handler_t)(union perf_event *event,
143 struct perf_sample *sample,
141 struct perf_session *session); 144 struct perf_session *session);
142typedef int (*event__handler_t)(event_t *event, struct sample_data *sample, 145
143 struct perf_session *session); 146int perf_event__synthesize_thread_map(struct thread_map *threads,
144 147 perf_event__handler_t process,
145int event__synthesize_thread_map(struct thread_map *threads, 148 struct perf_session *session);
146 event__handler_t process, 149int perf_event__synthesize_threads(perf_event__handler_t process,
147 struct perf_session *session); 150 struct perf_session *session);
148int event__synthesize_threads(event__handler_t process, 151int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
149 struct perf_session *session); 152 struct perf_session *session,
150int event__synthesize_kernel_mmap(event__handler_t process, 153 struct machine *machine,
151 struct perf_session *session, 154 const char *symbol_name);
152 struct machine *machine, 155
153 const char *symbol_name); 156int perf_event__synthesize_modules(perf_event__handler_t process,
154 157 struct perf_session *session,
155int event__synthesize_modules(event__handler_t process, 158 struct machine *machine);
156 struct perf_session *session, 159
157 struct machine *machine); 160int perf_event__process_comm(union perf_event *event, struct perf_sample *sample,
158 161 struct perf_session *session);
159int event__process_comm(event_t *self, struct sample_data *sample, 162int perf_event__process_lost(union perf_event *event, struct perf_sample *sample,
160 struct perf_session *session); 163 struct perf_session *session);
161int event__process_lost(event_t *self, struct sample_data *sample, 164int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample,
162 struct perf_session *session); 165 struct perf_session *session);
163int event__process_mmap(event_t *self, struct sample_data *sample, 166int perf_event__process_task(union perf_event *event, struct perf_sample *sample,
164 struct perf_session *session); 167 struct perf_session *session);
165int event__process_task(event_t *self, struct sample_data *sample, 168int perf_event__process(union perf_event *event, struct perf_sample *sample,
166 struct perf_session *session); 169 struct perf_session *session);
167int event__process(event_t *event, struct sample_data *sample,
168 struct perf_session *session);
169 170
170struct addr_location; 171struct addr_location;
171int event__preprocess_sample(const event_t *self, struct perf_session *session, 172int perf_event__preprocess_sample(const union perf_event *self,
172 struct addr_location *al, struct sample_data *data, 173 struct perf_session *session,
173 symbol_filter_t filter); 174 struct addr_location *al,
174int event__parse_sample(const event_t *event, struct perf_session *session, 175 struct perf_sample *sample,
175 struct sample_data *sample); 176 symbol_filter_t filter);
177
178const char *perf_event__name(unsigned int id);
176 179
177const char *event__get_event_name(unsigned int id); 180int perf_event__parse_sample(const union perf_event *event, u64 type,
181 bool sample_id_all, struct perf_sample *sample);
178 182
179#endif /* __PERF_RECORD_H */ 183#endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
new file mode 100644
index 00000000000..d852cefa20d
--- /dev/null
+++ b/tools/perf/util/evlist.c
@@ -0,0 +1,394 @@
1/*
2 * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
3 *
4 * Parts came from builtin-{top,stat,record}.c, see those files for further
5 * copyright notes.
6 *
7 * Released under the GPL v2. (and only v2, not any later version)
8 */
9#include <poll.h>
10#include "cpumap.h"
11#include "thread_map.h"
12#include "evlist.h"
13#include "evsel.h"
14#include "util.h"
15
16#include <sys/mman.h>
17
18#include <linux/bitops.h>
19#include <linux/hash.h>
20
21#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
22#define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
23
24void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
25 struct thread_map *threads)
26{
27 int i;
28
29 for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
30 INIT_HLIST_HEAD(&evlist->heads[i]);
31 INIT_LIST_HEAD(&evlist->entries);
32 perf_evlist__set_maps(evlist, cpus, threads);
33}
34
35struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
36 struct thread_map *threads)
37{
38 struct perf_evlist *evlist = zalloc(sizeof(*evlist));
39
40 if (evlist != NULL)
41 perf_evlist__init(evlist, cpus, threads);
42
43 return evlist;
44}
45
46static void perf_evlist__purge(struct perf_evlist *evlist)
47{
48 struct perf_evsel *pos, *n;
49
50 list_for_each_entry_safe(pos, n, &evlist->entries, node) {
51 list_del_init(&pos->node);
52 perf_evsel__delete(pos);
53 }
54
55 evlist->nr_entries = 0;
56}
57
58void perf_evlist__exit(struct perf_evlist *evlist)
59{
60 free(evlist->mmap);
61 free(evlist->pollfd);
62 evlist->mmap = NULL;
63 evlist->pollfd = NULL;
64}
65
66void perf_evlist__delete(struct perf_evlist *evlist)
67{
68 perf_evlist__purge(evlist);
69 perf_evlist__exit(evlist);
70 free(evlist);
71}
72
73void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
74{
75 list_add_tail(&entry->node, &evlist->entries);
76 ++evlist->nr_entries;
77}
78
79int perf_evlist__add_default(struct perf_evlist *evlist)
80{
81 struct perf_event_attr attr = {
82 .type = PERF_TYPE_HARDWARE,
83 .config = PERF_COUNT_HW_CPU_CYCLES,
84 };
85 struct perf_evsel *evsel = perf_evsel__new(&attr, 0);
86
87 if (evsel == NULL)
88 return -ENOMEM;
89
90 perf_evlist__add(evlist, evsel);
91 return 0;
92}
93
94int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
95{
96 int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries;
97 evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
98 return evlist->pollfd != NULL ? 0 : -ENOMEM;
99}
100
101void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
102{
103 fcntl(fd, F_SETFL, O_NONBLOCK);
104 evlist->pollfd[evlist->nr_fds].fd = fd;
105 evlist->pollfd[evlist->nr_fds].events = POLLIN;
106 evlist->nr_fds++;
107}
108
109static void perf_evlist__id_hash(struct perf_evlist *evlist,
110 struct perf_evsel *evsel,
111 int cpu, int thread, u64 id)
112{
113 int hash;
114 struct perf_sample_id *sid = SID(evsel, cpu, thread);
115
116 sid->id = id;
117 sid->evsel = evsel;
118 hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
119 hlist_add_head(&sid->node, &evlist->heads[hash]);
120}
121
122void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
123 int cpu, int thread, u64 id)
124{
125 perf_evlist__id_hash(evlist, evsel, cpu, thread, id);
126 evsel->id[evsel->ids++] = id;
127}
128
129static int perf_evlist__id_add_fd(struct perf_evlist *evlist,
130 struct perf_evsel *evsel,
131 int cpu, int thread, int fd)
132{
133 u64 read_data[4] = { 0, };
134 int id_idx = 1; /* The first entry is the counter value */
135
136 if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
137 read(fd, &read_data, sizeof(read_data)) == -1)
138 return -1;
139
140 if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
141 ++id_idx;
142 if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
143 ++id_idx;
144
145 perf_evlist__id_add(evlist, evsel, cpu, thread, read_data[id_idx]);
146 return 0;
147}
148
149struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
150{
151 struct hlist_head *head;
152 struct hlist_node *pos;
153 struct perf_sample_id *sid;
154 int hash;
155
156 if (evlist->nr_entries == 1)
157 return list_entry(evlist->entries.next, struct perf_evsel, node);
158
159 hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
160 head = &evlist->heads[hash];
161
162 hlist_for_each_entry(sid, pos, head, node)
163 if (sid->id == id)
164 return sid->evsel;
165 return NULL;
166}
167
168union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
169{
170 /* XXX Move this to perf.c, making it generally available */
171 unsigned int page_size = sysconf(_SC_PAGE_SIZE);
172 struct perf_mmap *md = &evlist->mmap[cpu];
173 unsigned int head = perf_mmap__read_head(md);
174 unsigned int old = md->prev;
175 unsigned char *data = md->base + page_size;
176 union perf_event *event = NULL;
177
178 if (evlist->overwrite) {
179 /*
180 * If we're further behind than half the buffer, there's a chance
181 * the writer will bite our tail and mess up the samples under us.
182 *
183 * If we somehow ended up ahead of the head, we got messed up.
184 *
185 * In either case, truncate and restart at head.
186 */
187 int diff = head - old;
188 if (diff > md->mask / 2 || diff < 0) {
189 fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
190
191 /*
192 * head points to a known good entry, start there.
193 */
194 old = head;
195 }
196 }
197
198 if (old != head) {
199 size_t size;
200
201 event = (union perf_event *)&data[old & md->mask];
202 size = event->header.size;
203
204 /*
205 * Event straddles the mmap boundary -- header should always
206 * be inside due to u64 alignment of output.
207 */
208 if ((old & md->mask) + size != ((old + size) & md->mask)) {
209 unsigned int offset = old;
210 unsigned int len = min(sizeof(*event), size), cpy;
211 void *dst = &evlist->event_copy;
212
213 do {
214 cpy = min(md->mask + 1 - (offset & md->mask), len);
215 memcpy(dst, &data[offset & md->mask], cpy);
216 offset += cpy;
217 dst += cpy;
218 len -= cpy;
219 } while (len);
220
221 event = &evlist->event_copy;
222 }
223
224 old += size;
225 }
226
227 md->prev = old;
228
229 if (!evlist->overwrite)
230 perf_mmap__write_tail(md, old);
231
232 return event;
233}
234
235void perf_evlist__munmap(struct perf_evlist *evlist)
236{
237 int cpu;
238
239 for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
240 if (evlist->mmap[cpu].base != NULL) {
241 munmap(evlist->mmap[cpu].base, evlist->mmap_len);
242 evlist->mmap[cpu].base = NULL;
243 }
244 }
245}
246
247int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
248{
249 evlist->mmap = zalloc(evlist->cpus->nr * sizeof(struct perf_mmap));
250 return evlist->mmap != NULL ? 0 : -ENOMEM;
251}
252
253static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot,
254 int mask, int fd)
255{
256 evlist->mmap[cpu].prev = 0;
257 evlist->mmap[cpu].mask = mask;
258 evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot,
259 MAP_SHARED, fd, 0);
260 if (evlist->mmap[cpu].base == MAP_FAILED)
261 return -1;
262
263 perf_evlist__add_pollfd(evlist, fd);
264 return 0;
265}
266
267/** perf_evlist__mmap - Create per cpu maps to receive events
268 *
269 * @evlist - list of events
270 * @pages - map length in pages
271 * @overwrite - overwrite older events?
272 *
273 * If overwrite is false the user needs to signal event consuption using:
274 *
275 * struct perf_mmap *m = &evlist->mmap[cpu];
276 * unsigned int head = perf_mmap__read_head(m);
277 *
278 * perf_mmap__write_tail(m, head)
279 *
280 * Using perf_evlist__read_on_cpu does this automatically.
281 */
282int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
283{
284 unsigned int page_size = sysconf(_SC_PAGE_SIZE);
285 int mask = pages * page_size - 1, cpu;
286 struct perf_evsel *first_evsel, *evsel;
287 const struct cpu_map *cpus = evlist->cpus;
288 const struct thread_map *threads = evlist->threads;
289 int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
290
291 if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
292 return -ENOMEM;
293
294 if (evlist->pollfd == NULL && perf_evlist__alloc_pollfd(evlist) < 0)
295 return -ENOMEM;
296
297 evlist->overwrite = overwrite;
298 evlist->mmap_len = (pages + 1) * page_size;
299 first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
300
301 list_for_each_entry(evsel, &evlist->entries, node) {
302 if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
303 evsel->sample_id == NULL &&
304 perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
305 return -ENOMEM;
306
307 for (cpu = 0; cpu < cpus->nr; cpu++) {
308 for (thread = 0; thread < threads->nr; thread++) {
309 int fd = FD(evsel, cpu, thread);
310
311 if (evsel->idx || thread) {
312 if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
313 FD(first_evsel, cpu, 0)) != 0)
314 goto out_unmap;
315 } else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0)
316 goto out_unmap;
317
318 if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
319 perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0)
320 goto out_unmap;
321 }
322 }
323 }
324
325 return 0;
326
327out_unmap:
328 for (cpu = 0; cpu < cpus->nr; cpu++) {
329 if (evlist->mmap[cpu].base != NULL) {
330 munmap(evlist->mmap[cpu].base, evlist->mmap_len);
331 evlist->mmap[cpu].base = NULL;
332 }
333 }
334 return -1;
335}
336
337int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid,
338 pid_t target_tid, const char *cpu_list)
339{
340 evlist->threads = thread_map__new(target_pid, target_tid);
341
342 if (evlist->threads == NULL)
343 return -1;
344
345 if (target_tid != -1)
346 evlist->cpus = cpu_map__dummy_new();
347 else
348 evlist->cpus = cpu_map__new(cpu_list);
349
350 if (evlist->cpus == NULL)
351 goto out_delete_threads;
352
353 return 0;
354
355out_delete_threads:
356 thread_map__delete(evlist->threads);
357 return -1;
358}
359
360void perf_evlist__delete_maps(struct perf_evlist *evlist)
361{
362 cpu_map__delete(evlist->cpus);
363 thread_map__delete(evlist->threads);
364 evlist->cpus = NULL;
365 evlist->threads = NULL;
366}
367
368int perf_evlist__set_filters(struct perf_evlist *evlist)
369{
370 const struct thread_map *threads = evlist->threads;
371 const struct cpu_map *cpus = evlist->cpus;
372 struct perf_evsel *evsel;
373 char *filter;
374 int thread;
375 int cpu;
376 int err;
377 int fd;
378
379 list_for_each_entry(evsel, &evlist->entries, node) {
380 filter = evsel->filter;
381 if (!filter)
382 continue;
383 for (cpu = 0; cpu < cpus->nr; cpu++) {
384 for (thread = 0; thread < threads->nr; thread++) {
385 fd = FD(evsel, cpu, thread);
386 err = ioctl(fd, PERF_EVENT_IOC_SET_FILTER, filter);
387 if (err)
388 return err;
389 }
390 }
391 }
392
393 return 0;
394}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
new file mode 100644
index 00000000000..8b1cb7a4c5f
--- /dev/null
+++ b/tools/perf/util/evlist.h
@@ -0,0 +1,68 @@
1#ifndef __PERF_EVLIST_H
2#define __PERF_EVLIST_H 1
3
4#include <linux/list.h>
5#include "../perf.h"
6#include "event.h"
7
8struct pollfd;
9struct thread_map;
10struct cpu_map;
11
12#define PERF_EVLIST__HLIST_BITS 8
13#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
14
15struct perf_evlist {
16 struct list_head entries;
17 struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
18 int nr_entries;
19 int nr_fds;
20 int mmap_len;
21 bool overwrite;
22 union perf_event event_copy;
23 struct perf_mmap *mmap;
24 struct pollfd *pollfd;
25 struct thread_map *threads;
26 struct cpu_map *cpus;
27};
28
29struct perf_evsel;
30
31struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
32 struct thread_map *threads);
33void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
34 struct thread_map *threads);
35void perf_evlist__exit(struct perf_evlist *evlist);
36void perf_evlist__delete(struct perf_evlist *evlist);
37
38void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
39int perf_evlist__add_default(struct perf_evlist *evlist);
40
41void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
42 int cpu, int thread, u64 id);
43
44int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
45void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
46
47struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
48
49union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
50
51int perf_evlist__alloc_mmap(struct perf_evlist *evlist);
52int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite);
53void perf_evlist__munmap(struct perf_evlist *evlist);
54
55static inline void perf_evlist__set_maps(struct perf_evlist *evlist,
56 struct cpu_map *cpus,
57 struct thread_map *threads)
58{
59 evlist->cpus = cpus;
60 evlist->threads = threads;
61}
62
63int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid,
64 pid_t target_tid, const char *cpu_list);
65void perf_evlist__delete_maps(struct perf_evlist *evlist);
66int perf_evlist__set_filters(struct perf_evlist *evlist);
67
68#endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d8575d31ee6..662596afd7f 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1,20 +1,34 @@
1/*
2 * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
3 *
4 * Parts came from builtin-{top,stat,record}.c, see those files for further
5 * copyright notes.
6 *
7 * Released under the GPL v2. (and only v2, not any later version)
8 */
9
1#include "evsel.h" 10#include "evsel.h"
2#include "../perf.h" 11#include "evlist.h"
3#include "util.h" 12#include "util.h"
4#include "cpumap.h" 13#include "cpumap.h"
5#include "thread.h" 14#include "thread_map.h"
6 15
7#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) 16#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
8 17
18void perf_evsel__init(struct perf_evsel *evsel,
19 struct perf_event_attr *attr, int idx)
20{
21 evsel->idx = idx;
22 evsel->attr = *attr;
23 INIT_LIST_HEAD(&evsel->node);
24}
25
9struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx) 26struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
10{ 27{
11 struct perf_evsel *evsel = zalloc(sizeof(*evsel)); 28 struct perf_evsel *evsel = zalloc(sizeof(*evsel));
12 29
13 if (evsel != NULL) { 30 if (evsel != NULL)
14 evsel->idx = idx; 31 perf_evsel__init(evsel, attr, idx);
15 evsel->attr = *attr;
16 INIT_LIST_HEAD(&evsel->node);
17 }
18 32
19 return evsel; 33 return evsel;
20} 34}
@@ -25,6 +39,22 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
25 return evsel->fd != NULL ? 0 : -ENOMEM; 39 return evsel->fd != NULL ? 0 : -ENOMEM;
26} 40}
27 41
42int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
43{
44 evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
45 if (evsel->sample_id == NULL)
46 return -ENOMEM;
47
48 evsel->id = zalloc(ncpus * nthreads * sizeof(u64));
49 if (evsel->id == NULL) {
50 xyarray__delete(evsel->sample_id);
51 evsel->sample_id = NULL;
52 return -ENOMEM;
53 }
54
55 return 0;
56}
57
28int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus) 58int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
29{ 59{
30 evsel->counts = zalloc((sizeof(*evsel->counts) + 60 evsel->counts = zalloc((sizeof(*evsel->counts) +
@@ -38,6 +68,14 @@ void perf_evsel__free_fd(struct perf_evsel *evsel)
38 evsel->fd = NULL; 68 evsel->fd = NULL;
39} 69}
40 70
71void perf_evsel__free_id(struct perf_evsel *evsel)
72{
73 xyarray__delete(evsel->sample_id);
74 evsel->sample_id = NULL;
75 free(evsel->id);
76 evsel->id = NULL;
77}
78
41void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) 79void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
42{ 80{
43 int cpu, thread; 81 int cpu, thread;
@@ -49,10 +87,19 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
49 } 87 }
50} 88}
51 89
52void perf_evsel__delete(struct perf_evsel *evsel) 90void perf_evsel__exit(struct perf_evsel *evsel)
53{ 91{
54 assert(list_empty(&evsel->node)); 92 assert(list_empty(&evsel->node));
55 xyarray__delete(evsel->fd); 93 xyarray__delete(evsel->fd);
94 xyarray__delete(evsel->sample_id);
95 free(evsel->id);
96}
97
98void perf_evsel__delete(struct perf_evsel *evsel)
99{
100 perf_evsel__exit(evsel);
101 close_cgroup(evsel->cgrp);
102 free(evsel->name);
56 free(evsel); 103 free(evsel);
57} 104}
58 105
@@ -128,21 +175,51 @@ int __perf_evsel__read(struct perf_evsel *evsel,
128} 175}
129 176
130static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, 177static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
131 struct thread_map *threads) 178 struct thread_map *threads, bool group, bool inherit)
132{ 179{
133 int cpu, thread; 180 int cpu, thread;
181 unsigned long flags = 0;
182 int pid = -1;
134 183
135 if (evsel->fd == NULL && 184 if (evsel->fd == NULL &&
136 perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0) 185 perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
137 return -1; 186 return -1;
138 187
188 if (evsel->cgrp) {
189 flags = PERF_FLAG_PID_CGROUP;
190 pid = evsel->cgrp->fd;
191 }
192
139 for (cpu = 0; cpu < cpus->nr; cpu++) { 193 for (cpu = 0; cpu < cpus->nr; cpu++) {
194 int group_fd = -1;
195 /*
196 * Don't allow mmap() of inherited per-task counters. This
197 * would create a performance issue due to all children writing
198 * to the same buffer.
199 *
200 * FIXME:
201 * Proper fix is not to pass 'inherit' to perf_evsel__open*,
202 * but a 'flags' parameter, with 'group' folded there as well,
203 * then introduce a PERF_O_{MMAP,GROUP,INHERIT} enum, and if
204 * O_MMAP is set, emit a warning if cpu < 0 and O_INHERIT is
205 * set. Lets go for the minimal fix first tho.
206 */
207 evsel->attr.inherit = (cpus->map[cpu] >= 0) && inherit;
208
140 for (thread = 0; thread < threads->nr; thread++) { 209 for (thread = 0; thread < threads->nr; thread++) {
210
211 if (!evsel->cgrp)
212 pid = threads->map[thread];
213
141 FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr, 214 FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
142 threads->map[thread], 215 pid,
143 cpus->map[cpu], -1, 0); 216 cpus->map[cpu],
217 group_fd, flags);
144 if (FD(evsel, cpu, thread) < 0) 218 if (FD(evsel, cpu, thread) < 0)
145 goto out_close; 219 goto out_close;
220
221 if (group && group_fd == -1)
222 group_fd = FD(evsel, cpu, thread);
146 } 223 }
147 } 224 }
148 225
@@ -175,10 +252,9 @@ static struct {
175 .threads = { -1, }, 252 .threads = { -1, },
176}; 253};
177 254
178int perf_evsel__open(struct perf_evsel *evsel, 255int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
179 struct cpu_map *cpus, struct thread_map *threads) 256 struct thread_map *threads, bool group, bool inherit)
180{ 257{
181
182 if (cpus == NULL) { 258 if (cpus == NULL) {
183 /* Work around old compiler warnings about strict aliasing */ 259 /* Work around old compiler warnings about strict aliasing */
184 cpus = &empty_cpu_map.map; 260 cpus = &empty_cpu_map.map;
@@ -187,15 +263,135 @@ int perf_evsel__open(struct perf_evsel *evsel,
187 if (threads == NULL) 263 if (threads == NULL)
188 threads = &empty_thread_map.map; 264 threads = &empty_thread_map.map;
189 265
190 return __perf_evsel__open(evsel, cpus, threads); 266 return __perf_evsel__open(evsel, cpus, threads, group, inherit);
191} 267}
192 268
193int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus) 269int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
270 struct cpu_map *cpus, bool group, bool inherit)
194{ 271{
195 return __perf_evsel__open(evsel, cpus, &empty_thread_map.map); 272 return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group, inherit);
273}
274
275int perf_evsel__open_per_thread(struct perf_evsel *evsel,
276 struct thread_map *threads, bool group, bool inherit)
277{
278 return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
279}
280
281static int perf_event__parse_id_sample(const union perf_event *event, u64 type,
282 struct perf_sample *sample)
283{
284 const u64 *array = event->sample.array;
285
286 array += ((event->header.size -
287 sizeof(event->header)) / sizeof(u64)) - 1;
288
289 if (type & PERF_SAMPLE_CPU) {
290 u32 *p = (u32 *)array;
291 sample->cpu = *p;
292 array--;
293 }
294
295 if (type & PERF_SAMPLE_STREAM_ID) {
296 sample->stream_id = *array;
297 array--;
298 }
299
300 if (type & PERF_SAMPLE_ID) {
301 sample->id = *array;
302 array--;
303 }
304
305 if (type & PERF_SAMPLE_TIME) {
306 sample->time = *array;
307 array--;
308 }
309
310 if (type & PERF_SAMPLE_TID) {
311 u32 *p = (u32 *)array;
312 sample->pid = p[0];
313 sample->tid = p[1];
314 }
315
316 return 0;
196} 317}
197 318
198int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads) 319int perf_event__parse_sample(const union perf_event *event, u64 type,
320 bool sample_id_all, struct perf_sample *data)
199{ 321{
200 return __perf_evsel__open(evsel, &empty_cpu_map.map, threads); 322 const u64 *array;
323
324 data->cpu = data->pid = data->tid = -1;
325 data->stream_id = data->id = data->time = -1ULL;
326
327 if (event->header.type != PERF_RECORD_SAMPLE) {
328 if (!sample_id_all)
329 return 0;
330 return perf_event__parse_id_sample(event, type, data);
331 }
332
333 array = event->sample.array;
334
335 if (type & PERF_SAMPLE_IP) {
336 data->ip = event->ip.ip;
337 array++;
338 }
339
340 if (type & PERF_SAMPLE_TID) {
341 u32 *p = (u32 *)array;
342 data->pid = p[0];
343 data->tid = p[1];
344 array++;
345 }
346
347 if (type & PERF_SAMPLE_TIME) {
348 data->time = *array;
349 array++;
350 }
351
352 if (type & PERF_SAMPLE_ADDR) {
353 data->addr = *array;
354 array++;
355 }
356
357 data->id = -1ULL;
358 if (type & PERF_SAMPLE_ID) {
359 data->id = *array;
360 array++;
361 }
362
363 if (type & PERF_SAMPLE_STREAM_ID) {
364 data->stream_id = *array;
365 array++;
366 }
367
368 if (type & PERF_SAMPLE_CPU) {
369 u32 *p = (u32 *)array;
370 data->cpu = *p;
371 array++;
372 }
373
374 if (type & PERF_SAMPLE_PERIOD) {
375 data->period = *array;
376 array++;
377 }
378
379 if (type & PERF_SAMPLE_READ) {
380 fprintf(stderr, "PERF_SAMPLE_READ is unsuported for now\n");
381 return -1;
382 }
383
384 if (type & PERF_SAMPLE_CALLCHAIN) {
385 data->callchain = (struct ip_callchain *)array;
386 array += 1 + data->callchain->nr;
387 }
388
389 if (type & PERF_SAMPLE_RAW) {
390 u32 *p = (u32 *)array;
391 data->raw_size = *p;
392 p++;
393 data->raw_data = p;
394 }
395
396 return 0;
201} 397}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index b2d755fe88a..6710ab53834 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -6,6 +6,8 @@
6#include "../../../include/linux/perf_event.h" 6#include "../../../include/linux/perf_event.h"
7#include "types.h" 7#include "types.h"
8#include "xyarray.h" 8#include "xyarray.h"
9#include "cgroup.h"
10#include "hist.h"
9 11
10struct perf_counts_values { 12struct perf_counts_values {
11 union { 13 union {
@@ -24,31 +26,66 @@ struct perf_counts {
24 struct perf_counts_values cpu[]; 26 struct perf_counts_values cpu[];
25}; 27};
26 28
29struct perf_evsel;
30
31/*
32 * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are
33 * more than one entry in the evlist.
34 */
35struct perf_sample_id {
36 struct hlist_node node;
37 u64 id;
38 struct perf_evsel *evsel;
39};
40
41/** struct perf_evsel - event selector
42 *
43 * @name - Can be set to retain the original event name passed by the user,
44 * so that when showing results in tools such as 'perf stat', we
45 * show the name used, not some alias.
46 */
27struct perf_evsel { 47struct perf_evsel {
28 struct list_head node; 48 struct list_head node;
29 struct perf_event_attr attr; 49 struct perf_event_attr attr;
30 char *filter; 50 char *filter;
31 struct xyarray *fd; 51 struct xyarray *fd;
52 struct xyarray *sample_id;
53 u64 *id;
32 struct perf_counts *counts; 54 struct perf_counts *counts;
33 int idx; 55 int idx;
34 void *priv; 56 int ids;
57 struct hists hists;
58 char *name;
59 union {
60 void *priv;
61 off_t id_offset;
62 };
63 struct cgroup_sel *cgrp;
35}; 64};
36 65
37struct cpu_map; 66struct cpu_map;
38struct thread_map; 67struct thread_map;
68struct perf_evlist;
39 69
40struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx); 70struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
71void perf_evsel__init(struct perf_evsel *evsel,
72 struct perf_event_attr *attr, int idx);
73void perf_evsel__exit(struct perf_evsel *evsel);
41void perf_evsel__delete(struct perf_evsel *evsel); 74void perf_evsel__delete(struct perf_evsel *evsel);
42 75
43int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); 76int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
77int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
44int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); 78int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
45void perf_evsel__free_fd(struct perf_evsel *evsel); 79void perf_evsel__free_fd(struct perf_evsel *evsel);
80void perf_evsel__free_id(struct perf_evsel *evsel);
46void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); 81void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
47 82
48int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus); 83int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
49int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads); 84 struct cpu_map *cpus, bool group, bool inherit);
50int perf_evsel__open(struct perf_evsel *evsel, 85int perf_evsel__open_per_thread(struct perf_evsel *evsel,
51 struct cpu_map *cpus, struct thread_map *threads); 86 struct thread_map *threads, bool group, bool inherit);
87int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
88 struct thread_map *threads, bool group, bool inherit);
52 89
53#define perf_evsel__match(evsel, t, c) \ 90#define perf_evsel__match(evsel, t, c) \
54 (evsel->attr.type == PERF_TYPE_##t && \ 91 (evsel->attr.type == PERF_TYPE_##t && \
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index 67eeff57156..7adf4ad15d8 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -11,31 +11,12 @@ static const char *argv0_path;
11 11
12const char *system_path(const char *path) 12const char *system_path(const char *path)
13{ 13{
14#ifdef RUNTIME_PREFIX
15 static const char *prefix;
16#else
17 static const char *prefix = PREFIX; 14 static const char *prefix = PREFIX;
18#endif
19 struct strbuf d = STRBUF_INIT; 15 struct strbuf d = STRBUF_INIT;
20 16
21 if (is_absolute_path(path)) 17 if (is_absolute_path(path))
22 return path; 18 return path;
23 19
24#ifdef RUNTIME_PREFIX
25 assert(argv0_path);
26 assert(is_absolute_path(argv0_path));
27
28 if (!prefix &&
29 !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
30 !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
31 !(prefix = strip_path_suffix(argv0_path, "perf"))) {
32 prefix = PREFIX;
33 fprintf(stderr, "RUNTIME_PREFIX requested, "
34 "but prefix computation failed. "
35 "Using static fallback '%s'.\n", prefix);
36 }
37#endif
38
39 strbuf_addf(&d, "%s/%s", prefix, path); 20 strbuf_addf(&d, "%s/%s", prefix, path);
40 path = strbuf_detach(&d, NULL); 21 path = strbuf_detach(&d, NULL);
41 return path; 22 return path;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 0866bcdb5e8..e5230c0ef95 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -8,6 +8,8 @@
8#include <linux/list.h> 8#include <linux/list.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10 10
11#include "evlist.h"
12#include "evsel.h"
11#include "util.h" 13#include "util.h"
12#include "header.h" 14#include "header.h"
13#include "../perf.h" 15#include "../perf.h"
@@ -18,89 +20,6 @@
18 20
19static bool no_buildid_cache = false; 21static bool no_buildid_cache = false;
20 22
21/*
22 * Create new perf.data header attribute:
23 */
24struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr)
25{
26 struct perf_header_attr *self = malloc(sizeof(*self));
27
28 if (self != NULL) {
29 self->attr = *attr;
30 self->ids = 0;
31 self->size = 1;
32 self->id = malloc(sizeof(u64));
33 if (self->id == NULL) {
34 free(self);
35 self = NULL;
36 }
37 }
38
39 return self;
40}
41
42void perf_header_attr__delete(struct perf_header_attr *self)
43{
44 free(self->id);
45 free(self);
46}
47
48int perf_header_attr__add_id(struct perf_header_attr *self, u64 id)
49{
50 int pos = self->ids;
51
52 self->ids++;
53 if (self->ids > self->size) {
54 int nsize = self->size * 2;
55 u64 *nid = realloc(self->id, nsize * sizeof(u64));
56
57 if (nid == NULL)
58 return -1;
59
60 self->size = nsize;
61 self->id = nid;
62 }
63 self->id[pos] = id;
64 return 0;
65}
66
67int perf_header__init(struct perf_header *self)
68{
69 self->size = 1;
70 self->attr = malloc(sizeof(void *));
71 return self->attr == NULL ? -ENOMEM : 0;
72}
73
74void perf_header__exit(struct perf_header *self)
75{
76 int i;
77 for (i = 0; i < self->attrs; ++i)
78 perf_header_attr__delete(self->attr[i]);
79 free(self->attr);
80}
81
82int perf_header__add_attr(struct perf_header *self,
83 struct perf_header_attr *attr)
84{
85 if (self->frozen)
86 return -1;
87
88 if (self->attrs == self->size) {
89 int nsize = self->size * 2;
90 struct perf_header_attr **nattr;
91
92 nattr = realloc(self->attr, nsize * sizeof(void *));
93 if (nattr == NULL)
94 return -1;
95
96 self->size = nsize;
97 self->attr = nattr;
98 }
99
100 self->attr[self->attrs++] = attr;
101 return 0;
102}
103
104static int event_count; 23static int event_count;
105static struct perf_trace_event_type *events; 24static struct perf_trace_event_type *events;
106 25
@@ -147,19 +66,19 @@ struct perf_file_attr {
147 struct perf_file_section ids; 66 struct perf_file_section ids;
148}; 67};
149 68
150void perf_header__set_feat(struct perf_header *self, int feat) 69void perf_header__set_feat(struct perf_header *header, int feat)
151{ 70{
152 set_bit(feat, self->adds_features); 71 set_bit(feat, header->adds_features);
153} 72}
154 73
155void perf_header__clear_feat(struct perf_header *self, int feat) 74void perf_header__clear_feat(struct perf_header *header, int feat)
156{ 75{
157 clear_bit(feat, self->adds_features); 76 clear_bit(feat, header->adds_features);
158} 77}
159 78
160bool perf_header__has_feat(const struct perf_header *self, int feat) 79bool perf_header__has_feat(const struct perf_header *header, int feat)
161{ 80{
162 return test_bit(feat, self->adds_features); 81 return test_bit(feat, header->adds_features);
163} 82}
164 83
165static int do_write(int fd, const void *buf, size_t size) 84static int do_write(int fd, const void *buf, size_t size)
@@ -228,22 +147,22 @@ static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
228 return 0; 147 return 0;
229} 148}
230 149
231static int machine__write_buildid_table(struct machine *self, int fd) 150static int machine__write_buildid_table(struct machine *machine, int fd)
232{ 151{
233 int err; 152 int err;
234 u16 kmisc = PERF_RECORD_MISC_KERNEL, 153 u16 kmisc = PERF_RECORD_MISC_KERNEL,
235 umisc = PERF_RECORD_MISC_USER; 154 umisc = PERF_RECORD_MISC_USER;
236 155
237 if (!machine__is_host(self)) { 156 if (!machine__is_host(machine)) {
238 kmisc = PERF_RECORD_MISC_GUEST_KERNEL; 157 kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
239 umisc = PERF_RECORD_MISC_GUEST_USER; 158 umisc = PERF_RECORD_MISC_GUEST_USER;
240 } 159 }
241 160
242 err = __dsos__write_buildid_table(&self->kernel_dsos, self->pid, 161 err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid,
243 kmisc, fd); 162 kmisc, fd);
244 if (err == 0) 163 if (err == 0)
245 err = __dsos__write_buildid_table(&self->user_dsos, 164 err = __dsos__write_buildid_table(&machine->user_dsos,
246 self->pid, umisc, fd); 165 machine->pid, umisc, fd);
247 return err; 166 return err;
248} 167}
249 168
@@ -366,12 +285,12 @@ out_free:
366 return err; 285 return err;
367} 286}
368 287
369static int dso__cache_build_id(struct dso *self, const char *debugdir) 288static int dso__cache_build_id(struct dso *dso, const char *debugdir)
370{ 289{
371 bool is_kallsyms = self->kernel && self->long_name[0] != '/'; 290 bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
372 291
373 return build_id_cache__add_b(self->build_id, sizeof(self->build_id), 292 return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id),
374 self->long_name, debugdir, is_kallsyms); 293 dso->long_name, debugdir, is_kallsyms);
375} 294}
376 295
377static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir) 296static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
@@ -386,14 +305,14 @@ static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
386 return err; 305 return err;
387} 306}
388 307
389static int machine__cache_build_ids(struct machine *self, const char *debugdir) 308static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
390{ 309{
391 int ret = __dsos__cache_build_ids(&self->kernel_dsos, debugdir); 310 int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir);
392 ret |= __dsos__cache_build_ids(&self->user_dsos, debugdir); 311 ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir);
393 return ret; 312 return ret;
394} 313}
395 314
396static int perf_session__cache_build_ids(struct perf_session *self) 315static int perf_session__cache_build_ids(struct perf_session *session)
397{ 316{
398 struct rb_node *nd; 317 struct rb_node *nd;
399 int ret; 318 int ret;
@@ -404,28 +323,28 @@ static int perf_session__cache_build_ids(struct perf_session *self)
404 if (mkdir(debugdir, 0755) != 0 && errno != EEXIST) 323 if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
405 return -1; 324 return -1;
406 325
407 ret = machine__cache_build_ids(&self->host_machine, debugdir); 326 ret = machine__cache_build_ids(&session->host_machine, debugdir);
408 327
409 for (nd = rb_first(&self->machines); nd; nd = rb_next(nd)) { 328 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
410 struct machine *pos = rb_entry(nd, struct machine, rb_node); 329 struct machine *pos = rb_entry(nd, struct machine, rb_node);
411 ret |= machine__cache_build_ids(pos, debugdir); 330 ret |= machine__cache_build_ids(pos, debugdir);
412 } 331 }
413 return ret ? -1 : 0; 332 return ret ? -1 : 0;
414} 333}
415 334
416static bool machine__read_build_ids(struct machine *self, bool with_hits) 335static bool machine__read_build_ids(struct machine *machine, bool with_hits)
417{ 336{
418 bool ret = __dsos__read_build_ids(&self->kernel_dsos, with_hits); 337 bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits);
419 ret |= __dsos__read_build_ids(&self->user_dsos, with_hits); 338 ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits);
420 return ret; 339 return ret;
421} 340}
422 341
423static bool perf_session__read_build_ids(struct perf_session *self, bool with_hits) 342static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
424{ 343{
425 struct rb_node *nd; 344 struct rb_node *nd;
426 bool ret = machine__read_build_ids(&self->host_machine, with_hits); 345 bool ret = machine__read_build_ids(&session->host_machine, with_hits);
427 346
428 for (nd = rb_first(&self->machines); nd; nd = rb_next(nd)) { 347 for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
429 struct machine *pos = rb_entry(nd, struct machine, rb_node); 348 struct machine *pos = rb_entry(nd, struct machine, rb_node);
430 ret |= machine__read_build_ids(pos, with_hits); 349 ret |= machine__read_build_ids(pos, with_hits);
431 } 350 }
@@ -433,7 +352,8 @@ static bool perf_session__read_build_ids(struct perf_session *self, bool with_hi
433 return ret; 352 return ret;
434} 353}
435 354
436static int perf_header__adds_write(struct perf_header *self, int fd) 355static int perf_header__adds_write(struct perf_header *header,
356 struct perf_evlist *evlist, int fd)
437{ 357{
438 int nr_sections; 358 int nr_sections;
439 struct perf_session *session; 359 struct perf_session *session;
@@ -442,13 +362,13 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
442 u64 sec_start; 362 u64 sec_start;
443 int idx = 0, err; 363 int idx = 0, err;
444 364
445 session = container_of(self, struct perf_session, header); 365 session = container_of(header, struct perf_session, header);
446 366
447 if (perf_header__has_feat(self, HEADER_BUILD_ID && 367 if (perf_header__has_feat(header, HEADER_BUILD_ID &&
448 !perf_session__read_build_ids(session, true))) 368 !perf_session__read_build_ids(session, true)))
449 perf_header__clear_feat(self, HEADER_BUILD_ID); 369 perf_header__clear_feat(header, HEADER_BUILD_ID);
450 370
451 nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); 371 nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
452 if (!nr_sections) 372 if (!nr_sections)
453 return 0; 373 return 0;
454 374
@@ -458,28 +378,28 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
458 378
459 sec_size = sizeof(*feat_sec) * nr_sections; 379 sec_size = sizeof(*feat_sec) * nr_sections;
460 380
461 sec_start = self->data_offset + self->data_size; 381 sec_start = header->data_offset + header->data_size;
462 lseek(fd, sec_start + sec_size, SEEK_SET); 382 lseek(fd, sec_start + sec_size, SEEK_SET);
463 383
464 if (perf_header__has_feat(self, HEADER_TRACE_INFO)) { 384 if (perf_header__has_feat(header, HEADER_TRACE_INFO)) {
465 struct perf_file_section *trace_sec; 385 struct perf_file_section *trace_sec;
466 386
467 trace_sec = &feat_sec[idx++]; 387 trace_sec = &feat_sec[idx++];
468 388
469 /* Write trace info */ 389 /* Write trace info */
470 trace_sec->offset = lseek(fd, 0, SEEK_CUR); 390 trace_sec->offset = lseek(fd, 0, SEEK_CUR);
471 read_tracing_data(fd, &evsel_list); 391 read_tracing_data(fd, &evlist->entries);
472 trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset; 392 trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
473 } 393 }
474 394
475 if (perf_header__has_feat(self, HEADER_BUILD_ID)) { 395 if (perf_header__has_feat(header, HEADER_BUILD_ID)) {
476 struct perf_file_section *buildid_sec; 396 struct perf_file_section *buildid_sec;
477 397
478 buildid_sec = &feat_sec[idx++]; 398 buildid_sec = &feat_sec[idx++];
479 399
480 /* Write build-ids */ 400 /* Write build-ids */
481 buildid_sec->offset = lseek(fd, 0, SEEK_CUR); 401 buildid_sec->offset = lseek(fd, 0, SEEK_CUR);
482 err = dsos__write_buildid_table(self, fd); 402 err = dsos__write_buildid_table(header, fd);
483 if (err < 0) { 403 if (err < 0) {
484 pr_debug("failed to write buildid table\n"); 404 pr_debug("failed to write buildid table\n");
485 goto out_free; 405 goto out_free;
@@ -518,32 +438,41 @@ int perf_header__write_pipe(int fd)
518 return 0; 438 return 0;
519} 439}
520 440
521int perf_header__write(struct perf_header *self, int fd, bool at_exit) 441int perf_session__write_header(struct perf_session *session,
442 struct perf_evlist *evlist,
443 int fd, bool at_exit)
522{ 444{
523 struct perf_file_header f_header; 445 struct perf_file_header f_header;
524 struct perf_file_attr f_attr; 446 struct perf_file_attr f_attr;
525 struct perf_header_attr *attr; 447 struct perf_header *header = &session->header;
526 int i, err; 448 struct perf_evsel *attr, *pair = NULL;
449 int err;
527 450
528 lseek(fd, sizeof(f_header), SEEK_SET); 451 lseek(fd, sizeof(f_header), SEEK_SET);
529 452
530 for (i = 0; i < self->attrs; i++) { 453 if (session->evlist != evlist)
531 attr = self->attr[i]; 454 pair = list_entry(session->evlist->entries.next, struct perf_evsel, node);
532 455
456 list_for_each_entry(attr, &evlist->entries, node) {
533 attr->id_offset = lseek(fd, 0, SEEK_CUR); 457 attr->id_offset = lseek(fd, 0, SEEK_CUR);
534 err = do_write(fd, attr->id, attr->ids * sizeof(u64)); 458 err = do_write(fd, attr->id, attr->ids * sizeof(u64));
535 if (err < 0) { 459 if (err < 0) {
460out_err_write:
536 pr_debug("failed to write perf header\n"); 461 pr_debug("failed to write perf header\n");
537 return err; 462 return err;
538 } 463 }
464 if (session->evlist != evlist) {
465 err = do_write(fd, pair->id, pair->ids * sizeof(u64));
466 if (err < 0)
467 goto out_err_write;
468 attr->ids += pair->ids;
469 pair = list_entry(pair->node.next, struct perf_evsel, node);
470 }
539 } 471 }
540 472
473 header->attr_offset = lseek(fd, 0, SEEK_CUR);
541 474
542 self->attr_offset = lseek(fd, 0, SEEK_CUR); 475 list_for_each_entry(attr, &evlist->entries, node) {
543
544 for (i = 0; i < self->attrs; i++) {
545 attr = self->attr[i];
546
547 f_attr = (struct perf_file_attr){ 476 f_attr = (struct perf_file_attr){
548 .attr = attr->attr, 477 .attr = attr->attr,
549 .ids = { 478 .ids = {
@@ -558,20 +487,20 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit)
558 } 487 }
559 } 488 }
560 489
561 self->event_offset = lseek(fd, 0, SEEK_CUR); 490 header->event_offset = lseek(fd, 0, SEEK_CUR);
562 self->event_size = event_count * sizeof(struct perf_trace_event_type); 491 header->event_size = event_count * sizeof(struct perf_trace_event_type);
563 if (events) { 492 if (events) {
564 err = do_write(fd, events, self->event_size); 493 err = do_write(fd, events, header->event_size);
565 if (err < 0) { 494 if (err < 0) {
566 pr_debug("failed to write perf header events\n"); 495 pr_debug("failed to write perf header events\n");
567 return err; 496 return err;
568 } 497 }
569 } 498 }
570 499
571 self->data_offset = lseek(fd, 0, SEEK_CUR); 500 header->data_offset = lseek(fd, 0, SEEK_CUR);
572 501
573 if (at_exit) { 502 if (at_exit) {
574 err = perf_header__adds_write(self, fd); 503 err = perf_header__adds_write(header, evlist, fd);
575 if (err < 0) 504 if (err < 0)
576 return err; 505 return err;
577 } 506 }
@@ -581,20 +510,20 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit)
581 .size = sizeof(f_header), 510 .size = sizeof(f_header),
582 .attr_size = sizeof(f_attr), 511 .attr_size = sizeof(f_attr),
583 .attrs = { 512 .attrs = {
584 .offset = self->attr_offset, 513 .offset = header->attr_offset,
585 .size = self->attrs * sizeof(f_attr), 514 .size = evlist->nr_entries * sizeof(f_attr),
586 }, 515 },
587 .data = { 516 .data = {
588 .offset = self->data_offset, 517 .offset = header->data_offset,
589 .size = self->data_size, 518 .size = header->data_size,
590 }, 519 },
591 .event_types = { 520 .event_types = {
592 .offset = self->event_offset, 521 .offset = header->event_offset,
593 .size = self->event_size, 522 .size = header->event_size,
594 }, 523 },
595 }; 524 };
596 525
597 memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features)); 526 memcpy(&f_header.adds_features, &header->adds_features, sizeof(header->adds_features));
598 527
599 lseek(fd, 0, SEEK_SET); 528 lseek(fd, 0, SEEK_SET);
600 err = do_write(fd, &f_header, sizeof(f_header)); 529 err = do_write(fd, &f_header, sizeof(f_header));
@@ -602,26 +531,26 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit)
602 pr_debug("failed to write perf header\n"); 531 pr_debug("failed to write perf header\n");
603 return err; 532 return err;
604 } 533 }
605 lseek(fd, self->data_offset + self->data_size, SEEK_SET); 534 lseek(fd, header->data_offset + header->data_size, SEEK_SET);
606 535
607 self->frozen = 1; 536 header->frozen = 1;
608 return 0; 537 return 0;
609} 538}
610 539
611static int perf_header__getbuffer64(struct perf_header *self, 540static int perf_header__getbuffer64(struct perf_header *header,
612 int fd, void *buf, size_t size) 541 int fd, void *buf, size_t size)
613{ 542{
614 if (readn(fd, buf, size) <= 0) 543 if (readn(fd, buf, size) <= 0)
615 return -1; 544 return -1;
616 545
617 if (self->needs_swap) 546 if (header->needs_swap)
618 mem_bswap_64(buf, size); 547 mem_bswap_64(buf, size);
619 548
620 return 0; 549 return 0;
621} 550}
622 551
623int perf_header__process_sections(struct perf_header *self, int fd, 552int perf_header__process_sections(struct perf_header *header, int fd,
624 int (*process)(struct perf_file_section *self, 553 int (*process)(struct perf_file_section *section,
625 struct perf_header *ph, 554 struct perf_header *ph,
626 int feat, int fd)) 555 int feat, int fd))
627{ 556{
@@ -631,7 +560,7 @@ int perf_header__process_sections(struct perf_header *self, int fd,
631 int idx = 0; 560 int idx = 0;
632 int err = -1, feat = 1; 561 int err = -1, feat = 1;
633 562
634 nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); 563 nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
635 if (!nr_sections) 564 if (!nr_sections)
636 return 0; 565 return 0;
637 566
@@ -641,17 +570,17 @@ int perf_header__process_sections(struct perf_header *self, int fd,
641 570
642 sec_size = sizeof(*feat_sec) * nr_sections; 571 sec_size = sizeof(*feat_sec) * nr_sections;
643 572
644 lseek(fd, self->data_offset + self->data_size, SEEK_SET); 573 lseek(fd, header->data_offset + header->data_size, SEEK_SET);
645 574
646 if (perf_header__getbuffer64(self, fd, feat_sec, sec_size)) 575 if (perf_header__getbuffer64(header, fd, feat_sec, sec_size))
647 goto out_free; 576 goto out_free;
648 577
649 err = 0; 578 err = 0;
650 while (idx < nr_sections && feat < HEADER_LAST_FEATURE) { 579 while (idx < nr_sections && feat < HEADER_LAST_FEATURE) {
651 if (perf_header__has_feat(self, feat)) { 580 if (perf_header__has_feat(header, feat)) {
652 struct perf_file_section *sec = &feat_sec[idx++]; 581 struct perf_file_section *sec = &feat_sec[idx++];
653 582
654 err = process(sec, self, feat, fd); 583 err = process(sec, header, feat, fd);
655 if (err < 0) 584 if (err < 0)
656 break; 585 break;
657 } 586 }
@@ -662,35 +591,35 @@ out_free:
662 return err; 591 return err;
663} 592}
664 593
665int perf_file_header__read(struct perf_file_header *self, 594int perf_file_header__read(struct perf_file_header *header,
666 struct perf_header *ph, int fd) 595 struct perf_header *ph, int fd)
667{ 596{
668 lseek(fd, 0, SEEK_SET); 597 lseek(fd, 0, SEEK_SET);
669 598
670 if (readn(fd, self, sizeof(*self)) <= 0 || 599 if (readn(fd, header, sizeof(*header)) <= 0 ||
671 memcmp(&self->magic, __perf_magic, sizeof(self->magic))) 600 memcmp(&header->magic, __perf_magic, sizeof(header->magic)))
672 return -1; 601 return -1;
673 602
674 if (self->attr_size != sizeof(struct perf_file_attr)) { 603 if (header->attr_size != sizeof(struct perf_file_attr)) {
675 u64 attr_size = bswap_64(self->attr_size); 604 u64 attr_size = bswap_64(header->attr_size);
676 605
677 if (attr_size != sizeof(struct perf_file_attr)) 606 if (attr_size != sizeof(struct perf_file_attr))
678 return -1; 607 return -1;
679 608
680 mem_bswap_64(self, offsetof(struct perf_file_header, 609 mem_bswap_64(header, offsetof(struct perf_file_header,
681 adds_features)); 610 adds_features));
682 ph->needs_swap = true; 611 ph->needs_swap = true;
683 } 612 }
684 613
685 if (self->size != sizeof(*self)) { 614 if (header->size != sizeof(*header)) {
686 /* Support the previous format */ 615 /* Support the previous format */
687 if (self->size == offsetof(typeof(*self), adds_features)) 616 if (header->size == offsetof(typeof(*header), adds_features))
688 bitmap_zero(self->adds_features, HEADER_FEAT_BITS); 617 bitmap_zero(header->adds_features, HEADER_FEAT_BITS);
689 else 618 else
690 return -1; 619 return -1;
691 } 620 }
692 621
693 memcpy(&ph->adds_features, &self->adds_features, 622 memcpy(&ph->adds_features, &header->adds_features,
694 sizeof(ph->adds_features)); 623 sizeof(ph->adds_features));
695 /* 624 /*
696 * FIXME: hack that assumes that if we need swap the perf.data file 625 * FIXME: hack that assumes that if we need swap the perf.data file
@@ -704,10 +633,10 @@ int perf_file_header__read(struct perf_file_header *self,
704 perf_header__set_feat(ph, HEADER_BUILD_ID); 633 perf_header__set_feat(ph, HEADER_BUILD_ID);
705 } 634 }
706 635
707 ph->event_offset = self->event_types.offset; 636 ph->event_offset = header->event_types.offset;
708 ph->event_size = self->event_types.size; 637 ph->event_size = header->event_types.size;
709 ph->data_offset = self->data.offset; 638 ph->data_offset = header->data.offset;
710 ph->data_size = self->data.size; 639 ph->data_size = header->data.size;
711 return 0; 640 return 0;
712} 641}
713 642
@@ -766,11 +695,10 @@ out:
766 return err; 695 return err;
767} 696}
768 697
769static int perf_header__read_build_ids(struct perf_header *self, 698static int perf_header__read_build_ids(struct perf_header *header,
770 int input, u64 offset, u64 size) 699 int input, u64 offset, u64 size)
771{ 700{
772 struct perf_session *session = container_of(self, 701 struct perf_session *session = container_of(header, struct perf_session, header);
773 struct perf_session, header);
774 struct build_id_event bev; 702 struct build_id_event bev;
775 char filename[PATH_MAX]; 703 char filename[PATH_MAX];
776 u64 limit = offset + size; 704 u64 limit = offset + size;
@@ -782,7 +710,7 @@ static int perf_header__read_build_ids(struct perf_header *self,
782 if (read(input, &bev, sizeof(bev)) != sizeof(bev)) 710 if (read(input, &bev, sizeof(bev)) != sizeof(bev))
783 goto out; 711 goto out;
784 712
785 if (self->needs_swap) 713 if (header->needs_swap)
786 perf_event_header__bswap(&bev.header); 714 perf_event_header__bswap(&bev.header);
787 715
788 len = bev.header.size - sizeof(bev); 716 len = bev.header.size - sizeof(bev);
@@ -798,13 +726,13 @@ out:
798 return err; 726 return err;
799} 727}
800 728
801static int perf_file_section__process(struct perf_file_section *self, 729static int perf_file_section__process(struct perf_file_section *section,
802 struct perf_header *ph, 730 struct perf_header *ph,
803 int feat, int fd) 731 int feat, int fd)
804{ 732{
805 if (lseek(fd, self->offset, SEEK_SET) == (off_t)-1) { 733 if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
806 pr_debug("Failed to lseek to %" PRIu64 " offset for feature " 734 pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
807 "%d, continuing...\n", self->offset, feat); 735 "%d, continuing...\n", section->offset, feat);
808 return 0; 736 return 0;
809 } 737 }
810 738
@@ -814,7 +742,7 @@ static int perf_file_section__process(struct perf_file_section *self,
814 break; 742 break;
815 743
816 case HEADER_BUILD_ID: 744 case HEADER_BUILD_ID:
817 if (perf_header__read_build_ids(ph, fd, self->offset, self->size)) 745 if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
818 pr_debug("Failed to read buildids, continuing...\n"); 746 pr_debug("Failed to read buildids, continuing...\n");
819 break; 747 break;
820 default: 748 default:
@@ -824,21 +752,21 @@ static int perf_file_section__process(struct perf_file_section *self,
824 return 0; 752 return 0;
825} 753}
826 754
827static int perf_file_header__read_pipe(struct perf_pipe_file_header *self, 755static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
828 struct perf_header *ph, int fd, 756 struct perf_header *ph, int fd,
829 bool repipe) 757 bool repipe)
830{ 758{
831 if (readn(fd, self, sizeof(*self)) <= 0 || 759 if (readn(fd, header, sizeof(*header)) <= 0 ||
832 memcmp(&self->magic, __perf_magic, sizeof(self->magic))) 760 memcmp(&header->magic, __perf_magic, sizeof(header->magic)))
833 return -1; 761 return -1;
834 762
835 if (repipe && do_write(STDOUT_FILENO, self, sizeof(*self)) < 0) 763 if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0)
836 return -1; 764 return -1;
837 765
838 if (self->size != sizeof(*self)) { 766 if (header->size != sizeof(*header)) {
839 u64 size = bswap_64(self->size); 767 u64 size = bswap_64(header->size);
840 768
841 if (size != sizeof(*self)) 769 if (size != sizeof(*header))
842 return -1; 770 return -1;
843 771
844 ph->needs_swap = true; 772 ph->needs_swap = true;
@@ -849,10 +777,10 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *self,
849 777
850static int perf_header__read_pipe(struct perf_session *session, int fd) 778static int perf_header__read_pipe(struct perf_session *session, int fd)
851{ 779{
852 struct perf_header *self = &session->header; 780 struct perf_header *header = &session->header;
853 struct perf_pipe_file_header f_header; 781 struct perf_pipe_file_header f_header;
854 782
855 if (perf_file_header__read_pipe(&f_header, self, fd, 783 if (perf_file_header__read_pipe(&f_header, header, fd,
856 session->repipe) < 0) { 784 session->repipe) < 0) {
857 pr_debug("incompatible file format\n"); 785 pr_debug("incompatible file format\n");
858 return -EINVAL; 786 return -EINVAL;
@@ -863,18 +791,22 @@ static int perf_header__read_pipe(struct perf_session *session, int fd)
863 return 0; 791 return 0;
864} 792}
865 793
866int perf_header__read(struct perf_session *session, int fd) 794int perf_session__read_header(struct perf_session *session, int fd)
867{ 795{
868 struct perf_header *self = &session->header; 796 struct perf_header *header = &session->header;
869 struct perf_file_header f_header; 797 struct perf_file_header f_header;
870 struct perf_file_attr f_attr; 798 struct perf_file_attr f_attr;
871 u64 f_id; 799 u64 f_id;
872 int nr_attrs, nr_ids, i, j; 800 int nr_attrs, nr_ids, i, j;
873 801
802 session->evlist = perf_evlist__new(NULL, NULL);
803 if (session->evlist == NULL)
804 return -ENOMEM;
805
874 if (session->fd_pipe) 806 if (session->fd_pipe)
875 return perf_header__read_pipe(session, fd); 807 return perf_header__read_pipe(session, fd);
876 808
877 if (perf_file_header__read(&f_header, self, fd) < 0) { 809 if (perf_file_header__read(&f_header, header, fd) < 0) {
878 pr_debug("incompatible file format\n"); 810 pr_debug("incompatible file format\n");
879 return -EINVAL; 811 return -EINVAL;
880 } 812 }
@@ -883,33 +815,39 @@ int perf_header__read(struct perf_session *session, int fd)
883 lseek(fd, f_header.attrs.offset, SEEK_SET); 815 lseek(fd, f_header.attrs.offset, SEEK_SET);
884 816
885 for (i = 0; i < nr_attrs; i++) { 817 for (i = 0; i < nr_attrs; i++) {
886 struct perf_header_attr *attr; 818 struct perf_evsel *evsel;
887 off_t tmp; 819 off_t tmp;
888 820
889 if (perf_header__getbuffer64(self, fd, &f_attr, sizeof(f_attr))) 821 if (perf_header__getbuffer64(header, fd, &f_attr, sizeof(f_attr)))
890 goto out_errno; 822 goto out_errno;
891 823
892 tmp = lseek(fd, 0, SEEK_CUR); 824 tmp = lseek(fd, 0, SEEK_CUR);
825 evsel = perf_evsel__new(&f_attr.attr, i);
893 826
894 attr = perf_header_attr__new(&f_attr.attr); 827 if (evsel == NULL)
895 if (attr == NULL) 828 goto out_delete_evlist;
896 return -ENOMEM; 829 /*
830 * Do it before so that if perf_evsel__alloc_id fails, this
831 * entry gets purged too at perf_evlist__delete().
832 */
833 perf_evlist__add(session->evlist, evsel);
897 834
898 nr_ids = f_attr.ids.size / sizeof(u64); 835 nr_ids = f_attr.ids.size / sizeof(u64);
836 /*
837 * We don't have the cpu and thread maps on the header, so
838 * for allocating the perf_sample_id table we fake 1 cpu and
839 * hattr->ids threads.
840 */
841 if (perf_evsel__alloc_id(evsel, 1, nr_ids))
842 goto out_delete_evlist;
843
899 lseek(fd, f_attr.ids.offset, SEEK_SET); 844 lseek(fd, f_attr.ids.offset, SEEK_SET);
900 845
901 for (j = 0; j < nr_ids; j++) { 846 for (j = 0; j < nr_ids; j++) {
902 if (perf_header__getbuffer64(self, fd, &f_id, sizeof(f_id))) 847 if (perf_header__getbuffer64(header, fd, &f_id, sizeof(f_id)))
903 goto out_errno; 848 goto out_errno;
904 849
905 if (perf_header_attr__add_id(attr, f_id) < 0) { 850 perf_evlist__id_add(session->evlist, evsel, 0, j, f_id);
906 perf_header_attr__delete(attr);
907 return -ENOMEM;
908 }
909 }
910 if (perf_header__add_attr(self, attr) < 0) {
911 perf_header_attr__delete(attr);
912 return -ENOMEM;
913 } 851 }
914 852
915 lseek(fd, tmp, SEEK_SET); 853 lseek(fd, tmp, SEEK_SET);
@@ -920,93 +858,63 @@ int perf_header__read(struct perf_session *session, int fd)
920 events = malloc(f_header.event_types.size); 858 events = malloc(f_header.event_types.size);
921 if (events == NULL) 859 if (events == NULL)
922 return -ENOMEM; 860 return -ENOMEM;
923 if (perf_header__getbuffer64(self, fd, events, 861 if (perf_header__getbuffer64(header, fd, events,
924 f_header.event_types.size)) 862 f_header.event_types.size))
925 goto out_errno; 863 goto out_errno;
926 event_count = f_header.event_types.size / sizeof(struct perf_trace_event_type); 864 event_count = f_header.event_types.size / sizeof(struct perf_trace_event_type);
927 } 865 }
928 866
929 perf_header__process_sections(self, fd, perf_file_section__process); 867 perf_header__process_sections(header, fd, perf_file_section__process);
930 868
931 lseek(fd, self->data_offset, SEEK_SET); 869 lseek(fd, header->data_offset, SEEK_SET);
932 870
933 self->frozen = 1; 871 header->frozen = 1;
934 return 0; 872 return 0;
935out_errno: 873out_errno:
936 return -errno; 874 return -errno;
875
876out_delete_evlist:
877 perf_evlist__delete(session->evlist);
878 session->evlist = NULL;
879 return -ENOMEM;
937} 880}
938 881
939u64 perf_header__sample_type(struct perf_header *header) 882u64 perf_evlist__sample_type(struct perf_evlist *evlist)
940{ 883{
884 struct perf_evsel *pos;
941 u64 type = 0; 885 u64 type = 0;
942 int i;
943
944 for (i = 0; i < header->attrs; i++) {
945 struct perf_header_attr *attr = header->attr[i];
946 886
887 list_for_each_entry(pos, &evlist->entries, node) {
947 if (!type) 888 if (!type)
948 type = attr->attr.sample_type; 889 type = pos->attr.sample_type;
949 else if (type != attr->attr.sample_type) 890 else if (type != pos->attr.sample_type)
950 die("non matching sample_type"); 891 die("non matching sample_type");
951 } 892 }
952 893
953 return type; 894 return type;
954} 895}
955 896
956bool perf_header__sample_id_all(const struct perf_header *header) 897bool perf_evlist__sample_id_all(const struct perf_evlist *evlist)
957{ 898{
958 bool value = false, first = true; 899 bool value = false, first = true;
959 int i; 900 struct perf_evsel *pos;
960
961 for (i = 0; i < header->attrs; i++) {
962 struct perf_header_attr *attr = header->attr[i];
963 901
902 list_for_each_entry(pos, &evlist->entries, node) {
964 if (first) { 903 if (first) {
965 value = attr->attr.sample_id_all; 904 value = pos->attr.sample_id_all;
966 first = false; 905 first = false;
967 } else if (value != attr->attr.sample_id_all) 906 } else if (value != pos->attr.sample_id_all)
968 die("non matching sample_id_all"); 907 die("non matching sample_id_all");
969 } 908 }
970 909
971 return value; 910 return value;
972} 911}
973 912
974struct perf_event_attr * 913int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
975perf_header__find_attr(u64 id, struct perf_header *header) 914 perf_event__handler_t process,
976{ 915 struct perf_session *session)
977 int i;
978
979 /*
980 * We set id to -1 if the data file doesn't contain sample
981 * ids. This can happen when the data file contains one type
982 * of event and in that case, the header can still store the
983 * event attribute information. Check for this and avoid
984 * walking through the entire list of ids which may be large.
985 */
986 if (id == -1ULL) {
987 if (header->attrs > 0)
988 return &header->attr[0]->attr;
989 return NULL;
990 }
991
992 for (i = 0; i < header->attrs; i++) {
993 struct perf_header_attr *attr = header->attr[i];
994 int j;
995
996 for (j = 0; j < attr->ids; j++) {
997 if (attr->id[j] == id)
998 return &attr->attr;
999 }
1000 }
1001
1002 return NULL;
1003}
1004
1005int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
1006 event__handler_t process,
1007 struct perf_session *session)
1008{ 916{
1009 event_t *ev; 917 union perf_event *ev;
1010 size_t size; 918 size_t size;
1011 int err; 919 int err;
1012 920
@@ -1033,17 +941,15 @@ int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
1033 return err; 941 return err;
1034} 942}
1035 943
1036int event__synthesize_attrs(struct perf_header *self, event__handler_t process, 944int perf_session__synthesize_attrs(struct perf_session *session,
1037 struct perf_session *session) 945 perf_event__handler_t process)
1038{ 946{
1039 struct perf_header_attr *attr; 947 struct perf_evsel *attr;
1040 int i, err = 0; 948 int err = 0;
1041
1042 for (i = 0; i < self->attrs; i++) {
1043 attr = self->attr[i];
1044 949
1045 err = event__synthesize_attr(&attr->attr, attr->ids, attr->id, 950 list_for_each_entry(attr, &session->evlist->entries, node) {
1046 process, session); 951 err = perf_event__synthesize_attr(&attr->attr, attr->ids,
952 attr->id, process, session);
1047 if (err) { 953 if (err) {
1048 pr_debug("failed to create perf header attribute\n"); 954 pr_debug("failed to create perf header attribute\n");
1049 return err; 955 return err;
@@ -1053,29 +959,39 @@ int event__synthesize_attrs(struct perf_header *self, event__handler_t process,
1053 return err; 959 return err;
1054} 960}
1055 961
1056int event__process_attr(event_t *self, struct perf_session *session) 962int perf_event__process_attr(union perf_event *event,
963 struct perf_session *session)
1057{ 964{
1058 struct perf_header_attr *attr;
1059 unsigned int i, ids, n_ids; 965 unsigned int i, ids, n_ids;
966 struct perf_evsel *evsel;
1060 967
1061 attr = perf_header_attr__new(&self->attr.attr); 968 if (session->evlist == NULL) {
1062 if (attr == NULL) 969 session->evlist = perf_evlist__new(NULL, NULL);
970 if (session->evlist == NULL)
971 return -ENOMEM;
972 }
973
974 evsel = perf_evsel__new(&event->attr.attr,
975 session->evlist->nr_entries);
976 if (evsel == NULL)
1063 return -ENOMEM; 977 return -ENOMEM;
1064 978
1065 ids = self->header.size; 979 perf_evlist__add(session->evlist, evsel);
1066 ids -= (void *)&self->attr.id - (void *)self; 980
981 ids = event->header.size;
982 ids -= (void *)&event->attr.id - (void *)event;
1067 n_ids = ids / sizeof(u64); 983 n_ids = ids / sizeof(u64);
984 /*
985 * We don't have the cpu and thread maps on the header, so
986 * for allocating the perf_sample_id table we fake 1 cpu and
987 * hattr->ids threads.
988 */
989 if (perf_evsel__alloc_id(evsel, 1, n_ids))
990 return -ENOMEM;
1068 991
1069 for (i = 0; i < n_ids; i++) { 992 for (i = 0; i < n_ids; i++) {
1070 if (perf_header_attr__add_id(attr, self->attr.id[i]) < 0) { 993 perf_evlist__id_add(session->evlist, evsel, 0, i,
1071 perf_header_attr__delete(attr); 994 event->attr.id[i]);
1072 return -ENOMEM;
1073 }
1074 }
1075
1076 if (perf_header__add_attr(&session->header, attr) < 0) {
1077 perf_header_attr__delete(attr);
1078 return -ENOMEM;
1079 } 995 }
1080 996
1081 perf_session__update_sample_type(session); 997 perf_session__update_sample_type(session);
@@ -1083,11 +999,11 @@ int event__process_attr(event_t *self, struct perf_session *session)
1083 return 0; 999 return 0;
1084} 1000}
1085 1001
1086int event__synthesize_event_type(u64 event_id, char *name, 1002int perf_event__synthesize_event_type(u64 event_id, char *name,
1087 event__handler_t process, 1003 perf_event__handler_t process,
1088 struct perf_session *session) 1004 struct perf_session *session)
1089{ 1005{
1090 event_t ev; 1006 union perf_event ev;
1091 size_t size = 0; 1007 size_t size = 0;
1092 int err = 0; 1008 int err = 0;
1093 1009
@@ -1108,8 +1024,8 @@ int event__synthesize_event_type(u64 event_id, char *name,
1108 return err; 1024 return err;
1109} 1025}
1110 1026
1111int event__synthesize_event_types(event__handler_t process, 1027int perf_event__synthesize_event_types(perf_event__handler_t process,
1112 struct perf_session *session) 1028 struct perf_session *session)
1113{ 1029{
1114 struct perf_trace_event_type *type; 1030 struct perf_trace_event_type *type;
1115 int i, err = 0; 1031 int i, err = 0;
@@ -1117,8 +1033,9 @@ int event__synthesize_event_types(event__handler_t process,
1117 for (i = 0; i < event_count; i++) { 1033 for (i = 0; i < event_count; i++) {
1118 type = &events[i]; 1034 type = &events[i];
1119 1035
1120 err = event__synthesize_event_type(type->event_id, type->name, 1036 err = perf_event__synthesize_event_type(type->event_id,
1121 process, session); 1037 type->name, process,
1038 session);
1122 if (err) { 1039 if (err) {
1123 pr_debug("failed to create perf header event type\n"); 1040 pr_debug("failed to create perf header event type\n");
1124 return err; 1041 return err;
@@ -1128,28 +1045,28 @@ int event__synthesize_event_types(event__handler_t process,
1128 return err; 1045 return err;
1129} 1046}
1130 1047
1131int event__process_event_type(event_t *self, 1048int perf_event__process_event_type(union perf_event *event,
1132 struct perf_session *session __unused) 1049 struct perf_session *session __unused)
1133{ 1050{
1134 if (perf_header__push_event(self->event_type.event_type.event_id, 1051 if (perf_header__push_event(event->event_type.event_type.event_id,
1135 self->event_type.event_type.name) < 0) 1052 event->event_type.event_type.name) < 0)
1136 return -ENOMEM; 1053 return -ENOMEM;
1137 1054
1138 return 0; 1055 return 0;
1139} 1056}
1140 1057
1141int event__synthesize_tracing_data(int fd, struct list_head *pattrs, 1058int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
1142 event__handler_t process, 1059 perf_event__handler_t process,
1143 struct perf_session *session __unused) 1060 struct perf_session *session __unused)
1144{ 1061{
1145 event_t ev; 1062 union perf_event ev;
1146 ssize_t size = 0, aligned_size = 0, padding; 1063 ssize_t size = 0, aligned_size = 0, padding;
1147 int err = 0; 1064 int err __used = 0;
1148 1065
1149 memset(&ev, 0, sizeof(ev)); 1066 memset(&ev, 0, sizeof(ev));
1150 1067
1151 ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA; 1068 ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
1152 size = read_tracing_data_size(fd, pattrs); 1069 size = read_tracing_data_size(fd, &evlist->entries);
1153 if (size <= 0) 1070 if (size <= 0)
1154 return size; 1071 return size;
1155 aligned_size = ALIGN(size, sizeof(u64)); 1072 aligned_size = ALIGN(size, sizeof(u64));
@@ -1159,16 +1076,16 @@ int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
1159 1076
1160 process(&ev, NULL, session); 1077 process(&ev, NULL, session);
1161 1078
1162 err = read_tracing_data(fd, pattrs); 1079 err = read_tracing_data(fd, &evlist->entries);
1163 write_padded(fd, NULL, 0, padding); 1080 write_padded(fd, NULL, 0, padding);
1164 1081
1165 return aligned_size; 1082 return aligned_size;
1166} 1083}
1167 1084
1168int event__process_tracing_data(event_t *self, 1085int perf_event__process_tracing_data(union perf_event *event,
1169 struct perf_session *session) 1086 struct perf_session *session)
1170{ 1087{
1171 ssize_t size_read, padding, size = self->tracing_data.size; 1088 ssize_t size_read, padding, size = event->tracing_data.size;
1172 off_t offset = lseek(session->fd, 0, SEEK_CUR); 1089 off_t offset = lseek(session->fd, 0, SEEK_CUR);
1173 char buf[BUFSIZ]; 1090 char buf[BUFSIZ];
1174 1091
@@ -1194,12 +1111,12 @@ int event__process_tracing_data(event_t *self,
1194 return size_read + padding; 1111 return size_read + padding;
1195} 1112}
1196 1113
1197int event__synthesize_build_id(struct dso *pos, u16 misc, 1114int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
1198 event__handler_t process, 1115 perf_event__handler_t process,
1199 struct machine *machine, 1116 struct machine *machine,
1200 struct perf_session *session) 1117 struct perf_session *session)
1201{ 1118{
1202 event_t ev; 1119 union perf_event ev;
1203 size_t len; 1120 size_t len;
1204 int err = 0; 1121 int err = 0;
1205 1122
@@ -1222,11 +1139,11 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
1222 return err; 1139 return err;
1223} 1140}
1224 1141
1225int event__process_build_id(event_t *self, 1142int perf_event__process_build_id(union perf_event *event,
1226 struct perf_session *session) 1143 struct perf_session *session)
1227{ 1144{
1228 __event_process_build_id(&self->build_id, 1145 __event_process_build_id(&event->build_id,
1229 self->build_id.filename, 1146 event->build_id.filename,
1230 session); 1147 session);
1231 return 0; 1148 return 0;
1232} 1149}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 33f16be7b72..456661d7f10 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -9,13 +9,6 @@
9 9
10#include <linux/bitmap.h> 10#include <linux/bitmap.h>
11 11
12struct perf_header_attr {
13 struct perf_event_attr attr;
14 int ids, size;
15 u64 *id;
16 off_t id_offset;
17};
18
19enum { 12enum {
20 HEADER_TRACE_INFO = 1, 13 HEADER_TRACE_INFO = 1,
21 HEADER_BUILD_ID, 14 HEADER_BUILD_ID,
@@ -46,14 +39,12 @@ struct perf_pipe_file_header {
46 39
47struct perf_header; 40struct perf_header;
48 41
49int perf_file_header__read(struct perf_file_header *self, 42int perf_file_header__read(struct perf_file_header *header,
50 struct perf_header *ph, int fd); 43 struct perf_header *ph, int fd);
51 44
52struct perf_header { 45struct perf_header {
53 int frozen; 46 int frozen;
54 int attrs, size;
55 bool needs_swap; 47 bool needs_swap;
56 struct perf_header_attr **attr;
57 s64 attr_offset; 48 s64 attr_offset;
58 u64 data_offset; 49 u64 data_offset;
59 u64 data_size; 50 u64 data_size;
@@ -62,34 +53,25 @@ struct perf_header {
62 DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS); 53 DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
63}; 54};
64 55
65int perf_header__init(struct perf_header *self); 56struct perf_evlist;
66void perf_header__exit(struct perf_header *self);
67 57
68int perf_header__read(struct perf_session *session, int fd); 58int perf_session__read_header(struct perf_session *session, int fd);
69int perf_header__write(struct perf_header *self, int fd, bool at_exit); 59int perf_session__write_header(struct perf_session *session,
60 struct perf_evlist *evlist,
61 int fd, bool at_exit);
70int perf_header__write_pipe(int fd); 62int perf_header__write_pipe(int fd);
71 63
72int perf_header__add_attr(struct perf_header *self,
73 struct perf_header_attr *attr);
74
75int perf_header__push_event(u64 id, const char *name); 64int perf_header__push_event(u64 id, const char *name);
76char *perf_header__find_event(u64 id); 65char *perf_header__find_event(u64 id);
77 66
78struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr); 67u64 perf_evlist__sample_type(struct perf_evlist *evlist);
79void perf_header_attr__delete(struct perf_header_attr *self); 68bool perf_evlist__sample_id_all(const struct perf_evlist *evlist);
69void perf_header__set_feat(struct perf_header *header, int feat);
70void perf_header__clear_feat(struct perf_header *header, int feat);
71bool perf_header__has_feat(const struct perf_header *header, int feat);
80 72
81int perf_header_attr__add_id(struct perf_header_attr *self, u64 id); 73int perf_header__process_sections(struct perf_header *header, int fd,
82 74 int (*process)(struct perf_file_section *section,
83u64 perf_header__sample_type(struct perf_header *header);
84bool perf_header__sample_id_all(const struct perf_header *header);
85struct perf_event_attr *
86perf_header__find_attr(u64 id, struct perf_header *header);
87void perf_header__set_feat(struct perf_header *self, int feat);
88void perf_header__clear_feat(struct perf_header *self, int feat);
89bool perf_header__has_feat(const struct perf_header *self, int feat);
90
91int perf_header__process_sections(struct perf_header *self, int fd,
92 int (*process)(struct perf_file_section *self,
93 struct perf_header *ph, 75 struct perf_header *ph,
94 int feat, int fd)); 76 int feat, int fd));
95 77
@@ -97,32 +79,31 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
97 const char *name, bool is_kallsyms); 79 const char *name, bool is_kallsyms);
98int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir); 80int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
99 81
100int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, 82int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
101 event__handler_t process, 83 perf_event__handler_t process,
102 struct perf_session *session);
103int event__synthesize_attrs(struct perf_header *self,
104 event__handler_t process,
105 struct perf_session *session);
106int event__process_attr(event_t *self, struct perf_session *session);
107
108int event__synthesize_event_type(u64 event_id, char *name,
109 event__handler_t process,
110 struct perf_session *session);
111int event__synthesize_event_types(event__handler_t process,
112 struct perf_session *session);
113int event__process_event_type(event_t *self,
114 struct perf_session *session);
115
116int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
117 event__handler_t process,
118 struct perf_session *session);
119int event__process_tracing_data(event_t *self,
120 struct perf_session *session); 84 struct perf_session *session);
85int perf_session__synthesize_attrs(struct perf_session *session,
86 perf_event__handler_t process);
87int perf_event__process_attr(union perf_event *event, struct perf_session *session);
88
89int perf_event__synthesize_event_type(u64 event_id, char *name,
90 perf_event__handler_t process,
91 struct perf_session *session);
92int perf_event__synthesize_event_types(perf_event__handler_t process,
93 struct perf_session *session);
94int perf_event__process_event_type(union perf_event *event,
95 struct perf_session *session);
121 96
122int event__synthesize_build_id(struct dso *pos, u16 misc, 97int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
123 event__handler_t process, 98 perf_event__handler_t process,
124 struct machine *machine, 99 struct perf_session *session);
125 struct perf_session *session); 100int perf_event__process_tracing_data(union perf_event *event,
126int event__process_build_id(event_t *self, struct perf_session *session); 101 struct perf_session *session);
127 102
103int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
104 perf_event__handler_t process,
105 struct machine *machine,
106 struct perf_session *session);
107int perf_event__process_build_id(union perf_event *event,
108 struct perf_session *session);
128#endif /* __PERF_HEADER_H */ 109#endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index df51560f16f..627a02e03c5 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1,3 +1,4 @@
1#include "annotate.h"
1#include "util.h" 2#include "util.h"
2#include "build-id.h" 3#include "build-id.h"
3#include "hist.h" 4#include "hist.h"
@@ -49,6 +50,15 @@ static void hists__calc_col_len(struct hists *self, struct hist_entry *h)
49 50
50 if (h->ms.sym) 51 if (h->ms.sym)
51 hists__new_col_len(self, HISTC_SYMBOL, h->ms.sym->namelen); 52 hists__new_col_len(self, HISTC_SYMBOL, h->ms.sym->namelen);
53 else {
54 const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
55
56 if (hists__col_len(self, HISTC_DSO) < unresolved_col_width &&
57 !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
58 !symbol_conf.dso_list)
59 hists__set_col_len(self, HISTC_DSO,
60 unresolved_col_width);
61 }
52 62
53 len = thread__comm_len(h->thread); 63 len = thread__comm_len(h->thread);
54 if (hists__new_col_len(self, HISTC_COMM, len)) 64 if (hists__new_col_len(self, HISTC_COMM, len))
@@ -211,7 +221,9 @@ void hist_entry__free(struct hist_entry *he)
211 * collapse the histogram 221 * collapse the histogram
212 */ 222 */
213 223
214static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he) 224static bool hists__collapse_insert_entry(struct hists *self,
225 struct rb_root *root,
226 struct hist_entry *he)
215{ 227{
216 struct rb_node **p = &root->rb_node; 228 struct rb_node **p = &root->rb_node;
217 struct rb_node *parent = NULL; 229 struct rb_node *parent = NULL;
@@ -226,8 +238,11 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
226 238
227 if (!cmp) { 239 if (!cmp) {
228 iter->period += he->period; 240 iter->period += he->period;
229 if (symbol_conf.use_callchain) 241 if (symbol_conf.use_callchain) {
230 callchain_merge(iter->callchain, he->callchain); 242 callchain_cursor_reset(&self->callchain_cursor);
243 callchain_merge(&self->callchain_cursor, iter->callchain,
244 he->callchain);
245 }
231 hist_entry__free(he); 246 hist_entry__free(he);
232 return false; 247 return false;
233 } 248 }
@@ -262,7 +277,7 @@ void hists__collapse_resort(struct hists *self)
262 next = rb_next(&n->rb_node); 277 next = rb_next(&n->rb_node);
263 278
264 rb_erase(&n->rb_node, &self->entries); 279 rb_erase(&n->rb_node, &self->entries);
265 if (collapse__insert_entry(&tmp, n)) 280 if (hists__collapse_insert_entry(self, &tmp, n))
266 hists__inc_nr_entries(self, n); 281 hists__inc_nr_entries(self, n);
267 } 282 }
268 283
@@ -425,7 +440,7 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
425 u64 cumul; 440 u64 cumul;
426 441
427 child = rb_entry(node, struct callchain_node, rb_node); 442 child = rb_entry(node, struct callchain_node, rb_node);
428 cumul = cumul_hits(child); 443 cumul = callchain_cumul_hits(child);
429 remaining -= cumul; 444 remaining -= cumul;
430 445
431 /* 446 /*
@@ -947,225 +962,14 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
947 } 962 }
948} 963}
949 964
950static int symbol__alloc_hist(struct symbol *self) 965int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 ip)
951{
952 struct sym_priv *priv = symbol__priv(self);
953 const int size = (sizeof(*priv->hist) +
954 (self->end - self->start) * sizeof(u64));
955
956 priv->hist = zalloc(size);
957 return priv->hist == NULL ? -1 : 0;
958}
959
960int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip)
961{
962 unsigned int sym_size, offset;
963 struct symbol *sym = self->ms.sym;
964 struct sym_priv *priv;
965 struct sym_hist *h;
966
967 if (!sym || !self->ms.map)
968 return 0;
969
970 priv = symbol__priv(sym);
971 if (priv->hist == NULL && symbol__alloc_hist(sym) < 0)
972 return -ENOMEM;
973
974 sym_size = sym->end - sym->start;
975 offset = ip - sym->start;
976
977 pr_debug3("%s: ip=%#" PRIx64 "\n", __func__, self->ms.map->unmap_ip(self->ms.map, ip));
978
979 if (offset >= sym_size)
980 return 0;
981
982 h = priv->hist;
983 h->sum++;
984 h->ip[offset]++;
985
986 pr_debug3("%#" PRIx64 " %s: period++ [ip: %#" PRIx64 ", %#" PRIx64
987 "] => %" PRIu64 "\n", self->ms.sym->start, self->ms.sym->name,
988 ip, ip - self->ms.sym->start, h->ip[offset]);
989 return 0;
990}
991
992static struct objdump_line *objdump_line__new(s64 offset, char *line, size_t privsize)
993{
994 struct objdump_line *self = malloc(sizeof(*self) + privsize);
995
996 if (self != NULL) {
997 self->offset = offset;
998 self->line = line;
999 }
1000
1001 return self;
1002}
1003
1004void objdump_line__free(struct objdump_line *self)
1005{
1006 free(self->line);
1007 free(self);
1008}
1009
1010static void objdump__add_line(struct list_head *head, struct objdump_line *line)
1011{
1012 list_add_tail(&line->node, head);
1013}
1014
1015struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
1016 struct objdump_line *pos)
1017{
1018 list_for_each_entry_continue(pos, head, node)
1019 if (pos->offset >= 0)
1020 return pos;
1021
1022 return NULL;
1023}
1024
1025static int hist_entry__parse_objdump_line(struct hist_entry *self, FILE *file,
1026 struct list_head *head, size_t privsize)
1027{ 966{
1028 struct symbol *sym = self->ms.sym; 967 return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip);
1029 struct objdump_line *objdump_line;
1030 char *line = NULL, *tmp, *tmp2, *c;
1031 size_t line_len;
1032 s64 line_ip, offset = -1;
1033
1034 if (getline(&line, &line_len, file) < 0)
1035 return -1;
1036
1037 if (!line)
1038 return -1;
1039
1040 while (line_len != 0 && isspace(line[line_len - 1]))
1041 line[--line_len] = '\0';
1042
1043 c = strchr(line, '\n');
1044 if (c)
1045 *c = 0;
1046
1047 line_ip = -1;
1048
1049 /*
1050 * Strip leading spaces:
1051 */
1052 tmp = line;
1053 while (*tmp) {
1054 if (*tmp != ' ')
1055 break;
1056 tmp++;
1057 }
1058
1059 if (*tmp) {
1060 /*
1061 * Parse hexa addresses followed by ':'
1062 */
1063 line_ip = strtoull(tmp, &tmp2, 16);
1064 if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0')
1065 line_ip = -1;
1066 }
1067
1068 if (line_ip != -1) {
1069 u64 start = map__rip_2objdump(self->ms.map, sym->start),
1070 end = map__rip_2objdump(self->ms.map, sym->end);
1071
1072 offset = line_ip - start;
1073 if (offset < 0 || (u64)line_ip > end)
1074 offset = -1;
1075 }
1076
1077 objdump_line = objdump_line__new(offset, line, privsize);
1078 if (objdump_line == NULL) {
1079 free(line);
1080 return -1;
1081 }
1082 objdump__add_line(head, objdump_line);
1083
1084 return 0;
1085} 968}
1086 969
1087int hist_entry__annotate(struct hist_entry *self, struct list_head *head, 970int hist_entry__annotate(struct hist_entry *he, size_t privsize)
1088 size_t privsize)
1089{ 971{
1090 struct symbol *sym = self->ms.sym; 972 return symbol__annotate(he->ms.sym, he->ms.map, privsize);
1091 struct map *map = self->ms.map;
1092 struct dso *dso = map->dso;
1093 char *filename = dso__build_id_filename(dso, NULL, 0);
1094 bool free_filename = true;
1095 char command[PATH_MAX * 2];
1096 FILE *file;
1097 int err = 0;
1098 u64 len;
1099 char symfs_filename[PATH_MAX];
1100
1101 if (filename) {
1102 snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
1103 symbol_conf.symfs, filename);
1104 }
1105
1106 if (filename == NULL) {
1107 if (dso->has_build_id) {
1108 pr_err("Can't annotate %s: not enough memory\n",
1109 sym->name);
1110 return -ENOMEM;
1111 }
1112 goto fallback;
1113 } else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
1114 strstr(command, "[kernel.kallsyms]") ||
1115 access(symfs_filename, R_OK)) {
1116 free(filename);
1117fallback:
1118 /*
1119 * If we don't have build-ids or the build-id file isn't in the
1120 * cache, or is just a kallsyms file, well, lets hope that this
1121 * DSO is the same as when 'perf record' ran.
1122 */
1123 filename = dso->long_name;
1124 snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
1125 symbol_conf.symfs, filename);
1126 free_filename = false;
1127 }
1128
1129 if (dso->origin == DSO__ORIG_KERNEL) {
1130 if (dso->annotate_warned)
1131 goto out_free_filename;
1132 err = -ENOENT;
1133 dso->annotate_warned = 1;
1134 pr_err("Can't annotate %s: No vmlinux file was found in the "
1135 "path\n", sym->name);
1136 goto out_free_filename;
1137 }
1138
1139 pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
1140 filename, sym->name, map->unmap_ip(map, sym->start),
1141 map->unmap_ip(map, sym->end));
1142
1143 len = sym->end - sym->start;
1144
1145 pr_debug("annotating [%p] %30s : [%p] %30s\n",
1146 dso, dso->long_name, sym, sym->name);
1147
1148 snprintf(command, sizeof(command),
1149 "objdump --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand",
1150 map__rip_2objdump(map, sym->start),
1151 map__rip_2objdump(map, sym->end),
1152 symfs_filename, filename);
1153
1154 pr_debug("Executing: %s\n", command);
1155
1156 file = popen(command, "r");
1157 if (!file)
1158 goto out_free_filename;
1159
1160 while (!feof(file))
1161 if (hist_entry__parse_objdump_line(self, file, head, privsize) < 0)
1162 break;
1163
1164 pclose(file);
1165out_free_filename:
1166 if (free_filename)
1167 free(filename);
1168 return err;
1169} 973}
1170 974
1171void hists__inc_nr_events(struct hists *self, u32 type) 975void hists__inc_nr_events(struct hists *self, u32 type)
@@ -1180,8 +984,12 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp)
1180 size_t ret = 0; 984 size_t ret = 0;
1181 985
1182 for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) { 986 for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
1183 const char *name = event__get_event_name(i); 987 const char *name;
988
989 if (self->stats.nr_events[i] == 0)
990 continue;
1184 991
992 name = perf_event__name(i);
1185 if (!strcmp(name, "UNKNOWN")) 993 if (!strcmp(name, "UNKNOWN"))
1186 continue; 994 continue;
1187 995
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index ee789856a8c..cb6858a2f9a 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -9,33 +9,6 @@ extern struct callchain_param callchain_param;
9struct hist_entry; 9struct hist_entry;
10struct addr_location; 10struct addr_location;
11struct symbol; 11struct symbol;
12struct rb_root;
13
14struct objdump_line {
15 struct list_head node;
16 s64 offset;
17 char *line;
18};
19
20void objdump_line__free(struct objdump_line *self);
21struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
22 struct objdump_line *pos);
23
24struct sym_hist {
25 u64 sum;
26 u64 ip[0];
27};
28
29struct sym_ext {
30 struct rb_node node;
31 double percent;
32 char *path;
33};
34
35struct sym_priv {
36 struct sym_hist *hist;
37 struct sym_ext *ext;
38};
39 12
40/* 13/*
41 * The kernel collects the number of events it couldn't send in a stretch and 14 * The kernel collects the number of events it couldn't send in a stretch and
@@ -69,14 +42,13 @@ enum hist_column {
69}; 42};
70 43
71struct hists { 44struct hists {
72 struct rb_node rb_node;
73 struct rb_root entries; 45 struct rb_root entries;
74 u64 nr_entries; 46 u64 nr_entries;
75 struct events_stats stats; 47 struct events_stats stats;
76 u64 config;
77 u64 event_stream; 48 u64 event_stream;
78 u32 type;
79 u16 col_len[HISTC_NR_COLS]; 49 u16 col_len[HISTC_NR_COLS];
50 /* Best would be to reuse the session callchain cursor */
51 struct callchain_cursor callchain_cursor;
80}; 52};
81 53
82struct hist_entry *__hists__add_entry(struct hists *self, 54struct hist_entry *__hists__add_entry(struct hists *self,
@@ -102,9 +74,8 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp);
102size_t hists__fprintf(struct hists *self, struct hists *pair, 74size_t hists__fprintf(struct hists *self, struct hists *pair,
103 bool show_displacement, FILE *fp); 75 bool show_displacement, FILE *fp);
104 76
105int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip); 77int hist_entry__inc_addr_samples(struct hist_entry *self, int evidx, u64 addr);
106int hist_entry__annotate(struct hist_entry *self, struct list_head *head, 78int hist_entry__annotate(struct hist_entry *self, size_t privsize);
107 size_t privsize);
108 79
109void hists__filter_by_dso(struct hists *self, const struct dso *dso); 80void hists__filter_by_dso(struct hists *self, const struct dso *dso);
110void hists__filter_by_thread(struct hists *self, const struct thread *thread); 81void hists__filter_by_thread(struct hists *self, const struct thread *thread);
@@ -113,21 +84,18 @@ u16 hists__col_len(struct hists *self, enum hist_column col);
113void hists__set_col_len(struct hists *self, enum hist_column col, u16 len); 84void hists__set_col_len(struct hists *self, enum hist_column col, u16 len);
114bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len); 85bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len);
115 86
116#ifdef NO_NEWT_SUPPORT 87struct perf_evlist;
117static inline int hists__browse(struct hists *self __used,
118 const char *helpline __used,
119 const char *ev_name __used)
120{
121 return 0;
122}
123 88
124static inline int hists__tui_browse_tree(struct rb_root *self __used, 89#ifdef NO_NEWT_SUPPORT
125 const char *help __used) 90static inline
91int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used,
92 const char *help __used)
126{ 93{
127 return 0; 94 return 0;
128} 95}
129 96
130static inline int hist_entry__tui_annotate(struct hist_entry *self __used) 97static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
98 int evidx __used)
131{ 99{
132 return 0; 100 return 0;
133} 101}
@@ -135,14 +103,12 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used)
135#define KEY_RIGHT -2 103#define KEY_RIGHT -2
136#else 104#else
137#include <newt.h> 105#include <newt.h>
138int hists__browse(struct hists *self, const char *helpline, 106int hist_entry__tui_annotate(struct hist_entry *self, int evidx);
139 const char *ev_name);
140int hist_entry__tui_annotate(struct hist_entry *self);
141 107
142#define KEY_LEFT NEWT_KEY_LEFT 108#define KEY_LEFT NEWT_KEY_LEFT
143#define KEY_RIGHT NEWT_KEY_RIGHT 109#define KEY_RIGHT NEWT_KEY_RIGHT
144 110
145int hists__tui_browse_tree(struct rb_root *self, const char *help); 111int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help);
146#endif 112#endif
147 113
148unsigned int hists__sort_list_width(struct hists *self); 114unsigned int hists__sort_list_width(struct hists *self);
diff --git a/tools/perf/util/include/linux/list.h b/tools/perf/util/include/linux/list.h
index f5ca26e53fb..356c7e467b8 100644
--- a/tools/perf/util/include/linux/list.h
+++ b/tools/perf/util/include/linux/list.h
@@ -1,3 +1,4 @@
1#include <linux/kernel.h>
1#include "../../../../include/linux/list.h" 2#include "../../../../include/linux/list.h"
2 3
3#ifndef PERF_LIST_H 4#ifndef PERF_LIST_H
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 135f69baf96..54a7e2634d5 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,6 +1,7 @@
1#include "../../../include/linux/hw_breakpoint.h" 1#include "../../../include/linux/hw_breakpoint.h"
2#include "util.h" 2#include "util.h"
3#include "../perf.h" 3#include "../perf.h"
4#include "evlist.h"
4#include "evsel.h" 5#include "evsel.h"
5#include "parse-options.h" 6#include "parse-options.h"
6#include "parse-events.h" 7#include "parse-events.h"
@@ -11,10 +12,6 @@
11#include "header.h" 12#include "header.h"
12#include "debugfs.h" 13#include "debugfs.h"
13 14
14int nr_counters;
15
16LIST_HEAD(evsel_list);
17
18struct event_symbol { 15struct event_symbol {
19 u8 type; 16 u8 type;
20 u64 config; 17 u64 config;
@@ -271,6 +268,9 @@ const char *event_name(struct perf_evsel *evsel)
271 u64 config = evsel->attr.config; 268 u64 config = evsel->attr.config;
272 int type = evsel->attr.type; 269 int type = evsel->attr.type;
273 270
271 if (evsel->name)
272 return evsel->name;
273
274 return __event_name(type, config); 274 return __event_name(type, config);
275} 275}
276 276
@@ -449,8 +449,8 @@ parse_single_tracepoint_event(char *sys_name,
449/* sys + ':' + event + ':' + flags*/ 449/* sys + ':' + event + ':' + flags*/
450#define MAX_EVOPT_LEN (MAX_EVENT_LENGTH * 2 + 2 + 128) 450#define MAX_EVOPT_LEN (MAX_EVENT_LENGTH * 2 + 2 + 128)
451static enum event_result 451static enum event_result
452parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp, 452parse_multiple_tracepoint_event(const struct option *opt, char *sys_name,
453 char *flags) 453 const char *evt_exp, char *flags)
454{ 454{
455 char evt_path[MAXPATHLEN]; 455 char evt_path[MAXPATHLEN];
456 struct dirent *evt_ent; 456 struct dirent *evt_ent;
@@ -483,15 +483,16 @@ parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp,
483 if (len < 0) 483 if (len < 0)
484 return EVT_FAILED; 484 return EVT_FAILED;
485 485
486 if (parse_events(NULL, event_opt, 0)) 486 if (parse_events(opt, event_opt, 0))
487 return EVT_FAILED; 487 return EVT_FAILED;
488 } 488 }
489 489
490 return EVT_HANDLED_ALL; 490 return EVT_HANDLED_ALL;
491} 491}
492 492
493static enum event_result parse_tracepoint_event(const char **strp, 493static enum event_result
494 struct perf_event_attr *attr) 494parse_tracepoint_event(const struct option *opt, const char **strp,
495 struct perf_event_attr *attr)
495{ 496{
496 const char *evt_name; 497 const char *evt_name;
497 char *flags = NULL, *comma_loc; 498 char *flags = NULL, *comma_loc;
@@ -530,7 +531,7 @@ static enum event_result parse_tracepoint_event(const char **strp,
530 return EVT_FAILED; 531 return EVT_FAILED;
531 if (strpbrk(evt_name, "*?")) { 532 if (strpbrk(evt_name, "*?")) {
532 *strp += strlen(sys_name) + evt_length + 1; /* 1 == the ':' */ 533 *strp += strlen(sys_name) + evt_length + 1; /* 1 == the ':' */
533 return parse_multiple_tracepoint_event(sys_name, evt_name, 534 return parse_multiple_tracepoint_event(opt, sys_name, evt_name,
534 flags); 535 flags);
535 } else { 536 } else {
536 return parse_single_tracepoint_event(sys_name, evt_name, 537 return parse_single_tracepoint_event(sys_name, evt_name,
@@ -740,11 +741,12 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
740 * Symbolic names are (almost) exactly matched. 741 * Symbolic names are (almost) exactly matched.
741 */ 742 */
742static enum event_result 743static enum event_result
743parse_event_symbols(const char **str, struct perf_event_attr *attr) 744parse_event_symbols(const struct option *opt, const char **str,
745 struct perf_event_attr *attr)
744{ 746{
745 enum event_result ret; 747 enum event_result ret;
746 748
747 ret = parse_tracepoint_event(str, attr); 749 ret = parse_tracepoint_event(opt, str, attr);
748 if (ret != EVT_FAILED) 750 if (ret != EVT_FAILED)
749 goto modifier; 751 goto modifier;
750 752
@@ -778,14 +780,17 @@ modifier:
778 return ret; 780 return ret;
779} 781}
780 782
781int parse_events(const struct option *opt __used, const char *str, int unset __used) 783int parse_events(const struct option *opt, const char *str, int unset __used)
782{ 784{
785 struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
783 struct perf_event_attr attr; 786 struct perf_event_attr attr;
784 enum event_result ret; 787 enum event_result ret;
788 const char *ostr;
785 789
786 for (;;) { 790 for (;;) {
791 ostr = str;
787 memset(&attr, 0, sizeof(attr)); 792 memset(&attr, 0, sizeof(attr));
788 ret = parse_event_symbols(&str, &attr); 793 ret = parse_event_symbols(opt, &str, &attr);
789 if (ret == EVT_FAILED) 794 if (ret == EVT_FAILED)
790 return -1; 795 return -1;
791 796
@@ -794,12 +799,15 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
794 799
795 if (ret != EVT_HANDLED_ALL) { 800 if (ret != EVT_HANDLED_ALL) {
796 struct perf_evsel *evsel; 801 struct perf_evsel *evsel;
797 evsel = perf_evsel__new(&attr, 802 evsel = perf_evsel__new(&attr, evlist->nr_entries);
798 nr_counters);
799 if (evsel == NULL) 803 if (evsel == NULL)
800 return -1; 804 return -1;
801 list_add_tail(&evsel->node, &evsel_list); 805 perf_evlist__add(evlist, evsel);
802 ++nr_counters; 806
807 evsel->name = calloc(str - ostr + 1, 1);
808 if (!evsel->name)
809 return -1;
810 strncpy(evsel->name, ostr, str - ostr);
803 } 811 }
804 812
805 if (*str == 0) 813 if (*str == 0)
@@ -813,13 +821,14 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
813 return 0; 821 return 0;
814} 822}
815 823
816int parse_filter(const struct option *opt __used, const char *str, 824int parse_filter(const struct option *opt, const char *str,
817 int unset __used) 825 int unset __used)
818{ 826{
827 struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
819 struct perf_evsel *last = NULL; 828 struct perf_evsel *last = NULL;
820 829
821 if (!list_empty(&evsel_list)) 830 if (evlist->nr_entries > 0)
822 last = list_entry(evsel_list.prev, struct perf_evsel, node); 831 last = list_entry(evlist->entries.prev, struct perf_evsel, node);
823 832
824 if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) { 833 if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) {
825 fprintf(stderr, 834 fprintf(stderr,
@@ -849,7 +858,7 @@ static const char * const event_type_descriptors[] = {
849 * Print the events from <debugfs_mount_point>/tracing/events 858 * Print the events from <debugfs_mount_point>/tracing/events
850 */ 859 */
851 860
852static void print_tracepoint_events(void) 861void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
853{ 862{
854 DIR *sys_dir, *evt_dir; 863 DIR *sys_dir, *evt_dir;
855 struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; 864 struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
@@ -864,6 +873,9 @@ static void print_tracepoint_events(void)
864 return; 873 return;
865 874
866 for_each_subsystem(sys_dir, sys_dirent, sys_next) { 875 for_each_subsystem(sys_dir, sys_dirent, sys_next) {
876 if (subsys_glob != NULL &&
877 !strglobmatch(sys_dirent.d_name, subsys_glob))
878 continue;
867 879
868 snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, 880 snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
869 sys_dirent.d_name); 881 sys_dirent.d_name);
@@ -872,6 +884,10 @@ static void print_tracepoint_events(void)
872 continue; 884 continue;
873 885
874 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { 886 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
887 if (event_glob != NULL &&
888 !strglobmatch(evt_dirent.d_name, event_glob))
889 continue;
890
875 snprintf(evt_path, MAXPATHLEN, "%s:%s", 891 snprintf(evt_path, MAXPATHLEN, "%s:%s",
876 sys_dirent.d_name, evt_dirent.d_name); 892 sys_dirent.d_name, evt_dirent.d_name);
877 printf(" %-42s [%s]\n", evt_path, 893 printf(" %-42s [%s]\n", evt_path,
@@ -923,13 +939,61 @@ int is_valid_tracepoint(const char *event_string)
923 return 0; 939 return 0;
924} 940}
925 941
942void print_events_type(u8 type)
943{
944 struct event_symbol *syms = event_symbols;
945 unsigned int i;
946 char name[64];
947
948 for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
949 if (type != syms->type)
950 continue;
951
952 if (strlen(syms->alias))
953 snprintf(name, sizeof(name), "%s OR %s",
954 syms->symbol, syms->alias);
955 else
956 snprintf(name, sizeof(name), "%s", syms->symbol);
957
958 printf(" %-42s [%s]\n", name,
959 event_type_descriptors[type]);
960 }
961}
962
963int print_hwcache_events(const char *event_glob)
964{
965 unsigned int type, op, i, printed = 0;
966
967 for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
968 for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
969 /* skip invalid cache type */
970 if (!is_cache_op_valid(type, op))
971 continue;
972
973 for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
974 char *name = event_cache_name(type, op, i);
975
976 if (event_glob != NULL &&
977 !strglobmatch(name, event_glob))
978 continue;
979
980 printf(" %-42s [%s]\n", name,
981 event_type_descriptors[PERF_TYPE_HW_CACHE]);
982 ++printed;
983 }
984 }
985 }
986
987 return printed;
988}
989
926/* 990/*
927 * Print the help text for the event symbols: 991 * Print the help text for the event symbols:
928 */ 992 */
929void print_events(void) 993void print_events(const char *event_glob)
930{ 994{
931 struct event_symbol *syms = event_symbols; 995 struct event_symbol *syms = event_symbols;
932 unsigned int i, type, op, prev_type = -1; 996 unsigned int i, type, prev_type = -1, printed = 0, ntypes_printed = 0;
933 char name[40]; 997 char name[40];
934 998
935 printf("\n"); 999 printf("\n");
@@ -938,8 +1002,16 @@ void print_events(void)
938 for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) { 1002 for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
939 type = syms->type; 1003 type = syms->type;
940 1004
941 if (type != prev_type) 1005 if (type != prev_type && printed) {
942 printf("\n"); 1006 printf("\n");
1007 printed = 0;
1008 ntypes_printed++;
1009 }
1010
1011 if (event_glob != NULL &&
1012 !(strglobmatch(syms->symbol, event_glob) ||
1013 (syms->alias && strglobmatch(syms->alias, event_glob))))
1014 continue;
943 1015
944 if (strlen(syms->alias)) 1016 if (strlen(syms->alias))
945 sprintf(name, "%s OR %s", syms->symbol, syms->alias); 1017 sprintf(name, "%s OR %s", syms->symbol, syms->alias);
@@ -949,22 +1021,17 @@ void print_events(void)
949 event_type_descriptors[type]); 1021 event_type_descriptors[type]);
950 1022
951 prev_type = type; 1023 prev_type = type;
1024 ++printed;
952 } 1025 }
953 1026
954 printf("\n"); 1027 if (ntypes_printed) {
955 for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { 1028 printed = 0;
956 for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { 1029 printf("\n");
957 /* skip invalid cache type */
958 if (!is_cache_op_valid(type, op))
959 continue;
960
961 for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
962 printf(" %-42s [%s]\n",
963 event_cache_name(type, op, i),
964 event_type_descriptors[PERF_TYPE_HW_CACHE]);
965 }
966 }
967 } 1030 }
1031 print_hwcache_events(event_glob);
1032
1033 if (event_glob != NULL)
1034 return;
968 1035
969 printf("\n"); 1036 printf("\n");
970 printf(" %-42s [%s]\n", 1037 printf(" %-42s [%s]\n",
@@ -977,37 +1044,7 @@ void print_events(void)
977 event_type_descriptors[PERF_TYPE_BREAKPOINT]); 1044 event_type_descriptors[PERF_TYPE_BREAKPOINT]);
978 printf("\n"); 1045 printf("\n");
979 1046
980 print_tracepoint_events(); 1047 print_tracepoint_events(NULL, NULL);
981 1048
982 exit(129); 1049 exit(129);
983} 1050}
984
985int perf_evsel_list__create_default(void)
986{
987 struct perf_evsel *evsel;
988 struct perf_event_attr attr;
989
990 memset(&attr, 0, sizeof(attr));
991 attr.type = PERF_TYPE_HARDWARE;
992 attr.config = PERF_COUNT_HW_CPU_CYCLES;
993
994 evsel = perf_evsel__new(&attr, 0);
995
996 if (evsel == NULL)
997 return -ENOMEM;
998
999 list_add(&evsel->node, &evsel_list);
1000 ++nr_counters;
1001 return 0;
1002}
1003
1004void perf_evsel_list__delete(void)
1005{
1006 struct perf_evsel *pos, *n;
1007
1008 list_for_each_entry_safe(pos, n, &evsel_list, node) {
1009 list_del_init(&pos->node);
1010 perf_evsel__delete(pos);
1011 }
1012 nr_counters = 0;
1013}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 458e3ecf17a..212f88e07a9 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -9,11 +9,6 @@
9struct list_head; 9struct list_head;
10struct perf_evsel; 10struct perf_evsel;
11 11
12extern struct list_head evsel_list;
13
14int perf_evsel_list__create_default(void);
15void perf_evsel_list__delete(void);
16
17struct option; 12struct option;
18 13
19struct tracepoint_path { 14struct tracepoint_path {
@@ -25,8 +20,6 @@ struct tracepoint_path {
25extern struct tracepoint_path *tracepoint_id_to_path(u64 config); 20extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
26extern bool have_tracepoints(struct list_head *evlist); 21extern bool have_tracepoints(struct list_head *evlist);
27 22
28extern int nr_counters;
29
30const char *event_name(struct perf_evsel *event); 23const char *event_name(struct perf_evsel *event);
31extern const char *__event_name(int type, u64 config); 24extern const char *__event_name(int type, u64 config);
32 25
@@ -35,7 +28,10 @@ extern int parse_filter(const struct option *opt, const char *str, int unset);
35 28
36#define EVENTS_HELP_MAX (128*1024) 29#define EVENTS_HELP_MAX (128*1024)
37 30
38extern void print_events(void); 31void print_events(const char *event_glob);
32void print_events_type(u8 type);
33void print_tracepoint_events(const char *subsys_glob, const char *event_glob);
34int print_hwcache_events(const char *event_glob);
39extern int is_valid_tracepoint(const char *event_string); 35extern int is_valid_tracepoint(const char *event_string);
40 36
41extern char debugfs_path[]; 37extern char debugfs_path[];
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 6e29d9c9dcc..5ddee66020a 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -31,6 +31,7 @@
31#include <string.h> 31#include <string.h>
32#include <stdarg.h> 32#include <stdarg.h>
33#include <limits.h> 33#include <limits.h>
34#include <elf.h>
34 35
35#undef _GNU_SOURCE 36#undef _GNU_SOURCE
36#include "util.h" 37#include "util.h"
@@ -111,7 +112,25 @@ static struct symbol *__find_kernel_function_by_name(const char *name,
111 NULL); 112 NULL);
112} 113}
113 114
114const char *kernel_get_module_path(const char *module) 115static struct map *kernel_get_module_map(const char *module)
116{
117 struct rb_node *nd;
118 struct map_groups *grp = &machine.kmaps;
119
120 if (!module)
121 module = "kernel";
122
123 for (nd = rb_first(&grp->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) {
124 struct map *pos = rb_entry(nd, struct map, rb_node);
125 if (strncmp(pos->dso->short_name + 1, module,
126 pos->dso->short_name_len - 2) == 0) {
127 return pos;
128 }
129 }
130 return NULL;
131}
132
133static struct dso *kernel_get_module_dso(const char *module)
115{ 134{
116 struct dso *dso; 135 struct dso *dso;
117 struct map *map; 136 struct map *map;
@@ -141,7 +160,13 @@ const char *kernel_get_module_path(const char *module)
141 } 160 }
142 } 161 }
143found: 162found:
144 return dso->long_name; 163 return dso;
164}
165
166const char *kernel_get_module_path(const char *module)
167{
168 struct dso *dso = kernel_get_module_dso(module);
169 return (dso) ? dso->long_name : NULL;
145} 170}
146 171
147#ifdef DWARF_SUPPORT 172#ifdef DWARF_SUPPORT
@@ -384,7 +409,7 @@ int show_line_range(struct line_range *lr, const char *module)
384 setup_pager(); 409 setup_pager();
385 410
386 if (lr->function) 411 if (lr->function)
387 fprintf(stdout, "<%s:%d>\n", lr->function, 412 fprintf(stdout, "<%s@%s:%d>\n", lr->function, lr->path,
388 lr->start - lr->offset); 413 lr->start - lr->offset);
389 else 414 else
390 fprintf(stdout, "<%s:%d>\n", lr->path, lr->start); 415 fprintf(stdout, "<%s:%d>\n", lr->path, lr->start);
@@ -426,12 +451,14 @@ end:
426} 451}
427 452
428static int show_available_vars_at(int fd, struct perf_probe_event *pev, 453static int show_available_vars_at(int fd, struct perf_probe_event *pev,
429 int max_vls, bool externs) 454 int max_vls, struct strfilter *_filter,
455 bool externs)
430{ 456{
431 char *buf; 457 char *buf;
432 int ret, i; 458 int ret, i, nvars;
433 struct str_node *node; 459 struct str_node *node;
434 struct variable_list *vls = NULL, *vl; 460 struct variable_list *vls = NULL, *vl;
461 const char *var;
435 462
436 buf = synthesize_perf_probe_point(&pev->point); 463 buf = synthesize_perf_probe_point(&pev->point);
437 if (!buf) 464 if (!buf)
@@ -439,36 +466,45 @@ static int show_available_vars_at(int fd, struct perf_probe_event *pev,
439 pr_debug("Searching variables at %s\n", buf); 466 pr_debug("Searching variables at %s\n", buf);
440 467
441 ret = find_available_vars_at(fd, pev, &vls, max_vls, externs); 468 ret = find_available_vars_at(fd, pev, &vls, max_vls, externs);
442 if (ret > 0) { 469 if (ret <= 0) {
443 /* Some variables were found */ 470 pr_err("Failed to find variables at %s (%d)\n", buf, ret);
444 fprintf(stdout, "Available variables at %s\n", buf); 471 goto end;
445 for (i = 0; i < ret; i++) { 472 }
446 vl = &vls[i]; 473 /* Some variables are found */
447 /* 474 fprintf(stdout, "Available variables at %s\n", buf);
448 * A probe point might be converted to 475 for (i = 0; i < ret; i++) {
449 * several trace points. 476 vl = &vls[i];
450 */ 477 /*
451 fprintf(stdout, "\t@<%s+%lu>\n", vl->point.symbol, 478 * A probe point might be converted to
452 vl->point.offset); 479 * several trace points.
453 free(vl->point.symbol); 480 */
454 if (vl->vars) { 481 fprintf(stdout, "\t@<%s+%lu>\n", vl->point.symbol,
455 strlist__for_each(node, vl->vars) 482 vl->point.offset);
483 free(vl->point.symbol);
484 nvars = 0;
485 if (vl->vars) {
486 strlist__for_each(node, vl->vars) {
487 var = strchr(node->s, '\t') + 1;
488 if (strfilter__compare(_filter, var)) {
456 fprintf(stdout, "\t\t%s\n", node->s); 489 fprintf(stdout, "\t\t%s\n", node->s);
457 strlist__delete(vl->vars); 490 nvars++;
458 } else 491 }
459 fprintf(stdout, "(No variables)\n"); 492 }
493 strlist__delete(vl->vars);
460 } 494 }
461 free(vls); 495 if (nvars == 0)
462 } else 496 fprintf(stdout, "\t\t(No matched variables)\n");
463 pr_err("Failed to find variables at %s (%d)\n", buf, ret); 497 }
464 498 free(vls);
499end:
465 free(buf); 500 free(buf);
466 return ret; 501 return ret;
467} 502}
468 503
469/* Show available variables on given probe point */ 504/* Show available variables on given probe point */
470int show_available_vars(struct perf_probe_event *pevs, int npevs, 505int show_available_vars(struct perf_probe_event *pevs, int npevs,
471 int max_vls, const char *module, bool externs) 506 int max_vls, const char *module,
507 struct strfilter *_filter, bool externs)
472{ 508{
473 int i, fd, ret = 0; 509 int i, fd, ret = 0;
474 510
@@ -485,7 +521,8 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
485 setup_pager(); 521 setup_pager();
486 522
487 for (i = 0; i < npevs && ret >= 0; i++) 523 for (i = 0; i < npevs && ret >= 0; i++)
488 ret = show_available_vars_at(fd, &pevs[i], max_vls, externs); 524 ret = show_available_vars_at(fd, &pevs[i], max_vls, _filter,
525 externs);
489 526
490 close(fd); 527 close(fd);
491 return ret; 528 return ret;
@@ -531,7 +568,9 @@ int show_line_range(struct line_range *lr __unused, const char *module __unused)
531 568
532int show_available_vars(struct perf_probe_event *pevs __unused, 569int show_available_vars(struct perf_probe_event *pevs __unused,
533 int npevs __unused, int max_vls __unused, 570 int npevs __unused, int max_vls __unused,
534 const char *module __unused, bool externs __unused) 571 const char *module __unused,
572 struct strfilter *filter __unused,
573 bool externs __unused)
535{ 574{
536 pr_warning("Debuginfo-analysis is not supported.\n"); 575 pr_warning("Debuginfo-analysis is not supported.\n");
537 return -ENOSYS; 576 return -ENOSYS;
@@ -556,11 +595,11 @@ static int parse_line_num(char **ptr, int *val, const char *what)
556 * The line range syntax is described by: 595 * The line range syntax is described by:
557 * 596 *
558 * SRC[:SLN[+NUM|-ELN]] 597 * SRC[:SLN[+NUM|-ELN]]
559 * FNC[:SLN[+NUM|-ELN]] 598 * FNC[@SRC][:SLN[+NUM|-ELN]]
560 */ 599 */
561int parse_line_range_desc(const char *arg, struct line_range *lr) 600int parse_line_range_desc(const char *arg, struct line_range *lr)
562{ 601{
563 char *range, *name = strdup(arg); 602 char *range, *file, *name = strdup(arg);
564 int err; 603 int err;
565 604
566 if (!name) 605 if (!name)
@@ -610,7 +649,16 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
610 } 649 }
611 } 650 }
612 651
613 if (strchr(name, '.')) 652 file = strchr(name, '@');
653 if (file) {
654 *file = '\0';
655 lr->file = strdup(++file);
656 if (lr->file == NULL) {
657 err = -ENOMEM;
658 goto err;
659 }
660 lr->function = name;
661 } else if (strchr(name, '.'))
614 lr->file = name; 662 lr->file = name;
615 else 663 else
616 lr->function = name; 664 lr->function = name;
@@ -1784,9 +1832,12 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
1784 } 1832 }
1785 1833
1786 /* Loop 2: add all events */ 1834 /* Loop 2: add all events */
1787 for (i = 0; i < npevs && ret >= 0; i++) 1835 for (i = 0; i < npevs; i++) {
1788 ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs, 1836 ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs,
1789 pkgs[i].ntevs, force_add); 1837 pkgs[i].ntevs, force_add);
1838 if (ret < 0)
1839 break;
1840 }
1790end: 1841end:
1791 /* Loop 3: cleanup and free trace events */ 1842 /* Loop 3: cleanup and free trace events */
1792 for (i = 0; i < npevs; i++) { 1843 for (i = 0; i < npevs; i++) {
@@ -1912,4 +1963,46 @@ int del_perf_probe_events(struct strlist *dellist)
1912 1963
1913 return ret; 1964 return ret;
1914} 1965}
1966/* TODO: don't use a global variable for filter ... */
1967static struct strfilter *available_func_filter;
1968
1969/*
1970 * If a symbol corresponds to a function with global binding and
1971 * matches filter return 0. For all others return 1.
1972 */
1973static int filter_available_functions(struct map *map __unused,
1974 struct symbol *sym)
1975{
1976 if (sym->binding == STB_GLOBAL &&
1977 strfilter__compare(available_func_filter, sym->name))
1978 return 0;
1979 return 1;
1980}
1981
1982int show_available_funcs(const char *module, struct strfilter *_filter)
1983{
1984 struct map *map;
1985 int ret;
1986
1987 setup_pager();
1988
1989 ret = init_vmlinux();
1990 if (ret < 0)
1991 return ret;
1915 1992
1993 map = kernel_get_module_map(module);
1994 if (!map) {
1995 pr_err("Failed to find %s map.\n", (module) ? : "kernel");
1996 return -EINVAL;
1997 }
1998 available_func_filter = _filter;
1999 if (map__load(map, filter_available_functions)) {
2000 pr_err("Failed to load map.\n");
2001 return -EINVAL;
2002 }
2003 if (!dso__sorted_by_name(map->dso, map->type))
2004 dso__sort_by_name(map->dso, map->type);
2005
2006 dso__fprintf_symbols_by_name(map->dso, map->type, stdout);
2007 return 0;
2008}
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 5accbedfea3..3434fc9d79d 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -3,6 +3,7 @@
3 3
4#include <stdbool.h> 4#include <stdbool.h>
5#include "strlist.h" 5#include "strlist.h"
6#include "strfilter.h"
6 7
7extern bool probe_event_dry_run; 8extern bool probe_event_dry_run;
8 9
@@ -126,7 +127,8 @@ extern int show_perf_probe_events(void);
126extern int show_line_range(struct line_range *lr, const char *module); 127extern int show_line_range(struct line_range *lr, const char *module);
127extern int show_available_vars(struct perf_probe_event *pevs, int npevs, 128extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
128 int max_probe_points, const char *module, 129 int max_probe_points, const char *module,
129 bool externs); 130 struct strfilter *filter, bool externs);
131extern int show_available_funcs(const char *module, struct strfilter *filter);
130 132
131 133
132/* Maximum index number of event-name postfix */ 134/* Maximum index number of event-name postfix */
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index ab83b6ac5d6..194f9e2a328 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -33,6 +33,7 @@
33#include <ctype.h> 33#include <ctype.h>
34#include <dwarf-regs.h> 34#include <dwarf-regs.h>
35 35
36#include <linux/bitops.h>
36#include "event.h" 37#include "event.h"
37#include "debug.h" 38#include "debug.h"
38#include "util.h" 39#include "util.h"
@@ -280,6 +281,19 @@ static bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
280 return name ? (strcmp(tname, name) == 0) : false; 281 return name ? (strcmp(tname, name) == 0) : false;
281} 282}
282 283
284/* Get callsite line number of inline-function instance */
285static int die_get_call_lineno(Dwarf_Die *in_die)
286{
287 Dwarf_Attribute attr;
288 Dwarf_Word ret;
289
290 if (!dwarf_attr(in_die, DW_AT_call_line, &attr))
291 return -ENOENT;
292
293 dwarf_formudata(&attr, &ret);
294 return (int)ret;
295}
296
283/* Get type die */ 297/* Get type die */
284static Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) 298static Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
285{ 299{
@@ -320,13 +334,23 @@ static Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
320 return vr_die; 334 return vr_die;
321} 335}
322 336
323static bool die_is_signed_type(Dwarf_Die *tp_die) 337static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name,
338 Dwarf_Word *result)
324{ 339{
325 Dwarf_Attribute attr; 340 Dwarf_Attribute attr;
341
342 if (dwarf_attr(tp_die, attr_name, &attr) == NULL ||
343 dwarf_formudata(&attr, result) != 0)
344 return -ENOENT;
345
346 return 0;
347}
348
349static bool die_is_signed_type(Dwarf_Die *tp_die)
350{
326 Dwarf_Word ret; 351 Dwarf_Word ret;
327 352
328 if (dwarf_attr(tp_die, DW_AT_encoding, &attr) == NULL || 353 if (die_get_attr_udata(tp_die, DW_AT_encoding, &ret))
329 dwarf_formudata(&attr, &ret) != 0)
330 return false; 354 return false;
331 355
332 return (ret == DW_ATE_signed_char || ret == DW_ATE_signed || 356 return (ret == DW_ATE_signed_char || ret == DW_ATE_signed ||
@@ -335,11 +359,29 @@ static bool die_is_signed_type(Dwarf_Die *tp_die)
335 359
336static int die_get_byte_size(Dwarf_Die *tp_die) 360static int die_get_byte_size(Dwarf_Die *tp_die)
337{ 361{
338 Dwarf_Attribute attr;
339 Dwarf_Word ret; 362 Dwarf_Word ret;
340 363
341 if (dwarf_attr(tp_die, DW_AT_byte_size, &attr) == NULL || 364 if (die_get_attr_udata(tp_die, DW_AT_byte_size, &ret))
342 dwarf_formudata(&attr, &ret) != 0) 365 return 0;
366
367 return (int)ret;
368}
369
370static int die_get_bit_size(Dwarf_Die *tp_die)
371{
372 Dwarf_Word ret;
373
374 if (die_get_attr_udata(tp_die, DW_AT_bit_size, &ret))
375 return 0;
376
377 return (int)ret;
378}
379
380static int die_get_bit_offset(Dwarf_Die *tp_die)
381{
382 Dwarf_Word ret;
383
384 if (die_get_attr_udata(tp_die, DW_AT_bit_offset, &ret))
343 return 0; 385 return 0;
344 386
345 return (int)ret; 387 return (int)ret;
@@ -458,6 +500,151 @@ static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
458 return die_find_child(sp_die, __die_find_inline_cb, &addr, die_mem); 500 return die_find_child(sp_die, __die_find_inline_cb, &addr, die_mem);
459} 501}
460 502
503/* Walker on lines (Note: line number will not be sorted) */
504typedef int (* line_walk_handler_t) (const char *fname, int lineno,
505 Dwarf_Addr addr, void *data);
506
507struct __line_walk_param {
508 const char *fname;
509 line_walk_handler_t handler;
510 void *data;
511 int retval;
512};
513
514static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data)
515{
516 struct __line_walk_param *lw = data;
517 Dwarf_Addr addr;
518 int lineno;
519
520 if (dwarf_tag(in_die) == DW_TAG_inlined_subroutine) {
521 lineno = die_get_call_lineno(in_die);
522 if (lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) {
523 lw->retval = lw->handler(lw->fname, lineno, addr,
524 lw->data);
525 if (lw->retval != 0)
526 return DIE_FIND_CB_FOUND;
527 }
528 }
529 return DIE_FIND_CB_SIBLING;
530}
531
532/* Walk on lines of blocks included in given DIE */
533static int __die_walk_funclines(Dwarf_Die *sp_die,
534 line_walk_handler_t handler, void *data)
535{
536 struct __line_walk_param lw = {
537 .handler = handler,
538 .data = data,
539 .retval = 0,
540 };
541 Dwarf_Die die_mem;
542 Dwarf_Addr addr;
543 int lineno;
544
545 /* Handle function declaration line */
546 lw.fname = dwarf_decl_file(sp_die);
547 if (lw.fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
548 dwarf_entrypc(sp_die, &addr) == 0) {
549 lw.retval = handler(lw.fname, lineno, addr, data);
550 if (lw.retval != 0)
551 goto done;
552 }
553 die_find_child(sp_die, __die_walk_funclines_cb, &lw, &die_mem);
554done:
555 return lw.retval;
556}
557
558static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data)
559{
560 struct __line_walk_param *lw = data;
561
562 lw->retval = __die_walk_funclines(sp_die, lw->handler, lw->data);
563 if (lw->retval != 0)
564 return DWARF_CB_ABORT;
565
566 return DWARF_CB_OK;
567}
568
569/*
570 * Walk on lines inside given PDIE. If the PDIE is subprogram, walk only on
571 * the lines inside the subprogram, otherwise PDIE must be a CU DIE.
572 */
573static int die_walk_lines(Dwarf_Die *pdie, line_walk_handler_t handler,
574 void *data)
575{
576 Dwarf_Lines *lines;
577 Dwarf_Line *line;
578 Dwarf_Addr addr;
579 const char *fname;
580 int lineno, ret = 0;
581 Dwarf_Die die_mem, *cu_die;
582 size_t nlines, i;
583
584 /* Get the CU die */
585 if (dwarf_tag(pdie) == DW_TAG_subprogram)
586 cu_die = dwarf_diecu(pdie, &die_mem, NULL, NULL);
587 else
588 cu_die = pdie;
589 if (!cu_die) {
590 pr_debug2("Failed to get CU from subprogram\n");
591 return -EINVAL;
592 }
593
594 /* Get lines list in the CU */
595 if (dwarf_getsrclines(cu_die, &lines, &nlines) != 0) {
596 pr_debug2("Failed to get source lines on this CU.\n");
597 return -ENOENT;
598 }
599 pr_debug2("Get %zd lines from this CU\n", nlines);
600
601 /* Walk on the lines on lines list */
602 for (i = 0; i < nlines; i++) {
603 line = dwarf_onesrcline(lines, i);
604 if (line == NULL ||
605 dwarf_lineno(line, &lineno) != 0 ||
606 dwarf_lineaddr(line, &addr) != 0) {
607 pr_debug2("Failed to get line info. "
608 "Possible error in debuginfo.\n");
609 continue;
610 }
611 /* Filter lines based on address */
612 if (pdie != cu_die)
613 /*
614 * Address filtering
615 * The line is included in given function, and
616 * no inline block includes it.
617 */
618 if (!dwarf_haspc(pdie, addr) ||
619 die_find_inlinefunc(pdie, addr, &die_mem))
620 continue;
621 /* Get source line */
622 fname = dwarf_linesrc(line, NULL, NULL);
623
624 ret = handler(fname, lineno, addr, data);
625 if (ret != 0)
626 return ret;
627 }
628
629 /*
630 * Dwarf lines doesn't include function declarations and inlined
631 * subroutines. We have to check functions list or given function.
632 */
633 if (pdie != cu_die)
634 ret = __die_walk_funclines(pdie, handler, data);
635 else {
636 struct __line_walk_param param = {
637 .handler = handler,
638 .data = data,
639 .retval = 0,
640 };
641 dwarf_getfuncs(cu_die, __die_walk_culines_cb, &param, 0);
642 ret = param.retval;
643 }
644
645 return ret;
646}
647
461struct __find_variable_param { 648struct __find_variable_param {
462 const char *name; 649 const char *name;
463 Dwarf_Addr addr; 650 Dwarf_Addr addr;
@@ -669,6 +856,8 @@ static_var:
669 return 0; 856 return 0;
670} 857}
671 858
859#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_LONG / sizeof(long))
860
672static int convert_variable_type(Dwarf_Die *vr_die, 861static int convert_variable_type(Dwarf_Die *vr_die,
673 struct probe_trace_arg *tvar, 862 struct probe_trace_arg *tvar,
674 const char *cast) 863 const char *cast)
@@ -685,6 +874,14 @@ static int convert_variable_type(Dwarf_Die *vr_die,
685 return (tvar->type == NULL) ? -ENOMEM : 0; 874 return (tvar->type == NULL) ? -ENOMEM : 0;
686 } 875 }
687 876
877 if (die_get_bit_size(vr_die) != 0) {
878 /* This is a bitfield */
879 ret = snprintf(buf, 16, "b%d@%d/%zd", die_get_bit_size(vr_die),
880 die_get_bit_offset(vr_die),
881 BYTES_TO_BITS(die_get_byte_size(vr_die)));
882 goto formatted;
883 }
884
688 if (die_get_real_type(vr_die, &type) == NULL) { 885 if (die_get_real_type(vr_die, &type) == NULL) {
689 pr_warning("Failed to get a type information of %s.\n", 886 pr_warning("Failed to get a type information of %s.\n",
690 dwarf_diename(vr_die)); 887 dwarf_diename(vr_die));
@@ -729,29 +926,31 @@ static int convert_variable_type(Dwarf_Die *vr_die,
729 return (tvar->type == NULL) ? -ENOMEM : 0; 926 return (tvar->type == NULL) ? -ENOMEM : 0;
730 } 927 }
731 928
732 ret = die_get_byte_size(&type) * 8; 929 ret = BYTES_TO_BITS(die_get_byte_size(&type));
733 if (ret) { 930 if (!ret)
734 /* Check the bitwidth */ 931 /* No size ... try to use default type */
735 if (ret > MAX_BASIC_TYPE_BITS) { 932 return 0;
736 pr_info("%s exceeds max-bitwidth."
737 " Cut down to %d bits.\n",
738 dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
739 ret = MAX_BASIC_TYPE_BITS;
740 }
741 933
742 ret = snprintf(buf, 16, "%c%d", 934 /* Check the bitwidth */
743 die_is_signed_type(&type) ? 's' : 'u', ret); 935 if (ret > MAX_BASIC_TYPE_BITS) {
744 if (ret < 0 || ret >= 16) { 936 pr_info("%s exceeds max-bitwidth. Cut down to %d bits.\n",
745 if (ret >= 16) 937 dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
746 ret = -E2BIG; 938 ret = MAX_BASIC_TYPE_BITS;
747 pr_warning("Failed to convert variable type: %s\n", 939 }
748 strerror(-ret)); 940 ret = snprintf(buf, 16, "%c%d",
749 return ret; 941 die_is_signed_type(&type) ? 's' : 'u', ret);
750 } 942
751 tvar->type = strdup(buf); 943formatted:
752 if (tvar->type == NULL) 944 if (ret < 0 || ret >= 16) {
753 return -ENOMEM; 945 if (ret >= 16)
946 ret = -E2BIG;
947 pr_warning("Failed to convert variable type: %s\n",
948 strerror(-ret));
949 return ret;
754 } 950 }
951 tvar->type = strdup(buf);
952 if (tvar->type == NULL)
953 return -ENOMEM;
755 return 0; 954 return 0;
756} 955}
757 956
@@ -1050,157 +1249,102 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf)
1050 return ret; 1249 return ret;
1051} 1250}
1052 1251
1053/* Find probe point from its line number */ 1252static int probe_point_line_walker(const char *fname, int lineno,
1054static int find_probe_point_by_line(struct probe_finder *pf) 1253 Dwarf_Addr addr, void *data)
1055{ 1254{
1056 Dwarf_Lines *lines; 1255 struct probe_finder *pf = data;
1057 Dwarf_Line *line; 1256 int ret;
1058 size_t nlines, i;
1059 Dwarf_Addr addr;
1060 int lineno;
1061 int ret = 0;
1062
1063 if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
1064 pr_warning("No source lines found.\n");
1065 return -ENOENT;
1066 }
1067 1257
1068 for (i = 0; i < nlines && ret == 0; i++) { 1258 if (lineno != pf->lno || strtailcmp(fname, pf->fname) != 0)
1069 line = dwarf_onesrcline(lines, i); 1259 return 0;
1070 if (dwarf_lineno(line, &lineno) != 0 ||
1071 lineno != pf->lno)
1072 continue;
1073 1260
1074 /* TODO: Get fileno from line, but how? */ 1261 pf->addr = addr;
1075 if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0) 1262 ret = call_probe_finder(NULL, pf);
1076 continue;
1077 1263
1078 if (dwarf_lineaddr(line, &addr) != 0) { 1264 /* Continue if no error, because the line will be in inline function */
1079 pr_warning("Failed to get the address of the line.\n"); 1265 return ret < 0 ? ret : 0;
1080 return -ENOENT; 1266}
1081 }
1082 pr_debug("Probe line found: line[%d]:%d addr:0x%jx\n",
1083 (int)i, lineno, (uintmax_t)addr);
1084 pf->addr = addr;
1085 1267
1086 ret = call_probe_finder(NULL, pf); 1268/* Find probe point from its line number */
1087 /* Continuing, because target line might be inlined. */ 1269static int find_probe_point_by_line(struct probe_finder *pf)
1088 } 1270{
1089 return ret; 1271 return die_walk_lines(&pf->cu_die, probe_point_line_walker, pf);
1090} 1272}
1091 1273
1092/* Find lines which match lazy pattern */ 1274/* Find lines which match lazy pattern */
1093static int find_lazy_match_lines(struct list_head *head, 1275static int find_lazy_match_lines(struct list_head *head,
1094 const char *fname, const char *pat) 1276 const char *fname, const char *pat)
1095{ 1277{
1096 char *fbuf, *p1, *p2; 1278 FILE *fp;
1097 int fd, line, nlines = -1; 1279 char *line = NULL;
1098 struct stat st; 1280 size_t line_len;
1099 1281 ssize_t len;
1100 fd = open(fname, O_RDONLY); 1282 int count = 0, linenum = 1;
1101 if (fd < 0) { 1283
1102 pr_warning("Failed to open %s: %s\n", fname, strerror(-fd)); 1284 fp = fopen(fname, "r");
1285 if (!fp) {
1286 pr_warning("Failed to open %s: %s\n", fname, strerror(errno));
1103 return -errno; 1287 return -errno;
1104 } 1288 }
1105 1289
1106 if (fstat(fd, &st) < 0) { 1290 while ((len = getline(&line, &line_len, fp)) > 0) {
1107 pr_warning("Failed to get the size of %s: %s\n", 1291
1108 fname, strerror(errno)); 1292 if (line[len - 1] == '\n')
1109 nlines = -errno; 1293 line[len - 1] = '\0';
1110 goto out_close; 1294
1111 } 1295 if (strlazymatch(line, pat)) {
1112 1296 line_list__add_line(head, linenum);
1113 nlines = -ENOMEM; 1297 count++;
1114 fbuf = malloc(st.st_size + 2);
1115 if (fbuf == NULL)
1116 goto out_close;
1117 if (read(fd, fbuf, st.st_size) < 0) {
1118 pr_warning("Failed to read %s: %s\n", fname, strerror(errno));
1119 nlines = -errno;
1120 goto out_free_fbuf;
1121 }
1122 fbuf[st.st_size] = '\n'; /* Dummy line */
1123 fbuf[st.st_size + 1] = '\0';
1124 p1 = fbuf;
1125 line = 1;
1126 nlines = 0;
1127 while ((p2 = strchr(p1, '\n')) != NULL) {
1128 *p2 = '\0';
1129 if (strlazymatch(p1, pat)) {
1130 line_list__add_line(head, line);
1131 nlines++;
1132 } 1298 }
1133 line++; 1299 linenum++;
1134 p1 = p2 + 1;
1135 } 1300 }
1136out_free_fbuf: 1301
1137 free(fbuf); 1302 if (ferror(fp))
1138out_close: 1303 count = -errno;
1139 close(fd); 1304 free(line);
1140 return nlines; 1305 fclose(fp);
1306
1307 if (count == 0)
1308 pr_debug("No matched lines found in %s.\n", fname);
1309 return count;
1310}
1311
1312static int probe_point_lazy_walker(const char *fname, int lineno,
1313 Dwarf_Addr addr, void *data)
1314{
1315 struct probe_finder *pf = data;
1316 int ret;
1317
1318 if (!line_list__has_line(&pf->lcache, lineno) ||
1319 strtailcmp(fname, pf->fname) != 0)
1320 return 0;
1321
1322 pr_debug("Probe line found: line:%d addr:0x%llx\n",
1323 lineno, (unsigned long long)addr);
1324 pf->addr = addr;
1325 ret = call_probe_finder(NULL, pf);
1326
1327 /*
1328 * Continue if no error, because the lazy pattern will match
1329 * to other lines
1330 */
1331 return ret < 0 ? ret : 0;
1141} 1332}
1142 1333
1143/* Find probe points from lazy pattern */ 1334/* Find probe points from lazy pattern */
1144static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf) 1335static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
1145{ 1336{
1146 Dwarf_Lines *lines;
1147 Dwarf_Line *line;
1148 size_t nlines, i;
1149 Dwarf_Addr addr;
1150 Dwarf_Die die_mem;
1151 int lineno;
1152 int ret = 0; 1337 int ret = 0;
1153 1338
1154 if (list_empty(&pf->lcache)) { 1339 if (list_empty(&pf->lcache)) {
1155 /* Matching lazy line pattern */ 1340 /* Matching lazy line pattern */
1156 ret = find_lazy_match_lines(&pf->lcache, pf->fname, 1341 ret = find_lazy_match_lines(&pf->lcache, pf->fname,
1157 pf->pev->point.lazy_line); 1342 pf->pev->point.lazy_line);
1158 if (ret == 0) { 1343 if (ret <= 0)
1159 pr_debug("No matched lines found in %s.\n", pf->fname);
1160 return 0;
1161 } else if (ret < 0)
1162 return ret; 1344 return ret;
1163 } 1345 }
1164 1346
1165 if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) { 1347 return die_walk_lines(sp_die, probe_point_lazy_walker, pf);
1166 pr_warning("No source lines found.\n");
1167 return -ENOENT;
1168 }
1169
1170 for (i = 0; i < nlines && ret >= 0; i++) {
1171 line = dwarf_onesrcline(lines, i);
1172
1173 if (dwarf_lineno(line, &lineno) != 0 ||
1174 !line_list__has_line(&pf->lcache, lineno))
1175 continue;
1176
1177 /* TODO: Get fileno from line, but how? */
1178 if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
1179 continue;
1180
1181 if (dwarf_lineaddr(line, &addr) != 0) {
1182 pr_debug("Failed to get the address of line %d.\n",
1183 lineno);
1184 continue;
1185 }
1186 if (sp_die) {
1187 /* Address filtering 1: does sp_die include addr? */
1188 if (!dwarf_haspc(sp_die, addr))
1189 continue;
1190 /* Address filtering 2: No child include addr? */
1191 if (die_find_inlinefunc(sp_die, addr, &die_mem))
1192 continue;
1193 }
1194
1195 pr_debug("Probe line found: line[%d]:%d addr:0x%llx\n",
1196 (int)i, lineno, (unsigned long long)addr);
1197 pf->addr = addr;
1198
1199 ret = call_probe_finder(sp_die, pf);
1200 /* Continuing, because target line might be inlined. */
1201 }
1202 /* TODO: deallocate lines, but how? */
1203 return ret;
1204} 1348}
1205 1349
1206/* Callback parameter with return value */ 1350/* Callback parameter with return value */
@@ -1318,8 +1462,7 @@ static int find_probes(int fd, struct probe_finder *pf)
1318 off = 0; 1462 off = 0;
1319 line_list__init(&pf->lcache); 1463 line_list__init(&pf->lcache);
1320 /* Loop on CUs (Compilation Unit) */ 1464 /* Loop on CUs (Compilation Unit) */
1321 while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) && 1465 while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) {
1322 ret >= 0) {
1323 /* Get the DIE(Debugging Information Entry) of this CU */ 1466 /* Get the DIE(Debugging Information Entry) of this CU */
1324 diep = dwarf_offdie(dbg, off + cuhl, &pf->cu_die); 1467 diep = dwarf_offdie(dbg, off + cuhl, &pf->cu_die);
1325 if (!diep) 1468 if (!diep)
@@ -1340,6 +1483,8 @@ static int find_probes(int fd, struct probe_finder *pf)
1340 pf->lno = pp->line; 1483 pf->lno = pp->line;
1341 ret = find_probe_point_by_line(pf); 1484 ret = find_probe_point_by_line(pf);
1342 } 1485 }
1486 if (ret < 0)
1487 break;
1343 } 1488 }
1344 off = noff; 1489 off = noff;
1345 } 1490 }
@@ -1644,91 +1789,28 @@ static int line_range_add_line(const char *src, unsigned int lineno,
1644 return line_list__add_line(&lr->line_list, lineno); 1789 return line_list__add_line(&lr->line_list, lineno);
1645} 1790}
1646 1791
1647/* Search function declaration lines */ 1792static int line_range_walk_cb(const char *fname, int lineno,
1648static int line_range_funcdecl_cb(Dwarf_Die *sp_die, void *data) 1793 Dwarf_Addr addr __used,
1794 void *data)
1649{ 1795{
1650 struct dwarf_callback_param *param = data; 1796 struct line_finder *lf = data;
1651 struct line_finder *lf = param->data;
1652 const char *src;
1653 int lineno;
1654 1797
1655 src = dwarf_decl_file(sp_die); 1798 if ((strtailcmp(fname, lf->fname) != 0) ||
1656 if (src && strtailcmp(src, lf->fname) != 0)
1657 return DWARF_CB_OK;
1658
1659 if (dwarf_decl_line(sp_die, &lineno) != 0 ||
1660 (lf->lno_s > lineno || lf->lno_e < lineno)) 1799 (lf->lno_s > lineno || lf->lno_e < lineno))
1661 return DWARF_CB_OK; 1800 return 0;
1662 1801
1663 param->retval = line_range_add_line(src, lineno, lf->lr); 1802 if (line_range_add_line(fname, lineno, lf->lr) < 0)
1664 if (param->retval < 0) 1803 return -EINVAL;
1665 return DWARF_CB_ABORT;
1666 return DWARF_CB_OK;
1667}
1668 1804
1669static int find_line_range_func_decl_lines(struct line_finder *lf) 1805 return 0;
1670{
1671 struct dwarf_callback_param param = {.data = (void *)lf, .retval = 0};
1672 dwarf_getfuncs(&lf->cu_die, line_range_funcdecl_cb, &param, 0);
1673 return param.retval;
1674} 1806}
1675 1807
1676/* Find line range from its line number */ 1808/* Find line range from its line number */
1677static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf) 1809static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
1678{ 1810{
1679 Dwarf_Lines *lines; 1811 int ret;
1680 Dwarf_Line *line;
1681 size_t nlines, i;
1682 Dwarf_Addr addr;
1683 int lineno, ret = 0;
1684 const char *src;
1685 Dwarf_Die die_mem;
1686
1687 line_list__init(&lf->lr->line_list);
1688 if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) {
1689 pr_warning("No source lines found.\n");
1690 return -ENOENT;
1691 }
1692
1693 /* Search probable lines on lines list */
1694 for (i = 0; i < nlines; i++) {
1695 line = dwarf_onesrcline(lines, i);
1696 if (dwarf_lineno(line, &lineno) != 0 ||
1697 (lf->lno_s > lineno || lf->lno_e < lineno))
1698 continue;
1699
1700 if (sp_die) {
1701 /* Address filtering 1: does sp_die include addr? */
1702 if (dwarf_lineaddr(line, &addr) != 0 ||
1703 !dwarf_haspc(sp_die, addr))
1704 continue;
1705
1706 /* Address filtering 2: No child include addr? */
1707 if (die_find_inlinefunc(sp_die, addr, &die_mem))
1708 continue;
1709 }
1710
1711 /* TODO: Get fileno from line, but how? */
1712 src = dwarf_linesrc(line, NULL, NULL);
1713 if (strtailcmp(src, lf->fname) != 0)
1714 continue;
1715
1716 ret = line_range_add_line(src, lineno, lf->lr);
1717 if (ret < 0)
1718 return ret;
1719 }
1720 1812
1721 /* 1813 ret = die_walk_lines(sp_die ?: &lf->cu_die, line_range_walk_cb, lf);
1722 * Dwarf lines doesn't include function declarations. We have to
1723 * check functions list or given function.
1724 */
1725 if (sp_die) {
1726 src = dwarf_decl_file(sp_die);
1727 if (src && dwarf_decl_line(sp_die, &lineno) == 0 &&
1728 (lf->lno_s <= lineno && lf->lno_e >= lineno))
1729 ret = line_range_add_line(src, lineno, lf->lr);
1730 } else
1731 ret = find_line_range_func_decl_lines(lf);
1732 1814
1733 /* Update status */ 1815 /* Update status */
1734 if (ret >= 0) 1816 if (ret >= 0)
@@ -1758,9 +1840,6 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
1758 struct line_finder *lf = param->data; 1840 struct line_finder *lf = param->data;
1759 struct line_range *lr = lf->lr; 1841 struct line_range *lr = lf->lr;
1760 1842
1761 pr_debug("find (%llx) %s\n",
1762 (unsigned long long)dwarf_dieoffset(sp_die),
1763 dwarf_diename(sp_die));
1764 if (dwarf_tag(sp_die) == DW_TAG_subprogram && 1843 if (dwarf_tag(sp_die) == DW_TAG_subprogram &&
1765 die_compare_name(sp_die, lr->function)) { 1844 die_compare_name(sp_die, lr->function)) {
1766 lf->fname = dwarf_decl_file(sp_die); 1845 lf->fname = dwarf_decl_file(sp_die);
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
new file mode 100644
index 00000000000..a9f2d7e1204
--- /dev/null
+++ b/tools/perf/util/python.c
@@ -0,0 +1,896 @@
1#include <Python.h>
2#include <structmember.h>
3#include <inttypes.h>
4#include <poll.h>
5#include "evlist.h"
6#include "evsel.h"
7#include "event.h"
8#include "cpumap.h"
9#include "thread_map.h"
10
11/* Define PyVarObject_HEAD_INIT for python 2.5 */
12#ifndef PyVarObject_HEAD_INIT
13# define PyVarObject_HEAD_INIT(type, size) PyObject_HEAD_INIT(type) size,
14#endif
15
16struct throttle_event {
17 struct perf_event_header header;
18 u64 time;
19 u64 id;
20 u64 stream_id;
21};
22
23PyMODINIT_FUNC initperf(void);
24
25#define member_def(type, member, ptype, help) \
26 { #member, ptype, \
27 offsetof(struct pyrf_event, event) + offsetof(struct type, member), \
28 0, help }
29
30#define sample_member_def(name, member, ptype, help) \
31 { #name, ptype, \
32 offsetof(struct pyrf_event, sample) + offsetof(struct perf_sample, member), \
33 0, help }
34
35struct pyrf_event {
36 PyObject_HEAD
37 struct perf_sample sample;
38 union perf_event event;
39};
40
41#define sample_members \
42 sample_member_def(sample_ip, ip, T_ULONGLONG, "event type"), \
43 sample_member_def(sample_pid, pid, T_INT, "event pid"), \
44 sample_member_def(sample_tid, tid, T_INT, "event tid"), \
45 sample_member_def(sample_time, time, T_ULONGLONG, "event timestamp"), \
46 sample_member_def(sample_addr, addr, T_ULONGLONG, "event addr"), \
47 sample_member_def(sample_id, id, T_ULONGLONG, "event id"), \
48 sample_member_def(sample_stream_id, stream_id, T_ULONGLONG, "event stream id"), \
49 sample_member_def(sample_period, period, T_ULONGLONG, "event period"), \
50 sample_member_def(sample_cpu, cpu, T_UINT, "event cpu"),
51
52static char pyrf_mmap_event__doc[] = PyDoc_STR("perf mmap event object.");
53
54static PyMemberDef pyrf_mmap_event__members[] = {
55 sample_members
56 member_def(perf_event_header, type, T_UINT, "event type"),
57 member_def(mmap_event, pid, T_UINT, "event pid"),
58 member_def(mmap_event, tid, T_UINT, "event tid"),
59 member_def(mmap_event, start, T_ULONGLONG, "start of the map"),
60 member_def(mmap_event, len, T_ULONGLONG, "map length"),
61 member_def(mmap_event, pgoff, T_ULONGLONG, "page offset"),
62 member_def(mmap_event, filename, T_STRING_INPLACE, "backing store"),
63 { .name = NULL, },
64};
65
66static PyObject *pyrf_mmap_event__repr(struct pyrf_event *pevent)
67{
68 PyObject *ret;
69 char *s;
70
71 if (asprintf(&s, "{ type: mmap, pid: %u, tid: %u, start: %#" PRIx64 ", "
72 "length: %#" PRIx64 ", offset: %#" PRIx64 ", "
73 "filename: %s }",
74 pevent->event.mmap.pid, pevent->event.mmap.tid,
75 pevent->event.mmap.start, pevent->event.mmap.len,
76 pevent->event.mmap.pgoff, pevent->event.mmap.filename) < 0) {
77 ret = PyErr_NoMemory();
78 } else {
79 ret = PyString_FromString(s);
80 free(s);
81 }
82 return ret;
83}
84
85static PyTypeObject pyrf_mmap_event__type = {
86 PyVarObject_HEAD_INIT(NULL, 0)
87 .tp_name = "perf.mmap_event",
88 .tp_basicsize = sizeof(struct pyrf_event),
89 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
90 .tp_doc = pyrf_mmap_event__doc,
91 .tp_members = pyrf_mmap_event__members,
92 .tp_repr = (reprfunc)pyrf_mmap_event__repr,
93};
94
95static char pyrf_task_event__doc[] = PyDoc_STR("perf task (fork/exit) event object.");
96
97static PyMemberDef pyrf_task_event__members[] = {
98 sample_members
99 member_def(perf_event_header, type, T_UINT, "event type"),
100 member_def(fork_event, pid, T_UINT, "event pid"),
101 member_def(fork_event, ppid, T_UINT, "event ppid"),
102 member_def(fork_event, tid, T_UINT, "event tid"),
103 member_def(fork_event, ptid, T_UINT, "event ptid"),
104 member_def(fork_event, time, T_ULONGLONG, "timestamp"),
105 { .name = NULL, },
106};
107
108static PyObject *pyrf_task_event__repr(struct pyrf_event *pevent)
109{
110 return PyString_FromFormat("{ type: %s, pid: %u, ppid: %u, tid: %u, "
111 "ptid: %u, time: %" PRIu64 "}",
112 pevent->event.header.type == PERF_RECORD_FORK ? "fork" : "exit",
113 pevent->event.fork.pid,
114 pevent->event.fork.ppid,
115 pevent->event.fork.tid,
116 pevent->event.fork.ptid,
117 pevent->event.fork.time);
118}
119
120static PyTypeObject pyrf_task_event__type = {
121 PyVarObject_HEAD_INIT(NULL, 0)
122 .tp_name = "perf.task_event",
123 .tp_basicsize = sizeof(struct pyrf_event),
124 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
125 .tp_doc = pyrf_task_event__doc,
126 .tp_members = pyrf_task_event__members,
127 .tp_repr = (reprfunc)pyrf_task_event__repr,
128};
129
130static char pyrf_comm_event__doc[] = PyDoc_STR("perf comm event object.");
131
132static PyMemberDef pyrf_comm_event__members[] = {
133 sample_members
134 member_def(perf_event_header, type, T_UINT, "event type"),
135 member_def(comm_event, pid, T_UINT, "event pid"),
136 member_def(comm_event, tid, T_UINT, "event tid"),
137 member_def(comm_event, comm, T_STRING_INPLACE, "process name"),
138 { .name = NULL, },
139};
140
141static PyObject *pyrf_comm_event__repr(struct pyrf_event *pevent)
142{
143 return PyString_FromFormat("{ type: comm, pid: %u, tid: %u, comm: %s }",
144 pevent->event.comm.pid,
145 pevent->event.comm.tid,
146 pevent->event.comm.comm);
147}
148
149static PyTypeObject pyrf_comm_event__type = {
150 PyVarObject_HEAD_INIT(NULL, 0)
151 .tp_name = "perf.comm_event",
152 .tp_basicsize = sizeof(struct pyrf_event),
153 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
154 .tp_doc = pyrf_comm_event__doc,
155 .tp_members = pyrf_comm_event__members,
156 .tp_repr = (reprfunc)pyrf_comm_event__repr,
157};
158
159static char pyrf_throttle_event__doc[] = PyDoc_STR("perf throttle event object.");
160
161static PyMemberDef pyrf_throttle_event__members[] = {
162 sample_members
163 member_def(perf_event_header, type, T_UINT, "event type"),
164 member_def(throttle_event, time, T_ULONGLONG, "timestamp"),
165 member_def(throttle_event, id, T_ULONGLONG, "event id"),
166 member_def(throttle_event, stream_id, T_ULONGLONG, "event stream id"),
167 { .name = NULL, },
168};
169
170static PyObject *pyrf_throttle_event__repr(struct pyrf_event *pevent)
171{
172 struct throttle_event *te = (struct throttle_event *)(&pevent->event.header + 1);
173
174 return PyString_FromFormat("{ type: %sthrottle, time: %" PRIu64 ", id: %" PRIu64
175 ", stream_id: %" PRIu64 " }",
176 pevent->event.header.type == PERF_RECORD_THROTTLE ? "" : "un",
177 te->time, te->id, te->stream_id);
178}
179
180static PyTypeObject pyrf_throttle_event__type = {
181 PyVarObject_HEAD_INIT(NULL, 0)
182 .tp_name = "perf.throttle_event",
183 .tp_basicsize = sizeof(struct pyrf_event),
184 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
185 .tp_doc = pyrf_throttle_event__doc,
186 .tp_members = pyrf_throttle_event__members,
187 .tp_repr = (reprfunc)pyrf_throttle_event__repr,
188};
189
190static int pyrf_event__setup_types(void)
191{
192 int err;
193 pyrf_mmap_event__type.tp_new =
194 pyrf_task_event__type.tp_new =
195 pyrf_comm_event__type.tp_new =
196 pyrf_throttle_event__type.tp_new = PyType_GenericNew;
197 err = PyType_Ready(&pyrf_mmap_event__type);
198 if (err < 0)
199 goto out;
200 err = PyType_Ready(&pyrf_task_event__type);
201 if (err < 0)
202 goto out;
203 err = PyType_Ready(&pyrf_comm_event__type);
204 if (err < 0)
205 goto out;
206 err = PyType_Ready(&pyrf_throttle_event__type);
207 if (err < 0)
208 goto out;
209out:
210 return err;
211}
212
213static PyTypeObject *pyrf_event__type[] = {
214 [PERF_RECORD_MMAP] = &pyrf_mmap_event__type,
215 [PERF_RECORD_LOST] = &pyrf_mmap_event__type,
216 [PERF_RECORD_COMM] = &pyrf_comm_event__type,
217 [PERF_RECORD_EXIT] = &pyrf_task_event__type,
218 [PERF_RECORD_THROTTLE] = &pyrf_throttle_event__type,
219 [PERF_RECORD_UNTHROTTLE] = &pyrf_throttle_event__type,
220 [PERF_RECORD_FORK] = &pyrf_task_event__type,
221 [PERF_RECORD_READ] = &pyrf_mmap_event__type,
222 [PERF_RECORD_SAMPLE] = &pyrf_mmap_event__type,
223};
224
225static PyObject *pyrf_event__new(union perf_event *event)
226{
227 struct pyrf_event *pevent;
228 PyTypeObject *ptype;
229
230 if (event->header.type < PERF_RECORD_MMAP ||
231 event->header.type > PERF_RECORD_SAMPLE)
232 return NULL;
233
234 ptype = pyrf_event__type[event->header.type];
235 pevent = PyObject_New(struct pyrf_event, ptype);
236 if (pevent != NULL)
237 memcpy(&pevent->event, event, event->header.size);
238 return (PyObject *)pevent;
239}
240
241struct pyrf_cpu_map {
242 PyObject_HEAD
243
244 struct cpu_map *cpus;
245};
246
247static int pyrf_cpu_map__init(struct pyrf_cpu_map *pcpus,
248 PyObject *args, PyObject *kwargs)
249{
250 static char *kwlist[] = { "cpustr", NULL, NULL, };
251 char *cpustr = NULL;
252
253 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s",
254 kwlist, &cpustr))
255 return -1;
256
257 pcpus->cpus = cpu_map__new(cpustr);
258 if (pcpus->cpus == NULL)
259 return -1;
260 return 0;
261}
262
263static void pyrf_cpu_map__delete(struct pyrf_cpu_map *pcpus)
264{
265 cpu_map__delete(pcpus->cpus);
266 pcpus->ob_type->tp_free((PyObject*)pcpus);
267}
268
269static Py_ssize_t pyrf_cpu_map__length(PyObject *obj)
270{
271 struct pyrf_cpu_map *pcpus = (void *)obj;
272
273 return pcpus->cpus->nr;
274}
275
276static PyObject *pyrf_cpu_map__item(PyObject *obj, Py_ssize_t i)
277{
278 struct pyrf_cpu_map *pcpus = (void *)obj;
279
280 if (i >= pcpus->cpus->nr)
281 return NULL;
282
283 return Py_BuildValue("i", pcpus->cpus->map[i]);
284}
285
286static PySequenceMethods pyrf_cpu_map__sequence_methods = {
287 .sq_length = pyrf_cpu_map__length,
288 .sq_item = pyrf_cpu_map__item,
289};
290
291static char pyrf_cpu_map__doc[] = PyDoc_STR("cpu map object.");
292
293static PyTypeObject pyrf_cpu_map__type = {
294 PyVarObject_HEAD_INIT(NULL, 0)
295 .tp_name = "perf.cpu_map",
296 .tp_basicsize = sizeof(struct pyrf_cpu_map),
297 .tp_dealloc = (destructor)pyrf_cpu_map__delete,
298 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
299 .tp_doc = pyrf_cpu_map__doc,
300 .tp_as_sequence = &pyrf_cpu_map__sequence_methods,
301 .tp_init = (initproc)pyrf_cpu_map__init,
302};
303
304static int pyrf_cpu_map__setup_types(void)
305{
306 pyrf_cpu_map__type.tp_new = PyType_GenericNew;
307 return PyType_Ready(&pyrf_cpu_map__type);
308}
309
310struct pyrf_thread_map {
311 PyObject_HEAD
312
313 struct thread_map *threads;
314};
315
316static int pyrf_thread_map__init(struct pyrf_thread_map *pthreads,
317 PyObject *args, PyObject *kwargs)
318{
319 static char *kwlist[] = { "pid", "tid", NULL, NULL, };
320 int pid = -1, tid = -1;
321
322 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii",
323 kwlist, &pid, &tid))
324 return -1;
325
326 pthreads->threads = thread_map__new(pid, tid);
327 if (pthreads->threads == NULL)
328 return -1;
329 return 0;
330}
331
332static void pyrf_thread_map__delete(struct pyrf_thread_map *pthreads)
333{
334 thread_map__delete(pthreads->threads);
335 pthreads->ob_type->tp_free((PyObject*)pthreads);
336}
337
338static Py_ssize_t pyrf_thread_map__length(PyObject *obj)
339{
340 struct pyrf_thread_map *pthreads = (void *)obj;
341
342 return pthreads->threads->nr;
343}
344
345static PyObject *pyrf_thread_map__item(PyObject *obj, Py_ssize_t i)
346{
347 struct pyrf_thread_map *pthreads = (void *)obj;
348
349 if (i >= pthreads->threads->nr)
350 return NULL;
351
352 return Py_BuildValue("i", pthreads->threads->map[i]);
353}
354
355static PySequenceMethods pyrf_thread_map__sequence_methods = {
356 .sq_length = pyrf_thread_map__length,
357 .sq_item = pyrf_thread_map__item,
358};
359
360static char pyrf_thread_map__doc[] = PyDoc_STR("thread map object.");
361
362static PyTypeObject pyrf_thread_map__type = {
363 PyVarObject_HEAD_INIT(NULL, 0)
364 .tp_name = "perf.thread_map",
365 .tp_basicsize = sizeof(struct pyrf_thread_map),
366 .tp_dealloc = (destructor)pyrf_thread_map__delete,
367 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
368 .tp_doc = pyrf_thread_map__doc,
369 .tp_as_sequence = &pyrf_thread_map__sequence_methods,
370 .tp_init = (initproc)pyrf_thread_map__init,
371};
372
373static int pyrf_thread_map__setup_types(void)
374{
375 pyrf_thread_map__type.tp_new = PyType_GenericNew;
376 return PyType_Ready(&pyrf_thread_map__type);
377}
378
379struct pyrf_evsel {
380 PyObject_HEAD
381
382 struct perf_evsel evsel;
383};
384
385static int pyrf_evsel__init(struct pyrf_evsel *pevsel,
386 PyObject *args, PyObject *kwargs)
387{
388 struct perf_event_attr attr = {
389 .type = PERF_TYPE_HARDWARE,
390 .config = PERF_COUNT_HW_CPU_CYCLES,
391 .sample_type = PERF_SAMPLE_PERIOD | PERF_SAMPLE_TID,
392 };
393 static char *kwlist[] = {
394 "type",
395 "config",
396 "sample_freq",
397 "sample_period",
398 "sample_type",
399 "read_format",
400 "disabled",
401 "inherit",
402 "pinned",
403 "exclusive",
404 "exclude_user",
405 "exclude_kernel",
406 "exclude_hv",
407 "exclude_idle",
408 "mmap",
409 "comm",
410 "freq",
411 "inherit_stat",
412 "enable_on_exec",
413 "task",
414 "watermark",
415 "precise_ip",
416 "mmap_data",
417 "sample_id_all",
418 "wakeup_events",
419 "bp_type",
420 "bp_addr",
421 "bp_len", NULL, NULL, };
422 u64 sample_period = 0;
423 u32 disabled = 0,
424 inherit = 0,
425 pinned = 0,
426 exclusive = 0,
427 exclude_user = 0,
428 exclude_kernel = 0,
429 exclude_hv = 0,
430 exclude_idle = 0,
431 mmap = 0,
432 comm = 0,
433 freq = 1,
434 inherit_stat = 0,
435 enable_on_exec = 0,
436 task = 0,
437 watermark = 0,
438 precise_ip = 0,
439 mmap_data = 0,
440 sample_id_all = 1;
441 int idx = 0;
442
443 if (!PyArg_ParseTupleAndKeywords(args, kwargs,
444 "|iKiKKiiiiiiiiiiiiiiiiiiiiiKK", kwlist,
445 &attr.type, &attr.config, &attr.sample_freq,
446 &sample_period, &attr.sample_type,
447 &attr.read_format, &disabled, &inherit,
448 &pinned, &exclusive, &exclude_user,
449 &exclude_kernel, &exclude_hv, &exclude_idle,
450 &mmap, &comm, &freq, &inherit_stat,
451 &enable_on_exec, &task, &watermark,
452 &precise_ip, &mmap_data, &sample_id_all,
453 &attr.wakeup_events, &attr.bp_type,
454 &attr.bp_addr, &attr.bp_len, &idx))
455 return -1;
456
457 /* union... */
458 if (sample_period != 0) {
459 if (attr.sample_freq != 0)
460 return -1; /* FIXME: throw right exception */
461 attr.sample_period = sample_period;
462 }
463
464 /* Bitfields */
465 attr.disabled = disabled;
466 attr.inherit = inherit;
467 attr.pinned = pinned;
468 attr.exclusive = exclusive;
469 attr.exclude_user = exclude_user;
470 attr.exclude_kernel = exclude_kernel;
471 attr.exclude_hv = exclude_hv;
472 attr.exclude_idle = exclude_idle;
473 attr.mmap = mmap;
474 attr.comm = comm;
475 attr.freq = freq;
476 attr.inherit_stat = inherit_stat;
477 attr.enable_on_exec = enable_on_exec;
478 attr.task = task;
479 attr.watermark = watermark;
480 attr.precise_ip = precise_ip;
481 attr.mmap_data = mmap_data;
482 attr.sample_id_all = sample_id_all;
483
484 perf_evsel__init(&pevsel->evsel, &attr, idx);
485 return 0;
486}
487
488static void pyrf_evsel__delete(struct pyrf_evsel *pevsel)
489{
490 perf_evsel__exit(&pevsel->evsel);
491 pevsel->ob_type->tp_free((PyObject*)pevsel);
492}
493
494static PyObject *pyrf_evsel__open(struct pyrf_evsel *pevsel,
495 PyObject *args, PyObject *kwargs)
496{
497 struct perf_evsel *evsel = &pevsel->evsel;
498 struct cpu_map *cpus = NULL;
499 struct thread_map *threads = NULL;
500 PyObject *pcpus = NULL, *pthreads = NULL;
501 int group = 0, overwrite = 0;
502 static char *kwlist[] = {"cpus", "threads", "group", "overwrite", NULL, NULL};
503
504 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOii", kwlist,
505 &pcpus, &pthreads, &group, &overwrite))
506 return NULL;
507
508 if (pthreads != NULL)
509 threads = ((struct pyrf_thread_map *)pthreads)->threads;
510
511 if (pcpus != NULL)
512 cpus = ((struct pyrf_cpu_map *)pcpus)->cpus;
513
514 if (perf_evsel__open(evsel, cpus, threads, group, overwrite) < 0) {
515 PyErr_SetFromErrno(PyExc_OSError);
516 return NULL;
517 }
518
519 Py_INCREF(Py_None);
520 return Py_None;
521}
522
523static PyMethodDef pyrf_evsel__methods[] = {
524 {
525 .ml_name = "open",
526 .ml_meth = (PyCFunction)pyrf_evsel__open,
527 .ml_flags = METH_VARARGS | METH_KEYWORDS,
528 .ml_doc = PyDoc_STR("open the event selector file descriptor table.")
529 },
530 { .ml_name = NULL, }
531};
532
533static char pyrf_evsel__doc[] = PyDoc_STR("perf event selector list object.");
534
535static PyTypeObject pyrf_evsel__type = {
536 PyVarObject_HEAD_INIT(NULL, 0)
537 .tp_name = "perf.evsel",
538 .tp_basicsize = sizeof(struct pyrf_evsel),
539 .tp_dealloc = (destructor)pyrf_evsel__delete,
540 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
541 .tp_doc = pyrf_evsel__doc,
542 .tp_methods = pyrf_evsel__methods,
543 .tp_init = (initproc)pyrf_evsel__init,
544};
545
546static int pyrf_evsel__setup_types(void)
547{
548 pyrf_evsel__type.tp_new = PyType_GenericNew;
549 return PyType_Ready(&pyrf_evsel__type);
550}
551
552struct pyrf_evlist {
553 PyObject_HEAD
554
555 struct perf_evlist evlist;
556};
557
558static int pyrf_evlist__init(struct pyrf_evlist *pevlist,
559 PyObject *args, PyObject *kwargs __used)
560{
561 PyObject *pcpus = NULL, *pthreads = NULL;
562 struct cpu_map *cpus;
563 struct thread_map *threads;
564
565 if (!PyArg_ParseTuple(args, "OO", &pcpus, &pthreads))
566 return -1;
567
568 threads = ((struct pyrf_thread_map *)pthreads)->threads;
569 cpus = ((struct pyrf_cpu_map *)pcpus)->cpus;
570 perf_evlist__init(&pevlist->evlist, cpus, threads);
571 return 0;
572}
573
574static void pyrf_evlist__delete(struct pyrf_evlist *pevlist)
575{
576 perf_evlist__exit(&pevlist->evlist);
577 pevlist->ob_type->tp_free((PyObject*)pevlist);
578}
579
580static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist,
581 PyObject *args, PyObject *kwargs)
582{
583 struct perf_evlist *evlist = &pevlist->evlist;
584 static char *kwlist[] = {"pages", "overwrite",
585 NULL, NULL};
586 int pages = 128, overwrite = false;
587
588 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", kwlist,
589 &pages, &overwrite))
590 return NULL;
591
592 if (perf_evlist__mmap(evlist, pages, overwrite) < 0) {
593 PyErr_SetFromErrno(PyExc_OSError);
594 return NULL;
595 }
596
597 Py_INCREF(Py_None);
598 return Py_None;
599}
600
601static PyObject *pyrf_evlist__poll(struct pyrf_evlist *pevlist,
602 PyObject *args, PyObject *kwargs)
603{
604 struct perf_evlist *evlist = &pevlist->evlist;
605 static char *kwlist[] = {"timeout", NULL, NULL};
606 int timeout = -1, n;
607
608 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i", kwlist, &timeout))
609 return NULL;
610
611 n = poll(evlist->pollfd, evlist->nr_fds, timeout);
612 if (n < 0) {
613 PyErr_SetFromErrno(PyExc_OSError);
614 return NULL;
615 }
616
617 return Py_BuildValue("i", n);
618}
619
620static PyObject *pyrf_evlist__get_pollfd(struct pyrf_evlist *pevlist,
621 PyObject *args __used, PyObject *kwargs __used)
622{
623 struct perf_evlist *evlist = &pevlist->evlist;
624 PyObject *list = PyList_New(0);
625 int i;
626
627 for (i = 0; i < evlist->nr_fds; ++i) {
628 PyObject *file;
629 FILE *fp = fdopen(evlist->pollfd[i].fd, "r");
630
631 if (fp == NULL)
632 goto free_list;
633
634 file = PyFile_FromFile(fp, "perf", "r", NULL);
635 if (file == NULL)
636 goto free_list;
637
638 if (PyList_Append(list, file) != 0) {
639 Py_DECREF(file);
640 goto free_list;
641 }
642
643 Py_DECREF(file);
644 }
645
646 return list;
647free_list:
648 return PyErr_NoMemory();
649}
650
651
652static PyObject *pyrf_evlist__add(struct pyrf_evlist *pevlist,
653 PyObject *args, PyObject *kwargs __used)
654{
655 struct perf_evlist *evlist = &pevlist->evlist;
656 PyObject *pevsel;
657 struct perf_evsel *evsel;
658
659 if (!PyArg_ParseTuple(args, "O", &pevsel))
660 return NULL;
661
662 Py_INCREF(pevsel);
663 evsel = &((struct pyrf_evsel *)pevsel)->evsel;
664 evsel->idx = evlist->nr_entries;
665 perf_evlist__add(evlist, evsel);
666
667 return Py_BuildValue("i", evlist->nr_entries);
668}
669
670static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist,
671 PyObject *args, PyObject *kwargs)
672{
673 struct perf_evlist *evlist = &pevlist->evlist;
674 union perf_event *event;
675 int sample_id_all = 1, cpu;
676 static char *kwlist[] = {"sample_id_all", NULL, NULL};
677
678 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist,
679 &cpu, &sample_id_all))
680 return NULL;
681
682 event = perf_evlist__read_on_cpu(evlist, cpu);
683 if (event != NULL) {
684 struct perf_evsel *first;
685 PyObject *pyevent = pyrf_event__new(event);
686 struct pyrf_event *pevent = (struct pyrf_event *)pyevent;
687
688 if (pyevent == NULL)
689 return PyErr_NoMemory();
690
691 first = list_entry(evlist->entries.next, struct perf_evsel, node);
692 perf_event__parse_sample(event, first->attr.sample_type, sample_id_all,
693 &pevent->sample);
694 return pyevent;
695 }
696
697 Py_INCREF(Py_None);
698 return Py_None;
699}
700
701static PyMethodDef pyrf_evlist__methods[] = {
702 {
703 .ml_name = "mmap",
704 .ml_meth = (PyCFunction)pyrf_evlist__mmap,
705 .ml_flags = METH_VARARGS | METH_KEYWORDS,
706 .ml_doc = PyDoc_STR("mmap the file descriptor table.")
707 },
708 {
709 .ml_name = "poll",
710 .ml_meth = (PyCFunction)pyrf_evlist__poll,
711 .ml_flags = METH_VARARGS | METH_KEYWORDS,
712 .ml_doc = PyDoc_STR("poll the file descriptor table.")
713 },
714 {
715 .ml_name = "get_pollfd",
716 .ml_meth = (PyCFunction)pyrf_evlist__get_pollfd,
717 .ml_flags = METH_VARARGS | METH_KEYWORDS,
718 .ml_doc = PyDoc_STR("get the poll file descriptor table.")
719 },
720 {
721 .ml_name = "add",
722 .ml_meth = (PyCFunction)pyrf_evlist__add,
723 .ml_flags = METH_VARARGS | METH_KEYWORDS,
724 .ml_doc = PyDoc_STR("adds an event selector to the list.")
725 },
726 {
727 .ml_name = "read_on_cpu",
728 .ml_meth = (PyCFunction)pyrf_evlist__read_on_cpu,
729 .ml_flags = METH_VARARGS | METH_KEYWORDS,
730 .ml_doc = PyDoc_STR("reads an event.")
731 },
732 { .ml_name = NULL, }
733};
734
735static Py_ssize_t pyrf_evlist__length(PyObject *obj)
736{
737 struct pyrf_evlist *pevlist = (void *)obj;
738
739 return pevlist->evlist.nr_entries;
740}
741
742static PyObject *pyrf_evlist__item(PyObject *obj, Py_ssize_t i)
743{
744 struct pyrf_evlist *pevlist = (void *)obj;
745 struct perf_evsel *pos;
746
747 if (i >= pevlist->evlist.nr_entries)
748 return NULL;
749
750 list_for_each_entry(pos, &pevlist->evlist.entries, node)
751 if (i-- == 0)
752 break;
753
754 return Py_BuildValue("O", container_of(pos, struct pyrf_evsel, evsel));
755}
756
757static PySequenceMethods pyrf_evlist__sequence_methods = {
758 .sq_length = pyrf_evlist__length,
759 .sq_item = pyrf_evlist__item,
760};
761
762static char pyrf_evlist__doc[] = PyDoc_STR("perf event selector list object.");
763
764static PyTypeObject pyrf_evlist__type = {
765 PyVarObject_HEAD_INIT(NULL, 0)
766 .tp_name = "perf.evlist",
767 .tp_basicsize = sizeof(struct pyrf_evlist),
768 .tp_dealloc = (destructor)pyrf_evlist__delete,
769 .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
770 .tp_as_sequence = &pyrf_evlist__sequence_methods,
771 .tp_doc = pyrf_evlist__doc,
772 .tp_methods = pyrf_evlist__methods,
773 .tp_init = (initproc)pyrf_evlist__init,
774};
775
776static int pyrf_evlist__setup_types(void)
777{
778 pyrf_evlist__type.tp_new = PyType_GenericNew;
779 return PyType_Ready(&pyrf_evlist__type);
780}
781
782static struct {
783 const char *name;
784 int value;
785} perf__constants[] = {
786 { "TYPE_HARDWARE", PERF_TYPE_HARDWARE },
787 { "TYPE_SOFTWARE", PERF_TYPE_SOFTWARE },
788 { "TYPE_TRACEPOINT", PERF_TYPE_TRACEPOINT },
789 { "TYPE_HW_CACHE", PERF_TYPE_HW_CACHE },
790 { "TYPE_RAW", PERF_TYPE_RAW },
791 { "TYPE_BREAKPOINT", PERF_TYPE_BREAKPOINT },
792
793 { "COUNT_HW_CPU_CYCLES", PERF_COUNT_HW_CPU_CYCLES },
794 { "COUNT_HW_INSTRUCTIONS", PERF_COUNT_HW_INSTRUCTIONS },
795 { "COUNT_HW_CACHE_REFERENCES", PERF_COUNT_HW_CACHE_REFERENCES },
796 { "COUNT_HW_CACHE_MISSES", PERF_COUNT_HW_CACHE_MISSES },
797 { "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
798 { "COUNT_HW_BRANCH_MISSES", PERF_COUNT_HW_BRANCH_MISSES },
799 { "COUNT_HW_BUS_CYCLES", PERF_COUNT_HW_BUS_CYCLES },
800 { "COUNT_HW_CACHE_L1D", PERF_COUNT_HW_CACHE_L1D },
801 { "COUNT_HW_CACHE_L1I", PERF_COUNT_HW_CACHE_L1I },
802 { "COUNT_HW_CACHE_LL", PERF_COUNT_HW_CACHE_LL },
803 { "COUNT_HW_CACHE_DTLB", PERF_COUNT_HW_CACHE_DTLB },
804 { "COUNT_HW_CACHE_ITLB", PERF_COUNT_HW_CACHE_ITLB },
805 { "COUNT_HW_CACHE_BPU", PERF_COUNT_HW_CACHE_BPU },
806 { "COUNT_HW_CACHE_OP_READ", PERF_COUNT_HW_CACHE_OP_READ },
807 { "COUNT_HW_CACHE_OP_WRITE", PERF_COUNT_HW_CACHE_OP_WRITE },
808 { "COUNT_HW_CACHE_OP_PREFETCH", PERF_COUNT_HW_CACHE_OP_PREFETCH },
809 { "COUNT_HW_CACHE_RESULT_ACCESS", PERF_COUNT_HW_CACHE_RESULT_ACCESS },
810 { "COUNT_HW_CACHE_RESULT_MISS", PERF_COUNT_HW_CACHE_RESULT_MISS },
811
812 { "COUNT_SW_CPU_CLOCK", PERF_COUNT_SW_CPU_CLOCK },
813 { "COUNT_SW_TASK_CLOCK", PERF_COUNT_SW_TASK_CLOCK },
814 { "COUNT_SW_PAGE_FAULTS", PERF_COUNT_SW_PAGE_FAULTS },
815 { "COUNT_SW_CONTEXT_SWITCHES", PERF_COUNT_SW_CONTEXT_SWITCHES },
816 { "COUNT_SW_CPU_MIGRATIONS", PERF_COUNT_SW_CPU_MIGRATIONS },
817 { "COUNT_SW_PAGE_FAULTS_MIN", PERF_COUNT_SW_PAGE_FAULTS_MIN },
818 { "COUNT_SW_PAGE_FAULTS_MAJ", PERF_COUNT_SW_PAGE_FAULTS_MAJ },
819 { "COUNT_SW_ALIGNMENT_FAULTS", PERF_COUNT_SW_ALIGNMENT_FAULTS },
820 { "COUNT_SW_EMULATION_FAULTS", PERF_COUNT_SW_EMULATION_FAULTS },
821
822 { "SAMPLE_IP", PERF_SAMPLE_IP },
823 { "SAMPLE_TID", PERF_SAMPLE_TID },
824 { "SAMPLE_TIME", PERF_SAMPLE_TIME },
825 { "SAMPLE_ADDR", PERF_SAMPLE_ADDR },
826 { "SAMPLE_READ", PERF_SAMPLE_READ },
827 { "SAMPLE_CALLCHAIN", PERF_SAMPLE_CALLCHAIN },
828 { "SAMPLE_ID", PERF_SAMPLE_ID },
829 { "SAMPLE_CPU", PERF_SAMPLE_CPU },
830 { "SAMPLE_PERIOD", PERF_SAMPLE_PERIOD },
831 { "SAMPLE_STREAM_ID", PERF_SAMPLE_STREAM_ID },
832 { "SAMPLE_RAW", PERF_SAMPLE_RAW },
833
834 { "FORMAT_TOTAL_TIME_ENABLED", PERF_FORMAT_TOTAL_TIME_ENABLED },
835 { "FORMAT_TOTAL_TIME_RUNNING", PERF_FORMAT_TOTAL_TIME_RUNNING },
836 { "FORMAT_ID", PERF_FORMAT_ID },
837 { "FORMAT_GROUP", PERF_FORMAT_GROUP },
838
839 { "RECORD_MMAP", PERF_RECORD_MMAP },
840 { "RECORD_LOST", PERF_RECORD_LOST },
841 { "RECORD_COMM", PERF_RECORD_COMM },
842 { "RECORD_EXIT", PERF_RECORD_EXIT },
843 { "RECORD_THROTTLE", PERF_RECORD_THROTTLE },
844 { "RECORD_UNTHROTTLE", PERF_RECORD_UNTHROTTLE },
845 { "RECORD_FORK", PERF_RECORD_FORK },
846 { "RECORD_READ", PERF_RECORD_READ },
847 { "RECORD_SAMPLE", PERF_RECORD_SAMPLE },
848 { .name = NULL, },
849};
850
851static PyMethodDef perf__methods[] = {
852 { .ml_name = NULL, }
853};
854
855PyMODINIT_FUNC initperf(void)
856{
857 PyObject *obj;
858 int i;
859 PyObject *dict, *module = Py_InitModule("perf", perf__methods);
860
861 if (module == NULL ||
862 pyrf_event__setup_types() < 0 ||
863 pyrf_evlist__setup_types() < 0 ||
864 pyrf_evsel__setup_types() < 0 ||
865 pyrf_thread_map__setup_types() < 0 ||
866 pyrf_cpu_map__setup_types() < 0)
867 return;
868
869 Py_INCREF(&pyrf_evlist__type);
870 PyModule_AddObject(module, "evlist", (PyObject*)&pyrf_evlist__type);
871
872 Py_INCREF(&pyrf_evsel__type);
873 PyModule_AddObject(module, "evsel", (PyObject*)&pyrf_evsel__type);
874
875 Py_INCREF(&pyrf_thread_map__type);
876 PyModule_AddObject(module, "thread_map", (PyObject*)&pyrf_thread_map__type);
877
878 Py_INCREF(&pyrf_cpu_map__type);
879 PyModule_AddObject(module, "cpu_map", (PyObject*)&pyrf_cpu_map__type);
880
881 dict = PyModule_GetDict(module);
882 if (dict == NULL)
883 goto error;
884
885 for (i = 0; perf__constants[i].name != NULL; i++) {
886 obj = PyInt_FromLong(perf__constants[i].value);
887 if (obj == NULL)
888 goto error;
889 PyDict_SetItemString(dict, perf__constants[i].name, obj);
890 Py_DECREF(obj);
891 }
892
893error:
894 if (PyErr_Occurred())
895 PyErr_SetString(PyExc_ImportError, "perf: Init failed!");
896}
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index c6d99334bdf..2040b853852 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -248,8 +248,7 @@ static void python_process_event(int cpu, void *data,
248 context = PyCObject_FromVoidPtr(scripting_context, NULL); 248 context = PyCObject_FromVoidPtr(scripting_context, NULL);
249 249
250 PyTuple_SetItem(t, n++, PyString_FromString(handler_name)); 250 PyTuple_SetItem(t, n++, PyString_FromString(handler_name));
251 PyTuple_SetItem(t, n++, 251 PyTuple_SetItem(t, n++, context);
252 PyCObject_FromVoidPtr(scripting_context, NULL));
253 252
254 if (handler) { 253 if (handler) {
255 PyTuple_SetItem(t, n++, PyInt_FromLong(cpu)); 254 PyTuple_SetItem(t, n++, PyInt_FromLong(cpu));
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 105f00bfd55..f26639fa0fb 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -7,6 +7,8 @@
7#include <sys/types.h> 7#include <sys/types.h>
8#include <sys/mman.h> 8#include <sys/mman.h>
9 9
10#include "evlist.h"
11#include "evsel.h"
10#include "session.h" 12#include "session.h"
11#include "sort.h" 13#include "sort.h"
12#include "util.h" 14#include "util.h"
@@ -19,7 +21,7 @@ static int perf_session__open(struct perf_session *self, bool force)
19 self->fd_pipe = true; 21 self->fd_pipe = true;
20 self->fd = STDIN_FILENO; 22 self->fd = STDIN_FILENO;
21 23
22 if (perf_header__read(self, self->fd) < 0) 24 if (perf_session__read_header(self, self->fd) < 0)
23 pr_err("incompatible file format"); 25 pr_err("incompatible file format");
24 26
25 return 0; 27 return 0;
@@ -51,7 +53,7 @@ static int perf_session__open(struct perf_session *self, bool force)
51 goto out_close; 53 goto out_close;
52 } 54 }
53 55
54 if (perf_header__read(self, self->fd) < 0) { 56 if (perf_session__read_header(self, self->fd) < 0) {
55 pr_err("incompatible file format"); 57 pr_err("incompatible file format");
56 goto out_close; 58 goto out_close;
57 } 59 }
@@ -67,7 +69,7 @@ out_close:
67 69
68static void perf_session__id_header_size(struct perf_session *session) 70static void perf_session__id_header_size(struct perf_session *session)
69{ 71{
70 struct sample_data *data; 72 struct perf_sample *data;
71 u64 sample_type = session->sample_type; 73 u64 sample_type = session->sample_type;
72 u16 size = 0; 74 u16 size = 0;
73 75
@@ -92,21 +94,10 @@ out:
92 session->id_hdr_size = size; 94 session->id_hdr_size = size;
93} 95}
94 96
95void perf_session__set_sample_id_all(struct perf_session *session, bool value)
96{
97 session->sample_id_all = value;
98 perf_session__id_header_size(session);
99}
100
101void perf_session__set_sample_type(struct perf_session *session, u64 type)
102{
103 session->sample_type = type;
104}
105
106void perf_session__update_sample_type(struct perf_session *self) 97void perf_session__update_sample_type(struct perf_session *self)
107{ 98{
108 self->sample_type = perf_header__sample_type(&self->header); 99 self->sample_type = perf_evlist__sample_type(self->evlist);
109 self->sample_id_all = perf_header__sample_id_all(&self->header); 100 self->sample_id_all = perf_evlist__sample_id_all(self->evlist);
110 perf_session__id_header_size(self); 101 perf_session__id_header_size(self);
111} 102}
112 103
@@ -135,13 +126,9 @@ struct perf_session *perf_session__new(const char *filename, int mode,
135 if (self == NULL) 126 if (self == NULL)
136 goto out; 127 goto out;
137 128
138 if (perf_header__init(&self->header) < 0)
139 goto out_free;
140
141 memcpy(self->filename, filename, len); 129 memcpy(self->filename, filename, len);
142 self->threads = RB_ROOT; 130 self->threads = RB_ROOT;
143 INIT_LIST_HEAD(&self->dead_threads); 131 INIT_LIST_HEAD(&self->dead_threads);
144 self->hists_tree = RB_ROOT;
145 self->last_match = NULL; 132 self->last_match = NULL;
146 /* 133 /*
147 * On 64bit we can mmap the data file in one go. No need for tiny mmap 134 * On 64bit we can mmap the data file in one go. No need for tiny mmap
@@ -162,17 +149,16 @@ struct perf_session *perf_session__new(const char *filename, int mode,
162 if (mode == O_RDONLY) { 149 if (mode == O_RDONLY) {
163 if (perf_session__open(self, force) < 0) 150 if (perf_session__open(self, force) < 0)
164 goto out_delete; 151 goto out_delete;
152 perf_session__update_sample_type(self);
165 } else if (mode == O_WRONLY) { 153 } else if (mode == O_WRONLY) {
166 /* 154 /*
167 * In O_RDONLY mode this will be performed when reading the 155 * In O_RDONLY mode this will be performed when reading the
168 * kernel MMAP event, in event__process_mmap(). 156 * kernel MMAP event, in perf_event__process_mmap().
169 */ 157 */
170 if (perf_session__create_kernel_maps(self) < 0) 158 if (perf_session__create_kernel_maps(self) < 0)
171 goto out_delete; 159 goto out_delete;
172 } 160 }
173 161
174 perf_session__update_sample_type(self);
175
176 if (ops && ops->ordering_requires_timestamps && 162 if (ops && ops->ordering_requires_timestamps &&
177 ops->ordered_samples && !self->sample_id_all) { 163 ops->ordered_samples && !self->sample_id_all) {
178 dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); 164 dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
@@ -181,9 +167,6 @@ struct perf_session *perf_session__new(const char *filename, int mode,
181 167
182out: 168out:
183 return self; 169 return self;
184out_free:
185 free(self);
186 return NULL;
187out_delete: 170out_delete:
188 perf_session__delete(self); 171 perf_session__delete(self);
189 return NULL; 172 return NULL;
@@ -214,7 +197,6 @@ static void perf_session__delete_threads(struct perf_session *self)
214 197
215void perf_session__delete(struct perf_session *self) 198void perf_session__delete(struct perf_session *self)
216{ 199{
217 perf_header__exit(&self->header);
218 perf_session__destroy_kernel_maps(self); 200 perf_session__destroy_kernel_maps(self);
219 perf_session__delete_dead_threads(self); 201 perf_session__delete_dead_threads(self);
220 perf_session__delete_threads(self); 202 perf_session__delete_threads(self);
@@ -242,17 +224,16 @@ static bool symbol__match_parent_regex(struct symbol *sym)
242 return 0; 224 return 0;
243} 225}
244 226
245struct map_symbol *perf_session__resolve_callchain(struct perf_session *self, 227int perf_session__resolve_callchain(struct perf_session *self,
246 struct thread *thread, 228 struct thread *thread,
247 struct ip_callchain *chain, 229 struct ip_callchain *chain,
248 struct symbol **parent) 230 struct symbol **parent)
249{ 231{
250 u8 cpumode = PERF_RECORD_MISC_USER; 232 u8 cpumode = PERF_RECORD_MISC_USER;
251 unsigned int i; 233 unsigned int i;
252 struct map_symbol *syms = calloc(chain->nr, sizeof(*syms)); 234 int err;
253 235
254 if (!syms) 236 callchain_cursor_reset(&self->callchain_cursor);
255 return NULL;
256 237
257 for (i = 0; i < chain->nr; i++) { 238 for (i = 0; i < chain->nr; i++) {
258 u64 ip = chain->ips[i]; 239 u64 ip = chain->ips[i];
@@ -281,30 +262,33 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
281 *parent = al.sym; 262 *parent = al.sym;
282 if (!symbol_conf.use_callchain) 263 if (!symbol_conf.use_callchain)
283 break; 264 break;
284 syms[i].map = al.map;
285 syms[i].sym = al.sym;
286 } 265 }
266
267 err = callchain_cursor_append(&self->callchain_cursor,
268 ip, al.map, al.sym);
269 if (err)
270 return err;
287 } 271 }
288 272
289 return syms; 273 return 0;
290} 274}
291 275
292static int process_event_synth_stub(event_t *event __used, 276static int process_event_synth_stub(union perf_event *event __used,
293 struct perf_session *session __used) 277 struct perf_session *session __used)
294{ 278{
295 dump_printf(": unhandled!\n"); 279 dump_printf(": unhandled!\n");
296 return 0; 280 return 0;
297} 281}
298 282
299static int process_event_stub(event_t *event __used, 283static int process_event_stub(union perf_event *event __used,
300 struct sample_data *sample __used, 284 struct perf_sample *sample __used,
301 struct perf_session *session __used) 285 struct perf_session *session __used)
302{ 286{
303 dump_printf(": unhandled!\n"); 287 dump_printf(": unhandled!\n");
304 return 0; 288 return 0;
305} 289}
306 290
307static int process_finished_round_stub(event_t *event __used, 291static int process_finished_round_stub(union perf_event *event __used,
308 struct perf_session *session __used, 292 struct perf_session *session __used,
309 struct perf_event_ops *ops __used) 293 struct perf_event_ops *ops __used)
310{ 294{
@@ -312,7 +296,7 @@ static int process_finished_round_stub(event_t *event __used,
312 return 0; 296 return 0;
313} 297}
314 298
315static int process_finished_round(event_t *event, 299static int process_finished_round(union perf_event *event,
316 struct perf_session *session, 300 struct perf_session *session,
317 struct perf_event_ops *ops); 301 struct perf_event_ops *ops);
318 302
@@ -329,7 +313,7 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
329 if (handler->exit == NULL) 313 if (handler->exit == NULL)
330 handler->exit = process_event_stub; 314 handler->exit = process_event_stub;
331 if (handler->lost == NULL) 315 if (handler->lost == NULL)
332 handler->lost = event__process_lost; 316 handler->lost = perf_event__process_lost;
333 if (handler->read == NULL) 317 if (handler->read == NULL)
334 handler->read = process_event_stub; 318 handler->read = process_event_stub;
335 if (handler->throttle == NULL) 319 if (handler->throttle == NULL)
@@ -363,98 +347,98 @@ void mem_bswap_64(void *src, int byte_size)
363 } 347 }
364} 348}
365 349
366static void event__all64_swap(event_t *self) 350static void perf_event__all64_swap(union perf_event *event)
367{ 351{
368 struct perf_event_header *hdr = &self->header; 352 struct perf_event_header *hdr = &event->header;
369 mem_bswap_64(hdr + 1, self->header.size - sizeof(*hdr)); 353 mem_bswap_64(hdr + 1, event->header.size - sizeof(*hdr));
370} 354}
371 355
372static void event__comm_swap(event_t *self) 356static void perf_event__comm_swap(union perf_event *event)
373{ 357{
374 self->comm.pid = bswap_32(self->comm.pid); 358 event->comm.pid = bswap_32(event->comm.pid);
375 self->comm.tid = bswap_32(self->comm.tid); 359 event->comm.tid = bswap_32(event->comm.tid);
376} 360}
377 361
378static void event__mmap_swap(event_t *self) 362static void perf_event__mmap_swap(union perf_event *event)
379{ 363{
380 self->mmap.pid = bswap_32(self->mmap.pid); 364 event->mmap.pid = bswap_32(event->mmap.pid);
381 self->mmap.tid = bswap_32(self->mmap.tid); 365 event->mmap.tid = bswap_32(event->mmap.tid);
382 self->mmap.start = bswap_64(self->mmap.start); 366 event->mmap.start = bswap_64(event->mmap.start);
383 self->mmap.len = bswap_64(self->mmap.len); 367 event->mmap.len = bswap_64(event->mmap.len);
384 self->mmap.pgoff = bswap_64(self->mmap.pgoff); 368 event->mmap.pgoff = bswap_64(event->mmap.pgoff);
385} 369}
386 370
387static void event__task_swap(event_t *self) 371static void perf_event__task_swap(union perf_event *event)
388{ 372{
389 self->fork.pid = bswap_32(self->fork.pid); 373 event->fork.pid = bswap_32(event->fork.pid);
390 self->fork.tid = bswap_32(self->fork.tid); 374 event->fork.tid = bswap_32(event->fork.tid);
391 self->fork.ppid = bswap_32(self->fork.ppid); 375 event->fork.ppid = bswap_32(event->fork.ppid);
392 self->fork.ptid = bswap_32(self->fork.ptid); 376 event->fork.ptid = bswap_32(event->fork.ptid);
393 self->fork.time = bswap_64(self->fork.time); 377 event->fork.time = bswap_64(event->fork.time);
394} 378}
395 379
396static void event__read_swap(event_t *self) 380static void perf_event__read_swap(union perf_event *event)
397{ 381{
398 self->read.pid = bswap_32(self->read.pid); 382 event->read.pid = bswap_32(event->read.pid);
399 self->read.tid = bswap_32(self->read.tid); 383 event->read.tid = bswap_32(event->read.tid);
400 self->read.value = bswap_64(self->read.value); 384 event->read.value = bswap_64(event->read.value);
401 self->read.time_enabled = bswap_64(self->read.time_enabled); 385 event->read.time_enabled = bswap_64(event->read.time_enabled);
402 self->read.time_running = bswap_64(self->read.time_running); 386 event->read.time_running = bswap_64(event->read.time_running);
403 self->read.id = bswap_64(self->read.id); 387 event->read.id = bswap_64(event->read.id);
404} 388}
405 389
406static void event__attr_swap(event_t *self) 390static void perf_event__attr_swap(union perf_event *event)
407{ 391{
408 size_t size; 392 size_t size;
409 393
410 self->attr.attr.type = bswap_32(self->attr.attr.type); 394 event->attr.attr.type = bswap_32(event->attr.attr.type);
411 self->attr.attr.size = bswap_32(self->attr.attr.size); 395 event->attr.attr.size = bswap_32(event->attr.attr.size);
412 self->attr.attr.config = bswap_64(self->attr.attr.config); 396 event->attr.attr.config = bswap_64(event->attr.attr.config);
413 self->attr.attr.sample_period = bswap_64(self->attr.attr.sample_period); 397 event->attr.attr.sample_period = bswap_64(event->attr.attr.sample_period);
414 self->attr.attr.sample_type = bswap_64(self->attr.attr.sample_type); 398 event->attr.attr.sample_type = bswap_64(event->attr.attr.sample_type);
415 self->attr.attr.read_format = bswap_64(self->attr.attr.read_format); 399 event->attr.attr.read_format = bswap_64(event->attr.attr.read_format);
416 self->attr.attr.wakeup_events = bswap_32(self->attr.attr.wakeup_events); 400 event->attr.attr.wakeup_events = bswap_32(event->attr.attr.wakeup_events);
417 self->attr.attr.bp_type = bswap_32(self->attr.attr.bp_type); 401 event->attr.attr.bp_type = bswap_32(event->attr.attr.bp_type);
418 self->attr.attr.bp_addr = bswap_64(self->attr.attr.bp_addr); 402 event->attr.attr.bp_addr = bswap_64(event->attr.attr.bp_addr);
419 self->attr.attr.bp_len = bswap_64(self->attr.attr.bp_len); 403 event->attr.attr.bp_len = bswap_64(event->attr.attr.bp_len);
420 404
421 size = self->header.size; 405 size = event->header.size;
422 size -= (void *)&self->attr.id - (void *)self; 406 size -= (void *)&event->attr.id - (void *)event;
423 mem_bswap_64(self->attr.id, size); 407 mem_bswap_64(event->attr.id, size);
424} 408}
425 409
426static void event__event_type_swap(event_t *self) 410static void perf_event__event_type_swap(union perf_event *event)
427{ 411{
428 self->event_type.event_type.event_id = 412 event->event_type.event_type.event_id =
429 bswap_64(self->event_type.event_type.event_id); 413 bswap_64(event->event_type.event_type.event_id);
430} 414}
431 415
432static void event__tracing_data_swap(event_t *self) 416static void perf_event__tracing_data_swap(union perf_event *event)
433{ 417{
434 self->tracing_data.size = bswap_32(self->tracing_data.size); 418 event->tracing_data.size = bswap_32(event->tracing_data.size);
435} 419}
436 420
437typedef void (*event__swap_op)(event_t *self); 421typedef void (*perf_event__swap_op)(union perf_event *event);
438 422
439static event__swap_op event__swap_ops[] = { 423static perf_event__swap_op perf_event__swap_ops[] = {
440 [PERF_RECORD_MMAP] = event__mmap_swap, 424 [PERF_RECORD_MMAP] = perf_event__mmap_swap,
441 [PERF_RECORD_COMM] = event__comm_swap, 425 [PERF_RECORD_COMM] = perf_event__comm_swap,
442 [PERF_RECORD_FORK] = event__task_swap, 426 [PERF_RECORD_FORK] = perf_event__task_swap,
443 [PERF_RECORD_EXIT] = event__task_swap, 427 [PERF_RECORD_EXIT] = perf_event__task_swap,
444 [PERF_RECORD_LOST] = event__all64_swap, 428 [PERF_RECORD_LOST] = perf_event__all64_swap,
445 [PERF_RECORD_READ] = event__read_swap, 429 [PERF_RECORD_READ] = perf_event__read_swap,
446 [PERF_RECORD_SAMPLE] = event__all64_swap, 430 [PERF_RECORD_SAMPLE] = perf_event__all64_swap,
447 [PERF_RECORD_HEADER_ATTR] = event__attr_swap, 431 [PERF_RECORD_HEADER_ATTR] = perf_event__attr_swap,
448 [PERF_RECORD_HEADER_EVENT_TYPE] = event__event_type_swap, 432 [PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap,
449 [PERF_RECORD_HEADER_TRACING_DATA] = event__tracing_data_swap, 433 [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
450 [PERF_RECORD_HEADER_BUILD_ID] = NULL, 434 [PERF_RECORD_HEADER_BUILD_ID] = NULL,
451 [PERF_RECORD_HEADER_MAX] = NULL, 435 [PERF_RECORD_HEADER_MAX] = NULL,
452}; 436};
453 437
454struct sample_queue { 438struct sample_queue {
455 u64 timestamp; 439 u64 timestamp;
456 u64 file_offset; 440 u64 file_offset;
457 event_t *event; 441 union perf_event *event;
458 struct list_head list; 442 struct list_head list;
459}; 443};
460 444
@@ -472,8 +456,8 @@ static void perf_session_free_sample_buffers(struct perf_session *session)
472} 456}
473 457
474static int perf_session_deliver_event(struct perf_session *session, 458static int perf_session_deliver_event(struct perf_session *session,
475 event_t *event, 459 union perf_event *event,
476 struct sample_data *sample, 460 struct perf_sample *sample,
477 struct perf_event_ops *ops, 461 struct perf_event_ops *ops,
478 u64 file_offset); 462 u64 file_offset);
479 463
@@ -483,7 +467,7 @@ static void flush_sample_queue(struct perf_session *s,
483 struct ordered_samples *os = &s->ordered_samples; 467 struct ordered_samples *os = &s->ordered_samples;
484 struct list_head *head = &os->samples; 468 struct list_head *head = &os->samples;
485 struct sample_queue *tmp, *iter; 469 struct sample_queue *tmp, *iter;
486 struct sample_data sample; 470 struct perf_sample sample;
487 u64 limit = os->next_flush; 471 u64 limit = os->next_flush;
488 u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL; 472 u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
489 473
@@ -494,7 +478,7 @@ static void flush_sample_queue(struct perf_session *s,
494 if (iter->timestamp > limit) 478 if (iter->timestamp > limit)
495 break; 479 break;
496 480
497 event__parse_sample(iter->event, s, &sample); 481 perf_session__parse_sample(s, iter->event, &sample);
498 perf_session_deliver_event(s, iter->event, &sample, ops, 482 perf_session_deliver_event(s, iter->event, &sample, ops,
499 iter->file_offset); 483 iter->file_offset);
500 484
@@ -550,7 +534,7 @@ static void flush_sample_queue(struct perf_session *s,
550 * Flush every events below timestamp 7 534 * Flush every events below timestamp 7
551 * etc... 535 * etc...
552 */ 536 */
553static int process_finished_round(event_t *event __used, 537static int process_finished_round(union perf_event *event __used,
554 struct perf_session *session, 538 struct perf_session *session,
555 struct perf_event_ops *ops) 539 struct perf_event_ops *ops)
556{ 540{
@@ -607,12 +591,12 @@ static void __queue_event(struct sample_queue *new, struct perf_session *s)
607 591
608#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue)) 592#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue))
609 593
610static int perf_session_queue_event(struct perf_session *s, event_t *event, 594static int perf_session_queue_event(struct perf_session *s, union perf_event *event,
611 struct sample_data *data, u64 file_offset) 595 struct perf_sample *sample, u64 file_offset)
612{ 596{
613 struct ordered_samples *os = &s->ordered_samples; 597 struct ordered_samples *os = &s->ordered_samples;
614 struct list_head *sc = &os->sample_cache; 598 struct list_head *sc = &os->sample_cache;
615 u64 timestamp = data->time; 599 u64 timestamp = sample->time;
616 struct sample_queue *new; 600 struct sample_queue *new;
617 601
618 if (!timestamp || timestamp == ~0ULL) 602 if (!timestamp || timestamp == ~0ULL)
@@ -648,7 +632,7 @@ static int perf_session_queue_event(struct perf_session *s, event_t *event,
648 return 0; 632 return 0;
649} 633}
650 634
651static void callchain__printf(struct sample_data *sample) 635static void callchain__printf(struct perf_sample *sample)
652{ 636{
653 unsigned int i; 637 unsigned int i;
654 638
@@ -660,8 +644,8 @@ static void callchain__printf(struct sample_data *sample)
660} 644}
661 645
662static void perf_session__print_tstamp(struct perf_session *session, 646static void perf_session__print_tstamp(struct perf_session *session,
663 event_t *event, 647 union perf_event *event,
664 struct sample_data *sample) 648 struct perf_sample *sample)
665{ 649{
666 if (event->header.type != PERF_RECORD_SAMPLE && 650 if (event->header.type != PERF_RECORD_SAMPLE &&
667 !session->sample_id_all) { 651 !session->sample_id_all) {
@@ -676,8 +660,8 @@ static void perf_session__print_tstamp(struct perf_session *session,
676 printf("%" PRIu64 " ", sample->time); 660 printf("%" PRIu64 " ", sample->time);
677} 661}
678 662
679static void dump_event(struct perf_session *session, event_t *event, 663static void dump_event(struct perf_session *session, union perf_event *event,
680 u64 file_offset, struct sample_data *sample) 664 u64 file_offset, struct perf_sample *sample)
681{ 665{
682 if (!dump_trace) 666 if (!dump_trace)
683 return; 667 return;
@@ -691,11 +675,11 @@ static void dump_event(struct perf_session *session, event_t *event,
691 perf_session__print_tstamp(session, event, sample); 675 perf_session__print_tstamp(session, event, sample);
692 676
693 printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset, 677 printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset,
694 event->header.size, event__get_event_name(event->header.type)); 678 event->header.size, perf_event__name(event->header.type));
695} 679}
696 680
697static void dump_sample(struct perf_session *session, event_t *event, 681static void dump_sample(struct perf_session *session, union perf_event *event,
698 struct sample_data *sample) 682 struct perf_sample *sample)
699{ 683{
700 if (!dump_trace) 684 if (!dump_trace)
701 return; 685 return;
@@ -709,8 +693,8 @@ static void dump_sample(struct perf_session *session, event_t *event,
709} 693}
710 694
711static int perf_session_deliver_event(struct perf_session *session, 695static int perf_session_deliver_event(struct perf_session *session,
712 event_t *event, 696 union perf_event *event,
713 struct sample_data *sample, 697 struct perf_sample *sample,
714 struct perf_event_ops *ops, 698 struct perf_event_ops *ops,
715 u64 file_offset) 699 u64 file_offset)
716{ 700{
@@ -743,7 +727,7 @@ static int perf_session_deliver_event(struct perf_session *session,
743} 727}
744 728
745static int perf_session__preprocess_sample(struct perf_session *session, 729static int perf_session__preprocess_sample(struct perf_session *session,
746 event_t *event, struct sample_data *sample) 730 union perf_event *event, struct perf_sample *sample)
747{ 731{
748 if (event->header.type != PERF_RECORD_SAMPLE || 732 if (event->header.type != PERF_RECORD_SAMPLE ||
749 !(session->sample_type & PERF_SAMPLE_CALLCHAIN)) 733 !(session->sample_type & PERF_SAMPLE_CALLCHAIN))
@@ -758,7 +742,7 @@ static int perf_session__preprocess_sample(struct perf_session *session,
758 return 0; 742 return 0;
759} 743}
760 744
761static int perf_session__process_user_event(struct perf_session *session, event_t *event, 745static int perf_session__process_user_event(struct perf_session *session, union perf_event *event,
762 struct perf_event_ops *ops, u64 file_offset) 746 struct perf_event_ops *ops, u64 file_offset)
763{ 747{
764 dump_event(session, event, file_offset, NULL); 748 dump_event(session, event, file_offset, NULL);
@@ -783,15 +767,16 @@ static int perf_session__process_user_event(struct perf_session *session, event_
783} 767}
784 768
785static int perf_session__process_event(struct perf_session *session, 769static int perf_session__process_event(struct perf_session *session,
786 event_t *event, 770 union perf_event *event,
787 struct perf_event_ops *ops, 771 struct perf_event_ops *ops,
788 u64 file_offset) 772 u64 file_offset)
789{ 773{
790 struct sample_data sample; 774 struct perf_sample sample;
791 int ret; 775 int ret;
792 776
793 if (session->header.needs_swap && event__swap_ops[event->header.type]) 777 if (session->header.needs_swap &&
794 event__swap_ops[event->header.type](event); 778 perf_event__swap_ops[event->header.type])
779 perf_event__swap_ops[event->header.type](event);
795 780
796 if (event->header.type >= PERF_RECORD_HEADER_MAX) 781 if (event->header.type >= PERF_RECORD_HEADER_MAX)
797 return -EINVAL; 782 return -EINVAL;
@@ -804,7 +789,7 @@ static int perf_session__process_event(struct perf_session *session,
804 /* 789 /*
805 * For all kernel events we get the sample data 790 * For all kernel events we get the sample data
806 */ 791 */
807 event__parse_sample(event, session, &sample); 792 perf_session__parse_sample(session, event, &sample);
808 793
809 /* Preprocess sample records - precheck callchains */ 794 /* Preprocess sample records - precheck callchains */
810 if (perf_session__preprocess_sample(session, event, &sample)) 795 if (perf_session__preprocess_sample(session, event, &sample))
@@ -843,7 +828,7 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
843static void perf_session__warn_about_errors(const struct perf_session *session, 828static void perf_session__warn_about_errors(const struct perf_session *session,
844 const struct perf_event_ops *ops) 829 const struct perf_event_ops *ops)
845{ 830{
846 if (ops->lost == event__process_lost && 831 if (ops->lost == perf_event__process_lost &&
847 session->hists.stats.total_lost != 0) { 832 session->hists.stats.total_lost != 0) {
848 ui__warning("Processed %" PRIu64 " events and LOST %" PRIu64 833 ui__warning("Processed %" PRIu64 " events and LOST %" PRIu64
849 "!\n\nCheck IO/CPU overload!\n\n", 834 "!\n\nCheck IO/CPU overload!\n\n",
@@ -875,7 +860,7 @@ volatile int session_done;
875static int __perf_session__process_pipe_events(struct perf_session *self, 860static int __perf_session__process_pipe_events(struct perf_session *self,
876 struct perf_event_ops *ops) 861 struct perf_event_ops *ops)
877{ 862{
878 event_t event; 863 union perf_event event;
879 uint32_t size; 864 uint32_t size;
880 int skip = 0; 865 int skip = 0;
881 u64 head; 866 u64 head;
@@ -956,7 +941,7 @@ int __perf_session__process_events(struct perf_session *session,
956 struct ui_progress *progress; 941 struct ui_progress *progress;
957 size_t page_size, mmap_size; 942 size_t page_size, mmap_size;
958 char *buf, *mmaps[8]; 943 char *buf, *mmaps[8];
959 event_t *event; 944 union perf_event *event;
960 uint32_t size; 945 uint32_t size;
961 946
962 perf_event_ops__fill_defaults(ops); 947 perf_event_ops__fill_defaults(ops);
@@ -1001,7 +986,7 @@ remap:
1001 file_pos = file_offset + head; 986 file_pos = file_offset + head;
1002 987
1003more: 988more:
1004 event = (event_t *)(buf + head); 989 event = (union perf_event *)(buf + head);
1005 990
1006 if (session->header.needs_swap) 991 if (session->header.needs_swap)
1007 perf_event_header__bswap(&event->header); 992 perf_event_header__bswap(&event->header);
@@ -1134,3 +1119,18 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp,
1134 size_t ret = machine__fprintf_dsos_buildid(&self->host_machine, fp, with_hits); 1119 size_t ret = machine__fprintf_dsos_buildid(&self->host_machine, fp, with_hits);
1135 return ret + machines__fprintf_dsos_buildid(&self->machines, fp, with_hits); 1120 return ret + machines__fprintf_dsos_buildid(&self->machines, fp, with_hits);
1136} 1121}
1122
1123size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
1124{
1125 struct perf_evsel *pos;
1126 size_t ret = fprintf(fp, "Aggregated stats:\n");
1127
1128 ret += hists__fprintf_nr_events(&session->hists, fp);
1129
1130 list_for_each_entry(pos, &session->evlist->entries, node) {
1131 ret += fprintf(fp, "%s stats:\n", event_name(pos));
1132 ret += hists__fprintf_nr_events(&pos->hists, fp);
1133 }
1134
1135 return ret;
1136}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index decd83f274f..b5b148b0aac 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -34,12 +34,12 @@ struct perf_session {
34 struct thread *last_match; 34 struct thread *last_match;
35 struct machine host_machine; 35 struct machine host_machine;
36 struct rb_root machines; 36 struct rb_root machines;
37 struct rb_root hists_tree; 37 struct perf_evlist *evlist;
38 /* 38 /*
39 * FIXME: should point to the first entry in hists_tree and 39 * FIXME: Need to split this up further, we need global
40 * be a hists instance. Right now its only 'report' 40 * stats + per event stats. 'perf diff' also needs
41 * that is using ->hists_tree while all the rest use 41 * to properly support multiple events in a single
42 * ->hists. 42 * perf.data file.
43 */ 43 */
44 struct hists hists; 44 struct hists hists;
45 u64 sample_type; 45 u64 sample_type;
@@ -51,15 +51,17 @@ struct perf_session {
51 int cwdlen; 51 int cwdlen;
52 char *cwd; 52 char *cwd;
53 struct ordered_samples ordered_samples; 53 struct ordered_samples ordered_samples;
54 char filename[0]; 54 struct callchain_cursor callchain_cursor;
55 char filename[0];
55}; 56};
56 57
57struct perf_event_ops; 58struct perf_event_ops;
58 59
59typedef int (*event_op)(event_t *self, struct sample_data *sample, 60typedef int (*event_op)(union perf_event *self, struct perf_sample *sample,
60 struct perf_session *session); 61 struct perf_session *session);
61typedef int (*event_synth_op)(event_t *self, struct perf_session *session); 62typedef int (*event_synth_op)(union perf_event *self,
62typedef int (*event_op2)(event_t *self, struct perf_session *session, 63 struct perf_session *session);
64typedef int (*event_op2)(union perf_event *self, struct perf_session *session,
63 struct perf_event_ops *ops); 65 struct perf_event_ops *ops);
64 66
65struct perf_event_ops { 67struct perf_event_ops {
@@ -94,10 +96,10 @@ int __perf_session__process_events(struct perf_session *self,
94int perf_session__process_events(struct perf_session *self, 96int perf_session__process_events(struct perf_session *self,
95 struct perf_event_ops *event_ops); 97 struct perf_event_ops *event_ops);
96 98
97struct map_symbol *perf_session__resolve_callchain(struct perf_session *self, 99int perf_session__resolve_callchain(struct perf_session *self,
98 struct thread *thread, 100 struct thread *thread,
99 struct ip_callchain *chain, 101 struct ip_callchain *chain,
100 struct symbol **parent); 102 struct symbol **parent);
101 103
102bool perf_session__has_traces(struct perf_session *self, const char *msg); 104bool perf_session__has_traces(struct perf_session *self, const char *msg);
103 105
@@ -110,8 +112,6 @@ void mem_bswap_64(void *src, int byte_size);
110int perf_session__create_kernel_maps(struct perf_session *self); 112int perf_session__create_kernel_maps(struct perf_session *self);
111 113
112void perf_session__update_sample_type(struct perf_session *self); 114void perf_session__update_sample_type(struct perf_session *self);
113void perf_session__set_sample_id_all(struct perf_session *session, bool value);
114void perf_session__set_sample_type(struct perf_session *session, u64 type);
115void perf_session__remove_thread(struct perf_session *self, struct thread *th); 115void perf_session__remove_thread(struct perf_session *self, struct thread *th);
116 116
117static inline 117static inline
@@ -149,9 +149,14 @@ size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp);
149size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, 149size_t perf_session__fprintf_dsos_buildid(struct perf_session *self,
150 FILE *fp, bool with_hits); 150 FILE *fp, bool with_hits);
151 151
152static inline 152size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
153size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp) 153
154static inline int perf_session__parse_sample(struct perf_session *session,
155 const union perf_event *event,
156 struct perf_sample *sample)
154{ 157{
155 return hists__fprintf_nr_events(&self->hists, fp); 158 return perf_event__parse_sample(event, session->sample_type,
159 session->sample_id_all, sample);
156} 160}
161
157#endif /* __PERF_SESSION_H */ 162#endif /* __PERF_SESSION_H */
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
new file mode 100644
index 00000000000..e24ffadb20b
--- /dev/null
+++ b/tools/perf/util/setup.py
@@ -0,0 +1,19 @@
1#!/usr/bin/python2
2
3from distutils.core import setup, Extension
4
5perf = Extension('perf',
6 sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c',
7 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c',
8 'util/util.c', 'util/xyarray.c', 'util/cgroup.c'],
9 include_dirs = ['util/include'],
10 extra_compile_args = ['-fno-strict-aliasing', '-Wno-write-strings'])
11
12setup(name='perf',
13 version='0.1',
14 description='Interface with the Linux profiling infrastructure',
15 author='Arnaldo Carvalho de Melo',
16 author_email='acme@redhat.com',
17 license='GPLv2',
18 url='http://perf.wiki.kernel.org',
19 ext_modules=[perf])
diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c
new file mode 100644
index 00000000000..834c8ebfe38
--- /dev/null
+++ b/tools/perf/util/strfilter.c
@@ -0,0 +1,199 @@
1#include "util.h"
2#include "string.h"
3#include "strfilter.h"
4
5/* Operators */
6static const char *OP_and = "&"; /* Logical AND */
7static const char *OP_or = "|"; /* Logical OR */
8static const char *OP_not = "!"; /* Logical NOT */
9
10#define is_operator(c) ((c) == '|' || (c) == '&' || (c) == '!')
11#define is_separator(c) (is_operator(c) || (c) == '(' || (c) == ')')
12
13static void strfilter_node__delete(struct strfilter_node *self)
14{
15 if (self) {
16 if (self->p && !is_operator(*self->p))
17 free((char *)self->p);
18 strfilter_node__delete(self->l);
19 strfilter_node__delete(self->r);
20 free(self);
21 }
22}
23
24void strfilter__delete(struct strfilter *self)
25{
26 if (self) {
27 strfilter_node__delete(self->root);
28 free(self);
29 }
30}
31
32static const char *get_token(const char *s, const char **e)
33{
34 const char *p;
35
36 while (isspace(*s)) /* Skip spaces */
37 s++;
38
39 if (*s == '\0') {
40 p = s;
41 goto end;
42 }
43
44 p = s + 1;
45 if (!is_separator(*s)) {
46 /* End search */
47retry:
48 while (*p && !is_separator(*p) && !isspace(*p))
49 p++;
50 /* Escape and special case: '!' is also used in glob pattern */
51 if (*(p - 1) == '\\' || (*p == '!' && *(p - 1) == '[')) {
52 p++;
53 goto retry;
54 }
55 }
56end:
57 *e = p;
58 return s;
59}
60
61static struct strfilter_node *strfilter_node__alloc(const char *op,
62 struct strfilter_node *l,
63 struct strfilter_node *r)
64{
65 struct strfilter_node *ret = zalloc(sizeof(struct strfilter_node));
66
67 if (ret) {
68 ret->p = op;
69 ret->l = l;
70 ret->r = r;
71 }
72
73 return ret;
74}
75
76static struct strfilter_node *strfilter_node__new(const char *s,
77 const char **ep)
78{
79 struct strfilter_node root, *cur, *last_op;
80 const char *e;
81
82 if (!s)
83 return NULL;
84
85 memset(&root, 0, sizeof(root));
86 last_op = cur = &root;
87
88 s = get_token(s, &e);
89 while (*s != '\0' && *s != ')') {
90 switch (*s) {
91 case '&': /* Exchg last OP->r with AND */
92 if (!cur->r || !last_op->r)
93 goto error;
94 cur = strfilter_node__alloc(OP_and, last_op->r, NULL);
95 if (!cur)
96 goto nomem;
97 last_op->r = cur;
98 last_op = cur;
99 break;
100 case '|': /* Exchg the root with OR */
101 if (!cur->r || !root.r)
102 goto error;
103 cur = strfilter_node__alloc(OP_or, root.r, NULL);
104 if (!cur)
105 goto nomem;
106 root.r = cur;
107 last_op = cur;
108 break;
109 case '!': /* Add NOT as a leaf node */
110 if (cur->r)
111 goto error;
112 cur->r = strfilter_node__alloc(OP_not, NULL, NULL);
113 if (!cur->r)
114 goto nomem;
115 cur = cur->r;
116 break;
117 case '(': /* Recursively parses inside the parenthesis */
118 if (cur->r)
119 goto error;
120 cur->r = strfilter_node__new(s + 1, &s);
121 if (!s)
122 goto nomem;
123 if (!cur->r || *s != ')')
124 goto error;
125 e = s + 1;
126 break;
127 default:
128 if (cur->r)
129 goto error;
130 cur->r = strfilter_node__alloc(NULL, NULL, NULL);
131 if (!cur->r)
132 goto nomem;
133 cur->r->p = strndup(s, e - s);
134 if (!cur->r->p)
135 goto nomem;
136 }
137 s = get_token(e, &e);
138 }
139 if (!cur->r)
140 goto error;
141 *ep = s;
142 return root.r;
143nomem:
144 s = NULL;
145error:
146 *ep = s;
147 strfilter_node__delete(root.r);
148 return NULL;
149}
150
151/*
152 * Parse filter rule and return new strfilter.
153 * Return NULL if fail, and *ep == NULL if memory allocation failed.
154 */
155struct strfilter *strfilter__new(const char *rules, const char **err)
156{
157 struct strfilter *ret = zalloc(sizeof(struct strfilter));
158 const char *ep = NULL;
159
160 if (ret)
161 ret->root = strfilter_node__new(rules, &ep);
162
163 if (!ret || !ret->root || *ep != '\0') {
164 if (err)
165 *err = ep;
166 strfilter__delete(ret);
167 ret = NULL;
168 }
169
170 return ret;
171}
172
173static bool strfilter_node__compare(struct strfilter_node *self,
174 const char *str)
175{
176 if (!self || !self->p)
177 return false;
178
179 switch (*self->p) {
180 case '|': /* OR */
181 return strfilter_node__compare(self->l, str) ||
182 strfilter_node__compare(self->r, str);
183 case '&': /* AND */
184 return strfilter_node__compare(self->l, str) &&
185 strfilter_node__compare(self->r, str);
186 case '!': /* NOT */
187 return !strfilter_node__compare(self->r, str);
188 default:
189 return strglobmatch(str, self->p);
190 }
191}
192
193/* Return true if STR matches the filter rules */
194bool strfilter__compare(struct strfilter *self, const char *str)
195{
196 if (!self)
197 return false;
198 return strfilter_node__compare(self->root, str);
199}
diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h
new file mode 100644
index 00000000000..00f58a7506d
--- /dev/null
+++ b/tools/perf/util/strfilter.h
@@ -0,0 +1,48 @@
1#ifndef __PERF_STRFILTER_H
2#define __PERF_STRFILTER_H
3/* General purpose glob matching filter */
4
5#include <linux/list.h>
6#include <stdbool.h>
7
8/* A node of string filter */
9struct strfilter_node {
10 struct strfilter_node *l; /* Tree left branche (for &,|) */
11 struct strfilter_node *r; /* Tree right branche (for !,&,|) */
12 const char *p; /* Operator or rule */
13};
14
15/* String filter */
16struct strfilter {
17 struct strfilter_node *root;
18};
19
20/**
21 * strfilter__new - Create a new string filter
22 * @rules: Filter rule, which is a combination of glob expressions.
23 * @err: Pointer which points an error detected on @rules
24 *
25 * Parse @rules and return new strfilter. Return NULL if an error detected.
26 * In that case, *@err will indicate where it is detected, and *@err is NULL
27 * if a memory allocation is failed.
28 */
29struct strfilter *strfilter__new(const char *rules, const char **err);
30
31/**
32 * strfilter__compare - compare given string and a string filter
33 * @self: String filter
34 * @str: target string
35 *
36 * Compare @str and @self. Return true if the str match the rule
37 */
38bool strfilter__compare(struct strfilter *self, const char *str);
39
40/**
41 * strfilter__delete - delete a string filter
42 * @self: String filter to delete
43 *
44 * Delete @self.
45 */
46void strfilter__delete(struct strfilter *self);
47
48#endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index b1bf490aff8..00014e32c28 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -207,7 +207,6 @@ struct dso *dso__new(const char *name)
207 dso__set_short_name(self, self->name); 207 dso__set_short_name(self, self->name);
208 for (i = 0; i < MAP__NR_TYPES; ++i) 208 for (i = 0; i < MAP__NR_TYPES; ++i)
209 self->symbols[i] = self->symbol_names[i] = RB_ROOT; 209 self->symbols[i] = self->symbol_names[i] = RB_ROOT;
210 self->slen_calculated = 0;
211 self->origin = DSO__ORIG_NOT_FOUND; 210 self->origin = DSO__ORIG_NOT_FOUND;
212 self->loaded = 0; 211 self->loaded = 0;
213 self->sorted_by_name = 0; 212 self->sorted_by_name = 0;
@@ -1525,8 +1524,8 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
1525 symbol_conf.symfs, self->long_name); 1524 symbol_conf.symfs, self->long_name);
1526 break; 1525 break;
1527 case DSO__ORIG_GUEST_KMODULE: 1526 case DSO__ORIG_GUEST_KMODULE:
1528 if (map->groups && map->groups->machine) 1527 if (map->groups && machine)
1529 root_dir = map->groups->machine->root_dir; 1528 root_dir = machine->root_dir;
1530 else 1529 else
1531 root_dir = ""; 1530 root_dir = "";
1532 snprintf(name, size, "%s%s%s", symbol_conf.symfs, 1531 snprintf(name, size, "%s%s%s", symbol_conf.symfs,
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 670cd1c88f5..4d7ed09fe33 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -132,7 +132,6 @@ struct dso {
132 struct rb_root symbol_names[MAP__NR_TYPES]; 132 struct rb_root symbol_names[MAP__NR_TYPES];
133 enum dso_kernel_type kernel; 133 enum dso_kernel_type kernel;
134 u8 adjust_symbols:1; 134 u8 adjust_symbols:1;
135 u8 slen_calculated:1;
136 u8 has_build_id:1; 135 u8 has_build_id:1;
137 u8 hit:1; 136 u8 hit:1;
138 u8 annotate_warned:1; 137 u8 annotate_warned:1;
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 00f4eade2e3..d5d3b22250f 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -7,61 +7,6 @@
7#include "util.h" 7#include "util.h"
8#include "debug.h" 8#include "debug.h"
9 9
10/* Skip "." and ".." directories */
11static int filter(const struct dirent *dir)
12{
13 if (dir->d_name[0] == '.')
14 return 0;
15 else
16 return 1;
17}
18
19struct thread_map *thread_map__new_by_pid(pid_t pid)
20{
21 struct thread_map *threads;
22 char name[256];
23 int items;
24 struct dirent **namelist = NULL;
25 int i;
26
27 sprintf(name, "/proc/%d/task", pid);
28 items = scandir(name, &namelist, filter, NULL);
29 if (items <= 0)
30 return NULL;
31
32 threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
33 if (threads != NULL) {
34 for (i = 0; i < items; i++)
35 threads->map[i] = atoi(namelist[i]->d_name);
36 threads->nr = items;
37 }
38
39 for (i=0; i<items; i++)
40 free(namelist[i]);
41 free(namelist);
42
43 return threads;
44}
45
46struct thread_map *thread_map__new_by_tid(pid_t tid)
47{
48 struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
49
50 if (threads != NULL) {
51 threads->map[0] = tid;
52 threads->nr = 1;
53 }
54
55 return threads;
56}
57
58struct thread_map *thread_map__new(pid_t pid, pid_t tid)
59{
60 if (pid != -1)
61 return thread_map__new_by_pid(pid);
62 return thread_map__new_by_tid(tid);
63}
64
65static struct thread *thread__new(pid_t pid) 10static struct thread *thread__new(pid_t pid)
66{ 11{
67 struct thread *self = zalloc(sizeof(*self)); 12 struct thread *self = zalloc(sizeof(*self));
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index d7574101054..e5f2401c1b5 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -18,24 +18,10 @@ struct thread {
18 int comm_len; 18 int comm_len;
19}; 19};
20 20
21struct thread_map {
22 int nr;
23 int map[];
24};
25
26struct perf_session; 21struct perf_session;
27 22
28void thread__delete(struct thread *self); 23void thread__delete(struct thread *self);
29 24
30struct thread_map *thread_map__new_by_pid(pid_t pid);
31struct thread_map *thread_map__new_by_tid(pid_t tid);
32struct thread_map *thread_map__new(pid_t pid, pid_t tid);
33
34static inline void thread_map__delete(struct thread_map *threads)
35{
36 free(threads);
37}
38
39int thread__set_comm(struct thread *self, const char *comm); 25int thread__set_comm(struct thread *self, const char *comm);
40int thread__comm_len(struct thread *self); 26int thread__comm_len(struct thread *self);
41struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); 27struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
new file mode 100644
index 00000000000..a5df131b77c
--- /dev/null
+++ b/tools/perf/util/thread_map.c
@@ -0,0 +1,64 @@
1#include <dirent.h>
2#include <stdlib.h>
3#include <stdio.h>
4#include "thread_map.h"
5
6/* Skip "." and ".." directories */
7static int filter(const struct dirent *dir)
8{
9 if (dir->d_name[0] == '.')
10 return 0;
11 else
12 return 1;
13}
14
15struct thread_map *thread_map__new_by_pid(pid_t pid)
16{
17 struct thread_map *threads;
18 char name[256];
19 int items;
20 struct dirent **namelist = NULL;
21 int i;
22
23 sprintf(name, "/proc/%d/task", pid);
24 items = scandir(name, &namelist, filter, NULL);
25 if (items <= 0)
26 return NULL;
27
28 threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
29 if (threads != NULL) {
30 for (i = 0; i < items; i++)
31 threads->map[i] = atoi(namelist[i]->d_name);
32 threads->nr = items;
33 }
34
35 for (i=0; i<items; i++)
36 free(namelist[i]);
37 free(namelist);
38
39 return threads;
40}
41
42struct thread_map *thread_map__new_by_tid(pid_t tid)
43{
44 struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
45
46 if (threads != NULL) {
47 threads->map[0] = tid;
48 threads->nr = 1;
49 }
50
51 return threads;
52}
53
54struct thread_map *thread_map__new(pid_t pid, pid_t tid)
55{
56 if (pid != -1)
57 return thread_map__new_by_pid(pid);
58 return thread_map__new_by_tid(tid);
59}
60
61void thread_map__delete(struct thread_map *threads)
62{
63 free(threads);
64}
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
new file mode 100644
index 00000000000..3cb90731140
--- /dev/null
+++ b/tools/perf/util/thread_map.h
@@ -0,0 +1,15 @@
1#ifndef __PERF_THREAD_MAP_H
2#define __PERF_THREAD_MAP_H
3
4#include <sys/types.h>
5
6struct thread_map {
7 int nr;
8 int map[];
9};
10
11struct thread_map *thread_map__new_by_pid(pid_t pid);
12struct thread_map *thread_map__new_by_tid(pid_t tid);
13struct thread_map *thread_map__new(pid_t pid, pid_t tid);
14void thread_map__delete(struct thread_map *threads);
15#endif /* __PERF_THREAD_MAP_H */
diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
new file mode 100644
index 00000000000..75cfe4d4511
--- /dev/null
+++ b/tools/perf/util/top.c
@@ -0,0 +1,238 @@
1/*
2 * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
3 *
4 * Refactored from builtin-top.c, see that files for further copyright notes.
5 *
6 * Released under the GPL v2. (and only v2, not any later version)
7 */
8
9#include "cpumap.h"
10#include "event.h"
11#include "evlist.h"
12#include "evsel.h"
13#include "parse-events.h"
14#include "symbol.h"
15#include "top.h"
16#include <inttypes.h>
17
18/*
19 * Ordering weight: count-1 * count-2 * ... / count-n
20 */
21static double sym_weight(const struct sym_entry *sym, struct perf_top *top)
22{
23 double weight = sym->snap_count;
24 int counter;
25
26 if (!top->display_weighted)
27 return weight;
28
29 for (counter = 1; counter < top->evlist->nr_entries - 1; counter++)
30 weight *= sym->count[counter];
31
32 weight /= (sym->count[counter] + 1);
33
34 return weight;
35}
36
37static void perf_top__remove_active_sym(struct perf_top *top, struct sym_entry *syme)
38{
39 pthread_mutex_lock(&top->active_symbols_lock);
40 list_del_init(&syme->node);
41 pthread_mutex_unlock(&top->active_symbols_lock);
42}
43
44static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
45{
46 struct rb_node **p = &tree->rb_node;
47 struct rb_node *parent = NULL;
48 struct sym_entry *iter;
49
50 while (*p != NULL) {
51 parent = *p;
52 iter = rb_entry(parent, struct sym_entry, rb_node);
53
54 if (se->weight > iter->weight)
55 p = &(*p)->rb_left;
56 else
57 p = &(*p)->rb_right;
58 }
59
60 rb_link_node(&se->rb_node, parent, p);
61 rb_insert_color(&se->rb_node, tree);
62}
63
64#define SNPRINTF(buf, size, fmt, args...) \
65({ \
66 size_t r = snprintf(buf, size, fmt, ## args); \
67 r > size ? size : r; \
68})
69
70size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
71{
72 struct perf_evsel *counter;
73 float samples_per_sec = top->samples / top->delay_secs;
74 float ksamples_per_sec = top->kernel_samples / top->delay_secs;
75 float esamples_percent = (100.0 * top->exact_samples) / top->samples;
76 size_t ret = 0;
77
78 if (!perf_guest) {
79 ret = SNPRINTF(bf, size,
80 " PerfTop:%8.0f irqs/sec kernel:%4.1f%%"
81 " exact: %4.1f%% [", samples_per_sec,
82 100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
83 samples_per_sec)),
84 esamples_percent);
85 } else {
86 float us_samples_per_sec = top->us_samples / top->delay_secs;
87 float guest_kernel_samples_per_sec = top->guest_kernel_samples / top->delay_secs;
88 float guest_us_samples_per_sec = top->guest_us_samples / top->delay_secs;
89
90 ret = SNPRINTF(bf, size,
91 " PerfTop:%8.0f irqs/sec kernel:%4.1f%% us:%4.1f%%"
92 " guest kernel:%4.1f%% guest us:%4.1f%%"
93 " exact: %4.1f%% [", samples_per_sec,
94 100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
95 samples_per_sec)),
96 100.0 - (100.0 * ((samples_per_sec - us_samples_per_sec) /
97 samples_per_sec)),
98 100.0 - (100.0 * ((samples_per_sec -
99 guest_kernel_samples_per_sec) /
100 samples_per_sec)),
101 100.0 - (100.0 * ((samples_per_sec -
102 guest_us_samples_per_sec) /
103 samples_per_sec)),
104 esamples_percent);
105 }
106
107 if (top->evlist->nr_entries == 1 || !top->display_weighted) {
108 struct perf_evsel *first;
109 first = list_entry(top->evlist->entries.next, struct perf_evsel, node);
110 ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ",
111 (uint64_t)first->attr.sample_period,
112 top->freq ? "Hz" : "");
113 }
114
115 if (!top->display_weighted) {
116 ret += SNPRINTF(bf + ret, size - ret, "%s",
117 event_name(top->sym_evsel));
118 } else {
119 /*
120 * Don't let events eat all the space. Leaving 30 bytes
121 * for the rest should be enough.
122 */
123 size_t last_pos = size - 30;
124
125 list_for_each_entry(counter, &top->evlist->entries, node) {
126 ret += SNPRINTF(bf + ret, size - ret, "%s%s",
127 counter->idx ? "/" : "",
128 event_name(counter));
129 if (ret > last_pos) {
130 sprintf(bf + last_pos - 3, "..");
131 ret = last_pos - 1;
132 break;
133 }
134 }
135 }
136
137 ret += SNPRINTF(bf + ret, size - ret, "], ");
138
139 if (top->target_pid != -1)
140 ret += SNPRINTF(bf + ret, size - ret, " (target_pid: %d",
141 top->target_pid);
142 else if (top->target_tid != -1)
143 ret += SNPRINTF(bf + ret, size - ret, " (target_tid: %d",
144 top->target_tid);
145 else
146 ret += SNPRINTF(bf + ret, size - ret, " (all");
147
148 if (top->cpu_list)
149 ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)",
150 top->evlist->cpus->nr > 1 ? "s" : "", top->cpu_list);
151 else {
152 if (top->target_tid != -1)
153 ret += SNPRINTF(bf + ret, size - ret, ")");
154 else
155 ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)",
156 top->evlist->cpus->nr,
157 top->evlist->cpus->nr > 1 ? "s" : "");
158 }
159
160 return ret;
161}
162
163void perf_top__reset_sample_counters(struct perf_top *top)
164{
165 top->samples = top->us_samples = top->kernel_samples =
166 top->exact_samples = top->guest_kernel_samples =
167 top->guest_us_samples = 0;
168}
169
170float perf_top__decay_samples(struct perf_top *top, struct rb_root *root)
171{
172 struct sym_entry *syme, *n;
173 float sum_ksamples = 0.0;
174 int snap = !top->display_weighted ? top->sym_counter : 0, j;
175
176 /* Sort the active symbols */
177 pthread_mutex_lock(&top->active_symbols_lock);
178 syme = list_entry(top->active_symbols.next, struct sym_entry, node);
179 pthread_mutex_unlock(&top->active_symbols_lock);
180
181 top->rb_entries = 0;
182 list_for_each_entry_safe_from(syme, n, &top->active_symbols, node) {
183 syme->snap_count = syme->count[snap];
184 if (syme->snap_count != 0) {
185
186 if ((top->hide_user_symbols &&
187 syme->origin == PERF_RECORD_MISC_USER) ||
188 (top->hide_kernel_symbols &&
189 syme->origin == PERF_RECORD_MISC_KERNEL)) {
190 perf_top__remove_active_sym(top, syme);
191 continue;
192 }
193 syme->weight = sym_weight(syme, top);
194
195 if ((int)syme->snap_count >= top->count_filter) {
196 rb_insert_active_sym(root, syme);
197 ++top->rb_entries;
198 }
199 sum_ksamples += syme->snap_count;
200
201 for (j = 0; j < top->evlist->nr_entries; j++)
202 syme->count[j] = top->zero ? 0 : syme->count[j] * 7 / 8;
203 } else
204 perf_top__remove_active_sym(top, syme);
205 }
206
207 return sum_ksamples;
208}
209
210/*
211 * Find the longest symbol name that will be displayed
212 */
213void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
214 int *dso_width, int *dso_short_width, int *sym_width)
215{
216 struct rb_node *nd;
217 int printed = 0;
218
219 *sym_width = *dso_width = *dso_short_width = 0;
220
221 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
222 struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
223 struct symbol *sym = sym_entry__symbol(syme);
224
225 if (++printed > top->print_entries ||
226 (int)syme->snap_count < top->count_filter)
227 continue;
228
229 if (syme->map->dso->long_name_len > *dso_width)
230 *dso_width = syme->map->dso->long_name_len;
231
232 if (syme->map->dso->short_name_len > *dso_short_width)
233 *dso_short_width = syme->map->dso->short_name_len;
234
235 if (sym->namelen > *sym_width)
236 *sym_width = sym->namelen;
237 }
238}
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
new file mode 100644
index 00000000000..96d1cb78af0
--- /dev/null
+++ b/tools/perf/util/top.h
@@ -0,0 +1,66 @@
1#ifndef __PERF_TOP_H
2#define __PERF_TOP_H 1
3
4#include "types.h"
5#include "../perf.h"
6#include <stddef.h>
7#include <pthread.h>
8#include <linux/list.h>
9#include <linux/rbtree.h>
10
11struct perf_evlist;
12struct perf_evsel;
13
14struct sym_entry {
15 struct rb_node rb_node;
16 struct list_head node;
17 unsigned long snap_count;
18 double weight;
19 int skip;
20 u8 origin;
21 struct map *map;
22 unsigned long count[0];
23};
24
25static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
26{
27 return ((void *)self) + symbol_conf.priv_size;
28}
29
30struct perf_top {
31 struct perf_evlist *evlist;
32 /*
33 * Symbols will be added here in perf_event__process_sample and will
34 * get out after decayed.
35 */
36 struct list_head active_symbols;
37 pthread_mutex_t active_symbols_lock;
38 pthread_cond_t active_symbols_cond;
39 u64 samples;
40 u64 kernel_samples, us_samples;
41 u64 exact_samples;
42 u64 guest_us_samples, guest_kernel_samples;
43 int print_entries, count_filter, delay_secs;
44 int display_weighted, freq, rb_entries, sym_counter;
45 pid_t target_pid, target_tid;
46 bool hide_kernel_symbols, hide_user_symbols, zero;
47 const char *cpu_list;
48 struct sym_entry *sym_filter_entry;
49 struct perf_evsel *sym_evsel;
50};
51
52size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size);
53void perf_top__reset_sample_counters(struct perf_top *top);
54float perf_top__decay_samples(struct perf_top *top, struct rb_root *root);
55void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
56 int *dso_width, int *dso_short_width, int *sym_width);
57
58#ifdef NO_NEWT_SUPPORT
59static inline int perf_top__tui_browser(struct perf_top *top __used)
60{
61 return 0;
62}
63#else
64int perf_top__tui_browser(struct perf_top *top);
65#endif
66#endif /* __PERF_TOP_H */
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 73a02223c62..d8e622dd738 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -153,7 +153,7 @@ void parse_proc_kallsyms(char *file, unsigned int size __unused)
153 char *next = NULL; 153 char *next = NULL;
154 char *addr_str; 154 char *addr_str;
155 char ch; 155 char ch;
156 int ret; 156 int ret __used;
157 int i; 157 int i;
158 158
159 line = strtok_r(file, "\n", &next); 159 line = strtok_r(file, "\n", &next);
diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c
index 8bc010edca2..611219f8068 100644
--- a/tools/perf/util/ui/browser.c
+++ b/tools/perf/util/ui/browser.c
@@ -1,4 +1,5 @@
1#include "libslang.h" 1#include "libslang.h"
2#include "ui.h"
2#include <linux/compiler.h> 3#include <linux/compiler.h>
3#include <linux/list.h> 4#include <linux/list.h>
4#include <linux/rbtree.h> 5#include <linux/rbtree.h>
@@ -156,6 +157,20 @@ void ui_browser__add_exit_keys(struct ui_browser *self, int keys[])
156 } 157 }
157} 158}
158 159
160void __ui_browser__show_title(struct ui_browser *browser, const char *title)
161{
162 SLsmg_gotorc(0, 0);
163 ui_browser__set_color(browser, NEWT_COLORSET_ROOT);
164 slsmg_write_nstring(title, browser->width);
165}
166
167void ui_browser__show_title(struct ui_browser *browser, const char *title)
168{
169 pthread_mutex_lock(&ui__lock);
170 __ui_browser__show_title(browser, title);
171 pthread_mutex_unlock(&ui__lock);
172}
173
159int ui_browser__show(struct ui_browser *self, const char *title, 174int ui_browser__show(struct ui_browser *self, const char *title,
160 const char *helpline, ...) 175 const char *helpline, ...)
161{ 176{
@@ -178,9 +193,8 @@ int ui_browser__show(struct ui_browser *self, const char *title,
178 if (self->sb == NULL) 193 if (self->sb == NULL)
179 return -1; 194 return -1;
180 195
181 SLsmg_gotorc(0, 0); 196 pthread_mutex_lock(&ui__lock);
182 ui_browser__set_color(self, NEWT_COLORSET_ROOT); 197 __ui_browser__show_title(self, title);
183 slsmg_write_nstring(title, self->width);
184 198
185 ui_browser__add_exit_keys(self, keys); 199 ui_browser__add_exit_keys(self, keys);
186 newtFormAddComponent(self->form, self->sb); 200 newtFormAddComponent(self->form, self->sb);
@@ -188,25 +202,30 @@ int ui_browser__show(struct ui_browser *self, const char *title,
188 va_start(ap, helpline); 202 va_start(ap, helpline);
189 ui_helpline__vpush(helpline, ap); 203 ui_helpline__vpush(helpline, ap);
190 va_end(ap); 204 va_end(ap);
205 pthread_mutex_unlock(&ui__lock);
191 return 0; 206 return 0;
192} 207}
193 208
194void ui_browser__hide(struct ui_browser *self) 209void ui_browser__hide(struct ui_browser *self)
195{ 210{
211 pthread_mutex_lock(&ui__lock);
196 newtFormDestroy(self->form); 212 newtFormDestroy(self->form);
197 self->form = NULL; 213 self->form = NULL;
198 ui_helpline__pop(); 214 ui_helpline__pop();
215 pthread_mutex_unlock(&ui__lock);
199} 216}
200 217
201int ui_browser__refresh(struct ui_browser *self) 218int ui_browser__refresh(struct ui_browser *self)
202{ 219{
203 int row; 220 int row;
204 221
222 pthread_mutex_lock(&ui__lock);
205 newtScrollbarSet(self->sb, self->index, self->nr_entries - 1); 223 newtScrollbarSet(self->sb, self->index, self->nr_entries - 1);
206 row = self->refresh(self); 224 row = self->refresh(self);
207 ui_browser__set_color(self, HE_COLORSET_NORMAL); 225 ui_browser__set_color(self, HE_COLORSET_NORMAL);
208 SLsmg_fill_region(self->y + row, self->x, 226 SLsmg_fill_region(self->y + row, self->x,
209 self->height - row, self->width, ' '); 227 self->height - row, self->width, ' ');
228 pthread_mutex_unlock(&ui__lock);
210 229
211 return 0; 230 return 0;
212} 231}
diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h
index 0dc7e4da36f..fc63dda1091 100644
--- a/tools/perf/util/ui/browser.h
+++ b/tools/perf/util/ui/browser.h
@@ -24,7 +24,6 @@ struct ui_browser {
24 u32 nr_entries; 24 u32 nr_entries;
25}; 25};
26 26
27
28void ui_browser__set_color(struct ui_browser *self, int color); 27void ui_browser__set_color(struct ui_browser *self, int color);
29void ui_browser__set_percent_color(struct ui_browser *self, 28void ui_browser__set_percent_color(struct ui_browser *self,
30 double percent, bool current); 29 double percent, bool current);
@@ -35,6 +34,8 @@ void ui_browser__reset_index(struct ui_browser *self);
35void ui_browser__gotorc(struct ui_browser *self, int y, int x); 34void ui_browser__gotorc(struct ui_browser *self, int y, int x);
36void ui_browser__add_exit_key(struct ui_browser *self, int key); 35void ui_browser__add_exit_key(struct ui_browser *self, int key);
37void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]); 36void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]);
37void __ui_browser__show_title(struct ui_browser *browser, const char *title);
38void ui_browser__show_title(struct ui_browser *browser, const char *title);
38int ui_browser__show(struct ui_browser *self, const char *title, 39int ui_browser__show(struct ui_browser *self, const char *title,
39 const char *helpline, ...); 40 const char *helpline, ...);
40void ui_browser__hide(struct ui_browser *self); 41void ui_browser__hide(struct ui_browser *self);
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index 82b78f99251..8c17a8730e4 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -1,9 +1,12 @@
1#include "../browser.h" 1#include "../browser.h"
2#include "../helpline.h" 2#include "../helpline.h"
3#include "../libslang.h" 3#include "../libslang.h"
4#include "../../annotate.h"
4#include "../../hist.h" 5#include "../../hist.h"
5#include "../../sort.h" 6#include "../../sort.h"
6#include "../../symbol.h" 7#include "../../symbol.h"
8#include "../../annotate.h"
9#include <pthread.h>
7 10
8static void ui__error_window(const char *fmt, ...) 11static void ui__error_window(const char *fmt, ...)
9{ 12{
@@ -42,8 +45,6 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
42 struct objdump_line_rb_node *olrb = objdump_line__rb(ol); 45 struct objdump_line_rb_node *olrb = objdump_line__rb(ol);
43 ui_browser__set_percent_color(self, olrb->percent, current_entry); 46 ui_browser__set_percent_color(self, olrb->percent, current_entry);
44 slsmg_printf(" %7.2f ", olrb->percent); 47 slsmg_printf(" %7.2f ", olrb->percent);
45 if (!current_entry)
46 ui_browser__set_color(self, HE_COLORSET_CODE);
47 } else { 48 } else {
48 ui_browser__set_percent_color(self, 0, current_entry); 49 ui_browser__set_percent_color(self, 0, current_entry);
49 slsmg_write_nstring(" ", 9); 50 slsmg_write_nstring(" ", 9);
@@ -55,35 +56,40 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
55 slsmg_write_nstring(" ", width - 18); 56 slsmg_write_nstring(" ", width - 18);
56 else 57 else
57 slsmg_write_nstring(ol->line, width - 18); 58 slsmg_write_nstring(ol->line, width - 18);
59
60 if (!current_entry)
61 ui_browser__set_color(self, HE_COLORSET_CODE);
58} 62}
59 63
60static double objdump_line__calc_percent(struct objdump_line *self, 64static double objdump_line__calc_percent(struct objdump_line *self,
61 struct list_head *head, 65 struct symbol *sym, int evidx)
62 struct symbol *sym)
63{ 66{
64 double percent = 0.0; 67 double percent = 0.0;
65 68
66 if (self->offset != -1) { 69 if (self->offset != -1) {
67 int len = sym->end - sym->start; 70 int len = sym->end - sym->start;
68 unsigned int hits = 0; 71 unsigned int hits = 0;
69 struct sym_priv *priv = symbol__priv(sym); 72 struct annotation *notes = symbol__annotation(sym);
70 struct sym_ext *sym_ext = priv->ext; 73 struct source_line *src_line = notes->src->lines;
71 struct sym_hist *h = priv->hist; 74 struct sym_hist *h = annotation__histogram(notes, evidx);
72 s64 offset = self->offset; 75 s64 offset = self->offset;
73 struct objdump_line *next = objdump__get_next_ip_line(head, self); 76 struct objdump_line *next;
74
75 77
78 next = objdump__get_next_ip_line(&notes->src->source, self);
76 while (offset < (s64)len && 79 while (offset < (s64)len &&
77 (next == NULL || offset < next->offset)) { 80 (next == NULL || offset < next->offset)) {
78 if (sym_ext) { 81 if (src_line) {
79 percent += sym_ext[offset].percent; 82 percent += src_line[offset].percent;
80 } else 83 } else
81 hits += h->ip[offset]; 84 hits += h->addr[offset];
82 85
83 ++offset; 86 ++offset;
84 } 87 }
85 88 /*
86 if (sym_ext == NULL && h->sum) 89 * If the percentage wasn't already calculated in
90 * symbol__get_source_line, do it now:
91 */
92 if (src_line == NULL && h->sum)
87 percent = 100.0 * hits / h->sum; 93 percent = 100.0 * hits / h->sum;
88 } 94 }
89 95
@@ -133,103 +139,161 @@ static void annotate_browser__set_top(struct annotate_browser *self,
133 self->curr_hot = nd; 139 self->curr_hot = nd;
134} 140}
135 141
136static int annotate_browser__run(struct annotate_browser *self) 142static void annotate_browser__calc_percent(struct annotate_browser *browser,
143 int evidx)
137{ 144{
138 struct rb_node *nd; 145 struct symbol *sym = browser->b.priv;
139 struct hist_entry *he = self->b.priv; 146 struct annotation *notes = symbol__annotation(sym);
140 int key; 147 struct objdump_line *pos;
141 148
142 if (ui_browser__show(&self->b, he->ms.sym->name, 149 browser->entries = RB_ROOT;
143 "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0) 150
144 return -1; 151 pthread_mutex_lock(&notes->lock);
152
153 list_for_each_entry(pos, &notes->src->source, node) {
154 struct objdump_line_rb_node *rbpos = objdump_line__rb(pos);
155 rbpos->percent = objdump_line__calc_percent(pos, sym, evidx);
156 if (rbpos->percent < 0.01) {
157 RB_CLEAR_NODE(&rbpos->rb_node);
158 continue;
159 }
160 objdump__insert_line(&browser->entries, rbpos);
161 }
162 pthread_mutex_unlock(&notes->lock);
163
164 browser->curr_hot = rb_last(&browser->entries);
165}
166
167static int annotate_browser__run(struct annotate_browser *self, int evidx,
168 int refresh)
169{
170 struct rb_node *nd = NULL;
171 struct symbol *sym = self->b.priv;
145 /* 172 /*
146 * To allow builtin-annotate to cycle thru multiple symbols by 173 * RIGHT To allow builtin-annotate to cycle thru multiple symbols by
147 * examining the exit key for this function. 174 * examining the exit key for this function.
148 */ 175 */
149 ui_browser__add_exit_key(&self->b, NEWT_KEY_RIGHT); 176 int exit_keys[] = { 'H', NEWT_KEY_TAB, NEWT_KEY_UNTAB,
177 NEWT_KEY_RIGHT, 0 };
178 int key;
179
180 if (ui_browser__show(&self->b, sym->name,
181 "<-, -> or ESC: exit, TAB/shift+TAB: "
182 "cycle hottest lines, H: Hottest") < 0)
183 return -1;
184
185 ui_browser__add_exit_keys(&self->b, exit_keys);
186 annotate_browser__calc_percent(self, evidx);
187
188 if (self->curr_hot)
189 annotate_browser__set_top(self, self->curr_hot);
150 190
151 nd = self->curr_hot; 191 nd = self->curr_hot;
152 if (nd) { 192
153 int tabs[] = { NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0 }; 193 if (refresh != 0)
154 ui_browser__add_exit_keys(&self->b, tabs); 194 newtFormSetTimer(self->b.form, refresh);
155 }
156 195
157 while (1) { 196 while (1) {
158 key = ui_browser__run(&self->b); 197 key = ui_browser__run(&self->b);
159 198
199 if (refresh != 0) {
200 annotate_browser__calc_percent(self, evidx);
201 /*
202 * Current line focus got out of the list of most active
203 * lines, NULL it so that if TAB|UNTAB is pressed, we
204 * move to curr_hot (current hottest line).
205 */
206 if (nd != NULL && RB_EMPTY_NODE(nd))
207 nd = NULL;
208 }
209
160 switch (key) { 210 switch (key) {
211 case -1:
212 /*
213 * FIXME we need to check if it was
214 * es.reason == NEWT_EXIT_TIMER
215 */
216 if (refresh != 0)
217 symbol__annotate_decay_histogram(sym, evidx);
218 continue;
161 case NEWT_KEY_TAB: 219 case NEWT_KEY_TAB:
162 nd = rb_prev(nd); 220 if (nd != NULL) {
163 if (nd == NULL) 221 nd = rb_prev(nd);
164 nd = rb_last(&self->entries); 222 if (nd == NULL)
165 annotate_browser__set_top(self, nd); 223 nd = rb_last(&self->entries);
224 } else
225 nd = self->curr_hot;
166 break; 226 break;
167 case NEWT_KEY_UNTAB: 227 case NEWT_KEY_UNTAB:
168 nd = rb_next(nd); 228 if (nd != NULL)
169 if (nd == NULL) 229 nd = rb_next(nd);
170 nd = rb_first(&self->entries); 230 if (nd == NULL)
171 annotate_browser__set_top(self, nd); 231 nd = rb_first(&self->entries);
232 else
233 nd = self->curr_hot;
234 break;
235 case 'H':
236 nd = self->curr_hot;
172 break; 237 break;
173 default: 238 default:
174 goto out; 239 goto out;
175 } 240 }
241
242 if (nd != NULL)
243 annotate_browser__set_top(self, nd);
176 } 244 }
177out: 245out:
178 ui_browser__hide(&self->b); 246 ui_browser__hide(&self->b);
179 return key; 247 return key;
180} 248}
181 249
182int hist_entry__tui_annotate(struct hist_entry *self) 250int hist_entry__tui_annotate(struct hist_entry *he, int evidx)
251{
252 return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, 0);
253}
254
255int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
256 int refresh)
183{ 257{
184 struct objdump_line *pos, *n; 258 struct objdump_line *pos, *n;
185 struct objdump_line_rb_node *rbpos; 259 struct annotation *notes = symbol__annotation(sym);
186 LIST_HEAD(head);
187 struct annotate_browser browser = { 260 struct annotate_browser browser = {
188 .b = { 261 .b = {
189 .entries = &head, 262 .entries = &notes->src->source,
190 .refresh = ui_browser__list_head_refresh, 263 .refresh = ui_browser__list_head_refresh,
191 .seek = ui_browser__list_head_seek, 264 .seek = ui_browser__list_head_seek,
192 .write = annotate_browser__write, 265 .write = annotate_browser__write,
193 .priv = self, 266 .priv = sym,
194 }, 267 },
195 }; 268 };
196 int ret; 269 int ret;
197 270
198 if (self->ms.sym == NULL) 271 if (sym == NULL)
199 return -1; 272 return -1;
200 273
201 if (self->ms.map->dso->annotate_warned) 274 if (map->dso->annotate_warned)
202 return -1; 275 return -1;
203 276
204 if (hist_entry__annotate(self, &head, sizeof(*rbpos)) < 0) { 277 if (symbol__annotate(sym, map, sizeof(struct objdump_line_rb_node)) < 0) {
205 ui__error_window(ui_helpline__last_msg); 278 ui__error_window(ui_helpline__last_msg);
206 return -1; 279 return -1;
207 } 280 }
208 281
209 ui_helpline__push("Press <- or ESC to exit"); 282 ui_helpline__push("Press <- or ESC to exit");
210 283
211 list_for_each_entry(pos, &head, node) { 284 list_for_each_entry(pos, &notes->src->source, node) {
285 struct objdump_line_rb_node *rbpos;
212 size_t line_len = strlen(pos->line); 286 size_t line_len = strlen(pos->line);
287
213 if (browser.b.width < line_len) 288 if (browser.b.width < line_len)
214 browser.b.width = line_len; 289 browser.b.width = line_len;
215 rbpos = objdump_line__rb(pos); 290 rbpos = objdump_line__rb(pos);
216 rbpos->idx = browser.b.nr_entries++; 291 rbpos->idx = browser.b.nr_entries++;
217 rbpos->percent = objdump_line__calc_percent(pos, &head, self->ms.sym);
218 if (rbpos->percent < 0.01)
219 continue;
220 objdump__insert_line(&browser.entries, rbpos);
221 } 292 }
222 293
223 /*
224 * Position the browser at the hottest line.
225 */
226 browser.curr_hot = rb_last(&browser.entries);
227 if (browser.curr_hot)
228 annotate_browser__set_top(&browser, browser.curr_hot);
229
230 browser.b.width += 18; /* Percentage */ 294 browser.b.width += 18; /* Percentage */
231 ret = annotate_browser__run(&browser); 295 ret = annotate_browser__run(&browser, evidx, refresh);
232 list_for_each_entry_safe(pos, n, &head, node) { 296 list_for_each_entry_safe(pos, n, &notes->src->source, node) {
233 list_del(&pos->node); 297 list_del(&pos->node);
234 objdump_line__free(pos); 298 objdump_line__free(pos);
235 } 299 }
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 60c463c1602..798efdca3ea 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -7,6 +7,8 @@
7#include <newt.h> 7#include <newt.h>
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9 9
10#include "../../evsel.h"
11#include "../../evlist.h"
10#include "../../hist.h" 12#include "../../hist.h"
11#include "../../pstack.h" 13#include "../../pstack.h"
12#include "../../sort.h" 14#include "../../sort.h"
@@ -292,7 +294,8 @@ static int hist_browser__run(struct hist_browser *self, const char *title)
292{ 294{
293 int key; 295 int key;
294 int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't', 296 int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't',
295 NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, 0, }; 297 NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT,
298 NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0, };
296 299
297 self->b.entries = &self->hists->entries; 300 self->b.entries = &self->hists->entries;
298 self->b.nr_entries = self->hists->nr_entries; 301 self->b.nr_entries = self->hists->nr_entries;
@@ -377,7 +380,7 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
377 while (node) { 380 while (node) {
378 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node); 381 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
379 struct rb_node *next = rb_next(node); 382 struct rb_node *next = rb_next(node);
380 u64 cumul = cumul_hits(child); 383 u64 cumul = callchain_cumul_hits(child);
381 struct callchain_list *chain; 384 struct callchain_list *chain;
382 char folded_sign = ' '; 385 char folded_sign = ' ';
383 int first = true; 386 int first = true;
@@ -638,6 +641,9 @@ static void ui_browser__hists_seek(struct ui_browser *self,
638 struct rb_node *nd; 641 struct rb_node *nd;
639 bool first = true; 642 bool first = true;
640 643
644 if (self->nr_entries == 0)
645 return;
646
641 switch (whence) { 647 switch (whence) {
642 case SEEK_SET: 648 case SEEK_SET:
643 nd = hists__filter_entries(rb_first(self->entries)); 649 nd = hists__filter_entries(rb_first(self->entries));
@@ -797,8 +803,11 @@ static int hists__browser_title(struct hists *self, char *bf, size_t size,
797 return printed; 803 return printed;
798} 804}
799 805
800int hists__browse(struct hists *self, const char *helpline, const char *ev_name) 806static int perf_evsel__hists_browse(struct perf_evsel *evsel,
807 const char *helpline, const char *ev_name,
808 bool left_exits)
801{ 809{
810 struct hists *self = &evsel->hists;
802 struct hist_browser *browser = hist_browser__new(self); 811 struct hist_browser *browser = hist_browser__new(self);
803 struct pstack *fstack; 812 struct pstack *fstack;
804 const struct thread *thread_filter = NULL; 813 const struct thread *thread_filter = NULL;
@@ -818,8 +827,8 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
818 hists__browser_title(self, msg, sizeof(msg), ev_name, 827 hists__browser_title(self, msg, sizeof(msg), ev_name,
819 dso_filter, thread_filter); 828 dso_filter, thread_filter);
820 while (1) { 829 while (1) {
821 const struct thread *thread; 830 const struct thread *thread = NULL;
822 const struct dso *dso; 831 const struct dso *dso = NULL;
823 char *options[16]; 832 char *options[16];
824 int nr_options = 0, choice = 0, i, 833 int nr_options = 0, choice = 0, i,
825 annotate = -2, zoom_dso = -2, zoom_thread = -2, 834 annotate = -2, zoom_dso = -2, zoom_thread = -2,
@@ -827,8 +836,10 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
827 836
828 key = hist_browser__run(browser, msg); 837 key = hist_browser__run(browser, msg);
829 838
830 thread = hist_browser__selected_thread(browser); 839 if (browser->he_selection != NULL) {
831 dso = browser->selection->map ? browser->selection->map->dso : NULL; 840 thread = hist_browser__selected_thread(browser);
841 dso = browser->selection->map ? browser->selection->map->dso : NULL;
842 }
832 843
833 switch (key) { 844 switch (key) {
834 case NEWT_KEY_TAB: 845 case NEWT_KEY_TAB:
@@ -839,7 +850,8 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
839 */ 850 */
840 goto out_free_stack; 851 goto out_free_stack;
841 case 'a': 852 case 'a':
842 if (browser->selection->map == NULL && 853 if (browser->selection == NULL ||
854 browser->selection->map == NULL ||
843 browser->selection->map->dso->annotate_warned) 855 browser->selection->map->dso->annotate_warned)
844 continue; 856 continue;
845 goto do_annotate; 857 goto do_annotate;
@@ -858,6 +870,7 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
858 "E Expand all callchains\n" 870 "E Expand all callchains\n"
859 "d Zoom into current DSO\n" 871 "d Zoom into current DSO\n"
860 "t Zoom into current Thread\n" 872 "t Zoom into current Thread\n"
873 "TAB/UNTAB Switch events\n"
861 "q/CTRL+C Exit browser"); 874 "q/CTRL+C Exit browser");
862 continue; 875 continue;
863 case NEWT_KEY_ENTER: 876 case NEWT_KEY_ENTER:
@@ -867,8 +880,14 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
867 case NEWT_KEY_LEFT: { 880 case NEWT_KEY_LEFT: {
868 const void *top; 881 const void *top;
869 882
870 if (pstack__empty(fstack)) 883 if (pstack__empty(fstack)) {
884 /*
885 * Go back to the perf_evsel_menu__run or other user
886 */
887 if (left_exits)
888 goto out_free_stack;
871 continue; 889 continue;
890 }
872 top = pstack__pop(fstack); 891 top = pstack__pop(fstack);
873 if (top == &dso_filter) 892 if (top == &dso_filter)
874 goto zoom_out_dso; 893 goto zoom_out_dso;
@@ -877,14 +896,16 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
877 continue; 896 continue;
878 } 897 }
879 case NEWT_KEY_ESCAPE: 898 case NEWT_KEY_ESCAPE:
880 if (!ui__dialog_yesno("Do you really want to exit?")) 899 if (!left_exits &&
900 !ui__dialog_yesno("Do you really want to exit?"))
881 continue; 901 continue;
882 /* Fall thru */ 902 /* Fall thru */
883 default: 903 default:
884 goto out_free_stack; 904 goto out_free_stack;
885 } 905 }
886 906
887 if (browser->selection->sym != NULL && 907 if (browser->selection != NULL &&
908 browser->selection->sym != NULL &&
888 !browser->selection->map->dso->annotate_warned && 909 !browser->selection->map->dso->annotate_warned &&
889 asprintf(&options[nr_options], "Annotate %s", 910 asprintf(&options[nr_options], "Annotate %s",
890 browser->selection->sym->name) > 0) 911 browser->selection->sym->name) > 0)
@@ -903,7 +924,8 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
903 (dso->kernel ? "the Kernel" : dso->short_name)) > 0) 924 (dso->kernel ? "the Kernel" : dso->short_name)) > 0)
904 zoom_dso = nr_options++; 925 zoom_dso = nr_options++;
905 926
906 if (browser->selection->map != NULL && 927 if (browser->selection != NULL &&
928 browser->selection->map != NULL &&
907 asprintf(&options[nr_options], "Browse map details") > 0) 929 asprintf(&options[nr_options], "Browse map details") > 0)
908 browse_map = nr_options++; 930 browse_map = nr_options++;
909 931
@@ -923,19 +945,11 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
923 if (choice == annotate) { 945 if (choice == annotate) {
924 struct hist_entry *he; 946 struct hist_entry *he;
925do_annotate: 947do_annotate:
926 if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
927 browser->selection->map->dso->annotate_warned = 1;
928 ui_helpline__puts("No vmlinux file found, can't "
929 "annotate with just a "
930 "kallsyms file");
931 continue;
932 }
933
934 he = hist_browser__selected_entry(browser); 948 he = hist_browser__selected_entry(browser);
935 if (he == NULL) 949 if (he == NULL)
936 continue; 950 continue;
937 951
938 hist_entry__tui_annotate(he); 952 hist_entry__tui_annotate(he, evsel->idx);
939 } else if (choice == browse_map) 953 } else if (choice == browse_map)
940 map__browse(browser->selection->map); 954 map__browse(browser->selection->map);
941 else if (choice == zoom_dso) { 955 else if (choice == zoom_dso) {
@@ -984,30 +998,141 @@ out:
984 return key; 998 return key;
985} 999}
986 1000
987int hists__tui_browse_tree(struct rb_root *self, const char *help) 1001struct perf_evsel_menu {
1002 struct ui_browser b;
1003 struct perf_evsel *selection;
1004};
1005
1006static void perf_evsel_menu__write(struct ui_browser *browser,
1007 void *entry, int row)
1008{
1009 struct perf_evsel_menu *menu = container_of(browser,
1010 struct perf_evsel_menu, b);
1011 struct perf_evsel *evsel = list_entry(entry, struct perf_evsel, node);
1012 bool current_entry = ui_browser__is_current_entry(browser, row);
1013 unsigned long nr_events = evsel->hists.stats.nr_events[PERF_RECORD_SAMPLE];
1014 const char *ev_name = event_name(evsel);
1015 char bf[256], unit;
1016
1017 ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED :
1018 HE_COLORSET_NORMAL);
1019
1020 nr_events = convert_unit(nr_events, &unit);
1021 snprintf(bf, sizeof(bf), "%lu%c%s%s", nr_events,
1022 unit, unit == ' ' ? "" : " ", ev_name);
1023 slsmg_write_nstring(bf, browser->width);
1024
1025 if (current_entry)
1026 menu->selection = evsel;
1027}
1028
1029static int perf_evsel_menu__run(struct perf_evsel_menu *menu, const char *help)
988{ 1030{
989 struct rb_node *first = rb_first(self), *nd = first, *next; 1031 int exit_keys[] = { NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, };
990 int key = 0; 1032 struct perf_evlist *evlist = menu->b.priv;
1033 struct perf_evsel *pos;
1034 const char *ev_name, *title = "Available samples";
1035 int key;
1036
1037 if (ui_browser__show(&menu->b, title,
1038 "ESC: exit, ENTER|->: Browse histograms") < 0)
1039 return -1;
1040
1041 ui_browser__add_exit_keys(&menu->b, exit_keys);
991 1042
992 while (nd) { 1043 while (1) {
993 struct hists *hists = rb_entry(nd, struct hists, rb_node); 1044 key = ui_browser__run(&menu->b);
994 const char *ev_name = __event_name(hists->type, hists->config);
995 1045
996 key = hists__browse(hists, help, ev_name);
997 switch (key) { 1046 switch (key) {
998 case NEWT_KEY_TAB: 1047 case NEWT_KEY_RIGHT:
999 next = rb_next(nd); 1048 case NEWT_KEY_ENTER:
1000 if (next) 1049 if (!menu->selection)
1001 nd = next; 1050 continue;
1051 pos = menu->selection;
1052browse_hists:
1053 ev_name = event_name(pos);
1054 key = perf_evsel__hists_browse(pos, help, ev_name, true);
1055 ui_browser__show_title(&menu->b, title);
1002 break; 1056 break;
1003 case NEWT_KEY_UNTAB: 1057 case NEWT_KEY_LEFT:
1004 if (nd == first) 1058 continue;
1059 case NEWT_KEY_ESCAPE:
1060 if (!ui__dialog_yesno("Do you really want to exit?"))
1005 continue; 1061 continue;
1006 nd = rb_prev(nd); 1062 /* Fall thru */
1063 default:
1064 goto out;
1065 }
1066
1067 switch (key) {
1068 case NEWT_KEY_TAB:
1069 if (pos->node.next == &evlist->entries)
1070 pos = list_entry(evlist->entries.next, struct perf_evsel, node);
1071 else
1072 pos = list_entry(pos->node.next, struct perf_evsel, node);
1073 goto browse_hists;
1074 case NEWT_KEY_UNTAB:
1075 if (pos->node.prev == &evlist->entries)
1076 pos = list_entry(evlist->entries.prev, struct perf_evsel, node);
1077 else
1078 pos = list_entry(pos->node.prev, struct perf_evsel, node);
1079 goto browse_hists;
1080 case 'q':
1081 case CTRL('c'):
1082 goto out;
1007 default: 1083 default:
1008 return key; 1084 break;
1009 } 1085 }
1010 } 1086 }
1011 1087
1088out:
1089 ui_browser__hide(&menu->b);
1012 return key; 1090 return key;
1013} 1091}
1092
1093static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist,
1094 const char *help)
1095{
1096 struct perf_evsel *pos;
1097 struct perf_evsel_menu menu = {
1098 .b = {
1099 .entries = &evlist->entries,
1100 .refresh = ui_browser__list_head_refresh,
1101 .seek = ui_browser__list_head_seek,
1102 .write = perf_evsel_menu__write,
1103 .nr_entries = evlist->nr_entries,
1104 .priv = evlist,
1105 },
1106 };
1107
1108 ui_helpline__push("Press ESC to exit");
1109
1110 list_for_each_entry(pos, &evlist->entries, node) {
1111 const char *ev_name = event_name(pos);
1112 size_t line_len = strlen(ev_name) + 7;
1113
1114 if (menu.b.width < line_len)
1115 menu.b.width = line_len;
1116 /*
1117 * Cache the evsel name, tracepoints have a _high_ cost per
1118 * event_name() call.
1119 */
1120 if (pos->name == NULL)
1121 pos->name = strdup(ev_name);
1122 }
1123
1124 return perf_evsel_menu__run(&menu, help);
1125}
1126
1127int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help)
1128{
1129
1130 if (evlist->nr_entries == 1) {
1131 struct perf_evsel *first = list_entry(evlist->entries.next,
1132 struct perf_evsel, node);
1133 const char *ev_name = event_name(first);
1134 return perf_evsel__hists_browse(first, help, ev_name, false);
1135 }
1136
1137 return __perf_evlist__tui_browse_hists(evlist, help);
1138}
diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c
index e5158369106..8462bffe20b 100644
--- a/tools/perf/util/ui/browsers/map.c
+++ b/tools/perf/util/ui/browsers/map.c
@@ -41,7 +41,7 @@ static int ui_entry__read(const char *title, char *bf, size_t size, int width)
41out_free_form: 41out_free_form:
42 newtPopWindow(); 42 newtPopWindow();
43 newtFormDestroy(form); 43 newtFormDestroy(form);
44 return 0; 44 return err;
45} 45}
46 46
47struct map_browser { 47struct map_browser {
diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c
new file mode 100644
index 00000000000..5a06538532a
--- /dev/null
+++ b/tools/perf/util/ui/browsers/top.c
@@ -0,0 +1,213 @@
1/*
2 * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
3 *
4 * Parts came from builtin-{top,stat,record}.c, see those files for further
5 * copyright notes.
6 *
7 * Released under the GPL v2. (and only v2, not any later version)
8 */
9#include "../browser.h"
10#include "../../annotate.h"
11#include "../helpline.h"
12#include "../libslang.h"
13#include "../util.h"
14#include "../../evlist.h"
15#include "../../hist.h"
16#include "../../sort.h"
17#include "../../symbol.h"
18#include "../../top.h"
19
20struct perf_top_browser {
21 struct ui_browser b;
22 struct rb_root root;
23 struct sym_entry *selection;
24 float sum_ksamples;
25 int dso_width;
26 int dso_short_width;
27 int sym_width;
28};
29
30static void perf_top_browser__write(struct ui_browser *browser, void *entry, int row)
31{
32 struct perf_top_browser *top_browser = container_of(browser, struct perf_top_browser, b);
33 struct sym_entry *syme = rb_entry(entry, struct sym_entry, rb_node);
34 bool current_entry = ui_browser__is_current_entry(browser, row);
35 struct symbol *symbol = sym_entry__symbol(syme);
36 struct perf_top *top = browser->priv;
37 int width = browser->width;
38 double pcnt;
39
40 pcnt = 100.0 - (100.0 * ((top_browser->sum_ksamples - syme->snap_count) /
41 top_browser->sum_ksamples));
42 ui_browser__set_percent_color(browser, pcnt, current_entry);
43
44 if (top->evlist->nr_entries == 1 || !top->display_weighted) {
45 slsmg_printf("%20.2f ", syme->weight);
46 width -= 24;
47 } else {
48 slsmg_printf("%9.1f %10ld ", syme->weight, syme->snap_count);
49 width -= 23;
50 }
51
52 slsmg_printf("%4.1f%%", pcnt);
53 width -= 7;
54
55 if (verbose) {
56 slsmg_printf(" %016" PRIx64, symbol->start);
57 width -= 17;
58 }
59
60 slsmg_printf(" %-*.*s ", top_browser->sym_width, top_browser->sym_width,
61 symbol->name);
62 width -= top_browser->sym_width;
63 slsmg_write_nstring(width >= syme->map->dso->long_name_len ?
64 syme->map->dso->long_name :
65 syme->map->dso->short_name, width);
66
67 if (current_entry)
68 top_browser->selection = syme;
69}
70
71static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
72{
73 struct perf_top *top = browser->b.priv;
74 u64 top_idx = browser->b.top_idx;
75
76 browser->root = RB_ROOT;
77 browser->b.top = NULL;
78 browser->sum_ksamples = perf_top__decay_samples(top, &browser->root);
79 /*
80 * No active symbols
81 */
82 if (top->rb_entries == 0)
83 return;
84
85 perf_top__find_widths(top, &browser->root, &browser->dso_width,
86 &browser->dso_short_width,
87 &browser->sym_width);
88 if (browser->sym_width + browser->dso_width > browser->b.width - 29) {
89 browser->dso_width = browser->dso_short_width;
90 if (browser->sym_width + browser->dso_width > browser->b.width - 29)
91 browser->sym_width = browser->b.width - browser->dso_width - 29;
92 }
93
94 /*
95 * Adjust the ui_browser indexes since the entries in the browser->root
96 * rb_tree may have changed, then seek it from start, so that we get a
97 * possible new top of the screen.
98 */
99 browser->b.nr_entries = top->rb_entries;
100
101 if (top_idx >= browser->b.nr_entries) {
102 if (browser->b.height >= browser->b.nr_entries)
103 top_idx = browser->b.nr_entries - browser->b.height;
104 else
105 top_idx = 0;
106 }
107
108 if (browser->b.index >= top_idx + browser->b.height)
109 browser->b.index = top_idx + browser->b.index - browser->b.top_idx;
110
111 if (browser->b.index >= browser->b.nr_entries)
112 browser->b.index = browser->b.nr_entries - 1;
113
114 browser->b.top_idx = top_idx;
115 browser->b.seek(&browser->b, top_idx, SEEK_SET);
116}
117
118static void perf_top_browser__annotate(struct perf_top_browser *browser)
119{
120 struct sym_entry *syme = browser->selection;
121 struct symbol *sym = sym_entry__symbol(syme);
122 struct annotation *notes = symbol__annotation(sym);
123 struct perf_top *top = browser->b.priv;
124
125 if (notes->src != NULL)
126 goto do_annotation;
127
128 pthread_mutex_lock(&notes->lock);
129
130 top->sym_filter_entry = NULL;
131
132 if (symbol__alloc_hist(sym, top->evlist->nr_entries) < 0) {
133 pr_err("Not enough memory for annotating '%s' symbol!\n",
134 sym->name);
135 pthread_mutex_unlock(&notes->lock);
136 return;
137 }
138
139 top->sym_filter_entry = syme;
140
141 pthread_mutex_unlock(&notes->lock);
142do_annotation:
143 symbol__tui_annotate(sym, syme->map, 0, top->delay_secs * 1000);
144}
145
146static int perf_top_browser__run(struct perf_top_browser *browser)
147{
148 int key;
149 char title[160];
150 struct perf_top *top = browser->b.priv;
151 int delay_msecs = top->delay_secs * 1000;
152 int exit_keys[] = { 'a', NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, };
153
154 perf_top_browser__update_rb_tree(browser);
155 perf_top__header_snprintf(top, title, sizeof(title));
156 perf_top__reset_sample_counters(top);
157
158 if (ui_browser__show(&browser->b, title,
159 "ESC: exit, ENTER|->|a: Live Annotate") < 0)
160 return -1;
161
162 newtFormSetTimer(browser->b.form, delay_msecs);
163 ui_browser__add_exit_keys(&browser->b, exit_keys);
164
165 while (1) {
166 key = ui_browser__run(&browser->b);
167
168 switch (key) {
169 case -1:
170 /* FIXME we need to check if it was es.reason == NEWT_EXIT_TIMER */
171 perf_top_browser__update_rb_tree(browser);
172 perf_top__header_snprintf(top, title, sizeof(title));
173 perf_top__reset_sample_counters(top);
174 ui_browser__set_color(&browser->b, NEWT_COLORSET_ROOT);
175 SLsmg_gotorc(0, 0);
176 slsmg_write_nstring(title, browser->b.width);
177 break;
178 case 'a':
179 case NEWT_KEY_RIGHT:
180 case NEWT_KEY_ENTER:
181 if (browser->selection)
182 perf_top_browser__annotate(browser);
183 break;
184 case NEWT_KEY_LEFT:
185 continue;
186 case NEWT_KEY_ESCAPE:
187 if (!ui__dialog_yesno("Do you really want to exit?"))
188 continue;
189 /* Fall thru */
190 default:
191 goto out;
192 }
193 }
194out:
195 ui_browser__hide(&browser->b);
196 return key;
197}
198
199int perf_top__tui_browser(struct perf_top *top)
200{
201 struct perf_top_browser browser = {
202 .b = {
203 .entries = &browser.root,
204 .refresh = ui_browser__rb_tree_refresh,
205 .seek = ui_browser__rb_tree_seek,
206 .write = perf_top_browser__write,
207 .priv = top,
208 },
209 };
210
211 ui_helpline__push("Press <- or ESC to exit");
212 return perf_top_browser__run(&browser);
213}
diff --git a/tools/perf/util/ui/helpline.c b/tools/perf/util/ui/helpline.c
index 8d79daa4458..f36d2ff509e 100644
--- a/tools/perf/util/ui/helpline.c
+++ b/tools/perf/util/ui/helpline.c
@@ -5,6 +5,7 @@
5 5
6#include "../debug.h" 6#include "../debug.h"
7#include "helpline.h" 7#include "helpline.h"
8#include "ui.h"
8 9
9void ui_helpline__pop(void) 10void ui_helpline__pop(void)
10{ 11{
@@ -55,7 +56,8 @@ int ui_helpline__show_help(const char *format, va_list ap)
55 int ret; 56 int ret;
56 static int backlog; 57 static int backlog;
57 58
58 ret = vsnprintf(ui_helpline__last_msg + backlog, 59 pthread_mutex_lock(&ui__lock);
60 ret = vsnprintf(ui_helpline__last_msg + backlog,
59 sizeof(ui_helpline__last_msg) - backlog, format, ap); 61 sizeof(ui_helpline__last_msg) - backlog, format, ap);
60 backlog += ret; 62 backlog += ret;
61 63
@@ -64,6 +66,7 @@ int ui_helpline__show_help(const char *format, va_list ap)
64 newtRefresh(); 66 newtRefresh();
65 backlog = 0; 67 backlog = 0;
66 } 68 }
69 pthread_mutex_unlock(&ui__lock);
67 70
68 return ret; 71 return ret;
69} 72}
diff --git a/tools/perf/util/ui/libslang.h b/tools/perf/util/ui/libslang.h
index 5623da8e808..2b63e1c9b18 100644
--- a/tools/perf/util/ui/libslang.h
+++ b/tools/perf/util/ui/libslang.h
@@ -13,11 +13,11 @@
13 13
14#if SLANG_VERSION < 20104 14#if SLANG_VERSION < 20104
15#define slsmg_printf(msg, args...) \ 15#define slsmg_printf(msg, args...) \
16 SLsmg_printf((char *)msg, ##args) 16 SLsmg_printf((char *)(msg), ##args)
17#define slsmg_write_nstring(msg, len) \ 17#define slsmg_write_nstring(msg, len) \
18 SLsmg_write_nstring((char *)msg, len) 18 SLsmg_write_nstring((char *)(msg), len)
19#define sltt_set_color(obj, name, fg, bg) \ 19#define sltt_set_color(obj, name, fg, bg) \
20 SLtt_set_color(obj,(char *)name, (char *)fg, (char *)bg) 20 SLtt_set_color(obj,(char *)(name), (char *)(fg), (char *)(bg))
21#else 21#else
22#define slsmg_printf SLsmg_printf 22#define slsmg_printf SLsmg_printf
23#define slsmg_write_nstring SLsmg_write_nstring 23#define slsmg_write_nstring SLsmg_write_nstring
diff --git a/tools/perf/util/ui/setup.c b/tools/perf/util/ui/setup.c
index 662085032eb..ee46d671db5 100644
--- a/tools/perf/util/ui/setup.c
+++ b/tools/perf/util/ui/setup.c
@@ -6,6 +6,9 @@
6#include "../debug.h" 6#include "../debug.h"
7#include "browser.h" 7#include "browser.h"
8#include "helpline.h" 8#include "helpline.h"
9#include "ui.h"
10
11pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
9 12
10static void newt_suspend(void *d __used) 13static void newt_suspend(void *d __used)
11{ 14{
@@ -14,11 +17,12 @@ static void newt_suspend(void *d __used)
14 newtResume(); 17 newtResume();
15} 18}
16 19
17void setup_browser(void) 20void setup_browser(bool fallback_to_pager)
18{ 21{
19 if (!isatty(1) || !use_browser || dump_trace) { 22 if (!isatty(1) || !use_browser || dump_trace) {
20 use_browser = 0; 23 use_browser = 0;
21 setup_pager(); 24 if (fallback_to_pager)
25 setup_pager();
22 return; 26 return;
23 } 27 }
24 28
diff --git a/tools/perf/util/ui/ui.h b/tools/perf/util/ui/ui.h
new file mode 100644
index 00000000000..d264e059c82
--- /dev/null
+++ b/tools/perf/util/ui/ui.h
@@ -0,0 +1,8 @@
1#ifndef _PERF_UI_H_
2#define _PERF_UI_H_ 1
3
4#include <pthread.h>
5
6extern pthread_mutex_t ui__lock;
7
8#endif /* _PERF_UI_H_ */
diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c
index 7b5a8926624..fdf1fc8f08b 100644
--- a/tools/perf/util/ui/util.c
+++ b/tools/perf/util/ui/util.c
@@ -9,6 +9,7 @@
9#include "../debug.h" 9#include "../debug.h"
10#include "browser.h" 10#include "browser.h"
11#include "helpline.h" 11#include "helpline.h"
12#include "ui.h"
12#include "util.h" 13#include "util.h"
13 14
14static void newt_form__set_exit_keys(newtComponent self) 15static void newt_form__set_exit_keys(newtComponent self)
@@ -118,10 +119,12 @@ void ui__warning(const char *format, ...)
118 va_list args; 119 va_list args;
119 120
120 va_start(args, format); 121 va_start(args, format);
121 if (use_browser > 0) 122 if (use_browser > 0) {
123 pthread_mutex_lock(&ui__lock);
122 newtWinMessagev((char *)warning_str, (char *)ok, 124 newtWinMessagev((char *)warning_str, (char *)ok,
123 (char *)format, args); 125 (char *)format, args);
124 else 126 pthread_mutex_unlock(&ui__lock);
127 } else
125 vfprintf(stderr, format, args); 128 vfprintf(stderr, format, args);
126 va_end(args); 129 va_end(args);
127} 130}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index e833f26f3bf..fc784284ac8 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -70,9 +70,7 @@
70#include <sys/poll.h> 70#include <sys/poll.h>
71#include <sys/socket.h> 71#include <sys/socket.h>
72#include <sys/ioctl.h> 72#include <sys/ioctl.h>
73#ifndef NO_SYS_SELECT_H
74#include <sys/select.h> 73#include <sys/select.h>
75#endif
76#include <netinet/in.h> 74#include <netinet/in.h>
77#include <netinet/tcp.h> 75#include <netinet/tcp.h>
78#include <arpa/inet.h> 76#include <arpa/inet.h>
@@ -83,10 +81,6 @@
83#include "types.h" 81#include "types.h"
84#include <sys/ttydefaults.h> 82#include <sys/ttydefaults.h>
85 83
86#ifndef NO_ICONV
87#include <iconv.h>
88#endif
89
90extern const char *graph_line; 84extern const char *graph_line;
91extern const char *graph_dotted_line; 85extern const char *graph_dotted_line;
92extern char buildid_dir[]; 86extern char buildid_dir[];
@@ -236,26 +230,6 @@ static inline int sane_case(int x, int high)
236 return x; 230 return x;
237} 231}
238 232
239#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
240# define FORCE_DIR_SET_GID S_ISGID
241#else
242# define FORCE_DIR_SET_GID 0
243#endif
244
245#ifdef NO_NSEC
246#undef USE_NSEC
247#define ST_CTIME_NSEC(st) 0
248#define ST_MTIME_NSEC(st) 0
249#else
250#ifdef USE_ST_TIMESPEC
251#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
252#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
253#else
254#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
255#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
256#endif
257#endif
258
259int mkdir_p(char *path, mode_t mode); 233int mkdir_p(char *path, mode_t mode);
260int copyfile(const char *from, const char *to); 234int copyfile(const char *from, const char *to);
261 235
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index e1c62eeb88f..ba7c63af6f3 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1,6 +1,6 @@
1#!/usr/bin/perl -w 1#!/usr/bin/perl -w
2# 2#
3# Copywrite 2010 - Steven Rostedt <srostedt@redhat.com>, Red Hat Inc. 3# Copyright 2010 - Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
4# Licensed under the terms of the GNU GPL License version 2 4# Licensed under the terms of the GNU GPL License version 2
5# 5#
6 6